From 23c560a99d78bddf5c251bfa97bce19e4da4b3f3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Apr 2009 22:10:23 +0900 Subject: scatterlist: make sure sg_miter_next() doesn't return 0 sized mappings Impact: fix not-so-critical but annoying bug sg_miter_next() returns 0 sized mapping if there is an zero sized sg entry in the list or at the end of each iteration. As the users always check the ->length field, this bug shouldn't be critical other than causing unnecessary iteration. Fix it. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- lib/scatterlist.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index b7b449dafbe5..a295e404e908 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -347,9 +347,12 @@ bool sg_miter_next(struct sg_mapping_iter *miter) sg_miter_stop(miter); /* get to the next sg if necessary. __offset is adjusted by stop */ - if (miter->__offset == miter->__sg->length && --miter->__nents) { - miter->__sg = sg_next(miter->__sg); - miter->__offset = 0; + while (miter->__offset == miter->__sg->length) { + if (--miter->__nents) { + miter->__sg = sg_next(miter->__sg); + miter->__offset = 0; + } else + return false; } /* map the next page */ -- cgit v1.2.3-59-g8ed1b From 25636e282fe95508cae96bb27f86407aef935817 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Apr 2009 22:10:24 +0900 Subject: block: fix SG_IO vector request data length handling Impact: fix SG_IO behavior such that it matches the documentation SG_IO howto says that if ->dxfer_len and sum of iovec disagress, the shorter one wins. However, the current implementation returns -EINVAL for such cases. Trim iovc if it's longer than ->dxfer_len. This patch uses iov_*() helpers which take struct iovec * by casting struct sg_iovec * to it. sg_iovec is always identical to iovec and this will be further cleaned up with later patches. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 84b7f8709f41..82a0ca2f6729 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -290,6 +290,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (hdr->iovec_count) { const int size = sizeof(struct sg_iovec) * hdr->iovec_count; + size_t iov_data_len; struct sg_iovec *iov; iov = kmalloc(size, GFP_KERNEL); @@ -304,8 +305,18 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, goto out; } + /* SG_IO howto says that the shorter of the two wins */ + iov_data_len = iov_length((struct iovec *)iov, + hdr->iovec_count); + if (hdr->dxfer_len < iov_data_len) { + hdr->iovec_count = iov_shorten((struct iovec *)iov, + hdr->iovec_count, + hdr->dxfer_len); + iov_data_len = hdr->dxfer_len; + } + ret = blk_rq_map_user_iov(q, rq, NULL, iov, hdr->iovec_count, - hdr->dxfer_len, GFP_KERNEL); + iov_data_len, GFP_KERNEL); kfree(iov); } else if (hdr->dxfer_len) ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, -- cgit v1.2.3-59-g8ed1b From cd0aca2d550f238d80ba58e7dcade4ea3d0a3aa7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Apr 2009 22:10:25 +0900 Subject: block: fix queue bounce limit setting Impact: don't set GFP_DMA in q->bounce_gfp unnecessarily All DMA address limits are expressed in terms of the last addressable unit (byte or page) instead of one plus that. However, when determining bounce_gfp for 64bit machines in blk_queue_bounce_limit(), it compares the specified limit against 0x100000000UL to determine whether it's below 4G ending up falsely setting GFP_DMA in q->bounce_gfp. As DMA zone is very small on x86_64, this makes larger SG_IO transfers very eager to trigger OOM killer. Fix it. While at it, rename the parameter to @dma_mask for clarity and convert comment to proper winged style. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-settings.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 69c42adde52b..57af728d94bb 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -156,26 +156,28 @@ EXPORT_SYMBOL(blk_queue_make_request); /** * blk_queue_bounce_limit - set bounce buffer limit for queue - * @q: the request queue for the device - * @dma_addr: bus address limit + * @q: the request queue for the device + * @dma_mask: the maximum address the device can handle * * Description: * Different hardware can have different requirements as to what pages * it can do I/O directly to. A low level driver can call * blk_queue_bounce_limit to have lower memory pages allocated as bounce - * buffers for doing I/O to pages residing above @dma_addr. + * buffers for doing I/O to pages residing above @dma_mask. **/ -void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr) +void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) { - unsigned long b_pfn = dma_addr >> PAGE_SHIFT; + unsigned long b_pfn = dma_mask >> PAGE_SHIFT; int dma = 0; q->bounce_gfp = GFP_NOIO; #if BITS_PER_LONG == 64 - /* Assume anything <= 4GB can be handled by IOMMU. - Actually some IOMMUs can handle everything, but I don't - know of a way to test this here. */ - if (b_pfn < (min_t(u64, 0x100000000UL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) + /* + * Assume anything <= 4GB can be handled by IOMMU. Actually + * some IOMMUs can handle everything, but I don't know of a + * way to test this here. + */ + if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) dma = 1; q->bounce_pfn = max_low_pfn; #else -- cgit v1.2.3-59-g8ed1b From 451a9ebf653d28337ba53ed5b4b70b0b9543cca1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Apr 2009 19:50:51 +0200 Subject: bio: fix bio_kmalloc() Impact: fix bio_kmalloc() and its destruction path bio_kmalloc() was broken in two ways. * bvec_alloc_bs() first allocates bvec using kmalloc() and then ignores it and allocates again like non-kmalloc bvecs. * bio_kmalloc_destructor() didn't check for and free bio integrity data. This patch fixes the above problems. kmalloc patch is separated out from bio_alloc_bioset() and allocates the requested number of bvecs as inline bvecs. * bio_alloc_bioset() no longer takes NULL @bs. None other than bio_kmalloc() used it and outside users can't know how it was allocated anyway. * Define and use BIO_POOL_NONE so that pool index check in bvec_free_bs() triggers if inline or kmalloc allocated bvec gets there. * Relocate destructors on top of each allocation function so that how they're used is more clear. Jens Axboe suggested allocating bvecs inline. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/bio.c | 118 ++++++++++++++++++++++++---------------------------- include/linux/bio.h | 1 + 2 files changed, 55 insertions(+), 64 deletions(-) diff --git a/fs/bio.c b/fs/bio.c index cd42bb882f30..d35588fd6d57 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -174,14 +174,6 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, { struct bio_vec *bvl; - /* - * If 'bs' is given, lookup the pool and do the mempool alloc. - * If not, this is a bio_kmalloc() allocation and just do a - * kzalloc() for the exact number of vecs right away. - */ - if (!bs) - bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask); - /* * see comment near bvec_array define! */ @@ -260,21 +252,6 @@ void bio_free(struct bio *bio, struct bio_set *bs) mempool_free(p, bs->bio_pool); } -/* - * default destructor for a bio allocated with bio_alloc_bioset() - */ -static void bio_fs_destructor(struct bio *bio) -{ - bio_free(bio, fs_bio_set); -} - -static void bio_kmalloc_destructor(struct bio *bio) -{ - if (bio_has_allocated_vec(bio)) - kfree(bio->bi_io_vec); - kfree(bio); -} - void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); @@ -301,21 +278,15 @@ void bio_init(struct bio *bio) **/ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { + unsigned long idx = BIO_POOL_NONE; struct bio_vec *bvl = NULL; - struct bio *bio = NULL; - unsigned long idx = 0; - void *p = NULL; - - if (bs) { - p = mempool_alloc(bs->bio_pool, gfp_mask); - if (!p) - goto err; - bio = p + bs->front_pad; - } else { - bio = kmalloc(sizeof(*bio), gfp_mask); - if (!bio) - goto err; - } + struct bio *bio; + void *p; + + p = mempool_alloc(bs->bio_pool, gfp_mask); + if (unlikely(!p)) + return NULL; + bio = p + bs->front_pad; bio_init(bio); @@ -332,22 +303,50 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) nr_iovecs = bvec_nr_vecs(idx); } +out_set: bio->bi_flags |= idx << BIO_POOL_OFFSET; bio->bi_max_vecs = nr_iovecs; -out_set: bio->bi_io_vec = bvl; - return bio; err_free: - if (bs) - mempool_free(p, bs->bio_pool); - else - kfree(bio); -err: + mempool_free(p, bs->bio_pool); return NULL; } +static void bio_fs_destructor(struct bio *bio) +{ + bio_free(bio, fs_bio_set); +} + +/** + * bio_alloc - allocate a new bio, memory pool backed + * @gfp_mask: allocation mask to use + * @nr_iovecs: number of iovecs + * + * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask + * contains __GFP_WAIT, the allocation is guaranteed to succeed. + * + * RETURNS: + * Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) +{ + struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); + + if (bio) + bio->bi_destructor = bio_fs_destructor; + + return bio; +} + +static void bio_kmalloc_destructor(struct bio *bio) +{ + if (bio_integrity(bio)) + bio_integrity_free(bio); + kfree(bio); +} + /** * bio_alloc - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator @@ -366,29 +365,20 @@ err: * do so can cause livelocks under memory pressure. * **/ -struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) -{ - struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); - - if (bio) - bio->bi_destructor = bio_fs_destructor; - - return bio; -} - -/* - * Like bio_alloc(), but doesn't use a mempool backing. This means that - * it CAN fail, but while bio_alloc() can only be used for allocations - * that have a short (finite) life span, bio_kmalloc() should be used - * for more permanent bio allocations (like allocating some bio's for - * initalization or setup purposes). - */ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) { - struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); + struct bio *bio; - if (bio) - bio->bi_destructor = bio_kmalloc_destructor; + bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), + gfp_mask); + if (unlikely(!bio)) + return NULL; + + bio_init(bio); + bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; + bio->bi_max_vecs = nr_iovecs; + bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_destructor = bio_kmalloc_destructor; return bio; } diff --git a/include/linux/bio.h b/include/linux/bio.h index b89cf2d82898..7b214fd672a2 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -132,6 +132,7 @@ struct bio { * top 4 bits of bio flags indicate the pool this bio came from */ #define BIO_POOL_BITS (4) +#define BIO_POOL_NONE ((1UL << BIO_POOL_BITS) - 1) #define BIO_POOL_OFFSET (BITS_PER_LONG - BIO_POOL_BITS) #define BIO_POOL_MASK (1UL << BIO_POOL_OFFSET) #define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET) -- cgit v1.2.3-59-g8ed1b From a9e9dc24bbc3e084450a22cf4fb82f5f5d4cbeea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Apr 2009 22:10:27 +0900 Subject: bio: use bio_kmalloc() in copy/map functions Impact: remove possible deadlock condition There is no reason to use mempool backed allocation for map functions. Also, because kern mapping is used inside LLDs (e.g. for EH), using mempool backed allocation can lead to deadlock under extreme conditions (mempool already consumed by the time a request reached EH and requests are blocked on EH). Switch copy/map functions to bio_kmalloc(). Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/bio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bio.c b/fs/bio.c index d35588fd6d57..7bbc98f0eda1 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -822,7 +822,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, return ERR_PTR(-ENOMEM); ret = -ENOMEM; - bio = bio_alloc(gfp_mask, nr_pages); + bio = bio_kmalloc(gfp_mask, nr_pages); if (!bio) goto out_bmd; @@ -946,7 +946,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, if (!nr_pages) return ERR_PTR(-EINVAL); - bio = bio_alloc(gfp_mask, nr_pages); + bio = bio_kmalloc(gfp_mask, nr_pages); if (!bio) return ERR_PTR(-ENOMEM); @@ -1130,7 +1130,7 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data, int offset, i; struct bio *bio; - bio = bio_alloc(gfp_mask, nr_pages); + bio = bio_kmalloc(gfp_mask, nr_pages); if (!bio) return ERR_PTR(-ENOMEM); -- cgit v1.2.3-59-g8ed1b From 71982a409f12c50d011325a4471aa20666bb908d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 17 Apr 2009 08:34:48 +0200 Subject: block: include empty disks in /proc/diskstats /proc/diskstats used to show stats for all disks whether they're zero-sized or not and their non-zero partitions. Commit 074a7aca7afa6f230104e8e65eba3420263714a5 accidentally changed the behavior such that it doesn't print out zero sized disks. This patch implements DISK_PITER_INCL_EMPTY_PART0 flag to partition iterator and uses it in diskstats_show() such that empty part0 is shown in /proc/diskstats. Reported and bisectd by Dianel Collins. Signed-off-by: Tejun Heo Reported-by: Daniel Collins Signed-off-by: Jens Axboe --- block/genhd.c | 12 ++++++++---- include/linux/genhd.h | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index a9ec910974c1..1a4916e01732 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -98,7 +98,7 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, if (flags & DISK_PITER_REVERSE) piter->idx = ptbl->len - 1; - else if (flags & DISK_PITER_INCL_PART0) + else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) piter->idx = 0; else piter->idx = 1; @@ -134,7 +134,8 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) /* determine iteration parameters */ if (piter->flags & DISK_PITER_REVERSE) { inc = -1; - if (piter->flags & DISK_PITER_INCL_PART0) + if (piter->flags & (DISK_PITER_INCL_PART0 | + DISK_PITER_INCL_EMPTY_PART0)) end = -1; else end = 0; @@ -150,7 +151,10 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; - if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects) + if (!part->nr_sects && + !(piter->flags & DISK_PITER_INCL_EMPTY) && + !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && + piter->idx == 0)) continue; get_device(part_to_dev(part)); @@ -1011,7 +1015,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) "\n\n"); */ - disk_part_iter_init(&piter, gp, DISK_PITER_INCL_PART0); + disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { cpu = part_stat_lock(); part_round_stats(cpu, hd); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 634c53028fb8..a1a28caed23d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -214,6 +214,7 @@ static inline void disk_put_part(struct hd_struct *part) #define DISK_PITER_REVERSE (1 << 0) /* iterate in the reverse direction */ #define DISK_PITER_INCL_EMPTY (1 << 1) /* include 0-sized parts */ #define DISK_PITER_INCL_PART0 (1 << 2) /* include partition 0 */ +#define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */ struct disk_part_iter { struct gendisk *disk; -- cgit v1.2.3-59-g8ed1b From b759113499d6c7cb75fab04f56772579308bc0f8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Apr 2009 08:36:50 +0200 Subject: block: make blk_abort_queue() ignore non-request based devices There's nothing to do for those devices, since the timeout handling is based on requests. Signed-off-by: Jens Axboe --- block/blk-timeout.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/blk-timeout.c b/block/blk-timeout.c index bbbdc4b8ccf2..8f570c4c80ee 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -211,6 +211,12 @@ void blk_abort_queue(struct request_queue *q) struct request *rq, *tmp; LIST_HEAD(list); + /* + * Not a request based block device, nothing to abort + */ + if (!q->request_fn) + return; + spin_lock_irqsave(q->queue_lock, flags); elv_abort_queue(q); -- cgit v1.2.3-59-g8ed1b From 4d00aa47e2337dcfe2d8a7215dbde3765b507167 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Tue, 21 Apr 2009 07:25:04 +0200 Subject: cfq-iosched: make seek_mean converge more quickly Right now, depending on the first sector to which a process issues I/O, the seek time may start out way out of whack. So make sure we start with 0 sectors in seek, instead of the offset of the first request issued. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 0d3b70de3d80..0eb4aff9df68 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1908,7 +1908,9 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, sector_t sdist; u64 total; - if (cic->last_request_pos < rq->sector) + if (!cic->last_request_pos) + sdist = 0; + else if (cic->last_request_pos < rq->sector) sdist = rq->sector - cic->last_request_pos; else sdist = cic->last_request_pos - rq->sector; -- cgit v1.2.3-59-g8ed1b From 04dc6e71a28d4815bf9431efcafc107bb0ad2792 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Tue, 21 Apr 2009 07:31:56 +0200 Subject: cfq-iosched: use the default seek distance when there aren't enough seek samples If the cfq io context doesn't have enough samples yet to provide a mean seek distance, then use the default threshold we have for seeky IO instead of defaulting to 0. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 0eb4aff9df68..7e13f04b5ed4 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -947,14 +947,18 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, return cfqd->last_position - rq->sector; } +#define CIC_SEEK_THR 8 * 1024 +#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR) + static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) { struct cfq_io_context *cic = cfqd->active_cic; + sector_t sdist = cic->seek_mean; if (!sample_valid(cic->seek_samples)) - return 0; + sdist = CIC_SEEK_THR; - return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean; + return cfq_dist_from_last(cfqd, rq) <= sdist; } static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, @@ -1039,9 +1043,6 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, return cfqq; } - -#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024)) - static void cfq_arm_slice_timer(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; -- cgit v1.2.3-59-g8ed1b From 097102c2d04974bdfcfa16a5f3062d499842139c Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Tue, 21 Apr 2009 09:33:14 +0200 Subject: pktcdvd.h should include mempool.h Fix this build error: In file included from fs/compat_ioctl.c:104: include/linux/pktcdvd.h:285: error: expected specifier-qualifier-list before 'mempool_t' Signed-off-by: Alexander Beregalov Signed-off-by: Jens Axboe --- include/linux/pktcdvd.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index 04b4d7330e6d..d745f5b6c7b0 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -113,6 +113,7 @@ struct pkt_ctrl_command { #include #include #include +#include /* default bio write queue congestion marks */ #define PKT_WRITE_CONGESTION_ON 10000 -- cgit v1.2.3-59-g8ed1b From 42dad7647aec49b3ad20dd0cb832b232a6ae514f Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 22 Apr 2009 14:01:49 +0200 Subject: block: simplify I/O stat accounting This simplifies I/O stat accounting switching code and separates it completely from I/O scheduler switch code. Requests are accounted according to the state of their request queue at the time of the request allocation. There is no need anymore to flush the request queue when switching I/O accounting state. Signed-off-by: Jerome Marchand Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++++-- block/blk-merge.c | 5 ++++- block/blk-sysfs.c | 4 ---- block/blk.h | 7 +------ include/linux/blkdev.h | 3 +++ 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 07ab75403e1a..2998fe3a2377 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -643,7 +643,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) } static struct request * -blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask) +blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask) { struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); @@ -652,7 +652,7 @@ blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask) blk_rq_init(q, rq); - rq->cmd_flags = rw | REQ_ALLOCED; + rq->cmd_flags = flags | REQ_ALLOCED; if (priv) { if (unlikely(elv_set_request(q, rq, gfp_mask))) { @@ -792,6 +792,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags, if (priv) rl->elvpriv++; + if (blk_queue_io_stat(q)) + rw_flags |= REQ_IO_STAT; spin_unlock_irq(q->queue_lock); rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); diff --git a/block/blk-merge.c b/block/blk-merge.c index 63760ca3da0f..23d2a6fe34a3 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -402,7 +402,10 @@ static int attempt_merge(struct request_queue *q, struct request *req, elv_merge_requests(q, req, next); - blk_account_io_merge(req); + /* + * 'next' is going away, so update stats accordingly + */ + blk_account_io_merge(next); req->ioprio = ioprio_best(req->ioprio, next->ioprio); if (blk_rq_cpu_valid(next)) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index cac4e9febe6a..3ff9bba3379a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -209,14 +209,10 @@ static ssize_t queue_iostats_store(struct request_queue *q, const char *page, ssize_t ret = queue_var_store(&stats, page, count); spin_lock_irq(q->queue_lock); - elv_quiesce_start(q); - if (stats) queue_flag_set(QUEUE_FLAG_IO_STAT, q); else queue_flag_clear(QUEUE_FLAG_IO_STAT, q); - - elv_quiesce_end(q); spin_unlock_irq(q->queue_lock); return ret; diff --git a/block/blk.h b/block/blk.h index 5dfc41267a08..79c85f7c9ff5 100644 --- a/block/blk.h +++ b/block/blk.h @@ -114,12 +114,7 @@ static inline int blk_cpu_to_group(int cpu) static inline int blk_do_io_stat(struct request *rq) { - struct gendisk *disk = rq->rq_disk; - - if (!disk || !disk->queue) - return 0; - - return blk_queue_io_stat(disk->queue) && (rq->cmd_flags & REQ_ELVPRIV); + return rq->rq_disk && blk_rq_io_stat(rq); } #endif diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ba54c834a590..2755d5c6da22 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -118,6 +118,7 @@ enum rq_flag_bits { __REQ_COPY_USER, /* contains copies of user pages */ __REQ_INTEGRITY, /* integrity metadata has been remapped */ __REQ_NOIDLE, /* Don't anticipate more IO after this one */ + __REQ_IO_STAT, /* account I/O stat */ __REQ_NR_BITS, /* stops here */ }; @@ -145,6 +146,7 @@ enum rq_flag_bits { #define REQ_COPY_USER (1 << __REQ_COPY_USER) #define REQ_INTEGRITY (1 << __REQ_INTEGRITY) #define REQ_NOIDLE (1 << __REQ_NOIDLE) +#define REQ_IO_STAT (1 << __REQ_IO_STAT) #define BLK_MAX_CDB 16 @@ -598,6 +600,7 @@ enum { blk_failfast_transport(rq) || \ blk_failfast_driver(rq)) #define blk_rq_started(rq) ((rq)->cmd_flags & REQ_STARTED) +#define blk_rq_io_stat(rq) ((rq)->cmd_flags & REQ_IO_STAT) #define blk_account_rq(rq) (blk_rq_started(rq) && (blk_fs_request(rq) || blk_discard_rq(rq))) -- cgit v1.2.3-59-g8ed1b From f3c737de8f57b5ce756010c2175f7d574194b30d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 23 Apr 2009 08:37:58 +0200 Subject: umem: fix request_queue lock warning The umem driver issues two warnings on boot, due to blk_plug_device() and blk_remove_plug() being called without q->queue_lock held. Starting with e48ec690 (block: extend queue_flag bitops), the queue_flag_* functions warn if q->queue_lock doesn't appear to be locked. In fact, q->queue_lock is NULL (though that apparently isn't otherwise a problem as the driver is using card->lock for everything). Although blk_init_queue() with take a request_fn_proc and spinlock_t*, there isn't a corresponding init helper that takes a make_request_fn. Setting queue_lock to &card->lock explicitly seems to work fine for me. The warning goes away and the device appears to behave. [ 1.531881] v2.3 : Micro Memory(tm) PCI memory board block driver [ 1.538136] umem 0000:02:01.0: PCI INT A -> GSI 20 (level, low) -> IRQ 20 [ 1.545018] umem 0000:02:01.0: Micro Memory(tm) controller found (PCI Mem Module (Battery Backup)) [ 1.554176] umem 0000:02:01.0: CSR 0xfc9ffc00 -> 0xffffc200013d0c00 (0x100) [ 1.561279] umem 0000:02:01.0: Size 1048576 KB, Battery 1 Disabled (FAILURE), Battery 2 Disabled (FAILURE) [ 1.571114] umem 0000:02:01.0: Window size 16777216 bytes, IRQ 20 [ 1.577304] umem 0000:02:01.0: memory NOT initialized. Consider over-writing whole device. [ 1.585989] umema:<4>------------[ cut here ]------------ [ 1.591775] WARNING: at include/linux/blkdev.h:492 blk_plug_device+0x6d/0x106() [ 1.592025] Hardware name: H8SSL [ 1.592025] Modules linked in: [ 1.592025] Pid: 1, comm: swapper Not tainted 2.6.29 #8 [ 1.592025] Call Trace: [ 1.592025] [] warn_slowpath+0xd3/0xf2 [ 1.592025] [] ? save_trace+0x3f/0x9b [ 1.592025] [] ? add_lock_to_list+0x7a/0xba [ 1.592025] [] ? validate_chain+0xb3b/0xce8 [ 1.592025] [] ? mm_make_request+0x27/0x59 [ 1.592025] [] ? mm_make_request+0x27/0x59 [ 1.592025] [] ? __lock_acquire+0x74e/0x7b9 [ 1.592025] [] ? get_lock_stats+0x34/0x5e [ 1.592025] [] ? put_lock_stats+0xe/0x27 [ 1.592025] [] ? mm_make_request+0x27/0x59 [ 1.592025] [] blk_plug_device+0x6d/0x106 [ 1.592025] [] mm_make_request+0x46/0x59 [ 1.592025] [] generic_make_request+0x335/0x3cf [ 1.592025] [] ? mempool_alloc_slab+0x11/0x13 [ 1.592025] [] ? mempool_alloc+0x45/0x101 [ 1.592025] [] ? put_lock_stats+0xe/0x27 [ 1.592025] [] submit_bio+0x10a/0x119 [ 1.592025] [] submit_bh+0xe5/0x109 [ 1.592025] [] block_read_full_page+0x2aa/0x2cb [ 1.592025] [] ? blkdev_get_block+0x0/0x4c [ 1.592025] [] ? _spin_unlock_irq+0x36/0x51 [ 1.592025] [] ? __lru_cache_add+0x92/0xb2 [ 1.592025] [] blkdev_readpage+0x13/0x15 [ 1.592025] [] read_cache_page_async+0x90/0x134 [ 1.592025] [] ? blkdev_readpage+0x0/0x15 [ 1.592025] [] ? adfspart_check_ICS+0x0/0x16c [ 1.592025] [] read_cache_page+0xe/0x45 [ 1.592025] [] read_dev_sector+0x2e/0x93 [ 1.592025] [] adfspart_check_ICS+0x28/0x16c [ 1.592025] [] ? trace_hardirqs_on+0xd/0xf [ 1.592025] [] ? adfspart_check_ICS+0x0/0x16c [ 1.592025] [] rescan_partitions+0x168/0x2fb [ 1.592025] [] __blkdev_get+0x259/0x336 [ 1.592025] [] ? kobject_put+0x47/0x4b [ 1.592025] [] blkdev_get+0xb/0xd [ 1.592025] [] register_disk+0xc4/0x12b [ 1.592025] [] add_disk+0xc3/0x12d [ 1.592025] [] ? mm_init+0x0/0x1a5 [ 1.592025] [] mm_init+0x129/0x1a5 [ 1.592025] [] ? mm_init+0x0/0x1a5 [ 1.592025] [] _stext+0x56/0x130 [ 1.592025] [] ? register_irq_proc+0xae/0xca [ 1.592025] [] ? proc_pid_lookup+0xb4/0x18b [ 1.592025] [] kernel_init+0x132/0x18b [ 1.592025] [] child_rip+0xa/0x20 [ 1.592025] [] ? restore_args+0x0/0x30 [ 1.592025] [] ? kernel_init+0x0/0x18b [ 1.592025] [] ? child_rip+0x0/0x20 [ 1.592025] ---[ end trace 7150b3b86da74e1e ]--- [ 1.889858] ------------[ cut here ]------------[ve_plug+0x5f/0x91() [ 1.893848] Hardware name: H8SSL [ 1.893848] Modules linked in: [ 1.893848] Pid: 1, comm: swapper Tainted: G W 2.6.29 #8 [ 1.893848] Call Trace: [ 1.893848] [] warn_slowpath+0xd3/0xf2 [ 1.893848] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 1.893848] [] ? restore_args+0x0/0x30 [ 1.893848] [] ? __atomic_notifier_call_chain+0x0/0xb2 [ 1.893848] [] ? _spin_unlock_irq+0x31/0x51 [ 1.893848] [] ? _spin_unlock_irq+0x4d/0x51 [ 1.893848] [] ? mm_make_request+0x4e/0x59 [ 1.893848] [] ? get_lock_stats+0x34/0x5e [ 1.893848] [] ? put_lock_stats+0x25/0x27 [ 1.893848] [] ? mm_unplug_device+0x25/0x50 [ 1.893848] [] blk_remove_plug+0x5f/0x91 [ 1.893848] [] mm_unplug_device+0x30/0x50 [ 1.893848] [] blk_unplug+0x78/0x7d [ 1.893848] [] blk_backing_dev_unplug+0xd/0xf [ 1.893848] [] block_sync_page+0x4a/0x4c [ 1.893848] [] sync_page+0x44/0x4d [ 1.893848] [] __wait_on_bit_lock+0x42/0x8a [ 1.893848] [] ? sync_page+0x0/0x4d [ 1.893848] [] __lock_page+0x64/0x6b [ 1.893848] [] ? wake_bit_function+0x0/0x2a [ 1.893848] [] read_cache_page_async+0xd4/0x134 [ 1.893848] [] ? blkdev_readpage+0x0/0x15 [ 1.893848] [] ? adfspart_check_ICS+0x0/0x16c [ 1.893848] [] read_cache_page+0xe/0x45 [ 1.893848] [] read_dev_sector+0x2e/0x93 [ 1.893848] [] adfspart_check_ICS+0x28/0x16c [ 1.893848] [] ? trace_hardirqs_on+0xd/0xf [ 1.893848] [] ? adfspart_check_ICS+0x0/0x16c [ 1.893848] [] rescan_partitions+0x168/0x2fb [ 1.893848] [] __blkdev_get+0x259/0x336 [ 1.893848] [] ? kobject_put+0x47/0x4b [ 1.893848] [] blkdev_get+0xb/0xd [ 1.893848] [] register_disk+0xc4/0x12b [ 1.893848] [] add_disk+0xc3/0x12d [ 1.893848] [] ? mm_init+0x0/0x1a5 [ 1.893848] [] mm_init+0x129/0x1a5 [ 1.893848] [] ? mm_init+0x0/0x1a5 [ 1.893848] [] _stext+0x56/0x130 [ 1.893848] [] ? register_irq_proc+0xae/0xca [ 1.893848] [] ? proc_pid_lookup+0xb4/0x18b [ 1.893848] [] kernel_init+0x132/0x18b [ 1.893848] [] child_rip+0xa/0x20 [ 1.893848] [] ? restore_args+0x0/0x30 [ 1.893848] [] ? kernel_init+0x0/0x18b [ 1.893848] [] ? child_rip+0x0/0x20 [ 1.893848] ---[ end trace 7150b3b86da74e1f ]--- Signed-off-by: Sage Weil Signed-off-by: Jens Axboe --- drivers/block/umem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 9744d59a69f2..858c34dd032d 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -906,6 +906,7 @@ static int __devinit mm_pci_probe(struct pci_dev *dev, goto failed_alloc; blk_queue_make_request(card->queue, mm_make_request); + card->queue->queue_lock = &card->lock; card->queue->queuedata = card; card->queue->unplug_fn = mm_unplug_device; -- cgit v1.2.3-59-g8ed1b From 17d5c8ca7572124c9623045f24b0c21d4aa2b47f Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 23 Apr 2009 10:32:59 +0200 Subject: block: fix intermittent dm timeout based oops Very rarely under stress testing of dm, oopses are occuring as something tampers with an old stack frame. This has been traced back to blk_abort_queue() leaving a timeout_list pointing to the stack. The reason is that sometimes blk_abort_request() won't delete the timer (if the request is marked as complete but before the timer has been removed, a small race window). Fix this by splicing back from the ususally empty list to the q->timeout_list. Signed-off-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-timeout.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 8f570c4c80ee..1ec0d503cacd 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -230,6 +230,13 @@ void blk_abort_queue(struct request_queue *q) list_for_each_entry_safe(rq, tmp, &list, timeout_list) blk_abort_request(rq); + /* + * Occasionally, blk_abort_request() will return without + * deleting the element from the list. Make sure we add those back + * instead of leaving them on the local stack list. + */ + list_splice(&list, &q->timeout_list); + spin_unlock_irqrestore(q->queue_lock, flags); } -- cgit v1.2.3-59-g8ed1b From 26a2ac009c2b07e1959c8864ca23486c1c485587 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 23 Apr 2009 12:13:27 +0200 Subject: cfq-iosched: clear ->prio_trees[] on cfqd alloc Not strictly needed, but we should make it clear that we init the rbtree roots here. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 7e13f04b5ed4..20a54b8e03e1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2446,12 +2446,22 @@ static void cfq_exit_queue(struct elevator_queue *e) static void *cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; + int i; cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) return NULL; cfqd->service_tree = CFQ_RB_ROOT; + + /* + * Not strictly needed (since RB_ROOT just clears the node and we + * zeroed cfqd on alloc), but better be safe in case someone decides + * to add magic to the rb code + */ + for (i = 0; i < CFQ_PRIO_LISTS; i++) + cfqd->prio_trees[i] = RB_ROOT; + INIT_LIST_HEAD(&cfqd->cic_list); cfqd->queue = q; -- cgit v1.2.3-59-g8ed1b From 3ac6c9f8a66726745136e46f63600550c3eb6cec Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 23 Apr 2009 12:14:56 +0200 Subject: cfq-iosched: fix bug with aliased request and cooperation detection cfq_prio_tree_lookup() should return the direct match, yet it always returns zero. Fix that. cfq_prio_tree_add() assumes that we don't get a direct match, while it is very possible that we do. Using O_DIRECT, you can have different cfqq with matching requests, since you don't have the page cache to serialize things for you. Fix this bug by only adding the cfqq if there isn't an existing match. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 20a54b8e03e1..b0b754a6882b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -584,12 +584,13 @@ cfq_prio_tree_lookup(struct cfq_data *cfqd, int ioprio, sector_t sector, else break; p = n; + cfqq = NULL; } *ret_parent = parent; if (rb_link) *rb_link = p; - return NULL; + return cfqq; } static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) @@ -608,10 +609,10 @@ static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->ioprio, cfqq->next_rq->sector, &parent, &p); - BUG_ON(__cfqq); - - rb_link_node(&cfqq->p_node, parent, p); - rb_insert_color(&cfqq->p_node, root); + if (!__cfqq) { + rb_link_node(&cfqq->p_node, parent, p); + rb_insert_color(&cfqq->p_node, root); + } } /* -- cgit v1.2.3-59-g8ed1b From f2d1f0ae7851be5ebd9613a80dac139270938809 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 23 Apr 2009 12:19:38 +0200 Subject: cfq-iosched: cache prio_tree root in cfqq->p_root Currently we look it up from ->ioprio, but ->ioprio can change if either the process gets its IO priority changed explicitly, or if cfq decides to temporarily boost it. So if we are unlucky, we can end up attempting to remove a node from a different rbtree root than where it was added. Fix this by using ->org_ioprio as the prio_tree index, since that will only change for explicit IO priority settings (not for a boost). Additionally cache the rbtree root inside the cfqq, then we don't have to add code to reinsert the cfqq in the prio_tree if IO priority changes. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b0b754a6882b..a55a9bd75bd1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -154,6 +154,8 @@ struct cfq_queue { unsigned long rb_key; /* prio tree member */ struct rb_node p_node; + /* prio tree root we belong to, if any */ + struct rb_root *p_root; /* sorted list of pending requests */ struct rb_root sort_list; /* if fifo isn't expired, next request to serve */ @@ -558,10 +560,10 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, } static struct cfq_queue * -cfq_prio_tree_lookup(struct cfq_data *cfqd, int ioprio, sector_t sector, - struct rb_node **ret_parent, struct rb_node ***rb_link) +cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, + sector_t sector, struct rb_node **ret_parent, + struct rb_node ***rb_link) { - struct rb_root *root = &cfqd->prio_trees[ioprio]; struct rb_node **p, *parent; struct cfq_queue *cfqq = NULL; @@ -595,24 +597,27 @@ cfq_prio_tree_lookup(struct cfq_data *cfqd, int ioprio, sector_t sector, static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - struct rb_root *root = &cfqd->prio_trees[cfqq->ioprio]; struct rb_node **p, *parent; struct cfq_queue *__cfqq; - if (!RB_EMPTY_NODE(&cfqq->p_node)) - rb_erase_init(&cfqq->p_node, root); + if (cfqq->p_root) { + rb_erase(&cfqq->p_node, cfqq->p_root); + cfqq->p_root = NULL; + } if (cfq_class_idle(cfqq)) return; if (!cfqq->next_rq) return; - __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->ioprio, cfqq->next_rq->sector, + cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; + __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, cfqq->next_rq->sector, &parent, &p); if (!__cfqq) { rb_link_node(&cfqq->p_node, parent, p); - rb_insert_color(&cfqq->p_node, root); - } + rb_insert_color(&cfqq->p_node, cfqq->p_root); + } else + cfqq->p_root = NULL; } /* @@ -657,8 +662,10 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) if (!RB_EMPTY_NODE(&cfqq->rb_node)) cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); - if (!RB_EMPTY_NODE(&cfqq->p_node)) - rb_erase_init(&cfqq->p_node, &cfqd->prio_trees[cfqq->ioprio]); + if (cfqq->p_root) { + rb_erase(&cfqq->p_node, cfqq->p_root); + cfqq->p_root = NULL; + } BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; @@ -965,7 +972,7 @@ static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, struct cfq_queue *cur_cfqq) { - struct rb_root *root = &cfqd->prio_trees[cur_cfqq->ioprio]; + struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; struct rb_node *parent, *node; struct cfq_queue *__cfqq; sector_t sector = cfqd->last_position; @@ -977,8 +984,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, * First, if we find a request starting at the end of the last * request, choose it. */ - __cfqq = cfq_prio_tree_lookup(cfqd, cur_cfqq->ioprio, - sector, &parent, NULL); + __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); if (__cfqq) return __cfqq; -- cgit v1.2.3-59-g8ed1b