From 3b5e6454aaf6b4439b19400d8365e2ec2d24e411 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 4 Sep 2014 22:04:42 -0400 Subject: fs/buffer.c: support buffer cache allocations with gfp modifiers A buffer cache is allocated from movable area because it is referred for a while and released soon. But some filesystems are taking buffer cache for a long time and it can disturb page migration. New APIs are introduced to allocate buffer cache with user specific flag. *_gfp APIs are for user want to set page allocation flag for page cache allocation. And *_unmovable APIs are for the user wants to allocate page cache from non-movable area. Signed-off-by: Gioh Kim Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/buffer.c | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 8f05111bbb8b..9a6029e0dd71 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, */ static int grow_dev_page(struct block_device *bdev, sector_t block, - pgoff_t index, int size, int sizebits) + pgoff_t index, int size, int sizebits, gfp_t gfp) { struct inode *inode = bdev->bd_inode; struct page *page; @@ -1002,8 +1002,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, int ret = 0; /* Will call free_more_memory() */ gfp_t gfp_mask; - gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; - gfp_mask |= __GFP_MOVABLE; + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp; + /* * XXX: __getblk_slow() can not really deal with failure and * will endlessly loop on improvised global reclaim. Prefer @@ -1058,7 +1058,7 @@ failed: * that page was dirty, the buffers are set dirty also. */ static int -grow_buffers(struct block_device *bdev, sector_t block, int size) +grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) { pgoff_t index; int sizebits; @@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size) } /* Create a page with the proper size buffers.. */ - return grow_dev_page(bdev, block, index, size, sizebits); + return grow_dev_page(bdev, block, index, size, sizebits, gfp); } -static struct buffer_head * -__getblk_slow(struct block_device *bdev, sector_t block, int size) +struct buffer_head * +__getblk_slow(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { /* Size must be multiple of hard sectorsize */ if (unlikely(size & (bdev_logical_block_size(bdev)-1) || @@ -1111,13 +1112,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) if (bh) return bh; - ret = grow_buffers(bdev, block, size); + ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; if (ret == 0) free_more_memory(); } } +EXPORT_SYMBOL(__getblk_slow); /* * The relationship between dirty buffers and dirty pages: @@ -1371,24 +1373,25 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) EXPORT_SYMBOL(__find_get_block); /* - * __getblk will locate (and, if necessary, create) the buffer_head + * __getblk_gfp() will locate (and, if necessary, create) the buffer_head * which corresponds to the passed block_device, block and size. The * returned buffer has its reference count incremented. * - * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() - * attempt is failing. FIXME, perhaps? + * __getblk_gfp() will lock up the machine if grow_dev_page's + * try_to_free_buffers() attempt is failing. FIXME, perhaps? */ struct buffer_head * -__getblk(struct block_device *bdev, sector_t block, unsigned size) +__getblk_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { struct buffer_head *bh = __find_get_block(bdev, block, size); might_sleep(); if (bh == NULL) - bh = __getblk_slow(bdev, block, size); + bh = __getblk_slow(bdev, block, size, gfp); return bh; } -EXPORT_SYMBOL(__getblk); +EXPORT_SYMBOL(__getblk_gfp); /* * Do async read-ahead on a buffer.. @@ -1404,24 +1407,28 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) EXPORT_SYMBOL(__breadahead); /** - * __bread() - reads a specified block and returns the bh + * __bread_gfp() - reads a specified block and returns the bh * @bdev: the block_device to read from * @block: number of block * @size: size (in bytes) to read - * + * @gfp: page allocation flag + * * Reads a specified block, and returns buffer head that contains it. + * The page cache can be allocated from non-movable area + * not to prevent page migration if you set gfp to zero. * It returns NULL if the block was unreadable. */ struct buffer_head * -__bread(struct block_device *bdev, sector_t block, unsigned size) +__bread_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { - struct buffer_head *bh = __getblk(bdev, block, size); + struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); if (likely(bh) && !buffer_uptodate(bh)) bh = __bread_slow(bh); return bh; } -EXPORT_SYMBOL(__bread); +EXPORT_SYMBOL(__bread_gfp); /* * invalidate_bh_lrus() is called rarely - but not only at unmount. -- cgit v1.3-8-gc7d7 From f2d5a94436cc7cc0221b9a81bba2276a25187dd3 Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Mon, 22 Sep 2014 01:53:03 +0100 Subject: Fix nasty 32-bit overflow bug in buffer i/o code. On 32-bit architectures, the legacy buffer_head functions are not always handling the sector number with the proper 64-bit types, and will thus fail on 4TB+ disks. Any code that uses __getblk() (and thus bread(), breadahead(), sb_bread(), sb_breadahead(), sb_getblk()), and calls it using a 64-bit block on a 32-bit arch (where "long" is 32-bit) causes an inifinite loop in __getblk_slow() with an infinite stream of errors logged to dmesg like this: __find_get_block_slow() failed. block=6740375944, b_blocknr=2445408648 b_state=0x00000020, b_size=512 device sda1 blocksize: 512 Note how in hex block is 0x191C1F988 and b_blocknr is 0x91C1F988 i.e. the top 32-bits are missing (in this case the 0x1 at the top). This is because grow_dev_page() is broken and has a 32-bit overflow due to shifting the page index value (a pgoff_t - which is just 32 bits on 32-bit architectures) left-shifted as the block number. But the top bits to get lost as the pgoff_t is not type cast to sector_t / 64-bit before the shift. This patch fixes this issue by type casting "index" to sector_t before doing the left shift. Note this is not a theoretical bug but has been seen in the field on a 4TiB hard drive with logical sector size 512 bytes. This patch has been verified to fix the infinite loop problem on 3.17-rc5 kernel using a 4TB disk image mounted using "-o loop". Without this patch doing a "find /nt" where /nt is an NTFS volume causes the inifinite loop 100% reproducibly whilst with the patch it works fine as expected. Signed-off-by: Anton Altaparmakov Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/buffer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 8f05111bbb8b..3588a80854b2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1022,7 +1022,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, bh = page_buffers(page); if (bh->b_size == size) { end_block = init_page_buffers(page, bdev, - index << sizebits, size); + (sector_t)index << sizebits, + size); goto done; } if (!try_to_free_buffers(page)) @@ -1043,7 +1044,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, */ spin_lock(&inode->i_mapping->private_lock); link_dev_buffers(page, bh); - end_block = init_page_buffers(page, bdev, index << sizebits, size); + end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, + size); spin_unlock(&inode->i_mapping->private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; -- cgit v1.3-8-gc7d7 From 90a8020278c1598fafd071736a0846b38510309c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Oct 2014 21:49:18 -0400 Subject: vfs: fix data corruption when blocksize < pagesize for mmaped data ->page_mkwrite() is used by filesystems to allocate blocks under a page which is becoming writeably mmapped in some process' address space. This allows a filesystem to return a page fault if there is not enough space available, user exceeds quota or similar problem happens, rather than silently discarding data later when writepage is called. However VFS fails to call ->page_mkwrite() in all the cases where filesystems need it when blocksize < pagesize. For example when blocksize = 1024, pagesize = 4096 the following is problematic: ftruncate(fd, 0); pwrite(fd, buf, 1024, 0); map = mmap(NULL, 1024, PROT_WRITE, MAP_SHARED, fd, 0); map[0] = 'a'; ----> page_mkwrite() for index 0 is called ftruncate(fd, 10000); /* or even pwrite(fd, buf, 1, 10000) */ mremap(map, 1024, 10000, 0); map[4095] = 'a'; ----> no page_mkwrite() called At the moment ->page_mkwrite() is called, filesystem can allocate only one block for the page because i_size == 1024. Otherwise it would create blocks beyond i_size which is generally undesirable. But later at ->writepage() time, we also need to store data at offset 4095 but we don't have block allocated for it. This patch introduces a helper function filesystems can use to have ->page_mkwrite() called at all the necessary moments. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/buffer.c | 3 +++ include/linux/mm.h | 1 + mm/truncate.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 9a6029e0dd71..6dc1475dcb2d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2087,6 +2087,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; int i_size_changed = 0; copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -2106,6 +2107,8 @@ int generic_write_end(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock diff --git a/include/linux/mm.h b/include/linux/mm.h index 8981cc882ed2..5005464fe012 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1155,6 +1155,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int truncate_inode_page(struct address_space *mapping, struct page *page); int generic_error_remove_page(struct address_space *mapping, struct page *page); diff --git a/mm/truncate.c b/mm/truncate.c index 96d167372d89..261eaf6e5a19 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -20,6 +20,7 @@ #include /* grr. try_to_release_page, do_invalidatepage */ #include +#include #include "internal.h" static void clear_exceptional_entry(struct address_space *mapping, @@ -719,11 +720,67 @@ EXPORT_SYMBOL(truncate_pagecache); */ void truncate_setsize(struct inode *inode, loff_t newsize) { + loff_t oldsize = inode->i_size; + i_size_write(inode, newsize); + if (newsize > oldsize) + pagecache_isize_extended(inode, oldsize, newsize); truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); +/** + * pagecache_isize_extended - update pagecache after extension of i_size + * @inode: inode for which i_size was extended + * @from: original inode size + * @to: new inode size + * + * Handle extension of inode size either caused by extending truncate or by + * write starting after current i_size. We mark the page straddling current + * i_size RO so that page_mkwrite() is called on the nearest write access to + * the page. This way filesystem can be sure that page_mkwrite() is called on + * the page before user writes to the page via mmap after the i_size has been + * changed. + * + * The function must be called after i_size is updated so that page fault + * coming after we unlock the page will already see the new i_size. + * The function must be called while we still hold i_mutex - this not only + * makes sure i_size is stable but also that userspace cannot observe new + * i_size value before we are prepared to store mmap writes at new inode size. + */ +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) +{ + int bsize = 1 << inode->i_blkbits; + loff_t rounded_from; + struct page *page; + pgoff_t index; + + WARN_ON(!mutex_is_locked(&inode->i_mutex)); + WARN_ON(to > inode->i_size); + + if (from >= to || bsize == PAGE_CACHE_SIZE) + return; + /* Page straddling @from will not have any hole block created? */ + rounded_from = round_up(from, bsize); + if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) + return; + + index = from >> PAGE_CACHE_SHIFT; + page = find_lock_page(inode->i_mapping, index); + /* Page not cached? Nothing to do */ + if (!page) + return; + /* + * See clear_page_dirty_for_io() for details why set_page_dirty() + * is needed. + */ + if (page_mkclean(page)) + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); +} +EXPORT_SYMBOL(pagecache_isize_extended); + /** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode -- cgit v1.3-8-gc7d7 From c2ca0fcd202863b14bd041a7fece2e789926c225 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 27 Jul 2014 13:00:41 -0400 Subject: fs: make cont_expand_zero interruptible This patch makes it possible to kill a process looping in cont_expand_zero. A process may spend a lot of time in this function, so it is desirable to be able to kill it. It happened to me that I wanted to copy a piece data from the disk to a file. By mistake, I used the "seek" parameter to dd instead of "skip". Due to the "seek" parameter, dd attempted to extend the file and became stuck doing so - the only possibility was to reset the machine or wait many hours until the filesystem runs out of space and cont_expand_zero fails. We need this patch to be able to terminate the process. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/buffer.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 3588a80854b2..c8a1c01d6187 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2318,6 +2318,11 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, err = 0; balance_dirty_pages_ratelimited(mapping); + + if (unlikely(fatal_signal_pending(current))) { + err = -EINTR; + goto out; + } } /* page covers the boundary, find the boundary offset */ -- cgit v1.3-8-gc7d7 From 59d43914ed7b96255271ad6b7b735344beffa3c0 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Thu, 9 Oct 2014 15:26:53 -0700 Subject: vfs: make guard_bh_eod() more generic This patchset implements readpages() operation for block device by using mpage_readpages() which can create multipage BIOs instead of BIOs for each page and reduce system CPU time consumption. This patch (of 3): guard_bh_eod() is used in submit_bh() to allow us to do IO even on the odd last sectors of a device, even if the block size is some multiple of the physical sector size. This makes guard_bh_eod() more generic and renames it guard_bio_eod() so that we can use it without struct buffer_head argument. The reason for this change is that using mpage_readpages() for block device requires to add this guard check in mpage code. Signed-off-by: Akinobu Mita Cc: Jens Axboe Cc: Alexander Viro Cc: Jeff Moyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 3588a80854b2..e442a26e80f7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2956,7 +2956,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) /* * This allows us to do IO even on the odd last sectors - * of a device, even if the bh block size is some multiple + * of a device, even if the block size is some multiple * of the physical sector size. * * We'll just truncate the bio to the size of the device, @@ -2966,10 +2966,11 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) * errors, this only handles the "we need to be able to * do IO at the final sector" case. */ -static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) +static void guard_bio_eod(int rw, struct bio *bio) { sector_t maxsector; - unsigned bytes; + struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; + unsigned truncated_bytes; maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; if (!maxsector) @@ -2984,23 +2985,20 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) return; maxsector -= bio->bi_iter.bi_sector; - bytes = bio->bi_iter.bi_size; - if (likely((bytes >> 9) <= maxsector)) + if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) return; - /* Uhhuh. We've got a bh that straddles the device size! */ - bytes = maxsector << 9; + /* Uhhuh. We've got a bio that straddles the device size! */ + truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); /* Truncate the bio.. */ - bio->bi_iter.bi_size = bytes; - bio->bi_io_vec[0].bv_len = bytes; + bio->bi_iter.bi_size -= truncated_bytes; + bvec->bv_len -= truncated_bytes; /* ..and clear the end of the buffer for reads */ if ((rw & RW_MASK) == READ) { - void *kaddr = kmap_atomic(bh->b_page); - memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); - kunmap_atomic(kaddr); - flush_dcache_page(bh->b_page); + zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len, + truncated_bytes); } } @@ -3041,7 +3039,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) bio->bi_flags |= bio_flags; /* Take care of bh's that straddle the end of the device */ - guard_bh_eod(rw, bio, bh); + guard_bio_eod(rw, bio); if (buffer_meta(bh)) rw |= REQ_META; -- cgit v1.3-8-gc7d7 From 4db96b71e3caea5bb39053d57683129e0682c66f Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Thu, 9 Oct 2014 15:26:55 -0700 Subject: vfs: guard end of device for mpage interface Add guard_bio_eod() check for mpage code in order to allow us to do IO even on the odd last sectors of a device, even if the block size is some multiple of the physical sector size. Using mpage_readpages() for block device requires this guard check. Signed-off-by: Akinobu Mita Cc: Jens Axboe Cc: Alexander Viro Cc: Jeff Moyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 2 +- fs/internal.h | 5 +++++ fs/mpage.c | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index e442a26e80f7..7bd5c4685e98 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2966,7 +2966,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) * errors, this only handles the "we need to be able to * do IO at the final sector" case. */ -static void guard_bio_eod(int rw, struct bio *bio) +void guard_bio_eod(int rw, struct bio *bio) { sector_t maxsector; struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; diff --git a/fs/internal.h b/fs/internal.h index e325b4f9c799..b2623200107b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -34,6 +34,11 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) } #endif +/* + * buffer.c + */ +extern void guard_bio_eod(int rw, struct bio *bio); + /* * char_dev.c */ diff --git a/fs/mpage.c b/fs/mpage.c index 5f9ed622274f..3e79220babac 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -28,6 +28,7 @@ #include #include #include +#include "internal.h" /* * I/O completion handler for multipage BIOs. @@ -57,6 +58,7 @@ static void mpage_end_io(struct bio *bio, int err) static struct bio *mpage_bio_submit(int rw, struct bio *bio) { bio->bi_end_io = mpage_end_io; + guard_bio_eod(rw, bio); submit_bio(rw, bio); return NULL; } -- cgit v1.3-8-gc7d7 From 86cf78d73de8c6bfa89804b91ee0ace71a459961 Mon Sep 17 00:00:00 2001 From: Sebastien Buisson Date: Thu, 9 Oct 2014 15:29:38 -0700 Subject: fs/buffer.c: increase the buffer-head per-CPU LRU size Increase the buffer-head per-CPU LRU size to allow efficient filesystem operations that access many blocks for each transaction. For example, creating a file in a large ext4 directory with quota enabled will access multiple buffer heads and will overflow the LRU at the default 8-block LRU size: * parent directory inode table block (ctime, nlinks for subdirs) * new inode bitmap * inode table block * 2 quota blocks * directory leaf block (not reused, but pollutes one cache entry) * 2 levels htree blocks (only one is reused, other pollutes cache) * 2 levels indirect/index blocks (only one is reused) The buffer-head per-CPU LRU size is raised to 16, as it shows in metadata performance benchmarks up to 10% gain for create, 4% for lookup and 7% for destroy. Signed-off-by: Liang Zhen Signed-off-by: Andreas Dilger Signed-off-by: Sebastien Buisson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 7bd5c4685e98..44c14a87750e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1253,7 +1253,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh) * a local interrupt disable for that. */ -#define BH_LRU_SIZE 8 +#define BH_LRU_SIZE 16 struct bh_lru { struct buffer_head *bhs[BH_LRU_SIZE]; -- cgit v1.3-8-gc7d7 From 9470dd5d352985ba907df7554845f87a4b8f9ea5 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 13 Oct 2014 15:55:05 -0700 Subject: fs: check bh blocknr earlier when searching lru It's very common for the buffer heads in the lru to have different block numbers. By comparing the blocknr before the bdev and size we can reduce the cost of searching in the very common case where all the entries have the same bdev and size. In quick hot cache cycle counting tests on a single fs workstation this cut the cost of a miss by about 20%. A diff of the disassembly shows the reordering of the bdev and blocknr comparisons. This is in such a tiny loop that skipping one comparison is a meaningful portion of the total work being done: 1628: 83 c1 01 add $0x1,%ecx 162b: 83 f9 08 cmp $0x8,%ecx 162e: 74 60 je 1690 <__find_get_block+0xa0> 1630: 89 c8 mov %ecx,%eax 1632: 65 4c 8b 04 c5 00 00 mov %gs:0x0(,%rax,8),%r8 1639: 00 00 163b: 4d 85 c0 test %r8,%r8 163e: 4c 89 c3 mov %r8,%rbx 1641: 74 e5 je 1628 <__find_get_block+0x38> - 1643: 4d 3b 68 30 cmp 0x30(%r8),%r13 + 1643: 4d 3b 68 18 cmp 0x18(%r8),%r13 1647: 75 df jne 1628 <__find_get_block+0x38> - 1649: 4d 3b 60 18 cmp 0x18(%r8),%r12 + 1649: 4d 3b 60 30 cmp 0x30(%r8),%r12 164d: 75 d9 jne 1628 <__find_get_block+0x38> 164f: 49 39 50 20 cmp %rdx,0x20(%r8) 1653: 75 d3 jne 1628 <__find_get_block+0x38> Signed-off-by: Zach Brown Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index d1f704806264..9614adc7e754 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1331,8 +1331,8 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) for (i = 0; i < BH_LRU_SIZE; i++) { struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]); - if (bh && bh->b_bdev == bdev && - bh->b_blocknr == block && bh->b_size == size) { + if (bh && bh->b_blocknr == block && bh->b_bdev == bdev && + bh->b_size == size) { if (i) { while (i) { __this_cpu_write(bh_lrus.bhs[i], -- cgit v1.3-8-gc7d7 From b744c2ac4bbc040794efb33207d6ebc14f88ea2e Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 21 Oct 2014 13:55:09 -0600 Subject: fs: merge I/O error prints into one line buffer.c uses two printk calls to print these messages: [67353.422338] Buffer I/O error on device sdr, logical block 212868488 [67353.422338] lost page write due to I/O error on sdr In a busy system, they may be interleaved with other prints, losing the context for the second message. Merge them into one line with one printk call so the prints are atomic. Also, differentiate between async page writes, sync page writes, and async page reads. Also, shorten "device" to "dev" to match the block layer prints: [67353.467906] blk_update_request: critical target error, dev sdr, sector 1707107328 Also, use %llu rather than %Lu. Resulting prints look like: [ 1356.437006] blk_update_request: critical target error, dev sdr, sector 1719693992 [ 1361.383522] quiet_error: 659876 callbacks suppressed [ 1361.385816] Buffer I/O error on dev sdr, logical block 256902912, lost async page write [ 1361.385819] Buffer I/O error on dev sdr, logical block 256903644, lost async page write Signed-off-by: Robert Elliott Reviewed-by: Webb Scales Signed-off-by: Jens Axboe --- fs/buffer.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 6c48f20eddd4..9d1da1d314a2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -137,12 +137,12 @@ static int quiet_error(struct buffer_head *bh) } -static void buffer_io_error(struct buffer_head *bh) +static void buffer_io_error(struct buffer_head *bh, char *msg) { char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", + printk(KERN_ERR "Buffer I/O error on dev %s, logical block %llu%s\n", bdevname(bh->b_bdev, b), - (unsigned long long)bh->b_blocknr); + (unsigned long long)bh->b_blocknr, msg); } /* @@ -177,17 +177,11 @@ EXPORT_SYMBOL(end_buffer_read_sync); void end_buffer_write_sync(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; - if (uptodate) { set_buffer_uptodate(bh); } else { - if (!quiet_error(bh)) { - buffer_io_error(bh); - printk(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); - } + if (!quiet_error(bh)) + buffer_io_error(bh, ", lost sync page write"); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); } @@ -305,7 +299,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) } else { clear_buffer_uptodate(bh); if (!quiet_error(bh)) - buffer_io_error(bh); + buffer_io_error(bh, ", async page read"); SetPageError(page); } @@ -353,7 +347,6 @@ still_busy: */ void end_buffer_async_write(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; unsigned long flags; struct buffer_head *first; struct buffer_head *tmp; @@ -365,12 +358,8 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (!quiet_error(bh)) { - buffer_io_error(bh); - printk(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); - } + if (!quiet_error(bh)) + buffer_io_error(bh, ", lost async page write"); set_bit(AS_EIO, &page->mapping->flags); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); -- cgit v1.3-8-gc7d7 From 432f16e64f50fd4999a476543d04dd52f7a2d753 Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 21 Oct 2014 13:55:11 -0600 Subject: fs: clarify rate limit suppressed buffer I/O errors When quiet_error applies rate limiting to buffer_io_error calls, what the they apply to is unclear because the name is so generic, particularly if the messages are interleaved with others: [ 1936.063572] quiet_error: 664293 callbacks suppressed [ 1936.065297] Buffer I/O error on dev sdr, logical block 257429952, lost async page write [ 1936.067814] Buffer I/O error on dev sdr, logical block 257429953, lost async page write Also, the function uses printk_ratelimit(), although printk.h includes a comment advising "Please don't use... Instead use printk_ratelimited()." Change buffer_io_error to check the BH_Quiet bit itself, drop the printk_ratelimit call, and print using printk_ratelimited. This makes the messages look like: [ 387.208839] buffer_io_error: 676394 callbacks suppressed [ 387.210693] Buffer I/O error on dev sdr, logical block 211291776, lost async page write [ 387.213432] Buffer I/O error on dev sdr, logical block 211291777, lost async page write Signed-off-by: Robert Elliott Reviewed-by: Webb Scales Signed-off-by: Jens Axboe --- fs/buffer.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 9d1da1d314a2..20805db2c987 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -128,19 +128,13 @@ __clear_page_buffers(struct page *page) page_cache_release(page); } - -static int quiet_error(struct buffer_head *bh) -{ - if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit()) - return 0; - return 1; -} - - static void buffer_io_error(struct buffer_head *bh, char *msg) { char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on dev %s, logical block %llu%s\n", + + if (!test_bit(BH_Quiet, &bh->b_state)) + printk_ratelimited(KERN_ERR + "Buffer I/O error on dev %s, logical block %llu%s\n", bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr, msg); } @@ -180,8 +174,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (!quiet_error(bh)) - buffer_io_error(bh, ", lost sync page write"); + buffer_io_error(bh, ", lost sync page write"); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); } @@ -298,8 +291,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); } else { clear_buffer_uptodate(bh); - if (!quiet_error(bh)) - buffer_io_error(bh, ", async page read"); + buffer_io_error(bh, ", async page read"); SetPageError(page); } @@ -358,8 +350,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (!quiet_error(bh)) - buffer_io_error(bh, ", lost async page write"); + buffer_io_error(bh, ", lost async page write"); set_bit(AS_EIO, &page->mapping->flags); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); -- cgit v1.3-8-gc7d7