diff options
Diffstat (limited to 'fs/buffer.c')
-rw-r--r-- | fs/buffer.c | 874 |
1 files changed, 252 insertions, 622 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index 46bc589b7a03..d9c6d1fbb6dd 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -52,8 +52,8 @@ #include "internal.h" static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); -static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - enum rw_hint hint, struct writeback_control *wbc); +static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, + struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -79,26 +79,26 @@ void unlock_buffer(struct buffer_head *bh) EXPORT_SYMBOL(unlock_buffer); /* - * Returns if the page has dirty or writeback buffers. If all the buffers - * are unlocked and clean then the PageDirty information is stale. If - * any of the pages are locked, it is assumed they are locked for IO. + * Returns if the folio has dirty or writeback buffers. If all the buffers + * are unlocked and clean then the folio_test_dirty information is stale. If + * any of the buffers are locked, it is assumed they are locked for IO. */ -void buffer_check_dirty_writeback(struct page *page, +void buffer_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { struct buffer_head *head, *bh; *dirty = false; *writeback = false; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); - if (!page_has_buffers(page)) + head = folio_buffers(folio); + if (!head) return; - if (PageWriteback(page)) + if (folio_test_writeback(folio)) *writeback = true; - head = page_buffers(page); bh = head; do { if (buffer_locked(bh)) @@ -152,7 +152,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) /* * Default synchronous end-of-IO handler.. Just mark it up-to-date and - * unlock the buffer. This is what ll_rw_block uses too. + * unlock the buffer. */ void end_buffer_read_sync(struct buffer_head *bh, int uptodate) { @@ -282,10 +282,10 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) spin_unlock_irqrestore(&first->b_uptodate_lock, flags); /* - * If none of the buffers had errors and they are all - * uptodate then we can set the page uptodate. + * If all of the buffers are uptodate then we can set the page + * uptodate. */ - if (page_uptodate && !PageError(page)) + if (page_uptodate) SetPageUptodate(page); unlock_page(page); return; @@ -314,7 +314,7 @@ static void decrypt_bh(struct work_struct *work) } /* - * I/O completion handler for block_read_full_page() - pages + * I/O completion handler for block_read_full_folio() - pages * which come unlocked at the end of I/O. */ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) @@ -491,8 +491,8 @@ int inode_has_buffers(struct inode *inode) * all already-submitted IO to complete, but does not queue any new * writes to the disk. * - * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as - * you dirty the buffers, and then use osync_inode_buffers to wait for + * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer + * as you dirty the buffers, and then use osync_inode_buffers to wait for * completion. Any other dirty buffers which are not yet queued for * write will not be flushed to disk by the osync. */ @@ -562,7 +562,7 @@ void write_boundary_block(struct block_device *bdev, struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh)) - ll_rw_block(REQ_OP_WRITE, 0, 1, &bh); + write_dirty_buffer(bh, 0); put_bh(bh); } } @@ -613,17 +613,14 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * FIXME: may need to call ->reservepage here as well. That's rather up to the * address_space though. */ -int __set_page_dirty_buffers(struct page *page) +bool block_dirty_folio(struct address_space *mapping, struct folio *folio) { - int newly_dirty; - struct address_space *mapping = page_mapping(page); - - if (unlikely(!mapping)) - return !TestSetPageDirty(page); + struct buffer_head *head; + bool newly_dirty; spin_lock(&mapping->private_lock); - if (page_has_buffers(page)) { - struct buffer_head *head = page_buffers(page); + head = folio_buffers(folio); + if (head) { struct buffer_head *bh = head; do { @@ -635,21 +632,21 @@ int __set_page_dirty_buffers(struct page *page) * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ - lock_page_memcg(page); - newly_dirty = !TestSetPageDirty(page); + folio_memcg_lock(folio); + newly_dirty = !folio_test_set_dirty(folio); spin_unlock(&mapping->private_lock); if (newly_dirty) - __set_page_dirty(page, mapping, 1); + __folio_mark_dirty(folio, mapping, 1); - unlock_page_memcg(page); + folio_memcg_unlock(folio); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return newly_dirty; } -EXPORT_SYMBOL(__set_page_dirty_buffers); +EXPORT_SYMBOL(block_dirty_folio); /* * Write out and wait upon a list of buffers. @@ -958,7 +955,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, size); goto done; } - if (!try_to_free_buffers(page)) + if (!try_to_free_buffers(page_folio(page))) goto failed; } @@ -1063,8 +1060,8 @@ __getblk_slow(struct block_device *bdev, sector_t block, * Also. When blockdev buffers are explicitly read with bread(), they * individually become uptodate. But their backing page remains not * uptodate - even if all of its buffers are uptodate. A subsequent - * block_read_full_page() against that page will discover all the uptodate - * buffers, will set the page uptodate and will perform no I/O. + * block_read_full_folio() against that folio will discover all the uptodate + * buffers, will set the folio uptodate and will perform no I/O. */ /** @@ -1177,7 +1174,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh) } else { get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -1235,16 +1232,18 @@ static void bh_lru_install(struct buffer_head *bh) int i; check_irqs_on(); + bh_lru_lock(); + /* * the refcount of buffer_head in bh_lru prevents dropping the * attached page(i.e., try_to_free_buffers) so it could cause * failing page migration. * Skip putting upcoming bh into bh_lru until migration is done. */ - if (lru_cache_disabled()) + if (lru_cache_disabled()) { + bh_lru_unlock(); return; - - bh_lru_lock(); + } b = this_cpu_ptr(&bh_lrus); for (i = 0; i < BH_LRU_SIZE; i++) { @@ -1343,23 +1342,12 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __getblk(bdev, block, size); if (likely(bh)) { - ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh); + bh_readahead(bh, REQ_RAHEAD); brelse(bh); } } EXPORT_SYMBOL(__breadahead); -void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size, - gfp_t gfp) -{ - struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); - if (likely(bh)) { - ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh); - brelse(bh); - } -} -EXPORT_SYMBOL(__breadahead_gfp); - /** * __bread_gfp() - reads a specified block and returns the bh * @bdev: the block_device to read from @@ -1465,58 +1453,53 @@ EXPORT_SYMBOL(set_bh_page); static void discard_buffer(struct buffer_head * bh) { - unsigned long b_state, b_state_old; + unsigned long b_state; lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; - b_state = bh->b_state; - for (;;) { - b_state_old = cmpxchg(&bh->b_state, b_state, - (b_state & ~BUFFER_FLAGS_DISCARD)); - if (b_state_old == b_state) - break; - b_state = b_state_old; - } + b_state = READ_ONCE(bh->b_state); + do { + } while (!try_cmpxchg(&bh->b_state, &b_state, + b_state & ~BUFFER_FLAGS_DISCARD)); unlock_buffer(bh); } /** - * block_invalidatepage - invalidate part or all of a buffer-backed page - * - * @page: the page which is affected + * block_invalidate_folio - Invalidate part or all of a buffer-backed folio. + * @folio: The folio which is affected. * @offset: start of the range to invalidate * @length: length of the range to invalidate * - * block_invalidatepage() is called when all or part of the page has become + * block_invalidate_folio() is called when all or part of the folio has been * invalidated by a truncate operation. * - * block_invalidatepage() does not have to release all buffers, but it must + * block_invalidate_folio() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O * is underway against any of the blocks which are outside the truncation * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ -void block_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +void block_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct buffer_head *head, *bh, *next; - unsigned int curr_off = 0; - unsigned int stop = length + offset; + size_t curr_off = 0; + size_t stop = length + offset; - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) - goto out; + BUG_ON(!folio_test_locked(folio)); /* * Check for overflow */ - BUG_ON(stop > PAGE_SIZE || stop < length); + BUG_ON(stop > folio_size(folio) || stop < length); + + head = folio_buffers(folio); + if (!head) + return; - head = page_buffers(page); bh = head; do { - unsigned int next_off = curr_off + bh->b_size; + size_t next_off = curr_off + bh->b_size; next = bh->b_this_page; /* @@ -1535,21 +1518,21 @@ void block_invalidatepage(struct page *page, unsigned int offset, } while (bh != head); /* - * We release buffers only if the entire page is being invalidated. + * We release buffers only if the entire folio is being invalidated. * The get_block cached value has been unconditionally invalidated, * so real IO is not possible anymore. */ - if (length == PAGE_SIZE) - try_to_release_page(page, 0); + if (length == folio_size(folio)) + filemap_release_folio(folio, 0); out: return; } -EXPORT_SYMBOL(block_invalidatepage); +EXPORT_SYMBOL(block_invalidate_folio); /* * We attach and possibly dirty the buffers atomically wrt - * __set_page_dirty_buffers() via private_lock. try_to_free_buffers + * block_dirty_folio() via private_lock. try_to_free_buffers * is already excluded via the page lock. */ void create_empty_buffers(struct page *page, @@ -1606,7 +1589,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) { struct inode *bd_inode = bdev->bd_inode; struct address_space *bd_mapping = bd_inode->i_mapping; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); pgoff_t end; int i, count; @@ -1614,24 +1597,24 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) struct buffer_head *head; end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); - pagevec_init(&pvec); - while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { - count = pagevec_count(&pvec); + folio_batch_init(&fbatch); + while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) { + count = folio_batch_count(&fbatch); for (i = 0; i < count; i++) { - struct page *page = pvec.pages[i]; + struct folio *folio = fbatch.folios[i]; - if (!page_has_buffers(page)) + if (!folio_buffers(folio)) continue; /* - * We use page lock instead of bd_mapping->private_lock + * We use folio lock instead of bd_mapping->private_lock * to pin buffers here since we can afford to sleep and * it scales better than a global spinlock lock. */ - lock_page(page); - /* Recheck when the page is locked which pins bhs */ - if (!page_has_buffers(page)) + folio_lock(folio); + /* Recheck when the folio is locked which pins bhs */ + head = folio_buffers(folio); + if (!head) goto unlock_page; - head = page_buffers(page); bh = head; do { if (!buffer_mapped(bh) || (bh->b_blocknr < block)) @@ -1645,9 +1628,9 @@ next: bh = bh->b_this_page; } while (bh != head); unlock_page: - unlock_page(page); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); /* End of range already reached? */ if (index > end || !index) @@ -1718,18 +1701,18 @@ int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_flags = wbc_to_write_flags(wbc); + blk_opf_t write_flags = wbc_to_write_flags(wbc); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers + * Be very careful. We have no exclusion from block_dirty_folio * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * Buffers outside i_size may be dirtied by block_dirty_folio; * handle that here by just cleaning them. */ @@ -1806,8 +1789,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, - inode->i_write_hint, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); nr_underway++; } bh = next; @@ -1820,7 +1802,7 @@ done: /* * The page was marked dirty, but the buffers were * clean. Someone wrote them back by hand with - * ll_rw_block/submit_bh. A rare case. + * write_dirty_buffer/submit_bh. A rare case. */ end_page_writeback(page); @@ -1861,8 +1843,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, - inode->i_write_hint, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); nr_underway++; } bh = next; @@ -1969,34 +1950,34 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, } } -int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, +int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block, const struct iomap *iomap) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; unsigned blocksize, bbits; struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); - head = create_page_buffers(page, inode, 0); + head = create_page_buffers(&folio->page, inode, 0); blocksize = head->b_size; bbits = block_size_bits(blocksize); - block = (sector_t)page->index << (PAGE_SHIFT - bbits); + block = (sector_t)folio->index << (PAGE_SHIFT - bbits); for(bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } @@ -2016,20 +1997,20 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, if (buffer_new(bh)) { clean_bdev_bh_alias(bh); - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { clear_buffer_new(bh); set_buffer_uptodate(bh); mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) - zero_user_segments(page, + folio_zero_segments(folio, to, block_end, block_start, from); continue; } } - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); continue; @@ -2037,7 +2018,7 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + bh_read_nowait(bh, 0); *wait_bh++=bh; } } @@ -2050,14 +2031,15 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) - page_zero_new_buffers(page, from, to); + page_zero_new_buffers(&folio->page, from, to); return err; } int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) { - return __block_write_begin_int(page, pos, len, get_block, NULL); + return __block_write_begin_int(page_folio(page), pos, len, get_block, + NULL); } EXPORT_SYMBOL(__block_write_begin); @@ -2091,7 +2073,7 @@ static int __block_commit_write(struct inode *inode, struct page *page, /* * If this is a partial write which happened to make all buffers - * uptodate then we can optimize away a bogus readpage() for + * uptodate then we can optimize away a bogus read_folio() for * the next read(). Here we 'discover' whether the page went * uptodate as a result of this (potentially partial) write. */ @@ -2107,13 +2089,13 @@ static int __block_commit_write(struct inode *inode, struct page *page, * The filesystem needs to handle block truncation upon failure. */ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - unsigned flags, struct page **pagep, get_block_t *get_block) + struct page **pagep, get_block_t *get_block) { pgoff_t index = pos >> PAGE_SHIFT; struct page *page; int status; - page = grab_cache_page_write_begin(mapping, index, flags); + page = grab_cache_page_write_begin(mapping, index); if (!page) return -ENOMEM; @@ -2140,12 +2122,12 @@ int block_write_end(struct file *file, struct address_space *mapping, if (unlikely(copied < len)) { /* - * The buffers that were written will now be uptodate, so we - * don't have to worry about a readpage reading them and - * overwriting a partial write. However if we have encountered - * a short write and only partially written into a buffer, it - * will not be marked uptodate, so a readpage might come in and - * destroy our partial write. + * The buffers that were written will now be uptodate, so + * we don't have to worry about a read_folio reading them + * and overwriting a partial write. However if we have + * encountered a short write and only partially written + * into a buffer, it will not be marked uptodate, so a + * read_folio might come in and destroy our partial write. * * Do the simplest thing, and just treat any short write to a * non uptodate page as a zero-length write, and force the @@ -2205,29 +2187,27 @@ int generic_write_end(struct file *file, struct address_space *mapping, EXPORT_SYMBOL(generic_write_end); /* - * block_is_partially_uptodate checks whether buffers within a page are + * block_is_partially_uptodate checks whether buffers within a folio are * uptodate or not. * - * Returns true if all buffers which correspond to a file portion - * we want to read are uptodate. + * Returns true if all buffers which correspond to the specified part + * of the folio are uptodate. */ -int block_is_partially_uptodate(struct page *page, unsigned long from, - unsigned long count) +bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { unsigned block_start, block_end, blocksize; unsigned to; struct buffer_head *bh, *head; - int ret = 1; - - if (!page_has_buffers(page)) - return 0; + bool ret = true; - head = page_buffers(page); + head = folio_buffers(folio); + if (!head) + return false; blocksize = head->b_size; - to = min_t(unsigned, PAGE_SIZE - from, count); + to = min_t(unsigned, folio_size(folio) - from, count); to = from + to; - if (from < blocksize && to > PAGE_SIZE - blocksize) - return 0; + if (from < blocksize && to > folio_size(folio) - blocksize) + return false; bh = head; block_start = 0; @@ -2235,7 +2215,7 @@ int block_is_partially_uptodate(struct page *page, unsigned long from, block_end = block_start + blocksize; if (block_end > from && block_start < to) { if (!buffer_uptodate(bh)) { - ret = 0; + ret = false; break; } if (block_end >= to) @@ -2250,26 +2230,29 @@ int block_is_partially_uptodate(struct page *page, unsigned long from, EXPORT_SYMBOL(block_is_partially_uptodate); /* - * Generic "read page" function for block devices that have the normal + * Generic "read_folio" function for block devices that have the normal * get_block functionality. This is most of the block device filesystems. - * Reads the page asynchronously --- the unlock_buffer() and + * Reads the folio asynchronously --- the unlock_buffer() and * set/clear_buffer_uptodate() functions propagate buffer state into the - * page struct once IO has completed. + * folio once IO has completed. */ -int block_read_full_page(struct page *page, get_block_t *get_block) +int block_read_full_folio(struct folio *folio, get_block_t *get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize, bbits; int nr, i; int fully_mapped = 1; + bool page_error = false; + + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - head = create_page_buffers(page, inode, 0); + head = create_page_buffers(&folio->page, inode, 0); blocksize = head->b_size; bbits = block_size_bits(blocksize); - iblock = (sector_t)page->index << (PAGE_SHIFT - bbits); + iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits); lblock = (i_size_read(inode)+blocksize-1) >> bbits; bh = head; nr = 0; @@ -2286,11 +2269,14 @@ int block_read_full_page(struct page *page, get_block_t *get_block) if (iblock < lblock) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, iblock, bh, 0); - if (err) - SetPageError(page); + if (err) { + folio_set_error(folio); + page_error = true; + } } if (!buffer_mapped(bh)) { - zero_user(page, i * blocksize, blocksize); + folio_zero_range(folio, i * blocksize, + blocksize); if (!err) set_buffer_uptodate(bh); continue; @@ -2306,16 +2292,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block) } while (i++, iblock++, (bh = bh->b_this_page) != head); if (fully_mapped) - SetPageMappedToDisk(page); + folio_set_mappedtodisk(folio); if (!nr) { /* - * All buffers are uptodate - we can set the page uptodate + * All buffers are uptodate - we can set the folio uptodate * as well. But not if get_block() returned an error. */ - if (!PageError(page)) - SetPageUptodate(page); - unlock_page(page); + if (!page_error) + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; } @@ -2336,11 +2322,11 @@ int block_read_full_page(struct page *page, get_block_t *get_block) if (buffer_uptodate(bh)) end_buffer_async_read(bh, 1); else - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); } return 0; } -EXPORT_SYMBOL(block_read_full_page); +EXPORT_SYMBOL(block_read_full_folio); /* utility function for filesystems that need to do work on expanding * truncates. Uses filesystem pagecache writes to allow the filesystem to @@ -2349,20 +2335,20 @@ EXPORT_SYMBOL(block_read_full_page); int generic_cont_expand_simple(struct inode *inode, loff_t size) { struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; struct page *page; - void *fsdata; + void *fsdata = NULL; int err; err = inode_newsize_ok(inode, size); if (err) goto out; - err = pagecache_write_begin(NULL, mapping, size, 0, - AOP_FLAG_CONT_EXPAND, &page, &fsdata); + err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata); if (err) goto out; - err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); + err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata); BUG_ON(err > 0); out: @@ -2374,9 +2360,10 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, loff_t pos, loff_t *bytes) { struct inode *inode = mapping->host; + const struct address_space_operations *aops = mapping->a_ops; unsigned int blocksize = i_blocksize(inode); struct page *page; - void *fsdata; + void *fsdata = NULL; pgoff_t index, curidx; loff_t curpos; unsigned zerofrom, offset, len; @@ -2393,12 +2380,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = PAGE_SIZE - zerofrom; - err = pagecache_write_begin(file, mapping, curpos, len, 0, + err = aops->write_begin(file, mapping, curpos, len, &page, &fsdata); if (err) goto out; zero_user(page, zerofrom, len); - err = pagecache_write_end(file, mapping, curpos, len, len, + err = aops->write_end(file, mapping, curpos, len, len, page, fsdata); if (err < 0) goto out; @@ -2426,12 +2413,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = offset - zerofrom; - err = pagecache_write_begin(file, mapping, curpos, len, 0, + err = aops->write_begin(file, mapping, curpos, len, &page, &fsdata); if (err) goto out; zero_user(page, zerofrom, len); - err = pagecache_write_end(file, mapping, curpos, len, len, + err = aops->write_end(file, mapping, curpos, len, len, page, fsdata); if (err < 0) goto out; @@ -2447,7 +2434,7 @@ out: * We may have to extend the file. */ int cont_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata, get_block_t *get_block, loff_t *bytes) { @@ -2466,7 +2453,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping, (*bytes)++; } - return block_write_begin(mapping, pos, len, flags, pagep, get_block); + return block_write_begin(mapping, pos, len, pagep, get_block); } EXPORT_SYMBOL(cont_write_begin); @@ -2535,341 +2522,6 @@ out_unlock: } EXPORT_SYMBOL(block_page_mkwrite); -/* - * nobh_write_begin()'s prereads are special: the buffer_heads are freed - * immediately, while under the page lock. So it needs a special end_io - * handler which does not touch the bh after unlocking it. - */ -static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) -{ - __end_buffer_read_notouch(bh, uptodate); -} - -/* - * Attach the singly-linked list of buffers created by nobh_write_begin, to - * the page (converting it to circular linked list and taking care of page - * dirty races). - */ -static void attach_nobh_buffers(struct page *page, struct buffer_head *head) -{ - struct buffer_head *bh; - - BUG_ON(!PageLocked(page)); - - spin_lock(&page->mapping->private_lock); - bh = head; - do { - if (PageDirty(page)) - set_buffer_dirty(bh); - if (!bh->b_this_page) - bh->b_this_page = head; - bh = bh->b_this_page; - } while (bh != head); - attach_page_private(page, head); - spin_unlock(&page->mapping->private_lock); -} - -/* - * On entry, the page is fully not uptodate. - * On exit the page is fully uptodate in the areas outside (from,to) - * The filesystem needs to handle block truncation upon failure. - */ -int nobh_write_begin(struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata, - get_block_t *get_block) -{ - struct inode *inode = mapping->host; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocksize = 1 << blkbits; - struct buffer_head *head, *bh; - struct page *page; - pgoff_t index; - unsigned from, to; - unsigned block_in_page; - unsigned block_start, block_end; - sector_t block_in_file; - int nr_reads = 0; - int ret = 0; - int is_mapped_to_disk = 1; - - index = pos >> PAGE_SHIFT; - from = pos & (PAGE_SIZE - 1); - to = from + len; - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - *pagep = page; - *fsdata = NULL; - - if (page_has_buffers(page)) { - ret = __block_write_begin(page, pos, len, get_block); - if (unlikely(ret)) - goto out_release; - return ret; - } - - if (PageMappedToDisk(page)) - return 0; - - /* - * Allocate buffers so that we can keep track of state, and potentially - * attach them to the page if an error occurs. In the common case of - * no error, they will just be freed again without ever being attached - * to the page (which is all OK, because we're under the page lock). - * - * Be careful: the buffer linked list is a NULL terminated one, rather - * than the circular one we're used to. - */ - head = alloc_page_buffers(page, blocksize, false); - if (!head) { - ret = -ENOMEM; - goto out_release; - } - - block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); - - /* - * We loop across all blocks in the page, whether or not they are - * part of the affected region. This is so we can discover if the - * page is fully mapped-to-disk. - */ - for (block_start = 0, block_in_page = 0, bh = head; - block_start < PAGE_SIZE; - block_in_page++, block_start += blocksize, bh = bh->b_this_page) { - int create; - - block_end = block_start + blocksize; - bh->b_state = 0; - create = 1; - if (block_start >= to) - create = 0; - ret = get_block(inode, block_in_file + block_in_page, - bh, create); - if (ret) - goto failed; - if (!buffer_mapped(bh)) - is_mapped_to_disk = 0; - if (buffer_new(bh)) - clean_bdev_bh_alias(bh); - if (PageUptodate(page)) { - set_buffer_uptodate(bh); - continue; - } - if (buffer_new(bh) || !buffer_mapped(bh)) { - zero_user_segments(page, block_start, from, - to, block_end); - continue; - } - if (buffer_uptodate(bh)) - continue; /* reiserfs does this */ - if (block_start < from || block_end > to) { - lock_buffer(bh); - bh->b_end_io = end_buffer_read_nobh; - submit_bh(REQ_OP_READ, 0, bh); - nr_reads++; - } - } - - if (nr_reads) { - /* - * The page is locked, so these buffers are protected from - * any VM or truncate activity. Hence we don't need to care - * for the buffer_head refcounts. - */ - for (bh = head; bh; bh = bh->b_this_page) { - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - ret = -EIO; - } - if (ret) - goto failed; - } - - if (is_mapped_to_disk) - SetPageMappedToDisk(page); - - *fsdata = head; /* to be released by nobh_write_end */ - - return 0; - -failed: - BUG_ON(!ret); - /* - * Error recovery is a bit difficult. We need to zero out blocks that - * were newly allocated, and dirty them to ensure they get written out. - * Buffers need to be attached to the page at this point, otherwise - * the handling of potential IO errors during writeout would be hard - * (could try doing synchronous writeout, but what if that fails too?) - */ - attach_nobh_buffers(page, head); - page_zero_new_buffers(page, from, to); - -out_release: - unlock_page(page); - put_page(page); - *pagep = NULL; - - return ret; -} -EXPORT_SYMBOL(nobh_write_begin); - -int nobh_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = page->mapping->host; - struct buffer_head *head = fsdata; - struct buffer_head *bh; - BUG_ON(fsdata != NULL && page_has_buffers(page)); - - if (unlikely(copied < len) && head) - attach_nobh_buffers(page, head); - if (page_has_buffers(page)) - return generic_write_end(file, mapping, pos, len, - copied, page, fsdata); - - SetPageUptodate(page); - set_page_dirty(page); - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); - mark_inode_dirty(inode); - } - - unlock_page(page); - put_page(page); - - while (head) { - bh = head; - head = head->b_this_page; - free_buffer_head(bh); - } - - return copied; -} -EXPORT_SYMBOL(nobh_write_end); - -/* - * nobh_writepage() - based on block_full_write_page() except - * that it tries to operate without attaching bufferheads to - * the page. - */ -int nobh_writepage(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) -{ - struct inode * const inode = page->mapping->host; - loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned offset; - int ret; - - /* Is the page fully inside i_size? */ - if (page->index < end_index) - goto out; - - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE-1); - if (page->index >= end_index+1 || !offset) { - unlock_page(page); - return 0; /* don't care */ - } - - /* - * The page straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - zero_user_segment(page, offset, PAGE_SIZE); -out: - ret = mpage_writepage(page, get_block, wbc); - if (ret == -EAGAIN) - ret = __block_write_full_page(inode, page, get_block, wbc, - end_buffer_async_write); - return ret; -} -EXPORT_SYMBOL(nobh_writepage); - -int nobh_truncate_page(struct address_space *mapping, - loff_t from, get_block_t *get_block) -{ - pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); - unsigned blocksize; - sector_t iblock; - unsigned length, pos; - struct inode *inode = mapping->host; - struct page *page; - struct buffer_head map_bh; - int err; - - blocksize = i_blocksize(inode); - length = offset & (blocksize - 1); - - /* Block boundary? Nothing to do */ - if (!length) - return 0; - - length = blocksize - length; - iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits); - - page = grab_cache_page(mapping, index); - err = -ENOMEM; - if (!page) - goto out; - - if (page_has_buffers(page)) { -has_buffers: - unlock_page(page); - put_page(page); - return block_truncate_page(mapping, from, get_block); - } - - /* Find the buffer that contains "offset" */ - pos = blocksize; - while (offset >= pos) { - iblock++; - pos += blocksize; - } - - map_bh.b_size = blocksize; - map_bh.b_state = 0; - err = get_block(inode, iblock, &map_bh, 0); - if (err) - goto unlock; - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(&map_bh)) - goto unlock; - - /* Ok, it's mapped. Make sure it's up-to-date */ - if (!PageUptodate(page)) { - err = mapping->a_ops->readpage(NULL, page); - if (err) { - put_page(page); - goto out; - } - lock_page(page); - if (!PageUptodate(page)) { - err = -EIO; - goto unlock; - } - if (page_has_buffers(page)) - goto has_buffers; - } - zero_user(page, offset, length); - set_page_dirty(page); - err = 0; - -unlock: - unlock_page(page); - put_page(page); -out: - return err; -} -EXPORT_SYMBOL(nobh_truncate_page); - int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) { @@ -2926,11 +2578,9 @@ int block_truncate_page(struct address_space *mapping, set_buffer_uptodate(bh); if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { - err = -EIO; - ll_rw_block(REQ_OP_READ, 0, 1, &bh); - wait_on_buffer(bh); + err = bh_read(bh, 0); /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) + if (err < 0) goto unlock; } @@ -3006,9 +2656,10 @@ static void end_bio_bh_io_sync(struct bio *bio) bio_put(bio); } -static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - enum rw_hint write_hint, struct writeback_control *wbc) +static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, + struct writeback_control *wbc) { + const enum req_op op = opf & REQ_OP_MASK; struct bio *bio; BUG_ON(!buffer_locked(bh)); @@ -3023,13 +2674,16 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE)) clear_buffer_write_io_error(bh); - bio = bio_alloc(GFP_NOIO, 1); + if (buffer_meta(bh)) + opf |= REQ_META; + if (buffer_prio(bh)) + opf |= REQ_PRIO; + + bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio_set_dev(bio, bh->b_bdev); - bio->bi_write_hint = write_hint; bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); BUG_ON(bio->bi_iter.bi_size != bh->b_size); @@ -3037,12 +2691,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; - if (buffer_meta(bh)) - op_flags |= REQ_META; - if (buffer_prio(bh)) - op_flags |= REQ_PRIO; - bio_set_op_attrs(bio, op, op_flags); - /* Take care of bh's that straddle the end of the device */ guard_bio_eod(bio); @@ -3052,71 +2700,15 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, } submit_bio(bio); - return 0; } -int submit_bh(int op, int op_flags, struct buffer_head *bh) +void submit_bh(blk_opf_t opf, struct buffer_head *bh) { - return submit_bh_wbc(op, op_flags, bh, 0, NULL); + submit_bh_wbc(opf, bh, NULL); } EXPORT_SYMBOL(submit_bh); -/** - * ll_rw_block: low-level access to block devices (DEPRECATED) - * @op: whether to %READ or %WRITE - * @op_flags: req_flag_bits - * @nr: number of &struct buffer_heads in the array - * @bhs: array of pointers to &struct buffer_head - * - * ll_rw_block() takes an array of pointers to &struct buffer_heads, and - * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE. - * @op_flags contains flags modifying the detailed I/O behavior, most notably - * %REQ_RAHEAD. - * - * This function drops any buffer that it cannot get a lock on (with the - * BH_Lock state bit), any buffer that appears to be clean when doing a write - * request, and any buffer that appears to be up-to-date when doing read - * request. Further it marks as clean buffers that are processed for - * writing (the buffer cache won't assume that they are actually clean - * until the buffer gets unlocked). - * - * ll_rw_block sets b_end_io to simple completion handler that marks - * the buffer up-to-date (if appropriate), unlocks the buffer and wakes - * any waiters. - * - * All of the buffers must be for the same device, and must also be a - * multiple of the current approved size for the device. - */ -void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[]) -{ - int i; - - for (i = 0; i < nr; i++) { - struct buffer_head *bh = bhs[i]; - - if (!trylock_buffer(bh)) - continue; - if (op == WRITE) { - if (test_clear_buffer_dirty(bh)) { - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - submit_bh(op, op_flags, bh); - continue; - } - } else { - if (!buffer_uptodate(bh)) { - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(op, op_flags, bh); - continue; - } - } - unlock_buffer(bh); - } -} -EXPORT_SYMBOL(ll_rw_block); - -void write_dirty_buffer(struct buffer_head *bh, int op_flags) +void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { lock_buffer(bh); if (!test_clear_buffer_dirty(bh)) { @@ -3125,7 +2717,7 @@ void write_dirty_buffer(struct buffer_head *bh, int op_flags) } bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(REQ_OP_WRITE, op_flags, bh); + submit_bh(REQ_OP_WRITE | op_flags, bh); } EXPORT_SYMBOL(write_dirty_buffer); @@ -3134,10 +2726,8 @@ EXPORT_SYMBOL(write_dirty_buffer); * and then start new I/O and then wait upon it. The caller must have a ref on * the buffer_head. */ -int __sync_dirty_buffer(struct buffer_head *bh, int op_flags) +int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { - int ret = 0; - WARN_ON(atomic_read(&bh->b_count) < 1); lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { @@ -3152,14 +2742,14 @@ int __sync_dirty_buffer(struct buffer_head *bh, int op_flags) get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(REQ_OP_WRITE, op_flags, bh); + submit_bh(REQ_OP_WRITE | op_flags, bh); wait_on_buffer(bh); - if (!ret && !buffer_uptodate(bh)) - ret = -EIO; + if (!buffer_uptodate(bh)) + return -EIO; } else { unlock_buffer(bh); } - return ret; + return 0; } EXPORT_SYMBOL(__sync_dirty_buffer); @@ -3170,21 +2760,21 @@ int sync_dirty_buffer(struct buffer_head *bh) EXPORT_SYMBOL(sync_dirty_buffer); /* - * try_to_free_buffers() checks if all the buffers on this particular page + * try_to_free_buffers() checks if all the buffers on this particular folio * are unused, and releases them if so. * * Exclusion against try_to_free_buffers may be obtained by either - * locking the page or by holding its mapping's private_lock. + * locking the folio or by holding its mapping's private_lock. * - * If the page is dirty but all the buffers are clean then we need to - * be sure to mark the page clean as well. This is because the page + * If the folio is dirty but all the buffers are clean then we need to + * be sure to mark the folio clean as well. This is because the folio * may be against a block device, and a later reattachment of buffers - * to a dirty page will set *all* buffers dirty. Which would corrupt + * to a dirty folio will set *all* buffers dirty. Which would corrupt * filesystem data on the same device. * - * The same applies to regular filesystem pages: if all the buffers are - * clean then we set the page clean and proceed. To do that, we require - * total exclusion from __set_page_dirty_buffers(). That is obtained with + * The same applies to regular filesystem folios: if all the buffers are + * clean then we set the folio clean and proceed. To do that, we require + * total exclusion from block_dirty_folio(). That is obtained with * private_lock. * * try_to_free_buffers() is non-blocking. @@ -3195,10 +2785,10 @@ static inline int buffer_busy(struct buffer_head *bh) (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); } -static int -drop_buffers(struct page *page, struct buffer_head **buffers_to_free) +static bool +drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free) { - struct buffer_head *head = page_buffers(page); + struct buffer_head *head = folio_buffers(folio); struct buffer_head *bh; bh = head; @@ -3216,46 +2806,46 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) bh = next; } while (bh != head); *buffers_to_free = head; - detach_page_private(page); - return 1; + folio_detach_private(folio); + return true; failed: - return 0; + return false; } -int try_to_free_buffers(struct page *page) +bool try_to_free_buffers(struct folio *folio) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = folio->mapping; struct buffer_head *buffers_to_free = NULL; - int ret = 0; + bool ret = 0; - BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) - return 0; + BUG_ON(!folio_test_locked(folio)); + if (folio_test_writeback(folio)) + return false; if (mapping == NULL) { /* can this still happen? */ - ret = drop_buffers(page, &buffers_to_free); + ret = drop_buffers(folio, &buffers_to_free); goto out; } spin_lock(&mapping->private_lock); - ret = drop_buffers(page, &buffers_to_free); + ret = drop_buffers(folio, &buffers_to_free); /* * If the filesystem writes its buffers by hand (eg ext3) - * then we can have clean buffers against a dirty page. We - * clean the page here; otherwise the VM will never notice + * then we can have clean buffers against a dirty folio. We + * clean the folio here; otherwise the VM will never notice * that the filesystem did any IO at all. * * Also, during truncate, discard_buffer will have marked all - * the page's buffers clean. We discover that here and clean - * the page also. + * the folio's buffers clean. We discover that here and clean + * the folio also. * * private_lock must be held over this entire operation in order - * to synchronise against __set_page_dirty_buffers and prevent the + * to synchronise against block_dirty_folio and prevent the * dirty bit from being lost. */ if (ret) - cancel_dirty_page(page); + folio_cancel_dirty(folio); spin_unlock(&mapping->private_lock); out: if (buffers_to_free) { @@ -3364,29 +2954,69 @@ int bh_uptodate_or_lock(struct buffer_head *bh) EXPORT_SYMBOL(bh_uptodate_or_lock); /** - * bh_submit_read - Submit a locked buffer for reading + * __bh_read - Submit read for a locked buffer * @bh: struct buffer_head + * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ + * @wait: wait until reading finish * - * Returns zero on success and -EIO on error. + * Returns zero on success or don't wait, and -EIO on error. */ -int bh_submit_read(struct buffer_head *bh) +int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { - BUG_ON(!buffer_locked(bh)); + int ret = 0; - if (buffer_uptodate(bh)) { - unlock_buffer(bh); - return 0; - } + BUG_ON(!buffer_locked(bh)); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, 0, bh); - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return 0; - return -EIO; + submit_bh(REQ_OP_READ | op_flags, bh); + if (wait) { + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + ret = -EIO; + } + return ret; +} +EXPORT_SYMBOL(__bh_read); + +/** + * __bh_read_batch - Submit read for a batch of unlocked buffers + * @nr: entry number of the buffer batch + * @bhs: a batch of struct buffer_head + * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ + * @force_lock: force to get a lock on the buffer if set, otherwise drops any + * buffer that cannot lock. + * + * Returns zero on success or don't wait, and -EIO on error. + */ +void __bh_read_batch(int nr, struct buffer_head *bhs[], + blk_opf_t op_flags, bool force_lock) +{ + int i; + + for (i = 0; i < nr; i++) { + struct buffer_head *bh = bhs[i]; + + if (buffer_uptodate(bh)) + continue; + + if (force_lock) + lock_buffer(bh); + else + if (!trylock_buffer(bh)) + continue; + + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + continue; + } + + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(REQ_OP_READ | op_flags, bh); + } } -EXPORT_SYMBOL(bh_submit_read); +EXPORT_SYMBOL(__bh_read_batch); void __init buffer_init(void) { |