aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c656
1 files changed, 399 insertions, 257 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 60f5f68d892d..6e3b72e63e42 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -142,7 +142,7 @@ struct extent_page_data {
unsigned int sync_io:1;
};
-static int add_extent_changeset(struct extent_state *state, unsigned bits,
+static int add_extent_changeset(struct extent_state *state, u32 bits,
struct extent_changeset *changeset,
int set)
{
@@ -530,7 +530,7 @@ static void merge_state(struct extent_io_tree *tree,
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, unsigned *bits,
+ struct extent_state *state, u32 *bits,
struct extent_changeset *changeset);
/*
@@ -547,7 +547,7 @@ static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
struct rb_node ***p,
struct rb_node **parent,
- unsigned *bits, struct extent_changeset *changeset)
+ u32 *bits, struct extent_changeset *changeset)
{
struct rb_node *node;
@@ -628,11 +628,11 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits, int wake,
+ u32 *bits, int wake,
struct extent_changeset *changeset)
{
struct extent_state *next;
- unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
+ u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
int ret;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
@@ -695,9 +695,9 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
+ u32 bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
@@ -868,7 +868,7 @@ static void wait_on_state(struct extent_io_tree *tree,
* The tree lock is taken by this function
*/
static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits)
+ u32 bits)
{
struct extent_state *state;
struct rb_node *node;
@@ -915,9 +915,9 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits, struct extent_changeset *changeset)
+ u32 *bits, struct extent_changeset *changeset)
{
- unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
+ u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
int ret;
if (tree->private_data && is_data_inode(tree->private_data))
@@ -961,12 +961,10 @@ static void cache_state(struct extent_state *state,
*
* [start, end] is inclusive This takes the tree lock.
*/
-
-static int __must_check
-__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned exclusive_bits,
- u64 *failed_start, struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ u32 exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask,
+ struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -980,6 +978,10 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
+ if (exclusive_bits)
+ ASSERT(failed_start);
+ else
+ ASSERT(failed_start == NULL);
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
@@ -1179,15 +1181,6 @@ out:
}
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, u64 * failed_start,
- struct extent_state **cached_state, gfp_t mask)
-{
- return __set_extent_bit(tree, start, end, bits, 0, failed_start,
- cached_state, mask, NULL);
-}
-
-
/**
* convert_extent_bit - convert all bits in a given range from one bit to
* another
@@ -1207,7 +1200,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
* All allocations are done with GFP_NOFS.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned clear_bits,
+ u32 bits, u32 clear_bits,
struct extent_state **cached_state)
{
struct extent_state *state;
@@ -1408,7 +1401,7 @@ out:
/* wrappers around set/clear extent bit */
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset)
+ u32 bits, struct extent_changeset *changeset)
{
/*
* We don't support EXTENT_LOCKED yet, as current changeset will
@@ -1418,19 +1411,19 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
BUG_ON(bits & EXTENT_LOCKED);
- return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
- changeset);
+ return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
+ changeset);
}
int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits)
+ u32 bits)
{
- return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
- GFP_NOWAIT, NULL);
+ return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
+ GFP_NOWAIT, NULL);
}
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
+ u32 bits, int wake, int delete,
struct extent_state **cached)
{
return __clear_extent_bit(tree, start, end, bits, wake, delete,
@@ -1438,7 +1431,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
}
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset)
+ u32 bits, struct extent_changeset *changeset)
{
/*
* Don't support EXTENT_LOCKED case, same reason as
@@ -1461,9 +1454,9 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u64 failed_start;
while (1) {
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
- EXTENT_LOCKED, &failed_start,
- cached_state, GFP_NOFS, NULL);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
+ EXTENT_LOCKED, &failed_start,
+ cached_state, GFP_NOFS, NULL);
if (err == -EEXIST) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
@@ -1479,8 +1472,8 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
int err;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, GFP_NOFS, NULL);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+ &failed_start, NULL, GFP_NOFS, NULL);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
@@ -1526,8 +1519,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
* nothing was found after 'start'
*/
static struct extent_state *
-find_first_extent_bit_state(struct extent_io_tree *tree,
- u64 start, unsigned bits)
+find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
{
struct rb_node *node;
struct extent_state *state;
@@ -1554,14 +1546,15 @@ out:
}
/*
- * find the first offset in the io tree with 'bits' set. zero is
- * returned if we find something, and *start_ret and *end_ret are
- * set to reflect the state struct that was found.
+ * Find the first offset in the io tree with one or more @bits set.
*
- * If nothing was found, 1 is returned. If found something, return 0.
+ * Note: If there are multiple bits set in @bits, any of them will match.
+ *
+ * Return 0 if we find something, and update @start_ret and @end_ret.
+ * Return 1 if we found nothing.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits,
+ u64 *start_ret, u64 *end_ret, u32 bits,
struct extent_state **cached_state)
{
struct extent_state *state;
@@ -1612,7 +1605,7 @@ out:
* returned will be the full contiguous area with the bits set.
*/
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits)
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
int ret = 1;
@@ -1649,7 +1642,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
* trim @end_ret to the appropriate size.
*/
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits)
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
struct rb_node *node, *prev = NULL, *next;
@@ -1946,7 +1939,7 @@ static int __process_pages_contig(struct address_space *mapping,
unsigned long page_ops, pgoff_t *index_ret)
{
unsigned long nr_pages = end_index - start_index + 1;
- unsigned long pages_locked = 0;
+ unsigned long pages_processed = 0;
pgoff_t index = start_index;
struct page *pages[16];
unsigned ret;
@@ -1981,7 +1974,7 @@ static int __process_pages_contig(struct address_space *mapping,
if (locked_page && pages[i] == locked_page) {
put_page(pages[i]);
- pages_locked++;
+ pages_processed++;
continue;
}
if (page_ops & PAGE_CLEAR_DIRTY)
@@ -2006,7 +1999,7 @@ static int __process_pages_contig(struct address_space *mapping,
}
}
put_page(pages[i]);
- pages_locked++;
+ pages_processed++;
}
nr_pages -= ret;
index += ret;
@@ -2014,14 +2007,13 @@ static int __process_pages_contig(struct address_space *mapping,
}
out:
if (err && index_ret)
- *index_ret = start_index + pages_locked - 1;
+ *index_ret = start_index + pages_processed - 1;
return err;
}
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
- unsigned clear_bits,
- unsigned long page_ops)
+ u32 clear_bits, unsigned long page_ops)
{
clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
@@ -2037,7 +2029,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
*/
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
- unsigned bits, int contig)
+ u32 bits, int contig)
{
struct rb_node *node;
struct extent_state *state;
@@ -2157,7 +2149,7 @@ out:
* range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int filled, struct extent_state *cached)
+ u32 bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
struct rb_node *node;
@@ -2642,7 +2634,7 @@ static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio)
}
blk_status_t btrfs_submit_read_repair(struct inode *inode,
- struct bio *failed_bio, u64 phy_offset,
+ struct bio *failed_bio, u32 bio_offset,
struct page *page, unsigned int pgoff,
u64 start, u64 end, int failed_mirror,
submit_bio_hook_t *submit_bio_hook)
@@ -2652,7 +2644,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
- const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits;
+ const int icsum = bio_offset >> fs_info->sectorsize_bits;
bool need_validation;
struct bio *repair_bio;
struct btrfs_io_bio *repair_io_bio;
@@ -2685,7 +2677,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
repair_bio->bi_private = failed_bio->bi_private;
if (failed_io_bio->csum) {
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
repair_io_bio->csum = repair_io_bio->csum_inline;
memcpy(repair_io_bio->csum,
@@ -2775,16 +2767,88 @@ static void end_bio_extent_writepage(struct bio *bio)
bio_put(bio);
}
-static void
-endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
- int uptodate)
+/*
+ * Record previously processed extent range
+ *
+ * For endio_readpage_release_extent() to handle a full extent range, reducing
+ * the extent io operations.
+ */
+struct processed_extent {
+ struct btrfs_inode *inode;
+ /* Start of the range in @inode */
+ u64 start;
+ /* End of the range in in @inode */
+ u64 end;
+ bool uptodate;
+};
+
+/*
+ * Try to release processed extent range
+ *
+ * May not release the extent range right now if the current range is
+ * contiguous to processed extent.
+ *
+ * Will release processed extent when any of @inode, @uptodate, the range is
+ * no longer contiguous to the processed range.
+ *
+ * Passing @inode == NULL will force processed extent to be released.
+ */
+static void endio_readpage_release_extent(struct processed_extent *processed,
+ struct btrfs_inode *inode, u64 start, u64 end,
+ bool uptodate)
{
struct extent_state *cached = NULL;
- u64 end = start + len - 1;
+ struct extent_io_tree *tree;
+
+ /* The first extent, initialize @processed */
+ if (!processed->inode)
+ goto update;
- if (uptodate && tree->track_uptodate)
- set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
- unlock_extent_cached_atomic(tree, start, end, &cached);
+ /*
+ * Contiguous to processed extent, just uptodate the end.
+ *
+ * Several things to notice:
+ *
+ * - bio can be merged as long as on-disk bytenr is contiguous
+ * This means we can have page belonging to other inodes, thus need to
+ * check if the inode still matches.
+ * - bvec can contain range beyond current page for multi-page bvec
+ * Thus we need to do processed->end + 1 >= start check
+ */
+ if (processed->inode == inode && processed->uptodate == uptodate &&
+ processed->end + 1 >= start && end >= processed->end) {
+ processed->end = end;
+ return;
+ }
+
+ tree = &processed->inode->io_tree;
+ /*
+ * Now we don't have range contiguous to the processed range, release
+ * the processed range now.
+ */
+ if (processed->uptodate && tree->track_uptodate)
+ set_extent_uptodate(tree, processed->start, processed->end,
+ &cached, GFP_ATOMIC);
+ unlock_extent_cached_atomic(tree, processed->start, processed->end,
+ &cached);
+
+update:
+ /* Update processed to current range */
+ processed->inode = inode;
+ processed->start = start;
+ processed->end = end;
+ processed->uptodate = uptodate;
+}
+
+static void endio_readpage_update_page_status(struct page *page, bool uptodate)
+{
+ if (uptodate) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
}
/*
@@ -2804,12 +2868,12 @@ static void end_bio_extent_readpage(struct bio *bio)
int uptodate = !bio->bi_status;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct extent_io_tree *tree, *failure_tree;
- u64 offset = 0;
- u64 start;
- u64 end;
- u64 len;
- u64 extent_start = 0;
- u64 extent_len = 0;
+ struct processed_extent processed = { 0 };
+ /*
+ * The offset to the beginning of a bio, since one bio can never be
+ * larger than UINT_MAX, u32 here is enough.
+ */
+ u32 bio_offset = 0;
int mirror;
int ret;
struct bvec_iter_all iter_all;
@@ -2819,42 +2883,48 @@ static void end_bio_extent_readpage(struct bio *bio)
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 start;
+ u64 end;
+ u32 len;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
- (u64)bio->bi_iter.bi_sector, bio->bi_status,
+ bio->bi_iter.bi_sector, bio->bi_status,
io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
- /* We always issue full-page reads, but if some block
- * in a page fails to read, blk_update_request() will
- * advance bv_offset and adjust bv_len to compensate.
- * Print a warning for nonzero offsets, and an error
- * if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
- if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
- btrfs_err(fs_info,
- "partial page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else
- btrfs_info(fs_info,
- "incomplete page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- }
-
- start = page_offset(page);
- end = start + bvec->bv_offset + bvec->bv_len - 1;
+ /*
+ * We always issue full-sector reads, but if some block in a
+ * page fails to read, blk_update_request() will advance
+ * bv_offset and adjust bv_len to compensate. Print a warning
+ * for unaligned offsets, and an error if they don't add up to
+ * a full sector.
+ */
+ if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ btrfs_err(fs_info,
+ "partial page read in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
+ sectorsize))
+ btrfs_info(fs_info,
+ "incomplete page read with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+
+ start = page_offset(page) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
len = bvec->bv_len;
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
if (is_data_inode(inode))
- ret = btrfs_verify_data_csum(io_bio, offset, page,
- start, end, mirror);
+ ret = btrfs_verify_data_csum(io_bio,
+ bio_offset, page, start, end,
+ mirror);
else
ret = btrfs_validate_metadata_buffer(io_bio,
- offset, page, start, end, mirror);
+ page, start, end, mirror);
if (ret)
uptodate = 0;
else
@@ -2879,12 +2949,14 @@ static void end_bio_extent_readpage(struct bio *bio)
* If it can't handle the error it will return -EIO and
* we remain responsible for that page.
*/
- if (!btrfs_submit_read_repair(inode, bio, offset, page,
+ if (!btrfs_submit_read_repair(inode, bio, bio_offset,
+ page,
start - page_offset(page),
start, end, mirror,
btrfs_submit_data_bio)) {
uptodate = !bio->bi_status;
- offset += len;
+ ASSERT(bio_offset + len > bio_offset);
+ bio_offset += len;
continue;
}
} else {
@@ -2908,40 +2980,17 @@ readpage_ok:
off = offset_in_page(i_size);
if (page->index == end_index && off)
zero_user_segment(page, off, PAGE_SIZE);
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
- offset += len;
-
- if (unlikely(!uptodate)) {
- if (extent_len) {
- endio_readpage_release_extent(tree,
- extent_start,
- extent_len, 1);
- extent_start = 0;
- extent_len = 0;
- }
- endio_readpage_release_extent(tree, start,
- end - start + 1, 0);
- } else if (!extent_len) {
- extent_start = start;
- extent_len = end + 1 - start;
- } else if (extent_start + extent_len == start) {
- extent_len += end + 1 - start;
- } else {
- endio_readpage_release_extent(tree, extent_start,
- extent_len, uptodate);
- extent_start = start;
- extent_len = end + 1 - start;
}
- }
+ ASSERT(bio_offset + len > bio_offset);
+ bio_offset += len;
- if (extent_len)
- endio_readpage_release_extent(tree, extent_start, extent_len,
- uptodate);
+ /* Update page status and unlock */
+ endio_readpage_update_page_status(page, uptodate);
+ endio_readpage_release_extent(&processed, BTRFS_I(inode),
+ start, end, uptodate);
+ }
+ /* Release the last extent */
+ endio_readpage_release_extent(&processed, NULL, 0, 0, false);
btrfs_io_bio_free_csum(io_bio);
bio_put(bio);
}
@@ -3038,7 +3087,7 @@ static int submit_extent_page(unsigned int opf,
{
int ret = 0;
struct bio *bio;
- size_t page_size = min_t(size_t, size, PAGE_SIZE);
+ size_t io_size = min_t(size_t, size, PAGE_SIZE);
sector_t sector = offset >> 9;
struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -3054,12 +3103,12 @@ static int submit_extent_page(unsigned int opf,
else
contig = bio_end_sector(bio) == sector;
- if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
+ if (btrfs_bio_fits_in_stripe(page, io_size, bio, bio_flags))
can_merge = false;
if (prev_bio_flags != bio_flags || !contig || !can_merge ||
force_bio_submit ||
- bio_add_page(bio, page, page_size, pg_offset) < page_size) {
+ bio_add_page(bio, page, io_size, pg_offset) < io_size) {
ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
if (ret < 0) {
*bio_ret = NULL;
@@ -3068,13 +3117,13 @@ static int submit_extent_page(unsigned int opf,
bio = NULL;
} else {
if (wbc)
- wbc_account_cgroup_owner(wbc, page, page_size);
+ wbc_account_cgroup_owner(wbc, page, io_size);
return 0;
}
}
bio = btrfs_bio_alloc(offset);
- bio_add_page(bio, page, page_size, pg_offset);
+ bio_add_page(bio, page, io_size, pg_offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
bio->bi_write_hint = page->mapping->host->i_write_hint;
@@ -3085,7 +3134,7 @@ static int submit_extent_page(unsigned int opf,
bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
- wbc_account_cgroup_owner(wbc, page, page_size);
+ wbc_account_cgroup_owner(wbc, page, io_size);
}
*bio_ret = bio;
@@ -3096,6 +3145,15 @@ static int submit_extent_page(unsigned int opf,
static void attach_extent_buffer_page(struct extent_buffer *eb,
struct page *page)
{
+ /*
+ * If the page is mapped to btree inode, we should hold the private
+ * lock to prevent race.
+ * For cloned or dummy extent buffers, their pages are not mapped and
+ * will not race with any other ebs.
+ */
+ if (page->mapping)
+ lockdep_assert_held(&page->mapping->private_lock);
+
if (!PagePrivate(page))
attach_page_private(page, eb);
else
@@ -3158,7 +3216,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
int nr = 0;
size_t pg_offset = 0;
size_t iosize;
- size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
unsigned long this_bio_flag = 0;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -3224,13 +3281,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
iosize = min(extent_map_end(em) - cur, end - cur + 1);
cur_end = min(extent_map_end(em) - 1, end);
iosize = ALIGN(iosize, blocksize);
- if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
- disk_io_size = em->block_len;
+ if (this_bio_flag & EXTENT_BIO_COMPRESSED)
offset = em->block_start;
- } else {
+ else
offset = em->block_start + extent_offset;
- disk_io_size = iosize;
- }
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
@@ -3319,7 +3373,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
}
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
- page, offset, disk_io_size,
+ page, offset, iosize,
pg_offset, bio,
end_bio_extent_readpage, 0,
*bio_flags,
@@ -3656,11 +3710,14 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
}
/*
- * Lock eb pages and flush the bio if we can't the locks
+ * Lock extent buffer status and pages for writeback.
+ *
+ * May try to flush write bio if we can't get the lock.
*
- * Return 0 if nothing went wrong
- * Return >0 is same as 0, except bio is not submitted
- * Return <0 if something went wrong, no page is locked
+ * Return 0 if the extent buffer doesn't need to be submitted.
+ * (E.g. the extent buffer is not dirty)
+ * Return >0 is the extent buffer is submitted to bio.
+ * Return <0 if something went wrong, no page is locked.
*/
static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
struct extent_page_data *epd)
@@ -3930,10 +3987,81 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
return ret;
}
+/*
+ * Submit all page(s) of one extent buffer.
+ *
+ * @page: the page of one extent buffer
+ * @eb_context: to determine if we need to submit this page, if current page
+ * belongs to this eb, we don't need to submit
+ *
+ * The caller should pass each page in their bytenr order, and here we use
+ * @eb_context to determine if we have submitted pages of one extent buffer.
+ *
+ * If we have, we just skip until we hit a new page that doesn't belong to
+ * current @eb_context.
+ *
+ * If not, we submit all the page(s) of the extent buffer.
+ *
+ * Return >0 if we have submitted the extent buffer successfully.
+ * Return 0 if we don't need to submit the page, as it's already submitted by
+ * previous call.
+ * Return <0 for fatal error.
+ */
+static int submit_eb_page(struct page *page, struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ struct extent_buffer **eb_context)
+{
+ struct address_space *mapping = page->mapping;
+ struct extent_buffer *eb;
+ int ret;
+
+ if (!PagePrivate(page))
+ return 0;
+
+ spin_lock(&mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+
+ eb = (struct extent_buffer *)page->private;
+
+ /*
+ * Shouldn't happen and normally this would be a BUG_ON but no point
+ * crashing the machine for something we can survive anyway.
+ */
+ if (WARN_ON(!eb)) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+
+ if (eb == *eb_context) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+ ret = atomic_inc_not_zero(&eb->refs);
+ spin_unlock(&mapping->private_lock);
+ if (!ret)
+ return 0;
+
+ *eb_context = eb;
+
+ ret = lock_extent_buffer_for_io(eb, epd);
+ if (ret <= 0) {
+ free_extent_buffer(eb);
+ return ret;
+ }
+ ret = write_one_eb(eb, wbc, epd);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ return ret;
+ return 1;
+}
+
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct extent_buffer *eb, *prev_eb = NULL;
+ struct extent_buffer *eb_context = NULL;
struct extent_page_data epd = {
.bio = NULL,
.extent_locked = 0,
@@ -3979,55 +4107,13 @@ retry:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- if (!PagePrivate(page))
- continue;
-
- spin_lock(&mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&mapping->private_lock);
+ ret = submit_eb_page(page, wbc, &epd, &eb_context);
+ if (ret == 0)
continue;
- }
-
- eb = (struct extent_buffer *)page->private;
-
- /*
- * Shouldn't happen and normally this would be a BUG_ON
- * but no sense in crashing the users box for something
- * we can survive anyway.
- */
- if (WARN_ON(!eb)) {
- spin_unlock(&mapping->private_lock);
- continue;
- }
-
- if (eb == prev_eb) {
- spin_unlock(&mapping->private_lock);
- continue;
- }
-
- ret = atomic_inc_not_zero(&eb->refs);
- spin_unlock(&mapping->private_lock);
- if (!ret)
- continue;
-
- prev_eb = eb;
- ret = lock_extent_buffer_for_io(eb, &epd);
- if (!ret) {
- free_extent_buffer(eb);
- continue;
- } else if (ret < 0) {
- done = 1;
- free_extent_buffer(eb);
- break;
- }
-
- ret = write_one_eb(eb, wbc, &epd);
- if (ret) {
+ if (ret < 0) {
done = 1;
- free_extent_buffer(eb);
break;
}
- free_extent_buffer(eb);
/*
* the filesystem may choose to bump up nr_to_write.
@@ -4048,7 +4134,6 @@ retry:
index = 0;
goto retry;
}
- ASSERT(ret <= 0);
if (ret < 0) {
end_write_bio(&epd, ret);
return ret;
@@ -4382,14 +4467,22 @@ int extent_invalidatepage(struct extent_io_tree *tree,
u64 end = start + PAGE_SIZE - 1;
size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+ /* This function is only called for the btree inode */
+ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
+
start += ALIGN(offset, blocksize);
if (start > end)
return 0;
lock_extent_bits(tree, start, end, &cached_state);
wait_on_page_writeback(page);
- clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 1, 1, &cached_state);
+
+ /*
+ * Currently for btree io tree, only EXTENT_LOCKED is utilized,
+ * so here we only need to unlock the extent range to free any
+ * existing extent state.
+ */
+ unlock_extent_cached(tree, start, end, &cached_state);
return 0;
}
@@ -4409,12 +4502,14 @@ static int try_release_extent_state(struct extent_io_tree *tree,
ret = 0;
} else {
/*
- * at this point we can safely clear everything except the
- * locked bit and the nodatasum bit
+ * At this point we can safely clear everything except the
+ * locked bit, the nodatasum bit and the delalloc new bit.
+ * The delalloc new bit will be cleared by ordered extent
+ * completion.
*/
ret = __clear_extent_bit(tree, start, end,
- ~(EXTENT_LOCKED | EXTENT_NODATASUM),
- 0, 0, NULL, mask, NULL);
+ ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
+ 0, 0, NULL, mask, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
@@ -4691,7 +4786,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
roots = ulist_alloc(GFP_KERNEL);
tmp_ulist = ulist_alloc(GFP_KERNEL);
@@ -4950,12 +5044,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
eb->len = len;
eb->fs_info = fs_info;
eb->bflags = 0;
- rwlock_init(&eb->lock);
- atomic_set(&eb->blocking_readers, 0);
- eb->blocking_writers = 0;
- eb->lock_recursed = false;
- init_waitqueue_head(&eb->write_lock_wq);
- init_waitqueue_head(&eb->read_lock_wq);
+ init_rwsem(&eb->lock);
btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
&fs_info->allocated_ebs);
@@ -4964,19 +5053,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
atomic_set(&eb->refs, 1);
atomic_set(&eb->io_pages, 0);
- /*
- * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
- */
- BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
- > MAX_INLINE_EXTENT_BUFFER_SIZE);
- BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
-
-#ifdef CONFIG_BTRFS_DEBUG
- eb->spinning_writers = 0;
- atomic_set(&eb->spinning_readers, 0);
- atomic_set(&eb->read_locks, 0);
- eb->write_locks = 0;
-#endif
+ ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
return eb;
}
@@ -5105,7 +5182,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> PAGE_SHIFT);
+ start >> fs_info->sectorsize_bits);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
/*
@@ -5157,7 +5234,7 @@ again:
}
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_SHIFT, eb);
+ start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -5178,7 +5255,7 @@ free_eb:
#endif
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
+ u64 start, u64 owner_root, int level)
{
unsigned long len = fs_info->nodesize;
int num_pages;
@@ -5196,6 +5273,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
return ERR_PTR(-EINVAL);
}
+ if (fs_info->sectorsize < PAGE_SIZE &&
+ offset_in_page(start) + len > PAGE_SIZE) {
+ btrfs_err(fs_info,
+ "tree block crosses page boundary, start %llu nodesize %lu",
+ start, len);
+ return ERR_PTR(-EINVAL);
+ }
+
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
@@ -5203,6 +5288,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return ERR_PTR(-ENOMEM);
+ btrfs_set_buffer_lockdep_class(owner_root, eb, level);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++, index++) {
@@ -5231,13 +5317,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
}
exists = NULL;
- /*
- * Do this so attach doesn't complain and we need to
- * drop the ref the old guy had.
- */
- ClearPagePrivate(p);
WARN_ON(PageDirty(p));
- put_page(p);
+ detach_page_private(p);
}
attach_extent_buffer_page(eb, p);
spin_unlock(&mapping->private_lock);
@@ -5265,7 +5346,7 @@ again:
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_SHIFT, eb);
+ start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -5321,7 +5402,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_lock(&fs_info->buffer_lock);
radix_tree_delete(&fs_info->buffer_radix,
- eb->start >> PAGE_SHIFT);
+ eb->start >> fs_info->sectorsize_bits);
spin_unlock(&fs_info->buffer_lock);
} else {
spin_unlock(&eb->refs_lock);
@@ -5622,12 +5703,12 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
struct page *page;
char *kaddr;
char *dst = (char *)dstv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
if (check_eb_range(eb, start, len))
return;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5652,13 +5733,13 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
struct page *page;
char *kaddr;
char __user *dst = (char __user *)dstv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5687,13 +5768,13 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
int ret = 0;
if (check_eb_range(eb, start, len))
return -EINVAL;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5719,7 +5800,7 @@ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
char *kaddr;
WARN_ON(!PageUptodate(eb->pages[0]));
- kaddr = page_address(eb->pages[0]);
+ kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
BTRFS_FSID_SIZE);
}
@@ -5729,7 +5810,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
char *kaddr;
WARN_ON(!PageUptodate(eb->pages[0]));
- kaddr = page_address(eb->pages[0]);
+ kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
BTRFS_FSID_SIZE);
}
@@ -5742,12 +5823,12 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
struct page *page;
char *kaddr;
char *src = (char *)srcv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
if (check_eb_range(eb, start, len))
return;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5771,12 +5852,12 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
size_t offset;
struct page *page;
char *kaddr;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
if (check_eb_range(eb, start, len))
return;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5800,10 +5881,20 @@ void copy_extent_buffer_full(const struct extent_buffer *dst,
ASSERT(dst->len == src->len);
- num_pages = num_extent_pages(dst);
- for (i = 0; i < num_pages; i++)
- copy_page(page_address(dst->pages[i]),
- page_address(src->pages[i]));
+ if (dst->fs_info->sectorsize == PAGE_SIZE) {
+ num_pages = num_extent_pages(dst);
+ for (i = 0; i < num_pages; i++)
+ copy_page(page_address(dst->pages[i]),
+ page_address(src->pages[i]));
+ } else {
+ size_t src_offset = get_eb_offset_in_page(src, 0);
+ size_t dst_offset = get_eb_offset_in_page(dst, 0);
+
+ ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
+ memcpy(page_address(dst->pages[0]) + dst_offset,
+ page_address(src->pages[0]) + src_offset,
+ src->len);
+ }
}
void copy_extent_buffer(const struct extent_buffer *dst,
@@ -5816,7 +5907,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
size_t offset;
struct page *page;
char *kaddr;
- unsigned long i = dst_offset >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(dst_offset);
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(src, src_offset, len))
@@ -5824,7 +5915,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
WARN_ON(src->len != dst_len);
- offset = offset_in_page(dst_offset);
+ offset = get_eb_offset_in_page(dst, dst_offset);
while (len > 0) {
page = dst->pages[i];
@@ -5868,7 +5959,7 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
- offset = start + byte_offset;
+ offset = start + offset_in_page(eb->start) + byte_offset;
*page_index = offset >> PAGE_SHIFT;
*page_offset = offset_in_page(offset);
@@ -6022,11 +6113,11 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
return;
while (len > 0) {
- dst_off_in_page = offset_in_page(dst_offset);
- src_off_in_page = offset_in_page(src_offset);
+ dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
+ src_off_in_page = get_eb_offset_in_page(dst, src_offset);
- dst_i = dst_offset >> PAGE_SHIFT;
- src_i = src_offset >> PAGE_SHIFT;
+ dst_i = get_eb_page_index(dst_offset);
+ src_i = get_eb_page_index(src_offset);
cur = min(len, (unsigned long)(PAGE_SIZE -
src_off_in_page));
@@ -6062,11 +6153,11 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
return;
}
while (len > 0) {
- dst_i = dst_end >> PAGE_SHIFT;
- src_i = src_end >> PAGE_SHIFT;
+ dst_i = get_eb_page_index(dst_end);
+ src_i = get_eb_page_index(src_end);
- dst_off_in_page = offset_in_page(dst_end);
- src_off_in_page = offset_in_page(src_end);
+ dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
+ src_off_in_page = get_eb_offset_in_page(dst, src_end);
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
@@ -6121,3 +6212,54 @@ int try_release_extent_buffer(struct page *page)
return release_extent_buffer(eb);
}
+
+/*
+ * btrfs_readahead_tree_block - attempt to readahead a child block
+ * @fs_info: the fs_info
+ * @bytenr: bytenr to read
+ * @owner_root: objectid of the root that owns this eb
+ * @gen: generation for the uptodate check, can be 0
+ * @level: level for the eb
+ *
+ * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
+ * normal uptodate check of the eb, without checking the generation. If we have
+ * to read the block we will not block on anything.
+ */
+void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 owner_root, u64 gen, int level)
+{
+ struct extent_buffer *eb;
+ int ret;
+
+ eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
+ if (IS_ERR(eb))
+ return;
+
+ if (btrfs_buffer_uptodate(eb, gen, 1)) {
+ free_extent_buffer(eb);
+ return;
+ }
+
+ ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
+ if (ret < 0)
+ free_extent_buffer_stale(eb);
+ else
+ free_extent_buffer(eb);
+}
+
+/*
+ * btrfs_readahead_node_child - readahead a node's child block
+ * @node: parent node we're reading from
+ * @slot: slot in the parent node for the child we want to read
+ *
+ * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
+ * the slot in the node provided.
+ */
+void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
+{
+ btrfs_readahead_tree_block(node->fs_info,
+ btrfs_node_blockptr(node, slot),
+ btrfs_header_owner(node),
+ btrfs_node_ptr_generation(node, slot),
+ btrfs_header_level(node) - 1);
+}