diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r-- | fs/btrfs/extent_io.c | 656 |
1 files changed, 399 insertions, 257 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 60f5f68d892d..6e3b72e63e42 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -142,7 +142,7 @@ struct extent_page_data { unsigned int sync_io:1; }; -static int add_extent_changeset(struct extent_state *state, unsigned bits, +static int add_extent_changeset(struct extent_state *state, u32 bits, struct extent_changeset *changeset, int set) { @@ -530,7 +530,7 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits, + struct extent_state *state, u32 *bits, struct extent_changeset *changeset); /* @@ -547,7 +547,7 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, struct rb_node ***p, struct rb_node **parent, - unsigned *bits, struct extent_changeset *changeset) + u32 *bits, struct extent_changeset *changeset) { struct rb_node *node; @@ -628,11 +628,11 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits, int wake, + u32 *bits, int wake, struct extent_changeset *changeset) { struct extent_state *next; - unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; + u32 bits_to_clear = *bits & ~EXTENT_CTLBITS; int ret; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { @@ -695,9 +695,9 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * This takes the tree lock, and returns 0 on success and < 0 on error. */ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, - struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) + u32 bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask, struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; @@ -868,7 +868,7 @@ static void wait_on_state(struct extent_io_tree *tree, * The tree lock is taken by this function */ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits) + u32 bits) { struct extent_state *state; struct rb_node *node; @@ -915,9 +915,9 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits, struct extent_changeset *changeset) + u32 *bits, struct extent_changeset *changeset) { - unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; + u32 bits_to_set = *bits & ~EXTENT_CTLBITS; int ret; if (tree->private_data && is_data_inode(tree->private_data)) @@ -961,12 +961,10 @@ static void cache_state(struct extent_state *state, * * [start, end] is inclusive This takes the tree lock. */ - -static int __must_check -__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, unsigned exclusive_bits, - u64 *failed_start, struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + u32 exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask, + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -980,6 +978,10 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); + if (exclusive_bits) + ASSERT(failed_start); + else + ASSERT(failed_start == NULL); again: if (!prealloc && gfpflags_allow_blocking(mask)) { /* @@ -1179,15 +1181,6 @@ out: } -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, u64 * failed_start, - struct extent_state **cached_state, gfp_t mask) -{ - return __set_extent_bit(tree, start, end, bits, 0, failed_start, - cached_state, mask, NULL); -} - - /** * convert_extent_bit - convert all bits in a given range from one bit to * another @@ -1207,7 +1200,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * All allocations are done with GFP_NOFS. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, unsigned clear_bits, + u32 bits, u32 clear_bits, struct extent_state **cached_state) { struct extent_state *state; @@ -1408,7 +1401,7 @@ out: /* wrappers around set/clear extent bit */ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset) + u32 bits, struct extent_changeset *changeset) { /* * We don't support EXTENT_LOCKED yet, as current changeset will @@ -1418,19 +1411,19 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ BUG_ON(bits & EXTENT_LOCKED); - return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, - changeset); + return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, + changeset); } int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits) + u32 bits) { - return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, - GFP_NOWAIT, NULL); + return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, + GFP_NOWAIT, NULL); } int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, + u32 bits, int wake, int delete, struct extent_state **cached) { return __clear_extent_bit(tree, start, end, bits, wake, delete, @@ -1438,7 +1431,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset) + u32 bits, struct extent_changeset *changeset) { /* * Don't support EXTENT_LOCKED case, same reason as @@ -1461,9 +1454,9 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u64 failed_start; while (1) { - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, - EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS, NULL); + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, + EXTENT_LOCKED, &failed_start, + cached_state, GFP_NOFS, NULL); if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; @@ -1479,8 +1472,8 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS, NULL); + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, GFP_NOFS, NULL); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, @@ -1526,8 +1519,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) * nothing was found after 'start' */ static struct extent_state * -find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, unsigned bits) +find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits) { struct rb_node *node; struct extent_state *state; @@ -1554,14 +1546,15 @@ out: } /* - * find the first offset in the io tree with 'bits' set. zero is - * returned if we find something, and *start_ret and *end_ret are - * set to reflect the state struct that was found. + * Find the first offset in the io tree with one or more @bits set. * - * If nothing was found, 1 is returned. If found something, return 0. + * Note: If there are multiple bits set in @bits, any of them will match. + * + * Return 0 if we find something, and update @start_ret and @end_ret. + * Return 1 if we found nothing. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits, + u64 *start_ret, u64 *end_ret, u32 bits, struct extent_state **cached_state) { struct extent_state *state; @@ -1612,7 +1605,7 @@ out: * returned will be the full contiguous area with the bits set. */ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits) + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; int ret = 1; @@ -1649,7 +1642,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, * trim @end_ret to the appropriate size. */ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits) + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; struct rb_node *node, *prev = NULL, *next; @@ -1946,7 +1939,7 @@ static int __process_pages_contig(struct address_space *mapping, unsigned long page_ops, pgoff_t *index_ret) { unsigned long nr_pages = end_index - start_index + 1; - unsigned long pages_locked = 0; + unsigned long pages_processed = 0; pgoff_t index = start_index; struct page *pages[16]; unsigned ret; @@ -1981,7 +1974,7 @@ static int __process_pages_contig(struct address_space *mapping, if (locked_page && pages[i] == locked_page) { put_page(pages[i]); - pages_locked++; + pages_processed++; continue; } if (page_ops & PAGE_CLEAR_DIRTY) @@ -2006,7 +1999,7 @@ static int __process_pages_contig(struct address_space *mapping, } } put_page(pages[i]); - pages_locked++; + pages_processed++; } nr_pages -= ret; index += ret; @@ -2014,14 +2007,13 @@ static int __process_pages_contig(struct address_space *mapping, } out: if (err && index_ret) - *index_ret = start_index + pages_locked - 1; + *index_ret = start_index + pages_processed - 1; return err; } void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, - unsigned clear_bits, - unsigned long page_ops) + u32 clear_bits, unsigned long page_ops) { clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); @@ -2037,7 +2029,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned bits, int contig) + u32 bits, int contig) { struct rb_node *node; struct extent_state *state; @@ -2157,7 +2149,7 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int filled, struct extent_state *cached) + u32 bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; @@ -2642,7 +2634,7 @@ static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio) } blk_status_t btrfs_submit_read_repair(struct inode *inode, - struct bio *failed_bio, u64 phy_offset, + struct bio *failed_bio, u32 bio_offset, struct page *page, unsigned int pgoff, u64 start, u64 end, int failed_mirror, submit_bio_hook_t *submit_bio_hook) @@ -2652,7 +2644,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); - const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits; + const int icsum = bio_offset >> fs_info->sectorsize_bits; bool need_validation; struct bio *repair_bio; struct btrfs_io_bio *repair_io_bio; @@ -2685,7 +2677,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, repair_bio->bi_private = failed_bio->bi_private; if (failed_io_bio->csum) { - const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u32 csum_size = fs_info->csum_size; repair_io_bio->csum = repair_io_bio->csum_inline; memcpy(repair_io_bio->csum, @@ -2775,16 +2767,88 @@ static void end_bio_extent_writepage(struct bio *bio) bio_put(bio); } -static void -endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, - int uptodate) +/* + * Record previously processed extent range + * + * For endio_readpage_release_extent() to handle a full extent range, reducing + * the extent io operations. + */ +struct processed_extent { + struct btrfs_inode *inode; + /* Start of the range in @inode */ + u64 start; + /* End of the range in in @inode */ + u64 end; + bool uptodate; +}; + +/* + * Try to release processed extent range + * + * May not release the extent range right now if the current range is + * contiguous to processed extent. + * + * Will release processed extent when any of @inode, @uptodate, the range is + * no longer contiguous to the processed range. + * + * Passing @inode == NULL will force processed extent to be released. + */ +static void endio_readpage_release_extent(struct processed_extent *processed, + struct btrfs_inode *inode, u64 start, u64 end, + bool uptodate) { struct extent_state *cached = NULL; - u64 end = start + len - 1; + struct extent_io_tree *tree; + + /* The first extent, initialize @processed */ + if (!processed->inode) + goto update; - if (uptodate && tree->track_uptodate) - set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(tree, start, end, &cached); + /* + * Contiguous to processed extent, just uptodate the end. + * + * Several things to notice: + * + * - bio can be merged as long as on-disk bytenr is contiguous + * This means we can have page belonging to other inodes, thus need to + * check if the inode still matches. + * - bvec can contain range beyond current page for multi-page bvec + * Thus we need to do processed->end + 1 >= start check + */ + if (processed->inode == inode && processed->uptodate == uptodate && + processed->end + 1 >= start && end >= processed->end) { + processed->end = end; + return; + } + + tree = &processed->inode->io_tree; + /* + * Now we don't have range contiguous to the processed range, release + * the processed range now. + */ + if (processed->uptodate && tree->track_uptodate) + set_extent_uptodate(tree, processed->start, processed->end, + &cached, GFP_ATOMIC); + unlock_extent_cached_atomic(tree, processed->start, processed->end, + &cached); + +update: + /* Update processed to current range */ + processed->inode = inode; + processed->start = start; + processed->end = end; + processed->uptodate = uptodate; +} + +static void endio_readpage_update_page_status(struct page *page, bool uptodate) +{ + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); } /* @@ -2804,12 +2868,12 @@ static void end_bio_extent_readpage(struct bio *bio) int uptodate = !bio->bi_status; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree, *failure_tree; - u64 offset = 0; - u64 start; - u64 end; - u64 len; - u64 extent_start = 0; - u64 extent_len = 0; + struct processed_extent processed = { 0 }; + /* + * The offset to the beginning of a bio, since one bio can never be + * larger than UINT_MAX, u32 here is enough. + */ + u32 bio_offset = 0; int mirror; int ret; struct bvec_iter_all iter_all; @@ -2819,42 +2883,48 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; + u64 start; + u64 end; + u32 len; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - (u64)bio->bi_iter.bi_sector, bio->bi_status, + bio->bi_iter.bi_sector, bio->bi_status, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; - /* We always issue full-page reads, but if some block - * in a page fails to read, blk_update_request() will - * advance bv_offset and adjust bv_len to compensate. - * Print a warning for nonzero offsets, and an error - * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { - if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) - btrfs_err(fs_info, - "partial page read in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else - btrfs_info(fs_info, - "incomplete page read in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - } - - start = page_offset(page); - end = start + bvec->bv_offset + bvec->bv_len - 1; + /* + * We always issue full-sector reads, but if some block in a + * page fails to read, blk_update_request() will advance + * bv_offset and adjust bv_len to compensate. Print a warning + * for unaligned offsets, and an error if they don't add up to + * a full sector. + */ + if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + btrfs_err(fs_info, + "partial page read in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len, + sectorsize)) + btrfs_info(fs_info, + "incomplete page read with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + + start = page_offset(page) + bvec->bv_offset; + end = start + bvec->bv_len - 1; len = bvec->bv_len; mirror = io_bio->mirror_num; if (likely(uptodate)) { if (is_data_inode(inode)) - ret = btrfs_verify_data_csum(io_bio, offset, page, - start, end, mirror); + ret = btrfs_verify_data_csum(io_bio, + bio_offset, page, start, end, + mirror); else ret = btrfs_validate_metadata_buffer(io_bio, - offset, page, start, end, mirror); + page, start, end, mirror); if (ret) uptodate = 0; else @@ -2879,12 +2949,14 @@ static void end_bio_extent_readpage(struct bio *bio) * If it can't handle the error it will return -EIO and * we remain responsible for that page. */ - if (!btrfs_submit_read_repair(inode, bio, offset, page, + if (!btrfs_submit_read_repair(inode, bio, bio_offset, + page, start - page_offset(page), start, end, mirror, btrfs_submit_data_bio)) { uptodate = !bio->bi_status; - offset += len; + ASSERT(bio_offset + len > bio_offset); + bio_offset += len; continue; } } else { @@ -2908,40 +2980,17 @@ readpage_ok: off = offset_in_page(i_size); if (page->index == end_index && off) zero_user_segment(page, off, PAGE_SIZE); - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - offset += len; - - if (unlikely(!uptodate)) { - if (extent_len) { - endio_readpage_release_extent(tree, - extent_start, - extent_len, 1); - extent_start = 0; - extent_len = 0; - } - endio_readpage_release_extent(tree, start, - end - start + 1, 0); - } else if (!extent_len) { - extent_start = start; - extent_len = end + 1 - start; - } else if (extent_start + extent_len == start) { - extent_len += end + 1 - start; - } else { - endio_readpage_release_extent(tree, extent_start, - extent_len, uptodate); - extent_start = start; - extent_len = end + 1 - start; } - } + ASSERT(bio_offset + len > bio_offset); + bio_offset += len; - if (extent_len) - endio_readpage_release_extent(tree, extent_start, extent_len, - uptodate); + /* Update page status and unlock */ + endio_readpage_update_page_status(page, uptodate); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, uptodate); + } + /* Release the last extent */ + endio_readpage_release_extent(&processed, NULL, 0, 0, false); btrfs_io_bio_free_csum(io_bio); bio_put(bio); } @@ -3038,7 +3087,7 @@ static int submit_extent_page(unsigned int opf, { int ret = 0; struct bio *bio; - size_t page_size = min_t(size_t, size, PAGE_SIZE); + size_t io_size = min_t(size_t, size, PAGE_SIZE); sector_t sector = offset >> 9; struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -3054,12 +3103,12 @@ static int submit_extent_page(unsigned int opf, else contig = bio_end_sector(bio) == sector; - if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) + if (btrfs_bio_fits_in_stripe(page, io_size, bio, bio_flags)) can_merge = false; if (prev_bio_flags != bio_flags || !contig || !can_merge || force_bio_submit || - bio_add_page(bio, page, page_size, pg_offset) < page_size) { + bio_add_page(bio, page, io_size, pg_offset) < io_size) { ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; @@ -3068,13 +3117,13 @@ static int submit_extent_page(unsigned int opf, bio = NULL; } else { if (wbc) - wbc_account_cgroup_owner(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, io_size); return 0; } } bio = btrfs_bio_alloc(offset); - bio_add_page(bio, page, page_size, pg_offset); + bio_add_page(bio, page, io_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; bio->bi_write_hint = page->mapping->host->i_write_hint; @@ -3085,7 +3134,7 @@ static int submit_extent_page(unsigned int opf, bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev; bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); - wbc_account_cgroup_owner(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, io_size); } *bio_ret = bio; @@ -3096,6 +3145,15 @@ static int submit_extent_page(unsigned int opf, static void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { + /* + * If the page is mapped to btree inode, we should hold the private + * lock to prevent race. + * For cloned or dummy extent buffers, their pages are not mapped and + * will not race with any other ebs. + */ + if (page->mapping) + lockdep_assert_held(&page->mapping->private_lock); + if (!PagePrivate(page)) attach_page_private(page, eb); else @@ -3158,7 +3216,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int nr = 0; size_t pg_offset = 0; size_t iosize; - size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; unsigned long this_bio_flag = 0; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; @@ -3224,13 +3281,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); - if (this_bio_flag & EXTENT_BIO_COMPRESSED) { - disk_io_size = em->block_len; + if (this_bio_flag & EXTENT_BIO_COMPRESSED) offset = em->block_start; - } else { + else offset = em->block_start + extent_offset; - disk_io_size = iosize; - } block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; @@ -3319,7 +3373,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - page, offset, disk_io_size, + page, offset, iosize, pg_offset, bio, end_bio_extent_readpage, 0, *bio_flags, @@ -3656,11 +3710,14 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) } /* - * Lock eb pages and flush the bio if we can't the locks + * Lock extent buffer status and pages for writeback. + * + * May try to flush write bio if we can't get the lock. * - * Return 0 if nothing went wrong - * Return >0 is same as 0, except bio is not submitted - * Return <0 if something went wrong, no page is locked + * Return 0 if the extent buffer doesn't need to be submitted. + * (E.g. the extent buffer is not dirty) + * Return >0 is the extent buffer is submitted to bio. + * Return <0 if something went wrong, no page is locked. */ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, struct extent_page_data *epd) @@ -3930,10 +3987,81 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, return ret; } +/* + * Submit all page(s) of one extent buffer. + * + * @page: the page of one extent buffer + * @eb_context: to determine if we need to submit this page, if current page + * belongs to this eb, we don't need to submit + * + * The caller should pass each page in their bytenr order, and here we use + * @eb_context to determine if we have submitted pages of one extent buffer. + * + * If we have, we just skip until we hit a new page that doesn't belong to + * current @eb_context. + * + * If not, we submit all the page(s) of the extent buffer. + * + * Return >0 if we have submitted the extent buffer successfully. + * Return 0 if we don't need to submit the page, as it's already submitted by + * previous call. + * Return <0 for fatal error. + */ +static int submit_eb_page(struct page *page, struct writeback_control *wbc, + struct extent_page_data *epd, + struct extent_buffer **eb_context) +{ + struct address_space *mapping = page->mapping; + struct extent_buffer *eb; + int ret; + + if (!PagePrivate(page)) + return 0; + + spin_lock(&mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&mapping->private_lock); + return 0; + } + + eb = (struct extent_buffer *)page->private; + + /* + * Shouldn't happen and normally this would be a BUG_ON but no point + * crashing the machine for something we can survive anyway. + */ + if (WARN_ON(!eb)) { + spin_unlock(&mapping->private_lock); + return 0; + } + + if (eb == *eb_context) { + spin_unlock(&mapping->private_lock); + return 0; + } + ret = atomic_inc_not_zero(&eb->refs); + spin_unlock(&mapping->private_lock); + if (!ret) + return 0; + + *eb_context = eb; + + ret = lock_extent_buffer_for_io(eb, epd); + if (ret <= 0) { + free_extent_buffer(eb); + return ret; + } + ret = write_one_eb(eb, wbc, epd); + free_extent_buffer(eb); + if (ret < 0) + return ret; + return 1; +} + int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_buffer *eb, *prev_eb = NULL; + struct extent_buffer *eb_context = NULL; struct extent_page_data epd = { .bio = NULL, .extent_locked = 0, @@ -3979,55 +4107,13 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (!PagePrivate(page)) - continue; - - spin_lock(&mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&mapping->private_lock); + ret = submit_eb_page(page, wbc, &epd, &eb_context); + if (ret == 0) continue; - } - - eb = (struct extent_buffer *)page->private; - - /* - * Shouldn't happen and normally this would be a BUG_ON - * but no sense in crashing the users box for something - * we can survive anyway. - */ - if (WARN_ON(!eb)) { - spin_unlock(&mapping->private_lock); - continue; - } - - if (eb == prev_eb) { - spin_unlock(&mapping->private_lock); - continue; - } - - ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->private_lock); - if (!ret) - continue; - - prev_eb = eb; - ret = lock_extent_buffer_for_io(eb, &epd); - if (!ret) { - free_extent_buffer(eb); - continue; - } else if (ret < 0) { - done = 1; - free_extent_buffer(eb); - break; - } - - ret = write_one_eb(eb, wbc, &epd); - if (ret) { + if (ret < 0) { done = 1; - free_extent_buffer(eb); break; } - free_extent_buffer(eb); /* * the filesystem may choose to bump up nr_to_write. @@ -4048,7 +4134,6 @@ retry: index = 0; goto retry; } - ASSERT(ret <= 0); if (ret < 0) { end_write_bio(&epd, ret); return ret; @@ -4382,14 +4467,22 @@ int extent_invalidatepage(struct extent_io_tree *tree, u64 end = start + PAGE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; + /* This function is only called for the btree inode */ + ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); + start += ALIGN(offset, blocksize); if (start > end) return 0; lock_extent_bits(tree, start, end, &cached_state); wait_on_page_writeback(page); - clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 1, 1, &cached_state); + + /* + * Currently for btree io tree, only EXTENT_LOCKED is utilized, + * so here we only need to unlock the extent range to free any + * existing extent state. + */ + unlock_extent_cached(tree, start, end, &cached_state); return 0; } @@ -4409,12 +4502,14 @@ static int try_release_extent_state(struct extent_io_tree *tree, ret = 0; } else { /* - * at this point we can safely clear everything except the - * locked bit and the nodatasum bit + * At this point we can safely clear everything except the + * locked bit, the nodatasum bit and the delalloc new bit. + * The delalloc new bit will be cleared by ordered extent + * completion. */ ret = __clear_extent_bit(tree, start, end, - ~(EXTENT_LOCKED | EXTENT_NODATASUM), - 0, 0, NULL, mask, NULL); + ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW), + 0, 0, NULL, mask, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. @@ -4691,7 +4786,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; roots = ulist_alloc(GFP_KERNEL); tmp_ulist = ulist_alloc(GFP_KERNEL); @@ -4950,12 +5044,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->len = len; eb->fs_info = fs_info; eb->bflags = 0; - rwlock_init(&eb->lock); - atomic_set(&eb->blocking_readers, 0); - eb->blocking_writers = 0; - eb->lock_recursed = false; - init_waitqueue_head(&eb->write_lock_wq); - init_waitqueue_head(&eb->read_lock_wq); + init_rwsem(&eb->lock); btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, &fs_info->allocated_ebs); @@ -4964,19 +5053,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, atomic_set(&eb->refs, 1); atomic_set(&eb->io_pages, 0); - /* - * Sanity checks, currently the maximum is 64k covered by 16x 4k pages - */ - BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE - > MAX_INLINE_EXTENT_BUFFER_SIZE); - BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); - -#ifdef CONFIG_BTRFS_DEBUG - eb->spinning_writers = 0; - atomic_set(&eb->spinning_readers, 0); - atomic_set(&eb->read_locks, 0); - eb->write_locks = 0; -#endif + ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); return eb; } @@ -5105,7 +5182,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, rcu_read_lock(); eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> PAGE_SHIFT); + start >> fs_info->sectorsize_bits); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); /* @@ -5157,7 +5234,7 @@ again: } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, - start >> PAGE_SHIFT, eb); + start >> fs_info->sectorsize_bits, eb); spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { @@ -5178,7 +5255,7 @@ free_eb: #endif struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u64 owner_root, int level) { unsigned long len = fs_info->nodesize; int num_pages; @@ -5196,6 +5273,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, return ERR_PTR(-EINVAL); } + if (fs_info->sectorsize < PAGE_SIZE && + offset_in_page(start) + len > PAGE_SIZE) { + btrfs_err(fs_info, + "tree block crosses page boundary, start %llu nodesize %lu", + start, len); + return ERR_PTR(-EINVAL); + } + eb = find_extent_buffer(fs_info, start); if (eb) return eb; @@ -5203,6 +5288,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return ERR_PTR(-ENOMEM); + btrfs_set_buffer_lockdep_class(owner_root, eb, level); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++, index++) { @@ -5231,13 +5317,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } exists = NULL; - /* - * Do this so attach doesn't complain and we need to - * drop the ref the old guy had. - */ - ClearPagePrivate(p); WARN_ON(PageDirty(p)); - put_page(p); + detach_page_private(p); } attach_extent_buffer_page(eb, p); spin_unlock(&mapping->private_lock); @@ -5265,7 +5346,7 @@ again: spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, - start >> PAGE_SHIFT, eb); + start >> fs_info->sectorsize_bits, eb); spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { @@ -5321,7 +5402,7 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_lock(&fs_info->buffer_lock); radix_tree_delete(&fs_info->buffer_radix, - eb->start >> PAGE_SHIFT); + eb->start >> fs_info->sectorsize_bits); spin_unlock(&fs_info->buffer_lock); } else { spin_unlock(&eb->refs_lock); @@ -5622,12 +5703,12 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, struct page *page; char *kaddr; char *dst = (char *)dstv; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); if (check_eb_range(eb, start, len)) return; - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5652,13 +5733,13 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, struct page *page; char *kaddr; char __user *dst = (char __user *)dstv; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5687,13 +5768,13 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, struct page *page; char *kaddr; char *ptr = (char *)ptrv; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); int ret = 0; if (check_eb_range(eb, start, len)) return -EINVAL; - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5719,7 +5800,7 @@ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, char *kaddr; WARN_ON(!PageUptodate(eb->pages[0])); - kaddr = page_address(eb->pages[0]); + kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, BTRFS_FSID_SIZE); } @@ -5729,7 +5810,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) char *kaddr; WARN_ON(!PageUptodate(eb->pages[0])); - kaddr = page_address(eb->pages[0]); + kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, BTRFS_FSID_SIZE); } @@ -5742,12 +5823,12 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, struct page *page; char *kaddr; char *src = (char *)srcv; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); if (check_eb_range(eb, start, len)) return; - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5771,12 +5852,12 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, size_t offset; struct page *page; char *kaddr; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); if (check_eb_range(eb, start, len)) return; - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5800,10 +5881,20 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, ASSERT(dst->len == src->len); - num_pages = num_extent_pages(dst); - for (i = 0; i < num_pages; i++) - copy_page(page_address(dst->pages[i]), - page_address(src->pages[i])); + if (dst->fs_info->sectorsize == PAGE_SIZE) { + num_pages = num_extent_pages(dst); + for (i = 0; i < num_pages; i++) + copy_page(page_address(dst->pages[i]), + page_address(src->pages[i])); + } else { + size_t src_offset = get_eb_offset_in_page(src, 0); + size_t dst_offset = get_eb_offset_in_page(dst, 0); + + ASSERT(src->fs_info->sectorsize < PAGE_SIZE); + memcpy(page_address(dst->pages[0]) + dst_offset, + page_address(src->pages[0]) + src_offset, + src->len); + } } void copy_extent_buffer(const struct extent_buffer *dst, @@ -5816,7 +5907,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, size_t offset; struct page *page; char *kaddr; - unsigned long i = dst_offset >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(dst_offset); if (check_eb_range(dst, dst_offset, len) || check_eb_range(src, src_offset, len)) @@ -5824,7 +5915,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, WARN_ON(src->len != dst_len); - offset = offset_in_page(dst_offset); + offset = get_eb_offset_in_page(dst, dst_offset); while (len > 0) { page = dst->pages[i]; @@ -5868,7 +5959,7 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start + byte_offset; + offset = start + offset_in_page(eb->start) + byte_offset; *page_index = offset >> PAGE_SHIFT; *page_offset = offset_in_page(offset); @@ -6022,11 +6113,11 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, return; while (len > 0) { - dst_off_in_page = offset_in_page(dst_offset); - src_off_in_page = offset_in_page(src_offset); + dst_off_in_page = get_eb_offset_in_page(dst, dst_offset); + src_off_in_page = get_eb_offset_in_page(dst, src_offset); - dst_i = dst_offset >> PAGE_SHIFT; - src_i = src_offset >> PAGE_SHIFT; + dst_i = get_eb_page_index(dst_offset); + src_i = get_eb_page_index(src_offset); cur = min(len, (unsigned long)(PAGE_SIZE - src_off_in_page)); @@ -6062,11 +6153,11 @@ void memmove_extent_buffer(const struct extent_buffer *dst, return; } while (len > 0) { - dst_i = dst_end >> PAGE_SHIFT; - src_i = src_end >> PAGE_SHIFT; + dst_i = get_eb_page_index(dst_end); + src_i = get_eb_page_index(src_end); - dst_off_in_page = offset_in_page(dst_end); - src_off_in_page = offset_in_page(src_end); + dst_off_in_page = get_eb_offset_in_page(dst, dst_end); + src_off_in_page = get_eb_offset_in_page(dst, src_end); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); @@ -6121,3 +6212,54 @@ int try_release_extent_buffer(struct page *page) return release_extent_buffer(eb); } + +/* + * btrfs_readahead_tree_block - attempt to readahead a child block + * @fs_info: the fs_info + * @bytenr: bytenr to read + * @owner_root: objectid of the root that owns this eb + * @gen: generation for the uptodate check, can be 0 + * @level: level for the eb + * + * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a + * normal uptodate check of the eb, without checking the generation. If we have + * to read the block we will not block on anything. + */ +void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 owner_root, u64 gen, int level) +{ + struct extent_buffer *eb; + int ret; + + eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); + if (IS_ERR(eb)) + return; + + if (btrfs_buffer_uptodate(eb, gen, 1)) { + free_extent_buffer(eb); + return; + } + + ret = read_extent_buffer_pages(eb, WAIT_NONE, 0); + if (ret < 0) + free_extent_buffer_stale(eb); + else + free_extent_buffer(eb); +} + +/* + * btrfs_readahead_node_child - readahead a node's child block + * @node: parent node we're reading from + * @slot: slot in the parent node for the child we want to read + * + * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at + * the slot in the node provided. + */ +void btrfs_readahead_node_child(struct extent_buffer *node, int slot) +{ + btrfs_readahead_tree_block(node->fs_info, + btrfs_node_blockptr(node, slot), + btrfs_header_owner(node), + btrfs_node_ptr_generation(node, slot), + btrfs_header_level(node) - 1); +} |