aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/scrub.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r--fs/btrfs/scrub.c893
1 files changed, 831 insertions, 62 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efa083113827..9e1569ffbf6e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -63,10 +63,18 @@ struct scrub_ctx;
*/
#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
+struct scrub_recover {
+ atomic_t refs;
+ struct btrfs_bio *bbio;
+ u64 *raid_map;
+ u64 map_length;
+};
+
struct scrub_page {
struct scrub_block *sblock;
struct page *page;
struct btrfs_device *dev;
+ struct list_head list;
u64 flags; /* extent flags */
u64 generation;
u64 logical;
@@ -79,6 +87,8 @@ struct scrub_page {
unsigned int io_error:1;
};
u8 csum[BTRFS_CSUM_SIZE];
+
+ struct scrub_recover *recover;
};
struct scrub_bio {
@@ -105,14 +115,52 @@ struct scrub_block {
atomic_t outstanding_pages;
atomic_t ref_count; /* free mem on transition to zero */
struct scrub_ctx *sctx;
+ struct scrub_parity *sparity;
struct {
unsigned int header_error:1;
unsigned int checksum_error:1;
unsigned int no_io_error_seen:1;
unsigned int generation_error:1; /* also sets header_error */
+
+ /* The following is for the data used to check parity */
+ /* It is for the data with checksum */
+ unsigned int data_corrected:1;
};
};
+/* Used for the chunks with parity stripe such RAID5/6 */
+struct scrub_parity {
+ struct scrub_ctx *sctx;
+
+ struct btrfs_device *scrub_dev;
+
+ u64 logic_start;
+
+ u64 logic_end;
+
+ int nsectors;
+
+ int stripe_len;
+
+ atomic_t ref_count;
+
+ struct list_head spages;
+
+ /* Work of parity check and repair */
+ struct btrfs_work work;
+
+ /* Mark the parity blocks which have data */
+ unsigned long *dbitmap;
+
+ /*
+ * Mark the parity blocks which have data, but errors happen when
+ * read data or check data
+ */
+ unsigned long *ebitmap;
+
+ unsigned long bitmap[0];
+};
+
struct scrub_wr_ctx {
struct scrub_bio *wr_curr_bio;
struct btrfs_device *tgtdev;
@@ -196,7 +244,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, int is_metadata,
int have_csum, u8 *csum, u64 generation,
- u16 csum_size);
+ u16 csum_size, int retry_failed_mirror);
static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int is_metadata, int have_csum,
@@ -218,6 +266,8 @@ static void scrub_block_get(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
static void scrub_page_get(struct scrub_page *spage);
static void scrub_page_put(struct scrub_page *spage);
+static void scrub_parity_get(struct scrub_parity *sparity);
+static void scrub_parity_put(struct scrub_parity *sparity);
static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -790,6 +840,20 @@ out:
scrub_pending_trans_workers_dec(sctx);
}
+static inline void scrub_get_recover(struct scrub_recover *recover)
+{
+ atomic_inc(&recover->refs);
+}
+
+static inline void scrub_put_recover(struct scrub_recover *recover)
+{
+ if (atomic_dec_and_test(&recover->refs)) {
+ kfree(recover->bbio);
+ kfree(recover->raid_map);
+ kfree(recover);
+ }
+}
+
/*
* scrub_handle_errored_block gets called when either verification of the
* pages failed or the bio failed to read, e.g. with EIO. In the latter
@@ -906,7 +970,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
/* build and submit the bios for the failed mirror, check checksums */
scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
- csum, generation, sctx->csum_size);
+ csum, generation, sctx->csum_size, 1);
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
@@ -920,6 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
*/
spin_lock(&sctx->stat_lock);
sctx->stat.unverified_errors++;
+ sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
if (sctx->is_dev_replace)
@@ -1019,7 +1084,7 @@ nodatasum_case:
/* build and submit the bios, check checksums */
scrub_recheck_block(fs_info, sblock_other, is_metadata,
have_csum, csum, generation,
- sctx->csum_size);
+ sctx->csum_size, 0);
if (!sblock_other->header_error &&
!sblock_other->checksum_error &&
@@ -1169,7 +1234,7 @@ nodatasum_case:
*/
scrub_recheck_block(fs_info, sblock_bad,
is_metadata, have_csum, csum,
- generation, sctx->csum_size);
+ generation, sctx->csum_size, 1);
if (!sblock_bad->header_error &&
!sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen)
@@ -1180,6 +1245,7 @@ nodatasum_case:
corrected_error:
spin_lock(&sctx->stat_lock);
sctx->stat.corrected_errors++;
+ sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
printk_ratelimited_in_rcu(KERN_ERR
"BTRFS: fixed up error at logical %llu on dev %s\n",
@@ -1201,11 +1267,18 @@ out:
mirror_index++) {
struct scrub_block *sblock = sblocks_for_recheck +
mirror_index;
+ struct scrub_recover *recover;
int page_index;
for (page_index = 0; page_index < sblock->page_count;
page_index++) {
sblock->pagev[page_index]->sblock = NULL;
+ recover = sblock->pagev[page_index]->recover;
+ if (recover) {
+ scrub_put_recover(recover);
+ sblock->pagev[page_index]->recover =
+ NULL;
+ }
scrub_page_put(sblock->pagev[page_index]);
}
}
@@ -1215,14 +1288,63 @@ out:
return 0;
}
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
+{
+ if (raid_map) {
+ if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+ return 3;
+ else
+ return 2;
+ } else {
+ return (int)bbio->num_stripes;
+ }
+}
+
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
+ u64 mapped_length,
+ int nstripes, int mirror,
+ int *stripe_index,
+ u64 *stripe_offset)
+{
+ int i;
+
+ if (raid_map) {
+ /* RAID5/6 */
+ for (i = 0; i < nstripes; i++) {
+ if (raid_map[i] == RAID6_Q_STRIPE ||
+ raid_map[i] == RAID5_P_STRIPE)
+ continue;
+
+ if (logical >= raid_map[i] &&
+ logical < raid_map[i] + mapped_length)
+ break;
+ }
+
+ *stripe_index = i;
+ *stripe_offset = logical - raid_map[i];
+ } else {
+ /* The other RAID type */
+ *stripe_index = mirror;
+ *stripe_offset = 0;
+ }
+}
+
static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info,
struct scrub_block *original_sblock,
u64 length, u64 logical,
struct scrub_block *sblocks_for_recheck)
{
+ struct scrub_recover *recover;
+ struct btrfs_bio *bbio;
+ u64 *raid_map;
+ u64 sublen;
+ u64 mapped_length;
+ u64 stripe_offset;
+ int stripe_index;
int page_index;
int mirror_index;
+ int nmirrors;
int ret;
/*
@@ -1233,23 +1355,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
page_index = 0;
while (length > 0) {
- u64 sublen = min_t(u64, length, PAGE_SIZE);
- u64 mapped_length = sublen;
- struct btrfs_bio *bbio = NULL;
+ sublen = min_t(u64, length, PAGE_SIZE);
+ mapped_length = sublen;
+ bbio = NULL;
+ raid_map = NULL;
/*
* with a length of PAGE_SIZE, each returned stripe
* represents one mirror
*/
- ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
- &mapped_length, &bbio, 0);
+ ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
+ &mapped_length, &bbio, 0, &raid_map);
if (ret || !bbio || mapped_length < sublen) {
kfree(bbio);
+ kfree(raid_map);
return -EIO;
}
+ recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
+ if (!recover) {
+ kfree(bbio);
+ kfree(raid_map);
+ return -ENOMEM;
+ }
+
+ atomic_set(&recover->refs, 1);
+ recover->bbio = bbio;
+ recover->raid_map = raid_map;
+ recover->map_length = mapped_length;
+
BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
- for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+
+ nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
+ for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
struct scrub_page *page;
@@ -1265,26 +1403,38 @@ leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
- kfree(bbio);
+ scrub_put_recover(recover);
return -ENOMEM;
}
scrub_page_get(page);
sblock->pagev[page_index] = page;
page->logical = logical;
- page->physical = bbio->stripes[mirror_index].physical;
+
+ scrub_stripe_index_and_offset(logical, raid_map,
+ mapped_length,
+ bbio->num_stripes,
+ mirror_index,
+ &stripe_index,
+ &stripe_offset);
+ page->physical = bbio->stripes[stripe_index].physical +
+ stripe_offset;
+ page->dev = bbio->stripes[stripe_index].dev;
+
BUG_ON(page_index >= original_sblock->page_count);
page->physical_for_dev_replace =
original_sblock->pagev[page_index]->
physical_for_dev_replace;
/* for missing devices, dev->bdev is NULL */
- page->dev = bbio->stripes[mirror_index].dev;
page->mirror_num = mirror_index + 1;
sblock->page_count++;
page->page = alloc_page(GFP_NOFS);
if (!page->page)
goto leave_nomem;
+
+ scrub_get_recover(recover);
+ page->recover = recover;
}
- kfree(bbio);
+ scrub_put_recover(recover);
length -= sublen;
logical += sublen;
page_index++;
@@ -1293,6 +1443,51 @@ leave_nomem:
return 0;
}
+struct scrub_bio_ret {
+ struct completion event;
+ int error;
+};
+
+static void scrub_bio_wait_endio(struct bio *bio, int error)
+{
+ struct scrub_bio_ret *ret = bio->bi_private;
+
+ ret->error = error;
+ complete(&ret->event);
+}
+
+static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+{
+ return page->recover && page->recover->raid_map;
+}
+
+static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
+ struct bio *bio,
+ struct scrub_page *page)
+{
+ struct scrub_bio_ret done;
+ int ret;
+
+ init_completion(&done.event);
+ done.error = 0;
+ bio->bi_iter.bi_sector = page->logical >> 9;
+ bio->bi_private = &done;
+ bio->bi_end_io = scrub_bio_wait_endio;
+
+ ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
+ page->recover->raid_map,
+ page->recover->map_length,
+ page->mirror_num, 0);
+ if (ret)
+ return ret;
+
+ wait_for_completion(&done.event);
+ if (done.error)
+ return -EIO;
+
+ return 0;
+}
+
/*
* this function will check the on disk data for checksum errors, header
* errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1303,7 +1498,7 @@ leave_nomem:
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, int is_metadata,
int have_csum, u8 *csum, u64 generation,
- u16 csum_size)
+ u16 csum_size, int retry_failed_mirror)
{
int page_num;
@@ -1329,11 +1524,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
continue;
}
bio->bi_bdev = page->dev->bdev;
- bio->bi_iter.bi_sector = page->physical >> 9;
bio_add_page(bio, page->page, PAGE_SIZE, 0);
- if (btrfsic_submit_bio_wait(READ, bio))
- sblock->no_io_error_seen = 0;
+ if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
+ if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+ sblock->no_io_error_seen = 0;
+ } else {
+ bio->bi_iter.bi_sector = page->physical >> 9;
+
+ if (btrfsic_submit_bio_wait(READ, bio))
+ sblock->no_io_error_seen = 0;
+ }
bio_put(bio);
}
@@ -1486,6 +1687,13 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
{
int page_num;
+ /*
+ * This block is used for the check of the parity on the source device,
+ * so the data needn't be written into the destination device.
+ */
+ if (sblock->sparity)
+ return;
+
for (page_num = 0; page_num < sblock->page_count; page_num++) {
int ret;
@@ -1867,6 +2075,9 @@ static void scrub_block_put(struct scrub_block *sblock)
if (atomic_dec_and_test(&sblock->ref_count)) {
int i;
+ if (sblock->sparity)
+ scrub_parity_put(sblock->sparity);
+
for (i = 0; i < sblock->page_count; i++)
scrub_page_put(sblock->pagev[i]);
kfree(sblock);
@@ -2124,9 +2335,51 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
scrub_pending_bio_dec(sctx);
}
+static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
+ unsigned long *bitmap,
+ u64 start, u64 len)
+{
+ int offset;
+ int nsectors;
+ int sectorsize = sparity->sctx->dev_root->sectorsize;
+
+ if (len >= sparity->stripe_len) {
+ bitmap_set(bitmap, 0, sparity->nsectors);
+ return;
+ }
+
+ start -= sparity->logic_start;
+ offset = (int)do_div(start, sparity->stripe_len);
+ offset /= sectorsize;
+ nsectors = (int)len / sectorsize;
+
+ if (offset + nsectors <= sparity->nsectors) {
+ bitmap_set(bitmap, offset, nsectors);
+ return;
+ }
+
+ bitmap_set(bitmap, offset, sparity->nsectors - offset);
+ bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
+}
+
+static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
+ u64 start, u64 len)
+{
+ __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
+}
+
+static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
+ u64 start, u64 len)
+{
+ __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
+}
+
static void scrub_block_complete(struct scrub_block *sblock)
{
+ int corrupted = 0;
+
if (!sblock->no_io_error_seen) {
+ corrupted = 1;
scrub_handle_errored_block(sblock);
} else {
/*
@@ -2134,9 +2387,19 @@ static void scrub_block_complete(struct scrub_block *sblock)
* dev replace case, otherwise write here in dev replace
* case.
*/
- if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
+ corrupted = scrub_checksum(sblock);
+ if (!corrupted && sblock->sctx->is_dev_replace)
scrub_write_block_to_dev_replace(sblock);
}
+
+ if (sblock->sparity && corrupted && !sblock->data_corrected) {
+ u64 start = sblock->pagev[0]->logical;
+ u64 end = sblock->pagev[sblock->page_count - 1]->logical +
+ PAGE_SIZE;
+
+ scrub_parity_mark_sectors_error(sblock->sparity,
+ start, end - start);
+ }
}
static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -2228,6 +2491,132 @@ behind_scrub_pages:
return 0;
}
+static int scrub_pages_for_parity(struct scrub_parity *sparity,
+ u64 logical, u64 len,
+ u64 physical, struct btrfs_device *dev,
+ u64 flags, u64 gen, int mirror_num, u8 *csum)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ struct scrub_block *sblock;
+ int index;
+
+ sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+ if (!sblock) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+
+ /* one ref inside this function, plus one for each page added to
+ * a bio later on */
+ atomic_set(&sblock->ref_count, 1);
+ sblock->sctx = sctx;
+ sblock->no_io_error_seen = 1;
+ sblock->sparity = sparity;
+ scrub_parity_get(sparity);
+
+ for (index = 0; len > 0; index++) {
+ struct scrub_page *spage;
+ u64 l = min_t(u64, len, PAGE_SIZE);
+
+ spage = kzalloc(sizeof(*spage), GFP_NOFS);
+ if (!spage) {
+leave_nomem:
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ scrub_block_put(sblock);
+ return -ENOMEM;
+ }
+ BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ /* For scrub block */
+ scrub_page_get(spage);
+ sblock->pagev[index] = spage;
+ /* For scrub parity */
+ scrub_page_get(spage);
+ list_add_tail(&spage->list, &sparity->spages);
+ spage->sblock = sblock;
+ spage->dev = dev;
+ spage->flags = flags;
+ spage->generation = gen;
+ spage->logical = logical;
+ spage->physical = physical;
+ spage->mirror_num = mirror_num;
+ if (csum) {
+ spage->have_csum = 1;
+ memcpy(spage->csum, csum, sctx->csum_size);
+ } else {
+ spage->have_csum = 0;
+ }
+ sblock->page_count++;
+ spage->page = alloc_page(GFP_NOFS);
+ if (!spage->page)
+ goto leave_nomem;
+ len -= l;
+ logical += l;
+ physical += l;
+ }
+
+ WARN_ON(sblock->page_count == 0);
+ for (index = 0; index < sblock->page_count; index++) {
+ struct scrub_page *spage = sblock->pagev[index];
+ int ret;
+
+ ret = scrub_add_page_to_rd_bio(sctx, spage);
+ if (ret) {
+ scrub_block_put(sblock);
+ return ret;
+ }
+ }
+
+ /* last one frees, either here or in bio completion for last page */
+ scrub_block_put(sblock);
+ return 0;
+}
+
+static int scrub_extent_for_parity(struct scrub_parity *sparity,
+ u64 logical, u64 len,
+ u64 physical, struct btrfs_device *dev,
+ u64 flags, u64 gen, int mirror_num)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ int ret;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u32 blocksize;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ blocksize = sctx->sectorsize;
+ } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ blocksize = sctx->nodesize;
+ } else {
+ blocksize = sctx->sectorsize;
+ WARN_ON(1);
+ }
+
+ while (len) {
+ u64 l = min_t(u64, len, blocksize);
+ int have_csum = 0;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ /* push csums to sbio */
+ have_csum = scrub_find_csum(sctx, logical, l, csum);
+ if (have_csum == 0)
+ goto skip;
+ }
+ ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
+ flags, gen, mirror_num,
+ have_csum ? csum : NULL);
+ if (ret)
+ return ret;
+skip:
+ len -= l;
+ logical += l;
+ physical += l;
+ }
+ return 0;
+}
+
/*
* Given a physical address, this will calculate it's
* logical offset. if this is a parity stripe, it will return
@@ -2236,7 +2625,8 @@ behind_scrub_pages:
* return 0 if it is a data stripe, 1 means parity stripe.
*/
static int get_raid56_logic_offset(u64 physical, int num,
- struct map_lookup *map, u64 *offset)
+ struct map_lookup *map, u64 *offset,
+ u64 *stripe_start)
{
int i;
int j = 0;
@@ -2247,6 +2637,9 @@ static int get_raid56_logic_offset(u64 physical, int num,
last_offset = (physical - map->stripes[num].physical) *
nr_data_stripes(map);
+ if (stripe_start)
+ *stripe_start = last_offset;
+
*offset = last_offset;
for (i = 0; i < nr_data_stripes(map); i++) {
*offset = last_offset + i * map->stripe_len;
@@ -2269,13 +2662,330 @@ static int get_raid56_logic_offset(u64 physical, int num,
return 1;
}
+static void scrub_free_parity(struct scrub_parity *sparity)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ struct scrub_page *curr, *next;
+ int nbits;
+
+ nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
+ if (nbits) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors += nbits;
+ sctx->stat.uncorrectable_errors += nbits;
+ spin_unlock(&sctx->stat_lock);
+ }
+
+ list_for_each_entry_safe(curr, next, &sparity->spages, list) {
+ list_del_init(&curr->list);
+ scrub_page_put(curr);
+ }
+
+ kfree(sparity);
+}
+
+static void scrub_parity_bio_endio(struct bio *bio, int error)
+{
+ struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
+ struct scrub_ctx *sctx = sparity->sctx;
+
+ if (error)
+ bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+ sparity->nsectors);
+
+ scrub_free_parity(sparity);
+ scrub_pending_bio_dec(sctx);
+ bio_put(bio);
+}
+
+static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ struct bio *bio;
+ struct btrfs_raid_bio *rbio;
+ struct scrub_page *spage;
+ struct btrfs_bio *bbio = NULL;
+ u64 *raid_map = NULL;
+ u64 length;
+ int ret;
+
+ if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
+ sparity->nsectors))
+ goto out;
+
+ length = sparity->logic_end - sparity->logic_start + 1;
+ ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
+ sparity->logic_start,
+ &length, &bbio, 0, &raid_map);
+ if (ret || !bbio || !raid_map)
+ goto bbio_out;
+
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+ if (!bio)
+ goto bbio_out;
+
+ bio->bi_iter.bi_sector = sparity->logic_start >> 9;
+ bio->bi_private = sparity;
+ bio->bi_end_io = scrub_parity_bio_endio;
+
+ rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
+ raid_map, length,
+ sparity->scrub_dev,
+ sparity->dbitmap,
+ sparity->nsectors);
+ if (!rbio)
+ goto rbio_out;
+
+ list_for_each_entry(spage, &sparity->spages, list)
+ raid56_parity_add_scrub_pages(rbio, spage->page,
+ spage->logical);
+
+ scrub_pending_bio_inc(sctx);
+ raid56_parity_submit_scrub_rbio(rbio);
+ return;
+
+rbio_out:
+ bio_put(bio);
+bbio_out:
+ kfree(bbio);
+ kfree(raid_map);
+ bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+ sparity->nsectors);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+out:
+ scrub_free_parity(sparity);
+}
+
+static inline int scrub_calc_parity_bitmap_len(int nsectors)
+{
+ return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
+}
+
+static void scrub_parity_get(struct scrub_parity *sparity)
+{
+ atomic_inc(&sparity->ref_count);
+}
+
+static void scrub_parity_put(struct scrub_parity *sparity)
+{
+ if (!atomic_dec_and_test(&sparity->ref_count))
+ return;
+
+ scrub_parity_check_and_repair(sparity);
+}
+
+static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
+ struct map_lookup *map,
+ struct btrfs_device *sdev,
+ struct btrfs_path *path,
+ u64 logic_start,
+ u64 logic_end)
+{
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *csum_root = fs_info->csum_root;
+ struct btrfs_extent_item *extent;
+ u64 flags;
+ int ret;
+ int slot;
+ struct extent_buffer *l;
+ struct btrfs_key key;
+ u64 generation;
+ u64 extent_logical;
+ u64 extent_physical;
+ u64 extent_len;
+ struct btrfs_device *extent_dev;
+ struct scrub_parity *sparity;
+ int nsectors;
+ int bitmap_len;
+ int extent_mirror_num;
+ int stop_loop = 0;
+
+ nsectors = map->stripe_len / root->sectorsize;
+ bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
+ sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
+ GFP_NOFS);
+ if (!sparity) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+
+ sparity->stripe_len = map->stripe_len;
+ sparity->nsectors = nsectors;
+ sparity->sctx = sctx;
+ sparity->scrub_dev = sdev;
+ sparity->logic_start = logic_start;
+ sparity->logic_end = logic_end;
+ atomic_set(&sparity->ref_count, 1);
+ INIT_LIST_HEAD(&sparity->spages);
+ sparity->dbitmap = sparity->bitmap;
+ sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
+
+ ret = 0;
+ while (logic_start < logic_end) {
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.objectid = logic_start;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = btrfs_previous_extent_item(root, path, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(NULL, root, &key,
+ path, 0, 0);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ stop_loop = 0;
+ while (1) {
+ u64 bytes;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+
+ stop_loop = 1;
+ break;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ bytes = root->nodesize;
+ else
+ bytes = key.offset;
+
+ if (key.objectid + bytes <= logic_start)
+ goto next;
+
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY)
+ goto next;
+
+ if (key.objectid > logic_end) {
+ stop_loop = 1;
+ break;
+ }
+
+ while (key.objectid >= logic_start + map->stripe_len)
+ logic_start += map->stripe_len;
+
+ extent = btrfs_item_ptr(l, slot,
+ struct btrfs_extent_item);
+ flags = btrfs_extent_flags(l, extent);
+ generation = btrfs_extent_generation(l, extent);
+
+ if (key.objectid < logic_start &&
+ (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+ btrfs_err(fs_info,
+ "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+ key.objectid, logic_start);
+ goto next;
+ }
+again:
+ extent_logical = key.objectid;
+ extent_len = bytes;
+
+ if (extent_logical < logic_start) {
+ extent_len -= logic_start - extent_logical;
+ extent_logical = logic_start;
+ }
+
+ if (extent_logical + extent_len >
+ logic_start + map->stripe_len)
+ extent_len = logic_start + map->stripe_len -
+ extent_logical;
+
+ scrub_parity_mark_sectors_data(sparity, extent_logical,
+ extent_len);
+
+ scrub_remap_extent(fs_info, extent_logical,
+ extent_len, &extent_physical,
+ &extent_dev,
+ &extent_mirror_num);
+
+ ret = btrfs_lookup_csums_range(csum_root,
+ extent_logical,
+ extent_logical + extent_len - 1,
+ &sctx->csum_list, 1);
+ if (ret)
+ goto out;
+
+ ret = scrub_extent_for_parity(sparity, extent_logical,
+ extent_len,
+ extent_physical,
+ extent_dev, flags,
+ generation,
+ extent_mirror_num);
+ if (ret)
+ goto out;
+
+ scrub_free_csums(sctx);
+ if (extent_logical + extent_len <
+ key.objectid + bytes) {
+ logic_start += map->stripe_len;
+
+ if (logic_start >= logic_end) {
+ stop_loop = 1;
+ break;
+ }
+
+ if (logic_start < key.objectid + bytes) {
+ cond_resched();
+ goto again;
+ }
+ }
+next:
+ path->slots[0]++;
+ }
+
+ btrfs_release_path(path);
+
+ if (stop_loop)
+ break;
+
+ logic_start += map->stripe_len;
+ }
+out:
+ if (ret < 0)
+ scrub_parity_mark_sectors_error(sparity, logic_start,
+ logic_end - logic_start + 1);
+ scrub_parity_put(sparity);
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+ btrfs_release_path(path);
+ return ret < 0 ? ret : 0;
+}
+
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
int num, u64 base, u64 length,
int is_dev_replace)
{
- struct btrfs_path *path;
+ struct btrfs_path *path, *ppath;
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
@@ -2302,6 +3012,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 extent_logical;
u64 extent_physical;
u64 extent_len;
+ u64 stripe_logical;
+ u64 stripe_end;
struct btrfs_device *extent_dev;
int extent_mirror_num;
int stop_loop = 0;
@@ -2327,7 +3039,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
mirror_num = num % map->num_stripes + 1;
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID6)) {
- get_raid56_logic_offset(physical, num, map, &offset);
+ get_raid56_logic_offset(physical, num, map, &offset, NULL);
increment = map->stripe_len * nr_data_stripes(map);
mirror_num = 1;
} else {
@@ -2339,6 +3051,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (!path)
return -ENOMEM;
+ ppath = btrfs_alloc_path();
+ if (!ppath) {
+ btrfs_free_path(ppath);
+ return -ENOMEM;
+ }
+
/*
* work on commit root. The related disk blocks are static as
* long as COW is applied. This means, it is save to rewrite
@@ -2357,7 +3075,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID6)) {
get_raid56_logic_offset(physical_end, num,
- map, &logic_end);
+ map, &logic_end, NULL);
logic_end += base;
} else {
logic_end = logical + increment * nstripes;
@@ -2404,10 +3122,18 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID6)) {
ret = get_raid56_logic_offset(physical, num,
- map, &logical);
+ map, &logical, &stripe_logical);
logical += base;
- if (ret)
+ if (ret) {
+ stripe_logical += base;
+ stripe_end = stripe_logical + increment - 1;
+ ret = scrub_raid56_parity(sctx, map, scrub_dev,
+ ppath, stripe_logical,
+ stripe_end);
+ if (ret)
+ goto out;
goto skip;
+ }
}
/*
* canceled?
@@ -2558,13 +3284,25 @@ again:
* loop until we find next data stripe
* or we have finished all stripes.
*/
- do {
- physical += map->stripe_len;
- ret = get_raid56_logic_offset(
- physical, num,
- map, &logical);
- logical += base;
- } while (physical < physical_end && ret);
+loop:
+ physical += map->stripe_len;
+ ret = get_raid56_logic_offset(physical,
+ num, map, &logical,
+ &stripe_logical);
+ logical += base;
+
+ if (ret && physical < physical_end) {
+ stripe_logical += base;
+ stripe_end = stripe_logical +
+ increment - 1;
+ ret = scrub_raid56_parity(sctx,
+ map, scrub_dev, ppath,
+ stripe_logical,
+ stripe_end);
+ if (ret)
+ goto out;
+ goto loop;
+ }
} else {
physical += map->stripe_len;
logical += increment;
@@ -2605,6 +3343,7 @@ out:
blk_finish_plug(&plug);
btrfs_free_path(path);
+ btrfs_free_path(ppath);
return ret < 0 ? ret : 0;
}
@@ -3310,6 +4049,50 @@ out:
scrub_pending_trans_workers_dec(sctx);
}
+static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
+ u64 logical)
+{
+ struct extent_state *cached_state = NULL;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_io_tree *io_tree;
+ struct extent_map *em;
+ u64 lockstart = start, lockend = start + len - 1;
+ int ret = 0;
+
+ io_tree = &BTRFS_I(inode)->io_tree;
+
+ lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ ret = 1;
+ goto out_unlock;
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock;
+ }
+
+ /*
+ * This extent does not actually cover the logical extent anymore,
+ * move on to the next inode.
+ */
+ if (em->block_start > logical ||
+ em->block_start + em->block_len < logical + len) {
+ free_extent_map(em);
+ ret = 1;
+ goto out_unlock;
+ }
+ free_extent_map(em);
+
+out_unlock:
+ unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+ GFP_NOFS);
+ return ret;
+}
+
static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct scrub_copy_nocow_ctx *nocow_ctx)
{
@@ -3318,13 +4101,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct inode *inode;
struct page *page;
struct btrfs_root *local_root;
- struct btrfs_ordered_extent *ordered;
- struct extent_map *em;
- struct extent_state *cached_state = NULL;
struct extent_io_tree *io_tree;
u64 physical_for_dev_replace;
+ u64 nocow_ctx_logical;
u64 len = nocow_ctx->len;
- u64 lockstart = offset, lockend = offset + len - 1;
unsigned long index;
int srcu_index;
int ret = 0;
@@ -3356,30 +4136,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
io_tree = &BTRFS_I(inode)->io_tree;
+ nocow_ctx_logical = nocow_ctx->logical;
- lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
- ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
- if (ordered) {
- btrfs_put_ordered_extent(ordered);
- goto out_unlock;
- }
-
- em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out_unlock;
- }
-
- /*
- * This extent does not actually cover the logical extent anymore,
- * move on to the next inode.
- */
- if (em->block_start > nocow_ctx->logical ||
- em->block_start + em->block_len < nocow_ctx->logical + len) {
- free_extent_map(em);
- goto out_unlock;
+ ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
+ if (ret) {
+ ret = ret > 0 ? 0 : ret;
+ goto out;
}
- free_extent_map(em);
while (len >= PAGE_CACHE_SIZE) {
index = offset >> PAGE_CACHE_SHIFT;
@@ -3396,7 +4159,7 @@ again:
goto next_page;
} else {
ClearPageError(page);
- err = extent_read_full_page_nolock(io_tree, page,
+ err = extent_read_full_page(io_tree, page,
btrfs_get_extent,
nocow_ctx->mirror_num);
if (err) {
@@ -3421,6 +4184,14 @@ again:
goto next_page;
}
}
+
+ ret = check_extent_to_block(inode, offset, len,
+ nocow_ctx_logical);
+ if (ret) {
+ ret = ret > 0 ? 0 : ret;
+ goto next_page;
+ }
+
err = write_page_nocow(nocow_ctx->sctx,
physical_for_dev_replace, page);
if (err)
@@ -3434,12 +4205,10 @@ next_page:
offset += PAGE_CACHE_SIZE;
physical_for_dev_replace += PAGE_CACHE_SIZE;
+ nocow_ctx_logical += PAGE_CACHE_SIZE;
len -= PAGE_CACHE_SIZE;
}
ret = COPY_COMPLETE;
-out_unlock:
- unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
- GFP_NOFS);
out:
mutex_unlock(&inode->i_mutex);
iput(inode);