aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
-rw-r--r--drivers/md/md.h11
-rw-r--r--drivers/md/raid1-10.c69
-rw-r--r--drivers/md/raid1.c550
-rw-r--r--drivers/md/raid1.h1
-rw-r--r--drivers/md/raid10.c58
-rw-r--r--drivers/md/raid5.c35
6 files changed, 444 insertions, 280 deletions
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8d881cc59799..b2076a165c10 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -207,6 +207,7 @@ enum flag_bits {
* check if there is collision between raid1
* serial bios.
*/
+ Nonrot, /* non-rotational device (SSD) */
};
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -222,6 +223,16 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
}
return 0;
}
+
+static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
+ int sectors)
+{
+ sector_t first_bad;
+ int bad_sectors;
+
+ return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors);
+}
+
extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 512746551f36..2ea1710a3b70 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -227,3 +227,72 @@ static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev)
return false;
}
+
+/**
+ * raid1_check_read_range() - check a given read range for bad blocks,
+ * available read length is returned;
+ * @rdev: the rdev to read;
+ * @this_sector: read position;
+ * @len: read length;
+ *
+ * helper function for read_balance()
+ *
+ * 1) If there are no bad blocks in the range, @len is returned;
+ * 2) If the range are all bad blocks, 0 is returned;
+ * 3) If there are partial bad blocks:
+ * - If the bad block range starts after @this_sector, the length of first
+ * good region is returned;
+ * - If the bad block range starts before @this_sector, 0 is returned and
+ * the @len is updated to the offset into the region before we get to the
+ * good blocks;
+ */
+static inline int raid1_check_read_range(struct md_rdev *rdev,
+ sector_t this_sector, int *len)
+{
+ sector_t first_bad;
+ int bad_sectors;
+
+ /* no bad block overlap */
+ if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors))
+ return *len;
+
+ /*
+ * bad block range starts offset into our range so we can return the
+ * number of sectors before the bad blocks start.
+ */
+ if (first_bad > this_sector)
+ return first_bad - this_sector;
+
+ /* read range is fully consumed by bad blocks. */
+ if (this_sector + *len <= first_bad + bad_sectors)
+ return 0;
+
+ /*
+ * final case, bad block range starts before or at the start of our
+ * range but does not cover our entire range so we still return 0 but
+ * update the length with the number of sectors before we get to the
+ * good ones.
+ */
+ *len = first_bad + bad_sectors - this_sector;
+ return 0;
+}
+
+/*
+ * Check if read should choose the first rdev.
+ *
+ * Balance on the whole device if no resync is going on (recovery is ok) or
+ * below the resync window. Otherwise, take the first readable disk.
+ */
+static inline bool raid1_should_read_first(struct mddev *mddev,
+ sector_t this_sector, int len)
+{
+ if ((mddev->recovery_cp < this_sector + len))
+ return true;
+
+ if (mddev_is_clustered(mddev) &&
+ md_cluster_ops->area_resyncing(mddev, READ, this_sector,
+ this_sector + len))
+ return true;
+
+ return false;
+}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 286f8b16c7bd..afca975ec7f3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -498,9 +498,6 @@ static void raid1_end_write_request(struct bio *bio)
* to user-side. So if something waits for IO, then it
* will wait for the 'master' bio.
*/
- sector_t first_bad;
- int bad_sectors;
-
r1_bio->bios[mirror] = NULL;
to_put = bio;
/*
@@ -516,8 +513,8 @@ static void raid1_end_write_request(struct bio *bio)
set_bit(R1BIO_Uptodate, &r1_bio->state);
/* Maybe we can clear some bad blocks. */
- if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
- &first_bad, &bad_sectors) && !discard_error) {
+ if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
+ !discard_error) {
r1_bio->bios[mirror] = IO_MADE_GOOD;
set_bit(R1BIO_MadeGood, &r1_bio->state);
}
@@ -582,211 +579,312 @@ static sector_t align_to_barrier_unit_end(sector_t start_sector,
return len;
}
-/*
- * This routine returns the disk from which the requested read should
- * be done. There is a per-array 'next expected sequential IO' sector
- * number - if this matches on the next IO then we use the last disk.
- * There is also a per-disk 'last know head position' sector that is
- * maintained from IRQ contexts, both the normal and the resync IO
- * completion handlers update this position correctly. If there is no
- * perfect sequential match then we pick the disk whose head is closest.
- *
- * If there are 2 mirrors in the same 2 devices, performance degrades
- * because position is mirror, not device based.
- *
- * The rdev for the device selected will have nr_pending incremented.
- */
-static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
+static void update_read_sectors(struct r1conf *conf, int disk,
+ sector_t this_sector, int len)
{
- const sector_t this_sector = r1_bio->sector;
- int sectors;
- int best_good_sectors;
- int best_disk, best_dist_disk, best_pending_disk;
- int has_nonrot_disk;
+ struct raid1_info *info = &conf->mirrors[disk];
+
+ atomic_inc(&info->rdev->nr_pending);
+ if (info->next_seq_sect != this_sector)
+ info->seq_start = this_sector;
+ info->next_seq_sect = this_sector + len;
+}
+
+static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio,
+ int *max_sectors)
+{
+ sector_t this_sector = r1_bio->sector;
+ int len = r1_bio->sectors;
int disk;
- sector_t best_dist;
- unsigned int min_pending;
- struct md_rdev *rdev;
- int choose_first;
- int choose_next_idle;
- /*
- * Check if we can balance. We can balance on the whole
- * device if no resync is going on, or below the resync window.
- * We take the first readable disk when above the resync window.
- */
- retry:
- sectors = r1_bio->sectors;
- best_disk = -1;
- best_dist_disk = -1;
- best_dist = MaxSector;
- best_pending_disk = -1;
- min_pending = UINT_MAX;
- best_good_sectors = 0;
- has_nonrot_disk = 0;
- choose_next_idle = 0;
- clear_bit(R1BIO_FailFast, &r1_bio->state);
+ for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
+ struct md_rdev *rdev;
+ int read_len;
- if ((conf->mddev->recovery_cp < this_sector + sectors) ||
- (mddev_is_clustered(conf->mddev) &&
- md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
- this_sector + sectors)))
- choose_first = 1;
- else
- choose_first = 0;
+ if (r1_bio->bios[disk] == IO_BLOCKED)
+ continue;
+
+ rdev = conf->mirrors[disk].rdev;
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ continue;
+
+ /* choose the first disk even if it has some bad blocks. */
+ read_len = raid1_check_read_range(rdev, this_sector, &len);
+ if (read_len > 0) {
+ update_read_sectors(conf, disk, this_sector, read_len);
+ *max_sectors = read_len;
+ return disk;
+ }
+ }
+
+ return -1;
+}
+
+static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio,
+ int *max_sectors)
+{
+ sector_t this_sector = r1_bio->sector;
+ int best_disk = -1;
+ int best_len = 0;
+ int disk;
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
- sector_t dist;
- sector_t first_bad;
- int bad_sectors;
- unsigned int pending;
- bool nonrot;
+ struct md_rdev *rdev;
+ int len;
+ int read_len;
+
+ if (r1_bio->bios[disk] == IO_BLOCKED)
+ continue;
rdev = conf->mirrors[disk].rdev;
- if (r1_bio->bios[disk] == IO_BLOCKED
- || rdev == NULL
- || test_bit(Faulty, &rdev->flags))
+ if (!rdev || test_bit(Faulty, &rdev->flags) ||
+ test_bit(WriteMostly, &rdev->flags))
continue;
- if (!test_bit(In_sync, &rdev->flags) &&
- rdev->recovery_offset < this_sector + sectors)
+
+ /* keep track of the disk with the most readable sectors. */
+ len = r1_bio->sectors;
+ read_len = raid1_check_read_range(rdev, this_sector, &len);
+ if (read_len > best_len) {
+ best_disk = disk;
+ best_len = read_len;
+ }
+ }
+
+ if (best_disk != -1) {
+ *max_sectors = best_len;
+ update_read_sectors(conf, best_disk, this_sector, best_len);
+ }
+
+ return best_disk;
+}
+
+static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio,
+ int *max_sectors)
+{
+ sector_t this_sector = r1_bio->sector;
+ int bb_disk = -1;
+ int bb_read_len = 0;
+ int disk;
+
+ for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
+ struct md_rdev *rdev;
+ int len;
+ int read_len;
+
+ if (r1_bio->bios[disk] == IO_BLOCKED)
continue;
- if (test_bit(WriteMostly, &rdev->flags)) {
- /* Don't balance among write-mostly, just
- * use the first as a last resort */
- if (best_dist_disk < 0) {
- if (is_badblock(rdev, this_sector, sectors,
- &first_bad, &bad_sectors)) {
- if (first_bad <= this_sector)
- /* Cannot use this */
- continue;
- best_good_sectors = first_bad - this_sector;
- } else
- best_good_sectors = sectors;
- best_dist_disk = disk;
- best_pending_disk = disk;
- }
+
+ rdev = conf->mirrors[disk].rdev;
+ if (!rdev || test_bit(Faulty, &rdev->flags) ||
+ !test_bit(WriteMostly, &rdev->flags))
continue;
+
+ /* there are no bad blocks, we can use this disk */
+ len = r1_bio->sectors;
+ read_len = raid1_check_read_range(rdev, this_sector, &len);
+ if (read_len == r1_bio->sectors) {
+ update_read_sectors(conf, disk, this_sector, read_len);
+ return disk;
}
- /* This is a reasonable device to use. It might
- * even be best.
+
+ /*
+ * there are partial bad blocks, choose the rdev with largest
+ * read length.
*/
- if (is_badblock(rdev, this_sector, sectors,
- &first_bad, &bad_sectors)) {
- if (best_dist < MaxSector)
- /* already have a better device */
- continue;
- if (first_bad <= this_sector) {
- /* cannot read here. If this is the 'primary'
- * device, then we must not read beyond
- * bad_sectors from another device..
- */
- bad_sectors -= (this_sector - first_bad);
- if (choose_first && sectors > bad_sectors)
- sectors = bad_sectors;
- if (best_good_sectors > sectors)
- best_good_sectors = sectors;
-
- } else {
- sector_t good_sectors = first_bad - this_sector;
- if (good_sectors > best_good_sectors) {
- best_good_sectors = good_sectors;
- best_disk = disk;
- }
- if (choose_first)
- break;
- }
- continue;
- } else {
- if ((sectors > best_good_sectors) && (best_disk >= 0))
- best_disk = -1;
- best_good_sectors = sectors;
+ if (read_len > bb_read_len) {
+ bb_disk = disk;
+ bb_read_len = read_len;
}
+ }
+
+ if (bb_disk != -1) {
+ *max_sectors = bb_read_len;
+ update_read_sectors(conf, bb_disk, this_sector, bb_read_len);
+ }
+
+ return bb_disk;
+}
+
+static bool is_sequential(struct r1conf *conf, int disk, struct r1bio *r1_bio)
+{
+ /* TODO: address issues with this check and concurrency. */
+ return conf->mirrors[disk].next_seq_sect == r1_bio->sector ||
+ conf->mirrors[disk].head_position == r1_bio->sector;
+}
+
+/*
+ * If buffered sequential IO size exceeds optimal iosize, check if there is idle
+ * disk. If yes, choose the idle disk.
+ */
+static bool should_choose_next(struct r1conf *conf, int disk)
+{
+ struct raid1_info *mirror = &conf->mirrors[disk];
+ int opt_iosize;
+
+ if (!test_bit(Nonrot, &mirror->rdev->flags))
+ return false;
+
+ opt_iosize = bdev_io_opt(mirror->rdev->bdev) >> 9;
+ return opt_iosize > 0 && mirror->seq_start != MaxSector &&
+ mirror->next_seq_sect > opt_iosize &&
+ mirror->next_seq_sect - opt_iosize >= mirror->seq_start;
+}
+
+static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio)
+{
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ return false;
+
+ /* still in recovery */
+ if (!test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < r1_bio->sector + r1_bio->sectors)
+ return false;
+
+ /* don't read from slow disk unless have to */
+ if (test_bit(WriteMostly, &rdev->flags))
+ return false;
+
+ /* don't split IO for bad blocks unless have to */
+ if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors))
+ return false;
+
+ return true;
+}
+
+struct read_balance_ctl {
+ sector_t closest_dist;
+ int closest_dist_disk;
+ int min_pending;
+ int min_pending_disk;
+ int sequential_disk;
+ int readable_disks;
+};
+
+static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
+{
+ int disk;
+ struct read_balance_ctl ctl = {
+ .closest_dist_disk = -1,
+ .closest_dist = MaxSector,
+ .min_pending_disk = -1,
+ .min_pending = UINT_MAX,
+ .sequential_disk = -1,
+ };
+
+ for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
+ struct md_rdev *rdev;
+ sector_t dist;
+ unsigned int pending;
+
+ if (r1_bio->bios[disk] == IO_BLOCKED)
+ continue;
+
+ rdev = conf->mirrors[disk].rdev;
+ if (!rdev_readable(rdev, r1_bio))
+ continue;
- if (best_disk >= 0)
- /* At least two disks to choose from so failfast is OK */
+ /* At least two disks to choose from so failfast is OK */
+ if (ctl.readable_disks++ == 1)
set_bit(R1BIO_FailFast, &r1_bio->state);
- nonrot = bdev_nonrot(rdev->bdev);
- has_nonrot_disk |= nonrot;
pending = atomic_read(&rdev->nr_pending);
- dist = abs(this_sector - conf->mirrors[disk].head_position);
- if (choose_first) {
- best_disk = disk;
- break;
- }
+ dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
+
/* Don't change to another disk for sequential reads */
- if (conf->mirrors[disk].next_seq_sect == this_sector
- || dist == 0) {
- int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
- struct raid1_info *mirror = &conf->mirrors[disk];
+ if (is_sequential(conf, disk, r1_bio)) {
+ if (!should_choose_next(conf, disk))
+ return disk;
- best_disk = disk;
/*
- * If buffered sequential IO size exceeds optimal
- * iosize, check if there is idle disk. If yes, choose
- * the idle disk. read_balance could already choose an
- * idle disk before noticing it's a sequential IO in
- * this disk. This doesn't matter because this disk
- * will idle, next time it will be utilized after the
- * first disk has IO size exceeds optimal iosize. In
- * this way, iosize of the first disk will be optimal
- * iosize at least. iosize of the second disk might be
- * small, but not a big deal since when the second disk
- * starts IO, the first disk is likely still busy.
+ * Add 'pending' to avoid choosing this disk if
+ * there is other idle disk.
*/
- if (nonrot && opt_iosize > 0 &&
- mirror->seq_start != MaxSector &&
- mirror->next_seq_sect > opt_iosize &&
- mirror->next_seq_sect - opt_iosize >=
- mirror->seq_start) {
- choose_next_idle = 1;
- continue;
- }
- break;
+ pending++;
+ /*
+ * If there is no other idle disk, this disk
+ * will be chosen.
+ */
+ ctl.sequential_disk = disk;
}
- if (choose_next_idle)
- continue;
-
- if (min_pending > pending) {
- min_pending = pending;
- best_pending_disk = disk;
+ if (ctl.min_pending > pending) {
+ ctl.min_pending = pending;
+ ctl.min_pending_disk = disk;
}
- if (dist < best_dist) {
- best_dist = dist;
- best_dist_disk = disk;
+ if (ctl.closest_dist > dist) {
+ ctl.closest_dist = dist;
+ ctl.closest_dist_disk = disk;
}
}
/*
+ * sequential IO size exceeds optimal iosize, however, there is no other
+ * idle disk, so choose the sequential disk.
+ */
+ if (ctl.sequential_disk != -1 && ctl.min_pending != 0)
+ return ctl.sequential_disk;
+
+ /*
* If all disks are rotational, choose the closest disk. If any disk is
* non-rotational, choose the disk with less pending request even the
* disk is rotational, which might/might not be optimal for raids with
* mixed ratation/non-rotational disks depending on workload.
*/
- if (best_disk == -1) {
- if (has_nonrot_disk || min_pending == 0)
- best_disk = best_pending_disk;
- else
- best_disk = best_dist_disk;
- }
+ if (ctl.min_pending_disk != -1 &&
+ (READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0))
+ return ctl.min_pending_disk;
+ else
+ return ctl.closest_dist_disk;
+}
- if (best_disk >= 0) {
- rdev = conf->mirrors[best_disk].rdev;
- if (!rdev)
- goto retry;
- atomic_inc(&rdev->nr_pending);
- sectors = best_good_sectors;
+/*
+ * This routine returns the disk from which the requested read should be done.
+ *
+ * 1) If resync is in progress, find the first usable disk and use it even if it
+ * has some bad blocks.
+ *
+ * 2) Now that there is no resync, loop through all disks and skipping slow
+ * disks and disks with bad blocks for now. Only pay attention to key disk
+ * choice.
+ *
+ * 3) If we've made it this far, now look for disks with bad blocks and choose
+ * the one with most number of sectors.
+ *
+ * 4) If we are all the way at the end, we have no choice but to use a disk even
+ * if it is write mostly.
+ *
+ * The rdev for the device selected will have nr_pending incremented.
+ */
+static int read_balance(struct r1conf *conf, struct r1bio *r1_bio,
+ int *max_sectors)
+{
+ int disk;
+
+ clear_bit(R1BIO_FailFast, &r1_bio->state);
- if (conf->mirrors[best_disk].next_seq_sect != this_sector)
- conf->mirrors[best_disk].seq_start = this_sector;
+ if (raid1_should_read_first(conf->mddev, r1_bio->sector,
+ r1_bio->sectors))
+ return choose_first_rdev(conf, r1_bio, max_sectors);
- conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
+ disk = choose_best_rdev(conf, r1_bio);
+ if (disk >= 0) {
+ *max_sectors = r1_bio->sectors;
+ update_read_sectors(conf, disk, r1_bio->sector,
+ r1_bio->sectors);
+ return disk;
}
- *max_sectors = sectors;
- return best_disk;
+ /*
+ * If we are here it means we didn't find a perfectly good disk so
+ * now spend a bit more time trying to find one with the most good
+ * sectors.
+ */
+ disk = choose_bb_rdev(conf, r1_bio, max_sectors);
+ if (disk >= 0)
+ return disk;
+
+ return choose_slow_rdev(conf, r1_bio, max_sectors);
}
static void wake_up_barrier(struct r1conf *conf)
@@ -1760,6 +1858,52 @@ static int raid1_spare_active(struct mddev *mddev)
return count;
}
+static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
+ bool replacement)
+{
+ struct raid1_info *info = conf->mirrors + disk;
+
+ if (replacement)
+ info += conf->raid_disks;
+
+ if (info->rdev)
+ return false;
+
+ if (bdev_nonrot(rdev->bdev)) {
+ set_bit(Nonrot, &rdev->flags);
+ WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
+ }
+
+ rdev->raid_disk = disk;
+ info->head_position = 0;
+ info->seq_start = MaxSector;
+ WRITE_ONCE(info->rdev, rdev);
+
+ return true;
+}
+
+static bool raid1_remove_conf(struct r1conf *conf, int disk)
+{
+ struct raid1_info *info = conf->mirrors + disk;
+ struct md_rdev *rdev = info->rdev;
+
+ if (!rdev || test_bit(In_sync, &rdev->flags) ||
+ atomic_read(&rdev->nr_pending))
+ return false;
+
+ /* Only remove non-faulty devices if recovery is not possible. */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ rdev->mddev->recovery_disabled != conf->recovery_disabled &&
+ rdev->mddev->degraded < conf->raid_disks)
+ return false;
+
+ if (test_and_clear_bit(Nonrot, &rdev->flags))
+ WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1);
+
+ WRITE_ONCE(info->rdev, NULL);
+ return true;
+}
+
static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r1conf *conf = mddev->private;
@@ -1795,15 +1939,13 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- p->head_position = 0;
- rdev->raid_disk = mirror;
+ raid1_add_conf(conf, rdev, mirror, false);
err = 0;
/* As all devices are equivalent, we don't need a full recovery
* if this was recently any drive of the array
*/
if (rdev->saved_raid_disk < 0)
conf->fullsync = 1;
- WRITE_ONCE(p->rdev, rdev);
break;
}
if (test_bit(WantReplacement, &p->rdev->flags) &&
@@ -1813,13 +1955,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (err && repl_slot >= 0) {
/* Add this device as a replacement */
- p = conf->mirrors + repl_slot;
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
- rdev->raid_disk = repl_slot;
+ raid1_add_conf(conf, rdev, repl_slot, true);
err = 0;
conf->fullsync = 1;
- WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
}
print_conf(conf);
@@ -1836,27 +1976,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (unlikely(number >= conf->raid_disks))
goto abort;
- if (rdev != p->rdev)
- p = conf->mirrors + conf->raid_disks + number;
+ if (rdev != p->rdev) {
+ number += conf->raid_disks;
+ p = conf->mirrors + number;
+ }
print_conf(conf);
if (rdev == p->rdev) {
- if (test_bit(In_sync, &rdev->flags) ||
- atomic_read(&rdev->nr_pending)) {
+ if (!raid1_remove_conf(conf, number)) {
err = -EBUSY;
goto abort;
}
- /* Only remove non-faulty devices if recovery
- * is not possible.
- */
- if (!test_bit(Faulty, &rdev->flags) &&
- mddev->recovery_disabled != conf->recovery_disabled &&
- mddev->degraded < conf->raid_disks) {
- err = -EBUSY;
- goto abort;
- }
- WRITE_ONCE(p->rdev, NULL);
- if (conf->mirrors[conf->raid_disks + number].rdev) {
+
+ if (number < conf->raid_disks &&
+ conf->mirrors[conf->raid_disks + number].rdev) {
/* We just removed a device that is being replaced.
* Move down the replacement. We drain all IO before
* doing this to avoid confusion.
@@ -1944,8 +2077,6 @@ static void end_sync_write(struct bio *bio)
struct r1bio *r1_bio = get_resync_r1bio(bio);
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
- sector_t first_bad;
- int bad_sectors;
struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
if (!uptodate) {
@@ -1955,14 +2086,11 @@ static void end_sync_write(struct bio *bio)
set_bit(MD_RECOVERY_NEEDED, &
mddev->recovery);
set_bit(R1BIO_WriteError, &r1_bio->state);
- } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
- &first_bad, &bad_sectors) &&
- !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
- r1_bio->sector,
- r1_bio->sectors,
- &first_bad, &bad_sectors)
- )
+ } else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
+ !rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev,
+ r1_bio->sector, r1_bio->sectors)) {
set_bit(R1BIO_MadeGood, &r1_bio->state);
+ }
put_sync_write_buf(r1_bio, uptodate);
}
@@ -2279,16 +2407,12 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
s = PAGE_SIZE >> 9;
do {
- sector_t first_bad;
- int bad_sectors;
-
rdev = conf->mirrors[d].rdev;
if (rdev &&
(test_bit(In_sync, &rdev->flags) ||
(!test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sect + s)) &&
- is_badblock(rdev, sect, s,
- &first_bad, &bad_sectors) == 0) {
+ rdev_has_badblock(rdev, sect, s) == 0) {
atomic_inc(&rdev->nr_pending);
if (sync_page_io(rdev, sect, s<<9,
conf->tmppage, REQ_OP_READ, false))
@@ -3006,23 +3130,17 @@ static struct r1conf *setup_conf(struct mddev *mddev)
err = -EINVAL;
spin_lock_init(&conf->device_lock);
+ conf->raid_disks = mddev->raid_disks;
rdev_for_each(rdev, mddev) {
int disk_idx = rdev->raid_disk;
- if (disk_idx >= mddev->raid_disks
- || disk_idx < 0)
+
+ if (disk_idx >= conf->raid_disks || disk_idx < 0)
continue;
- if (test_bit(Replacement, &rdev->flags))
- disk = conf->mirrors + mddev->raid_disks + disk_idx;
- else
- disk = conf->mirrors + disk_idx;
- if (disk->rdev)
+ if (!raid1_add_conf(conf, rdev, disk_idx,
+ test_bit(Replacement, &rdev->flags)))
goto abort;
- disk->rdev = rdev;
- disk->head_position = 0;
- disk->seq_start = MaxSector;
}
- conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
INIT_LIST_HEAD(&conf->retry_list);
INIT_LIST_HEAD(&conf->bio_end_io_list);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 14d4211a123a..5300cbaa58a4 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -71,6 +71,7 @@ struct r1conf {
* allow for replacements.
*/
int raid_disks;
+ int nonrot_disks;
spinlock_t device_lock;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7412066ea22c..8aecdb1ccc16 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -518,11 +518,7 @@ static void raid10_end_write_request(struct bio *bio)
* The 'master' represents the composite IO operation to
* user-side. So if something waits for IO, then it will
* wait for the 'master' bio.
- */
- sector_t first_bad;
- int bad_sectors;
-
- /*
+ *
* Do not set R10BIO_Uptodate if the current device is
* rebuilding or Faulty. This is because we cannot use
* such device for properly reading the data back (we could
@@ -535,10 +531,9 @@ static void raid10_end_write_request(struct bio *bio)
set_bit(R10BIO_Uptodate, &r10_bio->state);
/* Maybe we can clear some bad blocks. */
- if (is_badblock(rdev,
- r10_bio->devs[slot].addr,
- r10_bio->sectors,
- &first_bad, &bad_sectors) && !discard_error) {
+ if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
+ r10_bio->sectors) &&
+ !discard_error) {
bio_put(bio);
if (repl)
r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
@@ -753,17 +748,8 @@ static struct md_rdev *read_balance(struct r10conf *conf,
best_good_sectors = 0;
do_balance = 1;
clear_bit(R10BIO_FailFast, &r10_bio->state);
- /*
- * Check if we can balance. We can balance on the whole
- * device if no resync is going on (recovery is ok), or below
- * the resync window. We take the first readable disk when
- * above the resync window.
- */
- if ((conf->mddev->recovery_cp < MaxSector
- && (this_sector + sectors >= conf->next_resync)) ||
- (mddev_is_clustered(conf->mddev) &&
- md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
- this_sector + sectors)))
+
+ if (raid1_should_read_first(conf->mddev, this_sector, sectors))
do_balance = 0;
for (slot = 0; slot < conf->copies ; slot++) {
@@ -1330,10 +1316,7 @@ retry_wait:
}
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
- sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr;
- int bad_sectors;
- int is_bad;
/*
* Discard request doesn't care the write result
@@ -1342,9 +1325,8 @@ retry_wait:
if (!r10_bio->sectors)
continue;
- is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
- &first_bad, &bad_sectors);
- if (is_bad < 0) {
+ if (rdev_has_badblock(rdev, dev_sector,
+ r10_bio->sectors) < 0) {
/*
* Mustn't write here until the bad block
* is acknowledged
@@ -2290,8 +2272,6 @@ static void end_sync_write(struct bio *bio)
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
int d;
- sector_t first_bad;
- int bad_sectors;
int slot;
int repl;
struct md_rdev *rdev = NULL;
@@ -2312,11 +2292,10 @@ static void end_sync_write(struct bio *bio)
&rdev->mddev->recovery);
set_bit(R10BIO_WriteError, &r10_bio->state);
}
- } else if (is_badblock(rdev,
- r10_bio->devs[slot].addr,
- r10_bio->sectors,
- &first_bad, &bad_sectors))
+ } else if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
+ r10_bio->sectors)) {
set_bit(R10BIO_MadeGood, &r10_bio->state);
+ }
rdev_dec_pending(rdev, mddev);
@@ -2597,11 +2576,8 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
int sectors, struct page *page, enum req_op op)
{
- sector_t first_bad;
- int bad_sectors;
-
- if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
- && (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
+ if (rdev_has_badblock(rdev, sector, sectors) &&
+ (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
return -1;
if (sync_page_io(rdev, sector, sectors << 9, page, op, false))
/* success */
@@ -2658,16 +2634,14 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
s = PAGE_SIZE >> 9;
do {
- sector_t first_bad;
- int bad_sectors;
-
d = r10_bio->devs[sl].devnum;
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
- is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
- &first_bad, &bad_sectors) == 0) {
+ rdev_has_badblock(rdev,
+ r10_bio->devs[sl].addr + sect,
+ s) == 0) {
atomic_inc(&rdev->nr_pending);
success = sync_page_io(rdev,
r10_bio->devs[sl].addr +
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7ec445f49f1c..48129de21aec 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1210,10 +1210,8 @@ again:
*/
while (op_is_write(op) && rdev &&
test_bit(WriteErrorSeen, &rdev->flags)) {
- sector_t first_bad;
- int bad_sectors;
- int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors);
+ int bad = rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf));
if (!bad)
break;
@@ -2855,8 +2853,6 @@ static void raid5_end_write_request(struct bio *bi)
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
struct md_rdev *rdev;
- sector_t first_bad;
- int bad_sectors;
int replacement = 0;
for (i = 0 ; i < disks; i++) {
@@ -2888,9 +2884,8 @@ static void raid5_end_write_request(struct bio *bi)
if (replacement) {
if (bi->bi_status)
md_error(conf->mddev, rdev);
- else if (is_badblock(rdev, sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors))
+ else if (rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf)))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
if (bi->bi_status) {
@@ -2900,9 +2895,8 @@ static void raid5_end_write_request(struct bio *bi)
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
- } else if (is_badblock(rdev, sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors)) {
+ } else if (rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf))) {
set_bit(R5_MadeGood, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags))
/* That was a successful write so make
@@ -4674,8 +4668,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
/* Now to look around and see what can be done */
for (i=disks; i--; ) {
struct md_rdev *rdev;
- sector_t first_bad;
- int bad_sectors;
int is_bad = 0;
dev = &sh->dev[i];
@@ -4719,8 +4711,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
rdev = conf->disks[i].replacement;
if (rdev && !test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
- !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors))
+ !rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf)))
set_bit(R5_ReadRepl, &dev->flags);
else {
if (rdev && !test_bit(Faulty, &rdev->flags))
@@ -4733,8 +4725,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL;
if (rdev) {
- is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors);
+ is_bad = rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf));
if (s->blocked_rdev == NULL
&& (test_bit(Blocked, &rdev->flags)
|| is_bad < 0)) {
@@ -5463,8 +5455,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
struct r5conf *conf = mddev->private;
struct bio *align_bio;
struct md_rdev *rdev;
- sector_t sector, end_sector, first_bad;
- int bad_sectors, dd_idx;
+ sector_t sector, end_sector;
+ int dd_idx;
bool did_inc;
if (!in_chunk_boundary(mddev, raid_bio)) {
@@ -5493,8 +5485,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
atomic_inc(&rdev->nr_pending);
- if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
- &bad_sectors)) {
+ if (rdev_has_badblock(rdev, sector, bio_sectors(raid_bio))) {
rdev_dec_pending(rdev, mddev);
return 0;
}