aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-crypt.c8
-rw-r--r--drivers/md/dm-mpath.c4
-rw-r--r--drivers/md/dm-rq.c4
-rw-r--r--drivers/md/md.c5
-rw-r--r--drivers/md/md.h8
-rw-r--r--drivers/md/raid0.c12
-rw-r--r--drivers/md/raid1.c275
-rw-r--r--drivers/md/raid10.c245
-rw-r--r--drivers/md/raid5-cache.c140
-rw-r--r--drivers/md/raid5.c128
-rw-r--r--drivers/md/raid5.h7
11 files changed, 525 insertions, 311 deletions
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 7c6c57216bf2..8a9f742d8ed7 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1534,18 +1534,18 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
return PTR_ERR(key);
}
- rcu_read_lock();
+ down_read(&key->sem);
ukp = user_key_payload(key);
if (!ukp) {
- rcu_read_unlock();
+ up_read(&key->sem);
key_put(key);
kzfree(new_key_string);
return -EKEYREVOKED;
}
if (cc->key_size != ukp->datalen) {
- rcu_read_unlock();
+ up_read(&key->sem);
key_put(key);
kzfree(new_key_string);
return -EINVAL;
@@ -1553,7 +1553,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
memcpy(cc->key, ukp->data, cc->key_size);
- rcu_read_unlock();
+ up_read(&key->sem);
key_put(key);
/* clear the flag since following operations may invalidate previously valid key */
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6400cffb986d..3570bcb7a4a4 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -427,7 +427,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
unsigned long flags;
struct priority_group *pg;
struct pgpath *pgpath;
- bool bypassed = true;
+ unsigned bypassed = 1;
if (!atomic_read(&m->nr_valid_paths)) {
clear_bit(MPATHF_QUEUE_IO, &m->flags);
@@ -466,7 +466,7 @@ check_current_pg:
*/
do {
list_for_each_entry(pg, &m->priority_groups, list) {
- if (pg->bypassed == bypassed)
+ if (pg->bypassed == !!bypassed)
continue;
pgpath = choose_path_in_pg(m, pg, nr_bytes);
if (!IS_ERR_OR_NULL(pgpath)) {
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 9d7275fb541a..6e702fc69a83 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -779,6 +779,10 @@ static void dm_old_request_fn(struct request_queue *q)
int srcu_idx;
struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+ if (unlikely(!map)) {
+ dm_put_live_table(md, srcu_idx);
+ return;
+ }
ti = dm_table_find_target(map, pos);
dm_put_live_table(md, srcu_idx);
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 82821ee0d57f..01175dac0db6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5291,6 +5291,11 @@ int md_run(struct mddev *mddev)
if (start_readonly && mddev->ro == 0)
mddev->ro = 2; /* read-only, but switch on first write */
+ /*
+ * NOTE: some pers->run(), for example r5l_recovery_log(), wakes
+ * up mddev->thread. It is important to initialize critical
+ * resources for mddev->thread BEFORE calling pers->run().
+ */
err = pers->run(mddev);
if (err)
pr_warn("md: pers->run() failed ...\n");
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e38936d05df1..2a514036a83d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -212,6 +212,7 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
struct md_cluster_info;
+/* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */
enum mddev_flags {
MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */
MD_CLOSING, /* If set, we are closing the array, do not open
@@ -702,4 +703,11 @@ static inline int mddev_is_clustered(struct mddev *mddev)
{
return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
}
+
+/* clear unsupported mddev_flags */
+static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
+ unsigned long unsupported_flags)
+{
+ mddev->flags &= ~unsupported_flags;
+}
#endif /* _MD_MD_H */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index a162fedeb51a..848365d474f3 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -26,6 +26,11 @@
#include "raid0.h"
#include "raid5.h"
+#define UNSUPPORTED_MDDEV_FLAGS \
+ ((1L << MD_HAS_JOURNAL) | \
+ (1L << MD_JOURNAL_CLEAN) | \
+ (1L << MD_FAILFAST_SUPPORTED))
+
static int raid0_congested(struct mddev *mddev, int bits)
{
struct r0conf *conf = mddev->private;
@@ -539,8 +544,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
mddev->delta_disks = -1;
/* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector;
- clear_bit(MD_HAS_JOURNAL, &mddev->flags);
- clear_bit(MD_JOURNAL_CLEAN, &mddev->flags);
+ mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
create_strip_zones(mddev, &priv_conf);
@@ -583,7 +587,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
mddev->degraded = 0;
/* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector;
- clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+ mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
create_strip_zones(mddev, &priv_conf);
return priv_conf;
@@ -626,7 +630,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
mddev->raid_disks = 1;
/* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector;
- clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+ mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
create_strip_zones(mddev, &priv_conf);
return priv_conf;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a1f3fbed9100..7b0f647bcccb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -42,6 +42,10 @@
#include "raid1.h"
#include "bitmap.h"
+#define UNSUPPORTED_MDDEV_FLAGS \
+ ((1L << MD_HAS_JOURNAL) | \
+ (1L << MD_JOURNAL_CLEAN))
+
/*
* Number of guaranteed r1bios in case of extreme VM load:
*/
@@ -1066,17 +1070,107 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
kfree(plug);
}
-static void raid1_make_request(struct mddev *mddev, struct bio * bio)
+static void raid1_read_request(struct mddev *mddev, struct bio *bio,
+ struct r1bio *r1_bio)
{
struct r1conf *conf = mddev->private;
struct raid1_info *mirror;
- struct r1bio *r1_bio;
struct bio *read_bio;
+ struct bitmap *bitmap = mddev->bitmap;
+ const int op = bio_op(bio);
+ const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+ int sectors_handled;
+ int max_sectors;
+ int rdisk;
+
+ wait_barrier(conf, bio);
+
+read_again:
+ rdisk = read_balance(conf, r1_bio, &max_sectors);
+
+ if (rdisk < 0) {
+ /* couldn't find anywhere to read from */
+ raid_end_bio_io(r1_bio);
+ return;
+ }
+ mirror = conf->mirrors + rdisk;
+
+ if (test_bit(WriteMostly, &mirror->rdev->flags) &&
+ bitmap) {
+ /*
+ * Reading from a write-mostly device must take care not to
+ * over-take any writes that are 'behind'
+ */
+ raid1_log(mddev, "wait behind writes");
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
+ r1_bio->read_disk = rdisk;
+ r1_bio->start_next_window = 0;
+
+ read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
+
+ r1_bio->bios[rdisk] = read_bio;
+
+ read_bio->bi_iter.bi_sector = r1_bio->sector +
+ mirror->rdev->data_offset;
+ read_bio->bi_bdev = mirror->rdev->bdev;
+ read_bio->bi_end_io = raid1_end_read_request;
+ bio_set_op_attrs(read_bio, op, do_sync);
+ if (test_bit(FailFast, &mirror->rdev->flags) &&
+ test_bit(R1BIO_FailFast, &r1_bio->state))
+ read_bio->bi_opf |= MD_FAILFAST;
+ read_bio->bi_private = r1_bio;
+
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
+ read_bio, disk_devt(mddev->gendisk),
+ r1_bio->sector);
+
+ if (max_sectors < r1_bio->sectors) {
+ /*
+ * could not read all from this device, so we will need another
+ * r1_bio.
+ */
+ sectors_handled = (r1_bio->sector + max_sectors
+ - bio->bi_iter.bi_sector);
+ r1_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+
+ /*
+ * Cannot call generic_make_request directly as that will be
+ * queued in __make_request and subsequent mempool_alloc might
+ * block waiting for it. So hand bio over to raid1d.
+ */
+ reschedule_retry(r1_bio);
+
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = bio_sectors(bio) - sectors_handled;
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+ goto read_again;
+ } else
+ generic_make_request(read_bio);
+}
+
+static void raid1_write_request(struct mddev *mddev, struct bio *bio,
+ struct r1bio *r1_bio)
+{
+ struct r1conf *conf = mddev->private;
int i, disks;
- struct bitmap *bitmap;
+ struct bitmap *bitmap = mddev->bitmap;
unsigned long flags;
const int op = bio_op(bio);
- const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
const unsigned long do_flush_fua = (bio->bi_opf &
(REQ_PREFLUSH | REQ_FUA));
@@ -1096,15 +1190,15 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
md_write_start(mddev, bio); /* wait on superblock update early */
- if (bio_data_dir(bio) == WRITE &&
- ((bio_end_sector(bio) > mddev->suspend_lo &&
+ if ((bio_end_sector(bio) > mddev->suspend_lo &&
bio->bi_iter.bi_sector < mddev->suspend_hi) ||
(mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, WRITE,
- bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
- /* As the suspend_* range is controlled by
- * userspace, we want an interruptible
- * wait.
+ bio->bi_iter.bi_sector, bio_end_sector(bio)))) {
+
+ /*
+ * As the suspend_* range is controlled by userspace, we want
+ * an interruptible wait.
*/
DEFINE_WAIT(w);
for (;;) {
@@ -1115,128 +1209,15 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
bio->bi_iter.bi_sector >= mddev->suspend_hi ||
(mddev_is_clustered(mddev) &&
!md_cluster_ops->area_resyncing(mddev, WRITE,
- bio->bi_iter.bi_sector, bio_end_sector(bio))))
+ bio->bi_iter.bi_sector,
+ bio_end_sector(bio))))
break;
schedule();
}
finish_wait(&conf->wait_barrier, &w);
}
-
start_next_window = wait_barrier(conf, bio);
- bitmap = mddev->bitmap;
-
- /*
- * make_request() can abort the operation when read-ahead is being
- * used and no empty request is available.
- *
- */
- r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
- r1_bio->master_bio = bio;
- r1_bio->sectors = bio_sectors(bio);
- r1_bio->state = 0;
- r1_bio->mddev = mddev;
- r1_bio->sector = bio->bi_iter.bi_sector;
-
- /* We might need to issue multiple reads to different
- * devices if there are bad blocks around, so we keep
- * track of the number of reads in bio->bi_phys_segments.
- * If this is 0, there is only one r1_bio and no locking
- * will be needed when requests complete. If it is
- * non-zero, then it is the number of not-completed requests.
- */
- bio->bi_phys_segments = 0;
- bio_clear_flag(bio, BIO_SEG_VALID);
-
- if (rw == READ) {
- /*
- * read balancing logic:
- */
- int rdisk;
-
-read_again:
- rdisk = read_balance(conf, r1_bio, &max_sectors);
-
- if (rdisk < 0) {
- /* couldn't find anywhere to read from */
- raid_end_bio_io(r1_bio);
- return;
- }
- mirror = conf->mirrors + rdisk;
-
- if (test_bit(WriteMostly, &mirror->rdev->flags) &&
- bitmap) {
- /* Reading from a write-mostly device must
- * take care not to over-take any writes
- * that are 'behind'
- */
- raid1_log(mddev, "wait behind writes");
- wait_event(bitmap->behind_wait,
- atomic_read(&bitmap->behind_writes) == 0);
- }
- r1_bio->read_disk = rdisk;
- r1_bio->start_next_window = 0;
-
- read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
-
- r1_bio->bios[rdisk] = read_bio;
-
- read_bio->bi_iter.bi_sector = r1_bio->sector +
- mirror->rdev->data_offset;
- read_bio->bi_bdev = mirror->rdev->bdev;
- read_bio->bi_end_io = raid1_end_read_request;
- bio_set_op_attrs(read_bio, op, do_sync);
- if (test_bit(FailFast, &mirror->rdev->flags) &&
- test_bit(R1BIO_FailFast, &r1_bio->state))
- read_bio->bi_opf |= MD_FAILFAST;
- read_bio->bi_private = r1_bio;
-
- if (mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
- read_bio, disk_devt(mddev->gendisk),
- r1_bio->sector);
-
- if (max_sectors < r1_bio->sectors) {
- /* could not read all from this device, so we will
- * need another r1_bio.
- */
-
- sectors_handled = (r1_bio->sector + max_sectors
- - bio->bi_iter.bi_sector);
- r1_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
- /* Cannot call generic_make_request directly
- * as that will be queued in __make_request
- * and subsequent mempool_alloc might block waiting
- * for it. So hand bio over to raid1d.
- */
- reschedule_retry(r1_bio);
-
- r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
- r1_bio->master_bio = bio;
- r1_bio->sectors = bio_sectors(bio) - sectors_handled;
- r1_bio->state = 0;
- r1_bio->mddev = mddev;
- r1_bio->sector = bio->bi_iter.bi_sector +
- sectors_handled;
- goto read_again;
- } else
- generic_make_request(read_bio);
- return;
- }
-
- /*
- * WRITE:
- */
if (conf->pending_count >= max_queued_requests) {
md_wakeup_thread(mddev->thread);
raid1_log(mddev, "wait queued");
@@ -1280,8 +1261,7 @@ read_again:
int bad_sectors;
int is_bad;
- is_bad = is_badblock(rdev, r1_bio->sector,
- max_sectors,
+ is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
&first_bad, &bad_sectors);
if (is_bad < 0) {
/* mustn't write here until the bad block is
@@ -1370,7 +1350,8 @@ read_again:
continue;
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors);
+ bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
if (first_clone) {
/* do behind I/O ?
@@ -1464,6 +1445,40 @@ read_again:
wake_up(&conf->wait_barrier);
}
+static void raid1_make_request(struct mddev *mddev, struct bio *bio)
+{
+ struct r1conf *conf = mddev->private;
+ struct r1bio *r1_bio;
+
+ /*
+ * make_request() can abort the operation when read-ahead is being
+ * used and no empty request is available.
+ *
+ */
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = bio_sectors(bio);
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_iter.bi_sector;
+
+ /*
+ * We might need to issue multiple reads to different devices if there
+ * are bad blocks around, so we keep track of the number of reads in
+ * bio->bi_phys_segments. If this is 0, there is only one r1_bio and
+ * no locking will be needed when requests complete. If it is
+ * non-zero, then it is the number of not-completed requests.
+ */
+ bio->bi_phys_segments = 0;
+ bio_clear_flag(bio, BIO_SEG_VALID);
+
+ if (bio_data_dir(bio) == READ)
+ raid1_read_request(mddev, bio, r1_bio);
+ else
+ raid1_write_request(mddev, bio, r1_bio);
+}
+
static void raid1_status(struct seq_file *seq, struct mddev *mddev)
{
struct r1conf *conf = mddev->private;
@@ -3246,8 +3261,8 @@ static void *raid1_takeover(struct mddev *mddev)
if (!IS_ERR(conf)) {
/* Array must appear to be quiesced */
conf->array_frozen = 1;
- clear_bit(MD_HAS_JOURNAL, &mddev->flags);
- clear_bit(MD_JOURNAL_CLEAN, &mddev->flags);
+ mddev_clear_unsupported_flags(mddev,
+ UNSUPPORTED_MDDEV_FLAGS);
}
return conf;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ab5e86209322..1920756828df 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1087,23 +1087,122 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
kfree(plug);
}
-static void __make_request(struct mddev *mddev, struct bio *bio)
+static void raid10_read_request(struct mddev *mddev, struct bio *bio,
+ struct r10bio *r10_bio)
{
struct r10conf *conf = mddev->private;
- struct r10bio *r10_bio;
struct bio *read_bio;
+ const int op = bio_op(bio);
+ const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+ int sectors_handled;
+ int max_sectors;
+ sector_t sectors;
+ struct md_rdev *rdev;
+ int slot;
+
+ /*
+ * Register the new request and wait if the reconstruction
+ * thread has put up a bar for new requests.
+ * Continue immediately if no resync is active currently.
+ */
+ wait_barrier(conf);
+
+ sectors = bio_sectors(bio);
+ while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ bio->bi_iter.bi_sector < conf->reshape_progress &&
+ bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
+ /*
+ * IO spans the reshape position. Need to wait for reshape to
+ * pass
+ */
+ raid10_log(conf->mddev, "wait reshape");
+ allow_barrier(conf);
+ wait_event(conf->wait_barrier,
+ conf->reshape_progress <= bio->bi_iter.bi_sector ||
+ conf->reshape_progress >= bio->bi_iter.bi_sector +
+ sectors);
+ wait_barrier(conf);
+ }
+
+read_again:
+ rdev = read_balance(conf, r10_bio, &max_sectors);
+ if (!rdev) {
+ raid_end_bio_io(r10_bio);
+ return;
+ }
+ slot = r10_bio->read_slot;
+
+ read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
+
+ r10_bio->devs[slot].bio = read_bio;
+ r10_bio->devs[slot].rdev = rdev;
+
+ read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
+ choose_data_offset(r10_bio, rdev);
+ read_bio->bi_bdev = rdev->bdev;
+ read_bio->bi_end_io = raid10_end_read_request;
+ bio_set_op_attrs(read_bio, op, do_sync);
+ if (test_bit(FailFast, &rdev->flags) &&
+ test_bit(R10BIO_FailFast, &r10_bio->state))
+ read_bio->bi_opf |= MD_FAILFAST;
+ read_bio->bi_private = r10_bio;
+
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
+ read_bio, disk_devt(mddev->gendisk),
+ r10_bio->sector);
+ if (max_sectors < r10_bio->sectors) {
+ /*
+ * Could not read all from this device, so we will need another
+ * r10_bio.
+ */
+ sectors_handled = (r10_bio->sector + max_sectors
+ - bio->bi_iter.bi_sector);
+ r10_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ /*
+ * Cannot call generic_make_request directly as that will be
+ * queued in __generic_make_request and subsequent
+ * mempool_alloc might block waiting for it. so hand bio over
+ * to raid10d.
+ */
+ reschedule_retry(r10_bio);
+
+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+ r10_bio->master_bio = bio;
+ r10_bio->sectors = bio_sectors(bio) - sectors_handled;
+ r10_bio->state = 0;
+ r10_bio->mddev = mddev;
+ r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+ goto read_again;
+ } else
+ generic_make_request(read_bio);
+ return;
+}
+
+static void raid10_write_request(struct mddev *mddev, struct bio *bio,
+ struct r10bio *r10_bio)
+{
+ struct r10conf *conf = mddev->private;
int i;
const int op = bio_op(bio);
- const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
unsigned long flags;
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
struct raid10_plug_cb *plug = NULL;
+ sector_t sectors;
int sectors_handled;
int max_sectors;
- int sectors;
md_write_start(mddev, bio);
@@ -1118,8 +1217,9 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
- /* IO spans the reshape position. Need to wait for
- * reshape to pass
+ /*
+ * IO spans the reshape position. Need to wait for reshape to
+ * pass
*/
raid10_log(conf->mddev, "wait reshape");
allow_barrier(conf);
@@ -1129,8 +1229,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
sectors);
wait_barrier(conf);
}
+
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- bio_data_dir(bio) == WRITE &&
(mddev->reshape_backwards
? (bio->bi_iter.bi_sector < conf->reshape_safe &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
@@ -1148,98 +1248,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
conf->reshape_safe = mddev->reshape_position;
}
- r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
- r10_bio->master_bio = bio;
- r10_bio->sectors = sectors;
-
- r10_bio->mddev = mddev;
- r10_bio->sector = bio->bi_iter.bi_sector;
- r10_bio->state = 0;
-
- /* We might need to issue multiple reads to different
- * devices if there are bad blocks around, so we keep
- * track of the number of reads in bio->bi_phys_segments.
- * If this is 0, there is only one r10_bio and no locking
- * will be needed when the request completes. If it is
- * non-zero, then it is the number of not-completed requests.
- */
- bio->bi_phys_segments = 0;
- bio_clear_flag(bio, BIO_SEG_VALID);
-
- if (rw == READ) {
- /*
- * read balancing logic:
- */
- struct md_rdev *rdev;
- int slot;
-
-read_again:
- rdev = read_balance(conf, r10_bio, &max_sectors);
- if (!rdev) {
- raid_end_bio_io(r10_bio);
- return;
- }
- slot = r10_bio->read_slot;
-
- read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
-
- r10_bio->devs[slot].bio = read_bio;
- r10_bio->devs[slot].rdev = rdev;
-
- read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
- choose_data_offset(r10_bio, rdev);
- read_bio->bi_bdev = rdev->bdev;
- read_bio->bi_end_io = raid10_end_read_request;
- bio_set_op_attrs(read_bio, op, do_sync);
- if (test_bit(FailFast, &rdev->flags) &&
- test_bit(R10BIO_FailFast, &r10_bio->state))
- read_bio->bi_opf |= MD_FAILFAST;
- read_bio->bi_private = r10_bio;
-
- if (mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
- read_bio, disk_devt(mddev->gendisk),
- r10_bio->sector);
- if (max_sectors < r10_bio->sectors) {
- /* Could not read all from this device, so we will
- * need another r10_bio.
- */
- sectors_handled = (r10_bio->sector + max_sectors
- - bio->bi_iter.bi_sector);
- r10_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
- /* Cannot call generic_make_request directly
- * as that will be queued in __generic_make_request
- * and subsequent mempool_alloc might block
- * waiting for it. so hand bio over to raid10d.
- */
- reschedule_retry(r10_bio);
-
- r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
- r10_bio->master_bio = bio;
- r10_bio->sectors = bio_sectors(bio) - sectors_handled;
- r10_bio->state = 0;
- r10_bio->mddev = mddev;
- r10_bio->sector = bio->bi_iter.bi_sector +
- sectors_handled;
- goto read_again;
- } else
- generic_make_request(read_bio);
- return;
- }
-
- /*
- * WRITE:
- */
if (conf->pending_count >= max_queued_requests) {
md_wakeup_thread(mddev->thread);
raid10_log(mddev, "wait queued");
@@ -1300,8 +1308,7 @@ retry_write:
int bad_sectors;
int is_bad;
- is_bad = is_badblock(rdev, dev_sector,
- max_sectors,
+ is_bad = is_badblock(rdev, dev_sector, max_sectors,
&first_bad, &bad_sectors);
if (is_bad < 0) {
/* Mustn't write here until the bad block
@@ -1405,8 +1412,7 @@ retry_write:
r10_bio->devs[i].bio = mbio;
mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
- choose_data_offset(r10_bio,
- rdev));
+ choose_data_offset(r10_bio, rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
bio_set_op_attrs(mbio, op, do_sync | do_fua);
@@ -1457,8 +1463,7 @@ retry_write:
r10_bio->devs[i].repl_bio = mbio;
mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
- choose_data_offset(
- r10_bio, rdev));
+ choose_data_offset(r10_bio, rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
bio_set_op_attrs(mbio, op, do_sync | do_fua);
@@ -1503,6 +1508,36 @@ retry_write:
one_write_done(r10_bio);
}
+static void __make_request(struct mddev *mddev, struct bio *bio)
+{
+ struct r10conf *conf = mddev->private;
+ struct r10bio *r10_bio;
+
+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+ r10_bio->master_bio = bio;
+ r10_bio->sectors = bio_sectors(bio);
+
+ r10_bio->mddev = mddev;
+ r10_bio->sector = bio->bi_iter.bi_sector;
+ r10_bio->state = 0;
+
+ /*
+ * We might need to issue multiple reads to different devices if there
+ * are bad blocks around, so we keep track of the number of reads in
+ * bio->bi_phys_segments. If this is 0, there is only one r10_bio and
+ * no locking will be needed when the request completes. If it is
+ * non-zero, then it is the number of not-completed requests.
+ */
+ bio->bi_phys_segments = 0;
+ bio_clear_flag(bio, BIO_SEG_VALID);
+
+ if (bio_data_dir(bio) == READ)
+ raid10_read_request(mddev, bio, r10_bio);
+ else
+ raid10_write_request(mddev, bio, r10_bio);
+}
+
static void raid10_make_request(struct mddev *mddev, struct bio *bio)
{
struct r10conf *conf = mddev->private;
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index d7bfb6fc8aef..302dea3296ba 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -162,6 +162,8 @@ struct r5l_log {
/* to submit async io_units, to fulfill ordering of flush */
struct work_struct deferred_io_work;
+ /* to disable write back during in degraded mode */
+ struct work_struct disable_writeback_work;
};
/*
@@ -611,6 +613,21 @@ static void r5l_submit_io_async(struct work_struct *work)
r5l_do_submit_io(log, io);
}
+static void r5c_disable_writeback_async(struct work_struct *work)
+{
+ struct r5l_log *log = container_of(work, struct r5l_log,
+ disable_writeback_work);
+ struct mddev *mddev = log->rdev->mddev;
+
+ if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+ return;
+ pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
+ mdname(mddev));
+ mddev_suspend(mddev);
+ log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+ mddev_resume(mddev);
+}
+
static void r5l_submit_current_io(struct r5l_log *log)
{
struct r5l_io_unit *io = log->current_io;
@@ -1393,8 +1410,6 @@ static void r5l_do_reclaim(struct r5l_log *log)
next_checkpoint = r5c_calculate_new_cp(conf);
spin_unlock_irq(&log->io_list_lock);
- BUG_ON(reclaimable < 0);
-
if (reclaimable == 0 || !write_super)
return;
@@ -1682,8 +1697,7 @@ out:
static struct stripe_head *
r5c_recovery_alloc_stripe(struct r5conf *conf,
- sector_t stripe_sect,
- sector_t log_start)
+ sector_t stripe_sect)
{
struct stripe_head *sh;
@@ -1692,7 +1706,6 @@ r5c_recovery_alloc_stripe(struct r5conf *conf,
return NULL; /* no more stripe available */
r5l_recovery_reset_stripe(sh);
- sh->log_start = log_start;
return sh;
}
@@ -1862,7 +1875,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
stripe_sect);
if (!sh) {
- sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos);
+ sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
/*
* cannot get stripe from raid5_get_active_stripe
* try replay some stripes
@@ -1871,7 +1884,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
r5c_recovery_replay_stripes(
cached_stripe_list, ctx);
sh = r5c_recovery_alloc_stripe(
- conf, stripe_sect, ctx->pos);
+ conf, stripe_sect);
}
if (!sh) {
pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
@@ -1879,8 +1892,8 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
conf->min_nr_stripes * 2);
raid5_set_cache_size(mddev,
conf->min_nr_stripes * 2);
- sh = r5c_recovery_alloc_stripe(
- conf, stripe_sect, ctx->pos);
+ sh = r5c_recovery_alloc_stripe(conf,
+ stripe_sect);
}
if (!sh) {
pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
@@ -1894,7 +1907,6 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
r5l_recovery_replay_one_stripe(conf, sh, ctx);
- sh->log_start = ctx->pos;
list_move_tail(&sh->lru, cached_stripe_list);
}
r5l_recovery_load_data(log, sh, ctx, payload,
@@ -1933,8 +1945,6 @@ static void r5c_recovery_load_one_stripe(struct r5l_log *log,
set_bit(R5_UPTODATE, &dev->flags);
}
}
- list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
- atomic_inc(&log->stripe_in_journal_count);
}
/*
@@ -2067,9 +2077,10 @@ static int
r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
- struct stripe_head *sh, *next;
+ struct stripe_head *sh;
struct mddev *mddev = log->rdev->mddev;
struct page *page;
+ sector_t next_checkpoint = MaxSector;
page = alloc_page(GFP_KERNEL);
if (!page) {
@@ -2078,7 +2089,9 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
return -ENOMEM;
}
- list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+ WARN_ON(list_empty(&ctx->cached_list));
+
+ list_for_each_entry(sh, &ctx->cached_list, lru) {
struct r5l_meta_block *mb;
int i;
int offset;
@@ -2123,14 +2136,42 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
REQ_OP_WRITE, REQ_FUA, false);
sh->log_start = ctx->pos;
+ list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
+ atomic_inc(&log->stripe_in_journal_count);
ctx->pos = write_pos;
ctx->seq += 1;
+ next_checkpoint = sh->log_start;
+ }
+ log->next_checkpoint = next_checkpoint;
+ __free_page(page);
+ return 0;
+}
+
+static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx)
+{
+ struct mddev *mddev = log->rdev->mddev;
+ struct r5conf *conf = mddev->private;
+ struct stripe_head *sh, *next;
+
+ if (ctx->data_only_stripes == 0)
+ return;
+ log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
+
+ list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+ r5c_make_stripe_write_out(sh);
+ set_bit(STRIPE_HANDLE, &sh->state);
list_del_init(&sh->lru);
raid5_release_stripe(sh);
}
- __free_page(page);
- return 0;
+
+ md_wakeup_thread(conf->mddev->thread);
+ /* reuse conf->wait_for_quiescent in recovery */
+ wait_event(conf->wait_for_quiescent,
+ atomic_read(&conf->active_stripes) == 0);
+
+ log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
}
static int r5l_recovery_log(struct r5l_log *log)
@@ -2139,7 +2180,6 @@ static int r5l_recovery_log(struct r5l_log *log)
struct r5l_recovery_ctx ctx;
int ret;
sector_t pos;
- struct stripe_head *sh;
ctx.pos = log->last_checkpoint;
ctx.seq = log->last_cp_seq;
@@ -2160,35 +2200,31 @@ static int r5l_recovery_log(struct r5l_log *log)
pos = ctx.pos;
ctx.seq += 10000;
- if (ctx.data_only_stripes == 0) {
- log->next_checkpoint = ctx.pos;
- r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
- ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
- } else {
- sh = list_last_entry(&ctx.cached_list, struct stripe_head, lru);
- log->next_checkpoint = sh->log_start;
- }
if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
pr_debug("md/raid:%s: starting from clean shutdown\n",
mdname(mddev));
- else {
- pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n",
+ else
+ pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
mdname(mddev), ctx.data_only_stripes,
ctx.data_parity_stripes);
- if (ctx.data_only_stripes > 0)
- if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
- pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
- mdname(mddev));
- return -EIO;
- }
+ if (ctx.data_only_stripes == 0) {
+ log->next_checkpoint = ctx.pos;
+ r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
+ ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
+ } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
+ pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
+ mdname(mddev));
+ return -EIO;
}
log->log_start = ctx.pos;
log->seq = ctx.seq;
log->last_checkpoint = pos;
r5l_write_super(log, pos);
+
+ r5c_recovery_flush_data_only_stripes(log, &ctx);
return 0;
}
@@ -2250,6 +2286,10 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev,
val > R5C_JOURNAL_MODE_WRITE_BACK)
return -EINVAL;
+ if (raid5_calc_degraded(conf) > 0 &&
+ val == R5C_JOURNAL_MODE_WRITE_BACK)
+ return -EINVAL;
+
mddev_suspend(mddev);
conf->log->r5c_journal_mode = val;
mddev_resume(mddev);
@@ -2304,6 +2344,16 @@ int r5c_try_caching_write(struct r5conf *conf,
set_bit(STRIPE_R5C_CACHING, &sh->state);
}
+ /*
+ * When run in degraded mode, array is set to write-through mode.
+ * This check helps drain pending write safely in the transition to
+ * write-through mode.
+ */
+ if (s->failed) {
+ r5c_make_stripe_write_out(sh);
+ return -EAGAIN;
+ }
+
for (i = disks; i--; ) {
dev = &sh->dev[i];
/* if non-overwrite, use writing-out phase */
@@ -2354,6 +2404,8 @@ void r5c_release_extra_page(struct stripe_head *sh)
struct page *p = sh->dev[i].orig_page;
sh->dev[i].orig_page = sh->dev[i].page;
+ clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
+
if (!using_disk_info_extra_page)
put_page(p);
}
@@ -2418,9 +2470,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
if (do_wakeup)
wake_up(&conf->wait_for_overlap);
- if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
- return;
-
spin_lock_irq(&conf->log->stripe_in_journal_lock);
list_del_init(&sh->r5c);
spin_unlock_irq(&conf->log->stripe_in_journal_lock);
@@ -2561,6 +2610,19 @@ ioerr:
return ret;
}
+void r5c_update_on_rdev_error(struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+ struct r5l_log *log = conf->log;
+
+ if (!log)
+ return;
+
+ if (raid5_calc_degraded(conf) > 0 &&
+ conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
+ schedule_work(&log->disable_writeback_work);
+}
+
int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
{
struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -2633,20 +2695,23 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
spin_lock_init(&log->no_space_stripes_lock);
INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
+ INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
INIT_LIST_HEAD(&log->stripe_in_journal_list);
spin_lock_init(&log->stripe_in_journal_lock);
atomic_set(&log->stripe_in_journal_count, 0);
+ rcu_assign_pointer(conf->log, log);
+
if (r5l_load_log(log))
goto error;
- rcu_assign_pointer(conf->log, log);
set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return 0;
error:
+ rcu_assign_pointer(conf->log, NULL);
md_unregister_thread(&log->reclaim_thread);
reclaim_thread:
mempool_destroy(log->meta_pool);
@@ -2663,6 +2728,7 @@ io_kc:
void r5l_exit_log(struct r5l_log *log)
{
+ flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
mempool_destroy(log->meta_pool);
bioset_free(log->bs);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 06d7279bdd04..3c7e106c12a2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -62,6 +62,8 @@
#include "raid0.h"
#include "bitmap.h"
+#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
+
#define cpu_to_group(cpu) cpu_to_node(cpu)
#define ANY_GROUP NUMA_NO_NODE
@@ -554,7 +556,7 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
* of the two sections, and some non-in_sync devices may
* be insync in the section most affected by failed devices.
*/
-static int calc_degraded(struct r5conf *conf)
+int raid5_calc_degraded(struct r5conf *conf)
{
int degraded, degraded2;
int i;
@@ -617,7 +619,7 @@ static int has_failed(struct r5conf *conf)
if (conf->mddev->reshape_position == MaxSector)
return conf->mddev->degraded > conf->max_degraded;
- degraded = calc_degraded(conf);
+ degraded = raid5_calc_degraded(conf);
if (degraded > conf->max_degraded)
return 1;
return 0;
@@ -1013,7 +1015,17 @@ again:
if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
- sh->dev[i].vec.bv_page = sh->dev[i].page;
+
+ if (!op_is_write(op) &&
+ test_bit(R5_InJournal, &sh->dev[i].flags))
+ /*
+ * issuing read for a page in journal, this
+ * must be preparing for prexor in rmw; read
+ * the data into orig_page
+ */
+ sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
+ else
+ sh->dev[i].vec.bv_page = sh->dev[i].page;
bi->bi_vcnt = 1;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0;
@@ -2378,6 +2390,13 @@ static void raid5_end_read_request(struct bio * bi)
} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
+ if (test_bit(R5_InJournal, &sh->dev[i].flags))
+ /*
+ * end read for a page in journal, this
+ * must be preparing for prexor in rmw
+ */
+ set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
+
if (atomic_read(&rdev->read_errors))
atomic_set(&rdev->read_errors, 0);
} else {
@@ -2536,7 +2555,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
spin_lock_irqsave(&conf->device_lock, flags);
clear_bit(In_sync, &rdev->flags);
- mddev->degraded = calc_degraded(conf);
+ mddev->degraded = raid5_calc_degraded(conf);
spin_unlock_irqrestore(&conf->device_lock, flags);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
@@ -2550,6 +2569,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
bdevname(rdev->bdev, b),
mdname(mddev),
conf->raid_disks - mddev->degraded);
+ r5c_update_on_rdev_error(mddev);
}
/*
@@ -2878,6 +2898,30 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
return r_sector;
}
+/*
+ * There are cases where we want handle_stripe_dirtying() and
+ * schedule_reconstruction() to delay towrite to some dev of a stripe.
+ *
+ * This function checks whether we want to delay the towrite. Specifically,
+ * we delay the towrite when:
+ *
+ * 1. degraded stripe has a non-overwrite to the missing dev, AND this
+ * stripe has data in journal (for other devices).
+ *
+ * In this case, when reading data for the non-overwrite dev, it is
+ * necessary to handle complex rmw of write back cache (prexor with
+ * orig_page, and xor with page). To keep read path simple, we would
+ * like to flush data in journal to RAID disks first, so complex rmw
+ * is handled in the write patch (handle_stripe_dirtying).
+ *
+ */
+static inline bool delay_towrite(struct r5dev *dev,
+ struct stripe_head_state *s)
+{
+ return !test_bit(R5_OVERWRITE, &dev->flags) &&
+ !test_bit(R5_Insync, &dev->flags) && s->injournal;
+}
+
static void
schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
int rcw, int expand)
@@ -2898,7 +2942,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if (dev->towrite) {
+ if (dev->towrite && !delay_towrite(dev, s)) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantdrain, &dev->flags);
if (!expand)
@@ -3293,13 +3337,6 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
return rv;
}
-/* fetch_block - checks the given member device to see if its data needs
- * to be read or computed to satisfy a request.
- *
- * Returns 1 when no more member devices need to be checked, otherwise returns
- * 0 to tell the loop in handle_stripe_fill to continue
- */
-
static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
int disk_idx, int disks)
{
@@ -3390,6 +3427,12 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
return 0;
}
+/* fetch_block - checks the given member device to see if its data needs
+ * to be read or computed to satisfy a request.
+ *
+ * Returns 1 when no more member devices need to be checked, otherwise returns
+ * 0 to tell the loop in handle_stripe_fill to continue
+ */
static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
int disk_idx, int disks)
{
@@ -3476,10 +3519,26 @@ static void handle_stripe_fill(struct stripe_head *sh,
* midst of changing due to a write
*/
if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
- !sh->reconstruct_state)
+ !sh->reconstruct_state) {
+
+ /*
+ * For degraded stripe with data in journal, do not handle
+ * read requests yet, instead, flush the stripe to raid
+ * disks first, this avoids handling complex rmw of write
+ * back cache (prexor with orig_page, and then xor with
+ * page) in the read path
+ */
+ if (s->injournal && s->failed) {
+ if (test_bit(STRIPE_R5C_CACHING, &sh->state))
+ r5c_make_stripe_write_out(sh);
+ goto out;
+ }
+
for (i = disks; i--; )
if (fetch_block(sh, s, i, disks))
break;
+ }
+out:
set_bit(STRIPE_HANDLE, &sh->state);
}
@@ -3592,6 +3651,21 @@ unhash:
break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
}
+/*
+ * For RMW in write back cache, we need extra page in prexor to store the
+ * old data. This page is stored in dev->orig_page.
+ *
+ * This function checks whether we have data for prexor. The exact logic
+ * is:
+ * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE)
+ */
+static inline bool uptodate_for_rmw(struct r5dev *dev)
+{
+ return (test_bit(R5_UPTODATE, &dev->flags)) &&
+ (!test_bit(R5_InJournal, &dev->flags) ||
+ test_bit(R5_OrigPageUPTDODATE, &dev->flags));
+}
+
static int handle_stripe_dirtying(struct r5conf *conf,
struct stripe_head *sh,
struct stripe_head_state *s,
@@ -3620,12 +3694,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
} else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
- if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
+ if (((dev->towrite && !delay_towrite(dev, s)) ||
+ i == sh->pd_idx || i == sh->qd_idx ||
test_bit(R5_InJournal, &dev->flags)) &&
!test_bit(R5_LOCKED, &dev->flags) &&
- !((test_bit(R5_UPTODATE, &dev->flags) &&
- (!test_bit(R5_InJournal, &dev->flags) ||
- dev->page != dev->orig_page)) ||
+ !(uptodate_for_rmw(dev) ||
test_bit(R5_Wantcompute, &dev->flags))) {
if (test_bit(R5_Insync, &dev->flags))
rmw++;
@@ -3637,7 +3710,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
i != sh->pd_idx && i != sh->qd_idx &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
- test_bit(R5_InJournal, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
if (test_bit(R5_Insync, &dev->flags))
rcw++;
@@ -3687,13 +3759,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if ((dev->towrite ||
+ if (((dev->towrite && !delay_towrite(dev, s)) ||
i == sh->pd_idx || i == sh->qd_idx ||
test_bit(R5_InJournal, &dev->flags)) &&
!test_bit(R5_LOCKED, &dev->flags) &&
- !((test_bit(R5_UPTODATE, &dev->flags) &&
- (!test_bit(R5_InJournal, &dev->flags) ||
- dev->page != dev->orig_page)) ||
+ !(uptodate_for_rmw(dev) ||
test_bit(R5_Wantcompute, &dev->flags)) &&
test_bit(R5_Insync, &dev->flags)) {
if (test_bit(STRIPE_PREREAD_ACTIVE,
@@ -3720,7 +3790,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
i != sh->pd_idx && i != sh->qd_idx &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
- test_bit(R5_InJournal, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
rcw++;
if (test_bit(R5_Insync, &dev->flags) &&
@@ -7023,7 +7092,7 @@ static int raid5_run(struct mddev *mddev)
/*
* 0 for a fully functional array, 1 or 2 for a degraded array.
*/
- mddev->degraded = calc_degraded(conf);
+ mddev->degraded = raid5_calc_degraded(conf);
if (has_failed(conf)) {
pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
@@ -7270,7 +7339,7 @@ static int raid5_spare_active(struct mddev *mddev)
}
}
spin_lock_irqsave(&conf->device_lock, flags);
- mddev->degraded = calc_degraded(conf);
+ mddev->degraded = raid5_calc_degraded(conf);
spin_unlock_irqrestore(&conf->device_lock, flags);
print_raid5_conf(conf);
return count;
@@ -7630,7 +7699,7 @@ static int raid5_start_reshape(struct mddev *mddev)
* pre and post number of devices.
*/
spin_lock_irqsave(&conf->device_lock, flags);
- mddev->degraded = calc_degraded(conf);
+ mddev->degraded = raid5_calc_degraded(conf);
spin_unlock_irqrestore(&conf->device_lock, flags);
}
mddev->raid_disks = conf->raid_disks;
@@ -7718,7 +7787,7 @@ static void raid5_finish_reshape(struct mddev *mddev)
} else {
int d;
spin_lock_irq(&conf->device_lock);
- mddev->degraded = calc_degraded(conf);
+ mddev->degraded = raid5_calc_degraded(conf);
spin_unlock_irq(&conf->device_lock);
for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks;
@@ -7829,8 +7898,9 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
mddev->new_chunk_sectors = chunksect;
ret = setup_conf(mddev);
- if (!IS_ERR_VALUE(ret))
- clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+ if (!IS_ERR(ret))
+ mddev_clear_unsupported_flags(mddev,
+ UNSUPPORTED_MDDEV_FLAGS);
return ret;
}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ed8e1362ab36..1440fa26e296 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -322,6 +322,11 @@ enum r5dev_flags {
* data and parity being written are in the journal
* device
*/
+ R5_OrigPageUPTDODATE, /* with write back cache, we read old data into
+ * dev->orig_page for prexor. When this flag is
+ * set, orig_page contains latest data in the
+ * raid disk.
+ */
};
/*
@@ -753,6 +758,7 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
extern struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce);
+extern int raid5_calc_degraded(struct r5conf *conf);
extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
extern void r5l_exit_log(struct r5l_log *log);
extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
@@ -781,4 +787,5 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
extern void r5c_check_cached_full_stripe(struct r5conf *conf);
extern struct md_sysfs_entry r5c_journal_mode;
+extern void r5c_update_on_rdev_error(struct mddev *mddev);
#endif