From c744a65c1e2d59acc54333ce80a5b0702a98010b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Mar 2012 12:46:37 +1100 Subject: md: don't set md arrays to readonly on shutdown. It seems that with recent kernel, writeback can still be happening while shutdown is happening, and consequently data can be written after the md reboot notifier switches all arrays to read-only. This causes a BUG. So don't switch them to read-only - just mark them clean and set 'safemode' to '2' which mean that immediately after any write the array will be switch back to 'clean'. This could result in the shutdown happening when array is marked dirty, thus forcing a resync on reboot. However if you reboot without performing a "sync" first, you get to keep both halves. This is suitable for any stable kernel (though there might be some conflicts with obvious fixes in earlier kernels). Cc: stable@vger.kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index ce88755baf4a..115a6dd85837 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8157,30 +8157,23 @@ static int md_notify_reboot(struct notifier_block *this, struct mddev *mddev; int need_delay = 0; - if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { - - printk(KERN_INFO "md: stopping all md devices.\n"); - - for_each_mddev(mddev, tmp) { - if (mddev_trylock(mddev)) { - /* Force a switch to readonly even array - * appears to still be in use. Hence - * the '100'. - */ - md_set_readonly(mddev, 100); - mddev_unlock(mddev); - } - need_delay = 1; + for_each_mddev(mddev, tmp) { + if (mddev_trylock(mddev)) { + __md_stop_writes(mddev); + mddev->safemode = 2; + mddev_unlock(mddev); } - /* - * certain more exotic SCSI devices are known to be - * volatile wrt too early system reboots. While the - * right place to handle this issue is the given - * driver, we do want to have a safe RAID driver ... - */ - if (need_delay) - mdelay(1000*1); + need_delay = 1; } + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + if (need_delay) + mdelay(1000*1); + return NOTIFY_DONE; } -- cgit v1.2.3-59-g8ed1b From dafb20fa34320a472deb7442f25a0c086e0feb33 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Mar 2012 12:46:39 +1100 Subject: md: tidy up rdev_for_each usage. md.h has an 'rdev_for_each()' macro for iterating the rdevs in an mddev. However it uses the 'safe' version of list_for_each_entry, and so requires the extra variable, but doesn't include 'safe' in the name, which is useful documentation. Consequently some places use this safe version without needing it, and many use an explicity list_for_each entry. So: - rename rdev_for_each to rdev_for_each_safe - create a new rdev_for_each which uses the plain list_for_each_entry, - use the 'safe' version only where needed, and convert all other list_for_each_entry calls to use rdev_for_each. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 2 +- drivers/md/dm-raid.c | 16 +++++------ drivers/md/faulty.c | 2 +- drivers/md/linear.c | 2 +- drivers/md/md.c | 74 +++++++++++++++++++++++++------------------------- drivers/md/md.h | 5 +++- drivers/md/multipath.c | 2 +- drivers/md/raid0.c | 10 +++---- drivers/md/raid1.c | 4 +-- drivers/md/raid10.c | 4 +-- drivers/md/raid5.c | 8 +++--- 11 files changed, 66 insertions(+), 63 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 239af9a9aad1..2c5dbc6248d3 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -171,7 +171,7 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset, did_alloc = 1; } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (! test_bit(In_sync, &rdev->flags) || test_bit(Faulty, &rdev->flags)) continue; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 787022c18187..c5a875d7b882 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -615,14 +615,14 @@ static int read_disk_sb(struct md_rdev *rdev, int size) static void super_sync(struct mddev *mddev, struct md_rdev *rdev) { - struct md_rdev *r, *t; + struct md_rdev *r; uint64_t failed_devices; struct dm_raid_superblock *sb; sb = page_address(rdev->sb_page); failed_devices = le64_to_cpu(sb->failed_devices); - rdev_for_each(r, t, mddev) + rdev_for_each(r, mddev) if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) failed_devices |= (1ULL << r->raid_disk); @@ -707,7 +707,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) struct dm_raid_superblock *sb; uint32_t new_devs = 0; uint32_t rebuilds = 0; - struct md_rdev *r, *t; + struct md_rdev *r; struct dm_raid_superblock *sb2; sb = page_address(rdev->sb_page); @@ -750,7 +750,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) * case the In_sync bit will /not/ be set and * recovery_cp must be MaxSector. */ - rdev_for_each(r, t, mddev) { + rdev_for_each(r, mddev) { if (!test_bit(In_sync, &r->flags)) { DMINFO("Device %d specified for rebuild: " "Clearing superblock", r->raid_disk); @@ -782,7 +782,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) * Now we set the Faulty bit for those devices that are * recorded in the superblock as failed. */ - rdev_for_each(r, t, mddev) { + rdev_for_each(r, mddev) { if (!r->sb_page) continue; sb2 = page_address(r->sb_page); @@ -855,11 +855,11 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev) static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) { int ret; - struct md_rdev *rdev, *freshest, *tmp; + struct md_rdev *rdev, *freshest; struct mddev *mddev = &rs->md; freshest = NULL; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { if (!rdev->meta_bdev) continue; @@ -888,7 +888,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (super_validate(mddev, freshest)) return -EINVAL; - rdev_for_each(rdev, tmp, mddev) + rdev_for_each(rdev, mddev) if ((rdev != freshest) && super_validate(mddev, rdev)) return -EINVAL; diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index feb2c3c7bb44..45135f69509c 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -315,7 +315,7 @@ static int run(struct mddev *mddev) } conf->nfaults = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) conf->rdev = rdev; md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 627456542fb3..67940741b19d 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -138,7 +138,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) cnt = 0; conf->array_sectors = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { int j = rdev->raid_disk; struct dev_info *disk = conf->disks + j; sector_t sectors; diff --git a/drivers/md/md.c b/drivers/md/md.c index 115a6dd85837..119de175bf12 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -439,7 +439,7 @@ static void submit_flushes(struct work_struct *ws) INIT_WORK(&mddev->flush_work, md_submit_flush_data); atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); - list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { /* Take two references, one is dropped @@ -749,7 +749,7 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) { struct md_rdev *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->desc_nr == nr) return rdev; @@ -760,7 +760,7 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) { struct md_rdev *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->bdev->bd_dev == dev) return rdev; @@ -1342,7 +1342,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) sb->state |= (1<disks[0].state = (1<disks, same_set) { + rdev_for_each(rdev2, mddev) { mdp_disk_t *d; int desc_nr; int is_active = test_bit(In_sync, &rdev2->flags); @@ -1816,7 +1816,7 @@ retry: } max_dev = 0; - list_for_each_entry(rdev2, &mddev->disks, same_set) + rdev_for_each(rdev2, mddev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; @@ -1833,7 +1833,7 @@ retry: for (i=0; idev_roles[i] = cpu_to_le16(0xfffe); - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(0xfffe); @@ -1948,7 +1948,7 @@ int md_integrity_register(struct mddev *mddev) return 0; /* nothing to do */ if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) return 0; /* shouldn't register, or already is */ - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { /* skip spares and non-functional disks */ if (test_bit(Faulty, &rdev->flags)) continue; @@ -2175,7 +2175,7 @@ static void export_array(struct mddev *mddev) { struct md_rdev *rdev, *tmp; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (!rdev->mddev) { MD_BUG(); continue; @@ -2307,11 +2307,11 @@ static void md_print_devices(void) bitmap_print_sb(mddev->bitmap); else printk("%s: ", mdname(mddev)); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) printk("<%s>", bdevname(rdev->bdev,b)); printk("\n"); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) print_rdev(rdev, mddev->major_version); } printk("md: **********************************\n"); @@ -2328,7 +2328,7 @@ static void sync_sbs(struct mddev * mddev, int nospares) * with the rest of the array) */ struct md_rdev *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && @@ -2351,7 +2351,7 @@ static void md_update_sb(struct mddev * mddev, int force_change) repeat: /* First make sure individual recovery_offsets are correct */ - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && !test_bit(In_sync, &rdev->flags) && @@ -2364,7 +2364,7 @@ repeat: clear_bit(MD_CHANGE_DEVS, &mddev->flags); if (!mddev->external) { clear_bit(MD_CHANGE_PENDING, &mddev->flags); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) { md_ack_all_badblocks(&rdev->badblocks); md_error(mddev, rdev); @@ -2430,7 +2430,7 @@ repeat: mddev->events --; } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) any_badblocks_changed++; if (test_bit(Faulty, &rdev->flags)) @@ -2444,7 +2444,7 @@ repeat: mdname(mddev), mddev->in_sync); bitmap_update_sb(mddev->bitmap); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; if (rdev->sb_loaded != 1) @@ -2493,7 +2493,7 @@ repeat: if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (test_and_clear_bit(FaultRecorded, &rdev->flags)) clear_bit(Blocked, &rdev->flags); @@ -2896,7 +2896,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) struct md_rdev *rdev2; mddev_lock(mddev); - list_for_each_entry(rdev2, &mddev->disks, same_set) + rdev_for_each(rdev2, mddev) if (rdev->bdev == rdev2->bdev && rdev != rdev2 && overlaps(rdev->data_offset, rdev->sectors, @@ -3193,7 +3193,7 @@ static void analyze_sbs(struct mddev * mddev) char b[BDEVNAME_SIZE]; freshest = NULL; - rdev_for_each(rdev, tmp, mddev) + rdev_for_each_safe(rdev, tmp, mddev) switch (super_types[mddev->major_version]. load_super(rdev, freshest, mddev->minor_version)) { case 1: @@ -3214,7 +3214,7 @@ static void analyze_sbs(struct mddev * mddev) validate_super(mddev, freshest); i = 0; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (mddev->max_disks && (rdev->desc_nr >= mddev->max_disks || i > mddev->max_disks)) { @@ -3403,7 +3403,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) return -EINVAL; } - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) rdev->new_raid_disk = rdev->raid_disk; /* ->takeover must set new_* and/or delta_disks @@ -3456,7 +3456,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->safemode = 0; } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk >= mddev->raid_disks) @@ -3465,7 +3465,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) continue; sysfs_unlink_rdev(mddev, rdev); } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk == rdev->raid_disk) @@ -4796,7 +4796,7 @@ int md_run(struct mddev *mddev) * the only valid external interface is through the md * device. */ - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); @@ -4867,8 +4867,8 @@ int md_run(struct mddev *mddev) struct md_rdev *rdev2; int warned = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) + rdev_for_each(rdev2, mddev) { if (rdev < rdev2 && rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { @@ -4945,7 +4945,7 @@ int md_run(struct mddev *mddev) mddev->in_sync = 1; smp_wmb(); mddev->ready = 1; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; @@ -5175,7 +5175,7 @@ static int do_md_stop(struct mddev * mddev, int mode, int is_open) /* tell userspace to handle 'inactive' */ sysfs_notify_dirent_safe(mddev->sysfs_state); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) sysfs_unlink_rdev(mddev, rdev); @@ -5226,7 +5226,7 @@ static void autorun_array(struct mddev *mddev) printk(KERN_INFO "md: running: "); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; printk("<%s>", bdevname(rdev->bdev,b)); } @@ -5356,7 +5356,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) struct md_rdev *rdev; nr=working=insync=failed=spare=0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; @@ -5923,7 +5923,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) * grow, and re-add. */ return -EBUSY; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { sector_t avail = rdev->sectors; if (fit && (num_sectors == 0 || num_sectors > avail)) @@ -6758,7 +6758,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } sectors = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); @@ -7170,7 +7170,7 @@ void md_do_sync(struct mddev *mddev) max_sectors = mddev->dev_sectors; j = MaxSector; rcu_read_lock(); - list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && @@ -7342,7 +7342,7 @@ void md_do_sync(struct mddev *mddev) if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; rcu_read_lock(); - list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && !test_bit(Faulty, &rdev->flags) && @@ -7388,7 +7388,7 @@ static int remove_and_add_spares(struct mddev *mddev) mddev->curr_resync_completed = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || @@ -7406,7 +7406,7 @@ static int remove_and_add_spares(struct mddev *mddev) "degraded"); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) @@ -7451,7 +7451,7 @@ static void reap_sync_thread(struct mddev *mddev) * do the superblock for an incrementally recovered device * written out. */ - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (!mddev->degraded || test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = -1; @@ -7529,7 +7529,7 @@ void md_check_recovery(struct mddev *mddev) * failed devices. */ struct md_rdev *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && test_bit(Faulty, &rdev->flags) && diff --git a/drivers/md/md.h b/drivers/md/md.h index 44c63dfeeb2b..39acfe90cc26 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -519,7 +519,10 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) /* * iterates through the 'same array disks' ringlist */ -#define rdev_for_each(rdev, tmp, mddev) \ +#define rdev_for_each(rdev, mddev) \ + list_for_each_entry(rdev, &((mddev)->disks), same_set) + +#define rdev_for_each_safe(rdev, tmp, mddev) \ list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) #define rdev_for_each_rcu(rdev, mddev) \ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index a222f516660e..9339e67fcc79 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -428,7 +428,7 @@ static int multipath_run (struct mddev *mddev) } working_disks = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { disk_idx = rdev->raid_disk; if (disk_idx < 0 || disk_idx >= mddev->raid_disks) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 7294bd115e34..7ef5cbf31bb1 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -91,7 +91,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) if (!conf) return -ENOMEM; - list_for_each_entry(rdev1, &mddev->disks, same_set) { + rdev_for_each(rdev1, mddev) { pr_debug("md/raid0:%s: looking at %s\n", mdname(mddev), bdevname(rdev1->bdev, b)); @@ -102,7 +102,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) sector_div(sectors, mddev->chunk_sectors); rdev1->sectors = sectors * mddev->chunk_sectors; - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev2, mddev) { pr_debug("md/raid0:%s: comparing %s(%llu)" " with %s(%llu)\n", mdname(mddev), @@ -157,7 +157,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) smallest = NULL; dev = conf->devlist; err = -EINVAL; - list_for_each_entry(rdev1, &mddev->disks, same_set) { + rdev_for_each(rdev1, mddev) { int j = rdev1->raid_disk; if (mddev->level == 10) { @@ -329,7 +329,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks WARN_ONCE(sectors || raid_disks, "%s does not support generic reshape\n", __func__); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) array_sectors += rdev->sectors; return array_sectors; @@ -543,7 +543,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev) return ERR_PTR(-EINVAL); } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { /* check slot number for a disk */ if (rdev->raid_disk == mddev->raid_disks-1) { printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 118e0f69f224..a933bd4065a5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2504,7 +2504,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) err = -EINVAL; spin_lock_init(&conf->device_lock); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { int disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) @@ -2622,7 +2622,7 @@ static int run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (!mddev->gendisk) continue; disk_stack_limits(mddev->gendisk, rdev->bdev, diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2ae7021320e1..52bb37d4026d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3253,7 +3253,7 @@ static int run(struct mddev *mddev) blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks / conf->near_copies)); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { disk_idx = rdev->raid_disk; if (disk_idx >= conf->raid_disks @@ -3419,7 +3419,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev) conf = setup_conf(mddev); if (!IS_ERR(conf)) { - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) rdev->new_raid_disk = rdev->raid_disk * 2; conf->barrier = 1; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d38d235cc39d..23ac880bba9a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4842,7 +4842,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) pr_debug("raid456: run(%s) called.\n", mdname(mddev)); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { raid_disk = rdev->raid_disk; if (raid_disk >= max_disks || raid_disk < 0) @@ -5177,7 +5177,7 @@ static int run(struct mddev *mddev) blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks - conf->max_degraded)); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); } @@ -5500,7 +5500,7 @@ static int raid5_start_reshape(struct mddev *mddev) if (!check_stripe_cache(mddev)) return -ENOSPC; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (!test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) spares++; @@ -5546,7 +5546,7 @@ static int raid5_start_reshape(struct mddev *mddev) * such devices during the reshape and confusion could result. */ if (mddev->delta_disks >= 0) { - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev) == 0) { -- cgit v1.2.3-59-g8ed1b From 050b66152f87c79e8d66aed0e7996f9336462d5f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Mar 2012 12:46:39 +1100 Subject: md/raid10: handle merge_bvec_fn in member devices. Currently we don't honour merge_bvec_fn in member devices so if there is one, we force all requests to be single-page at most. This is not ideal. So enhance the raid10 merge_bvec_fn to check that function in children as well. This introduces a small problem. There is no locking around calls the ->merge_bvec_fn and subsequent calls to ->make_request. So a device added between these could end up getting a request which violates its merge_bvec_fn. Currently the best we can do is synchronize_sched(). This will work providing no preemption happens. If there is preemption, we just have to hope that new devices are largely consistent with old devices. Signed-off-by: NeilBrown --- drivers/md/md.c | 1 + drivers/md/md.h | 8 ++++ drivers/md/raid10.c | 122 ++++++++++++++++++++++++++++++++++------------------ 3 files changed, 90 insertions(+), 41 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 119de175bf12..4566b61373d5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5073,6 +5073,7 @@ static void md_clean(struct mddev *mddev) mddev->changed = 0; mddev->degraded = 0; mddev->safemode = 0; + mddev->merge_check_needed = 0; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; mddev->bitmap_info.chunksize = 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index 39acfe90cc26..1c2063ccf48e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -128,6 +128,10 @@ struct md_rdev { enum flag_bits { Faulty, /* device is known to have a fault */ In_sync, /* device is in_sync with rest of array */ + Unmerged, /* device is being added to array and should + * be considerred for bvec_merge_fn but not + * yet for actual IO + */ WriteMostly, /* Avoid reading if at all possible */ AutoDetected, /* added by auto-detect */ Blocked, /* An error occurred but has not yet @@ -345,6 +349,10 @@ struct mddev { int degraded; /* whether md should consider * adding a spare */ + int merge_check_needed; /* at least one + * member device + * has a + * merge_bvec_fn */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 52bb37d4026d..e4a66ab6b0fb 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -586,25 +586,68 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset - * If near_copies == raid_disk, there are no striping issues, - * but in that case, the function isn't called at all. + * This requires checking for end-of-chunk if near_copies != raid_disks, + * and for subordinate merge_bvec_fns if merge_check_needed. */ static int raid10_mergeable_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *biovec) { struct mddev *mddev = q->queuedata; + struct r10conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; - if (max < 0) max = 0; /* bio_add cannot handle a negative return */ - if (max <= biovec->bv_len && bio_sectors == 0) - return biovec->bv_len; - else - return max; + if (conf->near_copies < conf->raid_disks) { + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + + bio_sectors)) << 9; + if (max < 0) + /* bio_add cannot handle a negative return */ + max = 0; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; + } else + max = biovec->bv_len; + + if (mddev->merge_check_needed) { + struct r10bio r10_bio; + int s; + r10_bio.sector = sector; + raid10_find_phys(conf, &r10_bio); + rcu_read_lock(); + for (s = 0; s < conf->copies; s++) { + int disk = r10_bio.devs[s].devnum; + struct md_rdev *rdev = rcu_dereference( + conf->mirrors[disk].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = r10_bio.devs[s].addr + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } + rdev = rcu_dereference(conf->mirrors[disk].replacement); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = r10_bio.devs[s].addr + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } + } + rcu_read_unlock(); + } + return max; } /* @@ -668,11 +711,12 @@ retry: disk = r10_bio->devs[slot].devnum; rdev = rcu_dereference(conf->mirrors[disk].replacement); if (rdev == NULL || test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags) || r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) rdev = rcu_dereference(conf->mirrors[disk].rdev); - if (rdev == NULL) - continue; - if (test_bit(Faulty, &rdev->flags)) + if (rdev == NULL || + test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags)) continue; if (!test_bit(In_sync, &rdev->flags) && r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) @@ -1134,12 +1178,14 @@ retry_write: blocked_rdev = rrdev; break; } - if (rrdev && test_bit(Faulty, &rrdev->flags)) + if (rrdev && (test_bit(Faulty, &rrdev->flags) + || test_bit(Unmerged, &rrdev->flags))) rrdev = NULL; r10_bio->devs[i].bio = NULL; r10_bio->devs[i].repl_bio = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags)) { + if (!rdev || test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags)) { set_bit(R10BIO_Degraded, &r10_bio->state); continue; } @@ -1490,6 +1536,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) int mirror; int first = 0; int last = conf->raid_disks - 1; + struct request_queue *q = bdev_get_queue(rdev->bdev); if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is @@ -1502,6 +1549,11 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; + if (q->merge_bvec_fn) { + set_bit(Unmerged, &rdev->flags); + mddev->merge_check_needed = 1; + } + if (rdev->saved_raid_disk >= first && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; @@ -1521,11 +1573,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) err = 0; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } conf->fullsync = 1; rcu_assign_pointer(p->replacement, rdev); break; @@ -1533,17 +1580,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must - * never risk violating it, so limit - * ->max_segments to one lying with a single - * page, as a one page request is never in - * violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } p->head_position = 0; p->recovery_disabled = mddev->recovery_disabled - 1; @@ -1554,7 +1590,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - + if (err == 0 && test_bit(Unmerged, &rdev->flags)) { + /* Some requests might not have seen this new + * merge_bvec_fn. We must wait for them to complete + * before merging the device fully. + * First we make sure any code which has tested + * our function has submitted the request, then + * we wait for all outstanding requests to complete. + */ + synchronize_sched(); + raise_barrier(conf, 0); + lower_barrier(conf); + clear_bit(Unmerged, &rdev->flags); + } md_integrity_add_rdev(rdev, mddev); print_conf(conf); return err; @@ -2098,6 +2146,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && + !test_bit(Unmerged, &rdev->flags) && test_bit(In_sync, &rdev->flags) && is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, &first_bad, &bad_sectors) == 0) { @@ -2151,6 +2200,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (!rdev || + test_bit(Unmerged, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; @@ -3273,15 +3323,6 @@ static int run(struct mddev *mddev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit max_segments to 1 lying - * within a single page. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } disk->head_position = 0; } @@ -3345,8 +3386,7 @@ static int run(struct mddev *mddev) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } - if (conf->near_copies < conf->raid_disks) - blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); + blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); if (md_integrity_register(mddev)) goto out_free_conf; -- cgit v1.2.3-59-g8ed1b From 57148964d946614ffc6621539096ded1e7d896ab Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Mar 2012 12:46:40 +1100 Subject: md/bitmap: move printing of bitmap status to bitmap.c The part of /proc/mdstat which describes the bitmap should really be generated by code in bitmap.c. So move it there. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 28 ++++++++++++++++++++++++++++ drivers/md/bitmap.h | 1 + drivers/md/md.c | 23 +---------------------- 3 files changed, 30 insertions(+), 22 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 2c5dbc6248d3..04df18e8885f 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "md.h" #include "bitmap.h" @@ -1836,6 +1837,33 @@ out: } EXPORT_SYMBOL_GPL(bitmap_load); +void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) +{ + unsigned long chunk_kb; + unsigned long flags; + + if (!bitmap) + return; + + spin_lock_irqsave(&bitmap->lock, flags); + chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " + "%lu%s chunk", + bitmap->pages - bitmap->missing_pages, + bitmap->pages, + (bitmap->pages - bitmap->missing_pages) + << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, + chunk_kb ? "KB" : "B"); + if (bitmap->file) { + seq_printf(seq, ", file: "); + seq_path(seq, &bitmap->file->f_path, " \t\n"); + } + + seq_printf(seq, "\n"); + spin_unlock_irqrestore(&bitmap->lock, flags); +} + static ssize_t location_show(struct mddev *mddev, char *page) { diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index 557e3e8ea73b..e196e6a560e8 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -227,6 +227,7 @@ void bitmap_destroy(struct mddev *mddev); void bitmap_print_sb(struct bitmap *bitmap); void bitmap_update_sb(struct bitmap *bitmap); +void bitmap_status(struct seq_file *seq, struct bitmap *bitmap); int bitmap_setallbits(struct bitmap *bitmap); void bitmap_write_all(struct bitmap *bitmap); diff --git a/drivers/md/md.c b/drivers/md/md.c index 4566b61373d5..26591cc8ee87 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6725,7 +6725,6 @@ static int md_seq_show(struct seq_file *seq, void *v) struct mddev *mddev = v; sector_t sectors; struct md_rdev *rdev; - struct bitmap *bitmap; if (v == (void*)1) { struct md_personality *pers; @@ -6813,27 +6812,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } else seq_printf(seq, "\n "); - if ((bitmap = mddev->bitmap)) { - unsigned long chunk_kb; - unsigned long flags; - spin_lock_irqsave(&bitmap->lock, flags); - chunk_kb = mddev->bitmap_info.chunksize >> 10; - seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " - "%lu%s chunk", - bitmap->pages - bitmap->missing_pages, - bitmap->pages, - (bitmap->pages - bitmap->missing_pages) - << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, - chunk_kb ? "KB" : "B"); - if (bitmap->file) { - seq_printf(seq, ", file: "); - seq_path(seq, &bitmap->file->f_path, " \t\n"); - } - - seq_printf(seq, "\n"); - spin_unlock_irqrestore(&bitmap->lock, flags); - } + bitmap_status(seq, mddev->bitmap); seq_printf(seq, "\n"); } -- cgit v1.2.3-59-g8ed1b From d0962936bff659d20522555b517582a2715fd23f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Mar 2012 12:46:41 +1100 Subject: md: fix clearing of the 'changed' flags for the bad blocks list. In super_1_sync (the first hunk) we need to clear 'changed' before checking read_seqretry(), otherwise we might race with other code adding a bad block and so won't retry later. In md_update_sb (the second hunk), in the case where there is no metadata (neither persistent nor external), we treat any bad blocks as an error. However we need to clear the 'changed' flag before calling md_ack_all_badblocks, else it won't do anything. This patch is suitable for -stable release 3.0 and later. Cc: stable@vger.kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 26591cc8ee87..21a90efb13b7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1805,13 +1805,13 @@ retry: | BB_LEN(internal_bb)); *bbp++ = cpu_to_le64(store_bb); } + bb->changed = 0; if (read_seqretry(&bb->lock, seq)) goto retry; bb->sector = (rdev->sb_start + (int)le32_to_cpu(sb->bblog_offset)); bb->size = le16_to_cpu(sb->bblog_size); - bb->changed = 0; } } @@ -2366,6 +2366,7 @@ repeat: clear_bit(MD_CHANGE_PENDING, &mddev->flags); rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) { + rdev->badblocks.changed = 0; md_ack_all_badblocks(&rdev->badblocks); md_error(mddev, rdev); } -- cgit v1.2.3-59-g8ed1b From ecb178bb2b154a40cfae9fa4c42e62ccfa81ac6b Mon Sep 17 00:00:00 2001 From: majianpeng Date: Mon, 19 Mar 2012 12:46:42 +1100 Subject: md: Add judgement bb->unacked_exist in function md_ack_all_badblocks(). If there are no unacked bad blocks, then there is no point searching for them to acknowledge them. Signed-off-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 21a90efb13b7..b572e1e386ce 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8021,7 +8021,7 @@ void md_ack_all_badblocks(struct badblocks *bb) return; write_seqlock_irq(&bb->lock); - if (bb->changed == 0) { + if (bb->changed == 0 && bb->unacked_exist) { u64 *p = bb->page; int i; for (i = 0; i < bb->count ; i++) { -- cgit v1.2.3-59-g8ed1b