From 010228e4a932ca1e8365e3b58c8e1e44c16ff793 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 2 Jul 2018 16:26:24 +0800 Subject: md-cluster: clear another node's suspend_area after the copy is finished When one node leaves cluster or stops the resyncing (resync or recovery) array, then other nodes need to call recover_bitmaps to continue the unfinished task. But we need to clear suspend_area later after other nodes copy the resync information to their bitmap (by call bitmap_copy_from_slot). Otherwise, all nodes could write to the suspend_area even the suspend_area is not handled by any node, because area_resyncing returns 0 at the beginning of raid1_write_request. Which means one node could write suspend_area while another node is resyncing the same area, then data could be inconsistent. So let's clear suspend_area later to avoid above issue with the protection of bm lock. Also it is straightforward to clear suspend_area after nodes have copied the resync info to bitmap. Signed-off-by: Guoqing Jiang Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 021cbf9ef1bf..1ac945f7a3c2 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -304,15 +304,6 @@ static void recover_bitmaps(struct md_thread *thread) while (cinfo->recovery_map) { slot = fls64((u64)cinfo->recovery_map) - 1; - /* Clear suspend_area associated with the bitmap */ - spin_lock_irq(&cinfo->suspend_lock); - list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) - if (slot == s->slot) { - list_del(&s->list); - kfree(s); - } - spin_unlock_irq(&cinfo->suspend_lock); - snprintf(str, 64, "bitmap%04d", slot); bm_lockres = lockres_init(mddev, str, NULL, 1); if (!bm_lockres) { @@ -331,6 +322,16 @@ static void recover_bitmaps(struct md_thread *thread) pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); goto clear_bit; } + + /* Clear suspend_area associated with the bitmap */ + spin_lock_irq(&cinfo->suspend_lock); + list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) + if (slot == s->slot) { + list_del(&s->list); + kfree(s); + } + spin_unlock_irq(&cinfo->suspend_lock); + if (hi > 0) { if (lo < mddev->recovery_cp) mddev->recovery_cp = lo; -- cgit v1.2.3-59-g8ed1b From 0357ba27bd611ff496390fdb172fdb31ca475398 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 2 Jul 2018 16:26:25 +0800 Subject: md-cluster: show array's status more accurate When resync or recovery is happening in one node, other nodes don't show the appropriate info now. For example, when create an array in master node without "--assume-clean", then assemble the array in slave nodes, you can see "resync=PENDING" when read /proc/mdstat in slave nodes. However, the info is confusing since "PENDING" status is introduced for start array in read-only mode. We introduce RESYNCING_REMOTE flag to indicate that resync thread is running in remote node. The flags is set when node receive RESYNCING msg. And we clear the REMOTE flag in following cases: 1. resync or recover is finished in master node, which means slaves receive msg with both lo and hi are set to 0. 2. node continues resync/recovery in recover_bitmaps. 3. when resync_finish is called. Then we show accurate information in status_resync by check REMOTE flags and with other conditions. Signed-off-by: Guoqing Jiang Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 18 ++++++++++++++++-- drivers/md/md.c | 17 +++++++++++++++++ drivers/md/md.h | 1 + 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 1ac945f7a3c2..5ed13c4fe72d 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -338,8 +338,14 @@ static void recover_bitmaps(struct md_thread *thread) /* wake up thread to continue resync in case resync * is not finished */ if (mddev->recovery_cp != MaxSector) { - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); + /* + * clear the REMOTE flag since we will launch + * resync thread in current node. + */ + clear_bit(MD_RESYNCING_REMOTE, + &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); } } clear_bit: @@ -458,6 +464,11 @@ static void process_suspend_info(struct mddev *mddev, struct suspend_info *s; if (!hi) { + /* + * clear the REMOTE flag since resync or recovery is finished + * in remote node. + */ + clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); remove_suspend_info(mddev, slot); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -586,6 +597,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) revalidate_disk(mddev->gendisk); break; case RESYNCING: + set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); process_suspend_info(mddev, le32_to_cpu(msg->slot), le64_to_cpu(msg->low), le64_to_cpu(msg->high)); @@ -1266,6 +1278,8 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) static int resync_finish(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; + + clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); dlm_unlock_sync(cinfo->resync_lockres); return resync_info_update(mddev, 0, 0); } diff --git a/drivers/md/md.c b/drivers/md/md.c index 994aed2f9dff..da83d8710579 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7680,6 +7680,23 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) resync -= atomic_read(&mddev->recovery_active); if (resync == 0) { + if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags) && + rdev->recovery_offset != MaxSector && + rdev->recovery_offset) { + seq_printf(seq, "\trecover=REMOTE"); + return 1; + } + if (mddev->reshape_position != MaxSector) + seq_printf(seq, "\treshape=REMOTE"); + else + seq_printf(seq, "\tresync=REMOTE"); + return 1; + } if (mddev->recovery_cp < MaxSector) { seq_printf(seq, "\tresync=PENDING"); return 1; diff --git a/drivers/md/md.h b/drivers/md/md.h index 2d148bdaba74..8afd6bfdbfb9 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -496,6 +496,7 @@ enum recovery_flags { MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */ MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */ MD_RECOVERY_WAIT, /* waiting for pers->start() to finish */ + MD_RESYNCING_REMOTE, /* remote node is running resync thread */ }; static inline int __must_check mddev_lock(struct mddev *mddev) -- cgit v1.2.3-59-g8ed1b From df8c676418e1eb6f7bbafc1e01983f3538584894 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 2 Jul 2018 16:26:26 +0800 Subject: md-cluster: don't send msg if array is closing If we close an array which resync thread is running, then we don't need the node to send msg since another node would launch the resync thread to continue the rest works. Also send a message is time consuming, we should avoid it. Signed-off-by: Guoqing Jiang Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 5ed13c4fe72d..e8a74e92be30 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1281,7 +1281,15 @@ static int resync_finish(struct mddev *mddev) clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); dlm_unlock_sync(cinfo->resync_lockres); - return resync_info_update(mddev, 0, 0); + + /* + * If resync thread is interrupted so we can't say resync is finished, + * another node will launch resync thread to continue. + */ + if (test_bit(MD_CLOSING, &mddev->flags)) + return 0; + else + return resync_info_update(mddev, 0, 0); } static int area_resyncing(struct mddev *mddev, int direction, -- cgit v1.2.3-59-g8ed1b From ebc7709f65001911f275e13dd5f29b02803c0688 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 3 Jul 2018 07:51:09 +0100 Subject: md/r5cache: remove redundant pointer bio Pointer bio is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'bio' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 2b775abf377b..7416db70c6cc 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -717,7 +717,6 @@ static void r5c_disable_writeback_async(struct work_struct *work) static void r5l_submit_current_io(struct r5l_log *log) { struct r5l_io_unit *io = log->current_io; - struct bio *bio; struct r5l_meta_block *block; unsigned long flags; u32 crc; @@ -730,7 +729,6 @@ static void r5l_submit_current_io(struct r5l_log *log) block->meta_size = cpu_to_le32(io->meta_offset); crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); block->checksum = cpu_to_le32(crc); - bio = io->current_bio; log->current_io = NULL; spin_lock_irqsave(&log->io_list_lock, flags); -- cgit v1.2.3-59-g8ed1b From 685dbcaa25becfba864b72ec3f03470833b7f692 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Tue, 3 Jul 2018 22:01:36 +0200 Subject: drivers/md/raid5: Use irqsave variant of atomic_dec_and_lock() The irqsave variant of atomic_dec_and_lock handles irqsave/restore when taking/releasing the spin lock. With this variant the call of local_irq_save is no longer required. Cc: Shaohua Li Cc: linux-raid@vger.kernel.org Acked-by: Peter Zijlstra (Intel) Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2031506a0ecd..e933bff9459e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -409,16 +409,15 @@ void raid5_release_stripe(struct stripe_head *sh) md_wakeup_thread(conf->mddev->thread); return; slow_path: - local_irq_save(flags); /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ - if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { + if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) { INIT_LIST_HEAD(&list); hash = sh->hash_lock_index; do_release_stripe(conf, sh, &list); spin_unlock(&conf->device_lock); release_inactive_stripe_list(conf, &list, hash); + local_irq_restore(flags); } - local_irq_restore(flags); } static inline void remove_hash(struct stripe_head *sh) -- cgit v1.2.3-59-g8ed1b From 08edaaa6d6fa5f6ac9be2adcc71f80b4083b6494 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Tue, 3 Jul 2018 22:01:37 +0200 Subject: drivers/md/raid5: Do not disable irq on release_inactive_stripe_list() call There is no need to invoke release_inactive_stripe_list() with interrupts disabled. All call sites, except raid5_release_stripe(), unlock ->device_lock and enable interrupts before invoking the function. Make it consistent. Cc: Shaohua Li Cc: linux-raid@vger.kernel.org Acked-by: Peter Zijlstra (Intel) Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e933bff9459e..ca1dd0cb04c5 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -414,9 +414,8 @@ slow_path: INIT_LIST_HEAD(&list); hash = sh->hash_lock_index; do_release_stripe(conf, sh, &list); - spin_unlock(&conf->device_lock); + spin_unlock_irqrestore(&conf->device_lock, flags); release_inactive_stripe_list(conf, &list, hash); - local_irq_restore(flags); } } -- cgit v1.2.3-59-g8ed1b From d63e2fc804c46e50eee825c5d3a7228e07048b47 Mon Sep 17 00:00:00 2001 From: BingJing Chang Date: Wed, 1 Aug 2018 17:08:36 +0800 Subject: md/raid5: fix data corruption of replacements after originals dropped During raid5 replacement, the stripes can be marked with R5_NeedReplace flag. Data can be read from being-replaced devices and written to replacing spares without reading all other devices. (It's 'replace' mode. s.replacing = 1) If a being-replaced device is dropped, the replacement progress will be interrupted and resumed with pure recovery mode. However, existing stripes before being interrupted cannot read from the dropped device anymore. It prints lots of WARN_ON messages. And it results in data corruption because existing stripes write problematic data into its replacement device and update the progress. \# Erase disks (1MB + 2GB) dd if=/dev/zero of=/dev/sda bs=1MB count=2049 dd if=/dev/zero of=/dev/sdb bs=1MB count=2049 dd if=/dev/zero of=/dev/sdc bs=1MB count=2049 dd if=/dev/zero of=/dev/sdd bs=1MB count=2049 mdadm -C /dev/md0 -amd -R -l5 -n3 -x0 /dev/sd[abc] -z 2097152 \# Ensure array stores non-zero data dd if=/root/data_4GB.iso of=/dev/md0 bs=1MB \# Start replacement mdadm /dev/md0 -a /dev/sdd mdadm /dev/md0 --replace /dev/sda Then, Hot-plug out /dev/sda during recovery, and wait for recovery done. echo check > /sys/block/md0/md/sync_action cat /sys/block/md0/md/mismatch_cnt # it will be greater than 0. Soon after you hot-plug out /dev/sda, you will see many WARN_ON messages. The replacement recovery will be interrupted shortly. After the recovery finishes, it will result in data corruption. Actually, it's just an unhandled case of replacement. In commit (md/raid5: fix interaction of 'replace' and 'recovery'.), if a NeedReplace device is not UPTODATE then that is an error, the commit just simply print WARN_ON but also mark these corrupted stripes with R5_WantReplace. (it means it's ready for writes.) To fix this case, we can leverage 'sync and replace' mode mentioned in commit <9a3e1101b827> (md/raid5: detect and handle replacements during recovery.). We can add logics to detect and use 'sync and replace' mode for these stripes. Reported-by: Alex Chen Reviewed-by: Alex Wu Reviewed-by: Chung-Chiang Cheng Signed-off-by: BingJing Chang Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ca1dd0cb04c5..81eaa221216c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4519,6 +4519,12 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) s->failed++; if (rdev && !test_bit(Faulty, &rdev->flags)) do_recovery = 1; + else if (!rdev) { + rdev = rcu_dereference( + conf->disks[i].replacement); + if (rdev && !test_bit(Faulty, &rdev->flags)) + do_recovery = 1; + } } if (test_bit(R5_InJournal, &dev->flags)) -- cgit v1.2.3-59-g8ed1b