aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c716
1 files changed, 454 insertions, 262 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2efdb0d67460..9c4f7659f8b1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -58,11 +58,13 @@
#include <linux/sched/signal.h>
#include <trace/events/block.h>
+#include <linux/list_sort.h>
#include "md.h"
#include "raid5.h"
#include "raid0.h"
#include "bitmap.h"
+#include "raid5-log.h"
#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
@@ -101,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
{
int i;
- local_irq_disable();
- spin_lock(conf->hash_locks);
+ spin_lock_irq(conf->hash_locks);
for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
spin_lock(&conf->device_lock);
@@ -112,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
{
int i;
spin_unlock(&conf->device_lock);
- for (i = NR_STRIPE_HASH_LOCKS; i; i--)
- spin_unlock(conf->hash_locks + i - 1);
- local_irq_enable();
+ for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
+ spin_unlock(conf->hash_locks + i);
+ spin_unlock_irq(conf->hash_locks);
}
/* Find first data disk in a raid6 stripe */
@@ -156,17 +157,6 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
return slot;
}
-static void return_io(struct bio_list *return_bi)
-{
- struct bio *bi;
- while ((bi = bio_list_pop(return_bi)) != NULL) {
- bi->bi_iter.bi_size = 0;
- trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
- bi, 0);
- bio_endio(bi);
- }
-}
-
static void print_raid5_conf (struct r5conf *conf);
static int stripe_operations_active(struct stripe_head *sh)
@@ -176,6 +166,13 @@ static int stripe_operations_active(struct stripe_head *sh)
test_bit(STRIPE_COMPUTE_RUN, &sh->state);
}
+static bool stripe_is_lowprio(struct stripe_head *sh)
+{
+ return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
+ test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
+ !test_bit(STRIPE_R5C_CACHING, &sh->state);
+}
+
static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
@@ -191,7 +188,10 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
if (list_empty(&sh->lru)) {
struct r5worker_group *group;
group = conf->worker_groups + cpu_to_group(cpu);
- list_add_tail(&sh->lru, &group->handle_list);
+ if (stripe_is_lowprio(sh))
+ list_add_tail(&sh->lru, &group->loprio_list);
+ else
+ list_add_tail(&sh->lru, &group->handle_list);
group->stripes_cnt++;
sh->group = group;
}
@@ -233,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
if (test_bit(R5_InJournal, &sh->dev[i].flags))
injournal++;
/*
- * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
- * data in journal, so they are not released to cached lists
+ * In the following cases, the stripe cannot be released to cached
+ * lists. Therefore, we make the stripe write out and set
+ * STRIPE_HANDLE:
+ * 1. when quiesce in r5c write back;
+ * 2. when resync is requested fot the stripe.
*/
- if (conf->quiesce && r5c_is_writeback(conf->log) &&
- !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
+ (conf->quiesce && r5c_is_writeback(conf->log) &&
+ !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
if (test_bit(STRIPE_R5C_CACHING, &sh->state))
r5c_make_stripe_write_out(sh);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -254,7 +258,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
clear_bit(STRIPE_DELAYED, &sh->state);
clear_bit(STRIPE_BIT_DELAY, &sh->state);
if (conf->worker_cnt_per_group == 0) {
- list_add_tail(&sh->lru, &conf->handle_list);
+ if (stripe_is_lowprio(sh))
+ list_add_tail(&sh->lru,
+ &conf->loprio_list);
+ else
+ list_add_tail(&sh->lru,
+ &conf->handle_list);
} else {
raid5_wakeup_stripe_thread(sh);
return;
@@ -481,6 +490,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
sh->dev[i].page = page;
sh->dev[i].orig_page = page;
}
+
return 0;
}
@@ -707,12 +717,11 @@ static bool is_full_stripe_write(struct stripe_head *sh)
static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{
- local_irq_disable();
if (sh1 > sh2) {
- spin_lock(&sh2->stripe_lock);
+ spin_lock_irq(&sh2->stripe_lock);
spin_lock_nested(&sh1->stripe_lock, 1);
} else {
- spin_lock(&sh1->stripe_lock);
+ spin_lock_irq(&sh1->stripe_lock);
spin_lock_nested(&sh2->stripe_lock, 1);
}
}
@@ -720,8 +729,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{
spin_unlock(&sh1->stripe_lock);
- spin_unlock(&sh2->stripe_lock);
- local_irq_enable();
+ spin_unlock_irq(&sh2->stripe_lock);
}
/* Only freshly new full stripe normal write stripe can be added to a batch list */
@@ -729,7 +737,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -863,41 +871,107 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
return 1;
}
-static void flush_deferred_bios(struct r5conf *conf)
+static void dispatch_bio_list(struct bio_list *tmp)
{
- struct bio_list tmp;
struct bio *bio;
- if (!conf->batch_bio_dispatch || !conf->group_cnt)
+ while ((bio = bio_list_pop(tmp)))
+ generic_make_request(bio);
+}
+
+static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
+{
+ const struct r5pending_data *da = list_entry(a,
+ struct r5pending_data, sibling);
+ const struct r5pending_data *db = list_entry(b,
+ struct r5pending_data, sibling);
+ if (da->sector > db->sector)
+ return 1;
+ if (da->sector < db->sector)
+ return -1;
+ return 0;
+}
+
+static void dispatch_defer_bios(struct r5conf *conf, int target,
+ struct bio_list *list)
+{
+ struct r5pending_data *data;
+ struct list_head *first, *next = NULL;
+ int cnt = 0;
+
+ if (conf->pending_data_cnt == 0)
+ return;
+
+ list_sort(NULL, &conf->pending_list, cmp_stripe);
+
+ first = conf->pending_list.next;
+
+ /* temporarily move the head */
+ if (conf->next_pending_data)
+ list_move_tail(&conf->pending_list,
+ &conf->next_pending_data->sibling);
+
+ while (!list_empty(&conf->pending_list)) {
+ data = list_first_entry(&conf->pending_list,
+ struct r5pending_data, sibling);
+ if (&data->sibling == first)
+ first = data->sibling.next;
+ next = data->sibling.next;
+
+ bio_list_merge(list, &data->bios);
+ list_move(&data->sibling, &conf->free_list);
+ cnt++;
+ if (cnt >= target)
+ break;
+ }
+ conf->pending_data_cnt -= cnt;
+ BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
+
+ if (next != &conf->pending_list)
+ conf->next_pending_data = list_entry(next,
+ struct r5pending_data, sibling);
+ else
+ conf->next_pending_data = NULL;
+ /* list isn't empty */
+ if (first != &conf->pending_list)
+ list_move_tail(&conf->pending_list, first);
+}
+
+static void flush_deferred_bios(struct r5conf *conf)
+{
+ struct bio_list tmp = BIO_EMPTY_LIST;
+
+ if (conf->pending_data_cnt == 0)
return;
- bio_list_init(&tmp);
spin_lock(&conf->pending_bios_lock);
- bio_list_merge(&tmp, &conf->pending_bios);
- bio_list_init(&conf->pending_bios);
+ dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
+ BUG_ON(conf->pending_data_cnt != 0);
spin_unlock(&conf->pending_bios_lock);
- while ((bio = bio_list_pop(&tmp)))
- generic_make_request(bio);
+ dispatch_bio_list(&tmp);
}
-static void defer_bio_issue(struct r5conf *conf, struct bio *bio)
+static void defer_issue_bios(struct r5conf *conf, sector_t sector,
+ struct bio_list *bios)
{
- /*
- * change group_cnt will drain all bios, so this is safe
- *
- * A read generally means a read-modify-write, which usually means a
- * randwrite, so we don't delay it
- */
- if (!conf->batch_bio_dispatch || !conf->group_cnt ||
- bio_op(bio) == REQ_OP_READ) {
- generic_make_request(bio);
- return;
- }
+ struct bio_list tmp = BIO_EMPTY_LIST;
+ struct r5pending_data *ent;
+
spin_lock(&conf->pending_bios_lock);
- bio_list_add(&conf->pending_bios, bio);
+ ent = list_first_entry(&conf->free_list, struct r5pending_data,
+ sibling);
+ list_move_tail(&ent->sibling, &conf->pending_list);
+ ent->sector = sector;
+ bio_list_init(&ent->bios);
+ bio_list_merge(&ent->bios, bios);
+ conf->pending_data_cnt++;
+ if (conf->pending_data_cnt >= PENDING_IO_MAX)
+ dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
+
spin_unlock(&conf->pending_bios_lock);
- md_wakeup_thread(conf->mddev->thread);
+
+ dispatch_bio_list(&tmp);
}
static void
@@ -910,21 +984,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
struct r5conf *conf = sh->raid_conf;
int i, disks = sh->disks;
struct stripe_head *head_sh = sh;
+ struct bio_list pending_bios = BIO_EMPTY_LIST;
+ bool should_defer;
might_sleep();
- if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
- /* writing out phase */
- if (s->waiting_extra_page)
- return;
- if (r5l_write_stripe(conf->log, sh) == 0)
- return;
- } else { /* caching phase */
- if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
- r5c_cache_data(conf->log, sh, s);
- return;
- }
- }
+ if (log_stripe(sh, s) == 0)
+ return;
+
+ should_defer = conf->batch_bio_dispatch && conf->group_cnt;
for (i = disks; i--; ) {
int op, op_flags = 0;
@@ -1080,7 +1148,10 @@ again:
trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
bi, disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
- defer_bio_issue(conf, bi);
+ if (should_defer && op_is_write(op))
+ bio_list_add(&pending_bios, bi);
+ else
+ generic_make_request(bi);
}
if (rrdev) {
if (s->syncing || s->expanding || s->expanded
@@ -1125,7 +1196,10 @@ again:
trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
rbi, disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
- defer_bio_issue(conf, rbi);
+ if (should_defer && op_is_write(op))
+ bio_list_add(&pending_bios, rbi);
+ else
+ generic_make_request(rbi);
}
if (!rdev && !rrdev) {
if (op_is_write(op))
@@ -1143,6 +1217,9 @@ again:
if (sh != head_sh)
goto again;
}
+
+ if (should_defer && !bio_list_empty(&pending_bios))
+ defer_issue_bios(conf, head_sh->sector, &pending_bios);
}
static struct dma_async_tx_descriptor *
@@ -1212,7 +1289,6 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
static void ops_complete_biofill(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
- struct bio_list return_bi = BIO_EMPTY_LIST;
int i;
pr_debug("%s: stripe %llu\n", __func__,
@@ -1236,16 +1312,13 @@ static void ops_complete_biofill(void *stripe_head_ref)
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
- if (!raid5_dec_bi_active_stripes(rbi))
- bio_list_add(&return_bi, rbi);
+ bio_endio(rbi);
rbi = rbi2;
}
}
}
clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
- return_io(&return_bi);
-
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
}
@@ -2014,6 +2087,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
tx = ops_run_prexor6(sh, percpu, tx);
}
+ if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+ tx = ops_run_partial_parity(sh, percpu, tx);
+
if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
tx = ops_run_biodrain(sh, tx);
overlap_clear++;
@@ -2046,8 +2122,15 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
put_cpu();
}
+static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
+{
+ if (sh->ppl_page)
+ __free_page(sh->ppl_page);
+ kmem_cache_free(sc, sh);
+}
+
static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
- int disks)
+ int disks, struct r5conf *conf)
{
struct stripe_head *sh;
int i;
@@ -2061,6 +2144,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
INIT_LIST_HEAD(&sh->r5c);
INIT_LIST_HEAD(&sh->log_list);
atomic_set(&sh->count, 1);
+ sh->raid_conf = conf;
sh->log_start = MaxSector;
for (i = 0; i < disks; i++) {
struct r5dev *dev = &sh->dev[i];
@@ -2068,6 +2152,14 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
bio_init(&dev->req, &dev->vec, 1);
bio_init(&dev->rreq, &dev->rvec, 1);
}
+
+ if (raid5_has_ppl(conf)) {
+ sh->ppl_page = alloc_page(gfp);
+ if (!sh->ppl_page) {
+ free_stripe(sc, sh);
+ sh = NULL;
+ }
+ }
}
return sh;
}
@@ -2075,15 +2167,13 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
{
struct stripe_head *sh;
- sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size);
+ sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
if (!sh)
return 0;
- sh->raid_conf = conf;
-
if (grow_buffers(sh, gfp)) {
shrink_buffers(sh);
- kmem_cache_free(conf->slab_cache, sh);
+ free_stripe(conf->slab_cache, sh);
return 0;
}
sh->hash_lock_index =
@@ -2210,7 +2300,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
* pages have been transferred over, and the old kmem_cache is
* freed when all stripes are done.
* 3/ reallocate conf->disks to be suitable bigger. If this fails,
- * we simple return a failre status - no need to clean anything up.
+ * we simple return a failure status - no need to clean anything up.
* 4/ allocate new pages for the new slots in the new stripe_heads.
* If this fails, we don't bother trying the shrink the
* stripe_heads down again, we just leave them as they are.
@@ -2223,17 +2313,12 @@ static int resize_stripes(struct r5conf *conf, int newsize)
struct stripe_head *osh, *nsh;
LIST_HEAD(newstripes);
struct disk_info *ndisks;
- int err;
+ int err = 0;
struct kmem_cache *sc;
int i;
int hash, cnt;
- if (newsize <= conf->pool_size)
- return 0; /* never bother to shrink */
-
- err = md_allow_write(conf->mddev);
- if (err)
- return err;
+ md_allow_write(conf->mddev);
/* Step 1 */
sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -2246,11 +2331,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
mutex_lock(&conf->cache_size_mutex);
for (i = conf->max_nr_stripes; i; i--) {
- nsh = alloc_stripe(sc, GFP_KERNEL, newsize);
+ nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
if (!nsh)
break;
- nsh->raid_conf = conf;
list_add(&nsh->lru, &newstripes);
}
if (i) {
@@ -2258,7 +2342,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
while (!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
list_del(&nsh->lru);
- kmem_cache_free(sc, nsh);
+ free_stripe(sc, nsh);
}
kmem_cache_destroy(sc);
mutex_unlock(&conf->cache_size_mutex);
@@ -2284,7 +2368,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
nsh->dev[i].orig_page = osh->dev[i].page;
}
nsh->hash_lock_index = hash;
- kmem_cache_free(conf->slab_cache, osh);
+ free_stripe(conf->slab_cache, osh);
cnt++;
if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
!!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
@@ -2323,6 +2407,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
err = -ENOMEM;
mutex_unlock(&conf->cache_size_mutex);
+
+ conf->slab_cache = sc;
+ conf->active_name = 1-conf->active_name;
+
/* Step 4, return new stripes to service */
while(!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2340,8 +2428,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
}
/* critical section pass, GFP_NOIO no longer needed */
- conf->slab_cache = sc;
- conf->active_name = 1-conf->active_name;
if (!err)
conf->pool_size = newsize;
return err;
@@ -2359,7 +2445,7 @@ static int drop_one_stripe(struct r5conf *conf)
return 0;
BUG_ON(atomic_read(&sh->count));
shrink_buffers(sh);
- kmem_cache_free(conf->slab_cache, sh);
+ free_stripe(conf->slab_cache, sh);
atomic_dec(&conf->active_stripes);
conf->max_nr_stripes--;
return 1;
@@ -2607,7 +2693,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
bdevname(rdev->bdev, b),
mdname(mddev),
conf->raid_disks - mddev->degraded);
- r5c_update_on_rdev_error(mddev);
+ r5c_update_on_rdev_error(mddev, rdev);
}
/*
@@ -2968,6 +3054,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
* When LOG_CRITICAL, stripes with injournal == 0 will be sent to
* no_space_stripes list.
*
+ * 3. during journal failure
+ * In journal failure, we try to flush all cached data to raid disks
+ * based on data in stripe cache. The array is read-only to upper
+ * layers, so we would skip all pending writes.
+ *
*/
static inline bool delay_towrite(struct r5conf *conf,
struct r5dev *dev,
@@ -2981,6 +3072,9 @@ static inline bool delay_towrite(struct r5conf *conf,
if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
s->injournal > 0)
return true;
+ /* case 3 above */
+ if (s->log_failed && s->injournal)
+ return true;
return false;
}
@@ -3082,6 +3176,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
+ if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
+ test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+ !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+ test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+ set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
__func__, (unsigned long long)sh->sector,
s->locked, s->ops_request);
@@ -3103,14 +3203,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
(unsigned long long)bi->bi_iter.bi_sector,
(unsigned long long)sh->sector);
- /*
- * If several bio share a stripe. The bio bi_phys_segments acts as a
- * reference count to avoid race. The reference count should already be
- * increased before this function is called (for example, in
- * raid5_make_request()), so other bio sharing this stripe will not free the
- * stripe. If a stripe is owned by one stripe, the stripe lock will
- * protect it.
- */
spin_lock_irq(&sh->stripe_lock);
/* Don't allow new IO added to stripes in batch list */
if (sh->batch_head)
@@ -3129,6 +3221,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
goto overlap;
+ if (forwrite && raid5_has_ppl(conf)) {
+ /*
+ * With PPL only writes to consecutive data chunks within a
+ * stripe are allowed because for a single stripe_head we can
+ * only have one PPL entry at a time, which describes one data
+ * range. Not really an overlap, but wait_for_overlap can be
+ * used to handle this.
+ */
+ sector_t sector;
+ sector_t first = 0;
+ sector_t last = 0;
+ int count = 0;
+ int i;
+
+ for (i = 0; i < sh->disks; i++) {
+ if (i != sh->pd_idx &&
+ (i == dd_idx || sh->dev[i].towrite)) {
+ sector = sh->dev[i].sector;
+ if (count == 0 || sector < first)
+ first = sector;
+ if (sector > last)
+ last = sector;
+ count++;
+ }
+ }
+
+ if (first + conf->chunk_sectors * (count - 1) != last)
+ goto overlap;
+ }
+
if (!forwrite || previous)
clear_bit(STRIPE_BATCH_READY, &sh->state);
@@ -3136,7 +3258,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
if (*bip)
bi->bi_next = *bip;
*bip = bi;
- raid5_inc_bi_active_stripes(bi);
+ bio_inc_remaining(bi);
+ md_write_inc(conf->mddev, bi);
if (forwrite) {
/* check if page is covered */
@@ -3213,8 +3336,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
static void
handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s, int disks,
- struct bio_list *return_bi)
+ struct stripe_head_state *s, int disks)
{
int i;
BUG_ON(sh->batch_head);
@@ -3250,7 +3372,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi)
bitmap_end = 1;
- r5l_stripe_write_finished(sh);
+ log_stripe_write_finished(sh);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
@@ -3260,10 +3382,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
bi->bi_error = -EIO;
- if (!raid5_dec_bi_active_stripes(bi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, bi);
- }
+ md_write_end(conf->mddev);
+ bio_endio(bi);
bi = nextbi;
}
if (bitmap_end)
@@ -3284,10 +3404,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
bi->bi_error = -EIO;
- if (!raid5_dec_bi_active_stripes(bi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, bi);
- }
+ md_write_end(conf->mddev);
+ bio_endio(bi);
bi = bi2;
}
@@ -3312,8 +3430,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
r5_next_bio(bi, sh->dev[i].sector);
bi->bi_error = -EIO;
- if (!raid5_dec_bi_active_stripes(bi))
- bio_list_add(return_bi, bi);
+ bio_endio(bi);
bi = nextbi;
}
}
@@ -3449,7 +3566,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
/* Pre-reads at not permitted until after short delay
* to gather multiple requests. However if this
- * device is no Insync, the block could only be be computed
+ * device is no Insync, the block could only be computed
* and there is no need to delay that.
*/
return 0;
@@ -3468,7 +3585,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
/* If we are forced to do a reconstruct-write, either because
* the current RAID6 implementation only supports that, or
- * or because parity cannot be trusted and we are currently
+ * because parity cannot be trusted and we are currently
* recovering it, there is extra need to be careful.
* If one of the devices that we would need to read, because
* it is not being overwritten (and maybe not written at all)
@@ -3508,9 +3625,20 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
BUG_ON(test_bit(R5_Wantread, &dev->flags));
BUG_ON(sh->batch_head);
+
+ /*
+ * In the raid6 case if the only non-uptodate disk is P
+ * then we already trusted P to compute the other failed
+ * drives. It is safe to compute rather than re-read P.
+ * In other cases we only compute blocks from failed
+ * devices, otherwise check/repair might fail to detect
+ * a real inconsistency.
+ */
+
if ((s->uptodate == disks - 1) &&
+ ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
(s->failed && (disk_idx == s->failed_num[0] ||
- disk_idx == s->failed_num[1]))) {
+ disk_idx == s->failed_num[1])))) {
/* have disk failed, and we're requested to fetch it;
* do compute it
*/
@@ -3612,7 +3740,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
* never LOCKED, so we don't need to test 'failed' directly.
*/
static void handle_stripe_clean_event(struct r5conf *conf,
- struct stripe_head *sh, int disks, struct bio_list *return_bi)
+ struct stripe_head *sh, int disks)
{
int i;
struct r5dev *dev;
@@ -3644,10 +3772,8 @@ returnbi:
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
- if (!raid5_dec_bi_active_stripes(wbi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, wbi);
- }
+ md_write_end(conf->mddev);
+ bio_endio(wbi);
wbi = wbi2;
}
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -3669,7 +3795,7 @@ returnbi:
discard_pending = 1;
}
- r5l_stripe_write_finished(sh);
+ log_stripe_write_finished(sh);
if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -4534,8 +4660,13 @@ static void handle_stripe(struct stripe_head *sh)
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
spin_lock(&sh->stripe_lock);
- /* Cannot process 'sync' concurrently with 'discard' */
- if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+ /*
+ * Cannot process 'sync' concurrently with 'discard'.
+ * Flush data in r5cache before 'sync'.
+ */
+ if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+ !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
+ !test_bit(STRIPE_DISCARD, &sh->state) &&
test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state);
@@ -4556,7 +4687,8 @@ static void handle_stripe(struct stripe_head *sh)
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
goto finish;
- if (s.handle_bad_blocks) {
+ if (s.handle_bad_blocks ||
+ test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
}
@@ -4581,15 +4713,20 @@ static void handle_stripe(struct stripe_head *sh)
" to_write=%d failed=%d failed_num=%d,%d\n",
s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
s.failed_num[0], s.failed_num[1]);
- /* check if the array has lost more than max_degraded devices and,
+ /*
+ * check if the array has lost more than max_degraded devices and,
* if so, some requests might need to be failed.
+ *
+ * When journal device failed (log_failed), we will only process
+ * the stripe if there is data need write to raid disks
*/
- if (s.failed > conf->max_degraded || s.log_failed) {
+ if (s.failed > conf->max_degraded ||
+ (s.log_failed && s.injournal == 0)) {
sh->check_state = 0;
sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
if (s.to_read+s.to_write+s.written)
- handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+ handle_failed_stripe(conf, sh, &s, disks);
if (s.syncing + s.replacing)
handle_failed_sync(conf, sh, &s);
}
@@ -4655,11 +4792,11 @@ static void handle_stripe(struct stripe_head *sh)
&& !test_bit(R5_LOCKED, &qdev->flags)
&& (test_bit(R5_UPTODATE, &qdev->flags) ||
test_bit(R5_Discard, &qdev->flags))))))
- handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+ handle_stripe_clean_event(conf, sh, disks);
if (s.just_cached)
- r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
- r5l_stripe_write_finished(sh);
+ r5c_handle_cached_data_endio(conf, sh, disks);
+ log_stripe_write_finished(sh);
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
@@ -4886,16 +5023,6 @@ finish:
md_wakeup_thread(conf->mddev->thread);
}
- if (!bio_list_empty(&s.return_bi)) {
- if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
- spin_lock_irq(&conf->device_lock);
- bio_list_merge(&conf->return_bi, &s.return_bi);
- spin_unlock_irq(&conf->device_lock);
- md_wakeup_thread(conf->mddev->thread);
- } else
- return_io(&s.return_bi);
- }
-
clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
}
@@ -4984,12 +5111,14 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
md_wakeup_thread(conf->mddev->thread);
}
-static struct bio *remove_bio_from_retry(struct r5conf *conf)
+static struct bio *remove_bio_from_retry(struct r5conf *conf,
+ unsigned int *offset)
{
struct bio *bi;
bi = conf->retry_read_aligned;
if (bi) {
+ *offset = conf->retry_read_offset;
conf->retry_read_aligned = NULL;
return bi;
}
@@ -4997,11 +5126,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
if(bi) {
conf->retry_read_aligned_list = bi->bi_next;
bi->bi_next = NULL;
- /*
- * this sets the active strip count to 1 and the processed
- * strip count to zero (upper 8 bits)
- */
- raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
+ *offset = 0;
}
return bi;
@@ -5136,24 +5261,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
{
struct bio *split;
+ sector_t sector = raid_bio->bi_iter.bi_sector;
+ unsigned chunk_sects = mddev->chunk_sectors;
+ unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
- do {
- sector_t sector = raid_bio->bi_iter.bi_sector;
- unsigned chunk_sects = mddev->chunk_sectors;
- unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
-
- if (sectors < bio_sectors(raid_bio)) {
- split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
- bio_chain(split, raid_bio);
- } else
- split = raid_bio;
+ if (sectors < bio_sectors(raid_bio)) {
+ struct r5conf *conf = mddev->private;
+ split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
+ bio_chain(split, raid_bio);
+ generic_make_request(raid_bio);
+ raid_bio = split;
+ }
- if (!raid5_read_one_chunk(mddev, split)) {
- if (split != raid_bio)
- generic_make_request(raid_bio);
- return split;
- }
- } while (split != raid_bio);
+ if (!raid5_read_one_chunk(mddev, raid_bio))
+ return raid_bio;
return NULL;
}
@@ -5170,19 +5291,29 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
*/
static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
{
- struct stripe_head *sh = NULL, *tmp;
+ struct stripe_head *sh, *tmp;
struct list_head *handle_list = NULL;
- struct r5worker_group *wg = NULL;
+ struct r5worker_group *wg;
+ bool second_try = !r5c_is_writeback(conf->log) &&
+ !r5l_log_disk_error(conf);
+ bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
+ r5l_log_disk_error(conf);
+again:
+ wg = NULL;
+ sh = NULL;
if (conf->worker_cnt_per_group == 0) {
- handle_list = &conf->handle_list;
+ handle_list = try_loprio ? &conf->loprio_list :
+ &conf->handle_list;
} else if (group != ANY_GROUP) {
- handle_list = &conf->worker_groups[group].handle_list;
+ handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
+ &conf->worker_groups[group].handle_list;
wg = &conf->worker_groups[group];
} else {
int i;
for (i = 0; i < conf->group_cnt; i++) {
- handle_list = &conf->worker_groups[i].handle_list;
+ handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
+ &conf->worker_groups[i].handle_list;
wg = &conf->worker_groups[i];
if (!list_empty(handle_list))
break;
@@ -5233,8 +5364,13 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
wg = NULL;
}
- if (!sh)
- return NULL;
+ if (!sh) {
+ if (second_try)
+ return NULL;
+ second_try = true;
+ try_loprio = !try_loprio;
+ goto again;
+ }
if (wg) {
wg->stripes_cnt--;
@@ -5323,7 +5459,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
struct r5conf *conf = mddev->private;
sector_t logical_sector, last_sector;
struct stripe_head *sh;
- int remaining;
int stripe_sectors;
if (mddev->reshape_position != MaxSector)
@@ -5334,7 +5469,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
bi->bi_next = NULL;
- bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+ md_write_start(mddev, bi);
stripe_sectors = conf->chunk_sectors *
(conf->raid_disks - conf->max_degraded);
@@ -5380,7 +5515,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
continue;
sh->dev[d].towrite = bi;
set_bit(R5_OVERWRITE, &sh->dev[d].flags);
- raid5_inc_bi_active_stripes(bi);
+ bio_inc_remaining(bi);
+ md_write_inc(mddev, bi);
sh->overwrite_disks++;
}
spin_unlock_irq(&sh->stripe_lock);
@@ -5403,11 +5539,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
release_stripe_plug(mddev, sh);
}
- remaining = raid5_dec_bi_active_stripes(bi);
- if (remaining == 0) {
- md_write_end(mddev);
- bio_endio(bi);
- }
+ md_write_end(mddev);
+ bio_endio(bi);
}
static void raid5_make_request(struct mddev *mddev, struct bio * bi)
@@ -5418,7 +5551,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
sector_t logical_sector, last_sector;
struct stripe_head *sh;
const int rw = bio_data_dir(bi);
- int remaining;
DEFINE_WAIT(w);
bool do_prepare;
bool do_flush = false;
@@ -5440,8 +5572,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = bi->bi_opf & REQ_PREFLUSH;
}
- md_write_start(mddev, bi);
-
/*
* If array is degraded, better not do chunk aligned read because
* later we might have to read it again in order to reconstruct
@@ -5462,7 +5592,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
- bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+ md_write_start(mddev, bi);
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
@@ -5597,16 +5727,9 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
}
finish_wait(&conf->wait_for_overlap, &w);
- remaining = raid5_dec_bi_active_stripes(bi);
- if (remaining == 0) {
-
- if ( rw == WRITE )
- md_write_end(mddev);
-
- trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
- bi, 0);
- bio_endio(bi);
- }
+ if (rw == WRITE)
+ md_write_end(mddev);
+ bio_endio(bi);
}
static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
@@ -5955,7 +6078,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
return STRIPE_SECTORS;
}
-static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
+static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
+ unsigned int offset)
{
/* We may not be able to submit a whole bio at once as there
* may not be enough stripe_heads available.
@@ -5971,7 +6095,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
int dd_idx;
sector_t sector, logical_sector, last_sector;
int scnt = 0;
- int remaining;
int handled = 0;
logical_sector = raid_bio->bi_iter.bi_sector &
@@ -5985,7 +6108,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
sector += STRIPE_SECTORS,
scnt++) {
- if (scnt < raid5_bi_processed_stripes(raid_bio))
+ if (scnt < offset)
/* already done this stripe */
continue;
@@ -5993,15 +6116,15 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
if (!sh) {
/* failed to get a stripe - must wait */
- raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
+ conf->retry_read_offset = scnt;
return handled;
}
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
raid5_release_stripe(sh);
- raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
+ conf->retry_read_offset = scnt;
return handled;
}
@@ -6010,12 +6133,9 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
raid5_release_stripe(sh);
handled++;
}
- remaining = raid5_dec_bi_active_stripes(raid_bio);
- if (remaining == 0) {
- trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
- raid_bio, 0);
- bio_endio(raid_bio);
- }
+
+ bio_endio(raid_bio);
+
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
return handled;
@@ -6058,7 +6178,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < batch_size; i++)
handle_stripe(batch[i]);
- r5l_write_stripe_run(conf->log);
+ log_write_stripe_run(conf);
cond_resched();
@@ -6075,6 +6195,7 @@ static void raid5_do_work(struct work_struct *work)
struct r5worker *worker = container_of(work, struct r5worker, work);
struct r5worker_group *group = worker->group;
struct r5conf *conf = group->conf;
+ struct mddev *mddev = conf->mddev;
int group_id = group - conf->worker_groups;
int handled;
struct blk_plug plug;
@@ -6095,6 +6216,9 @@ static void raid5_do_work(struct work_struct *work)
if (!batch_size && !released)
break;
handled += batch_size;
+ wait_event_lock_irq(mddev->sb_wait,
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+ conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
@@ -6122,24 +6246,13 @@ static void raid5d(struct md_thread *thread)
md_check_recovery(mddev);
- if (!bio_list_empty(&conf->return_bi) &&
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
- struct bio_list tmp = BIO_EMPTY_LIST;
- spin_lock_irq(&conf->device_lock);
- if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
- bio_list_merge(&tmp, &conf->return_bi);
- bio_list_init(&conf->return_bi);
- }
- spin_unlock_irq(&conf->device_lock);
- return_io(&tmp);
- }
-
blk_start_plug(&plug);
handled = 0;
spin_lock_irq(&conf->device_lock);
while (1) {
struct bio *bio;
int batch_size, released;
+ unsigned int offset;
released = release_stripe_list(conf, conf->temp_inactive_list);
if (released)
@@ -6157,10 +6270,10 @@ static void raid5d(struct md_thread *thread)
}
raid5_activate_delayed(conf);
- while ((bio = remove_bio_from_retry(conf))) {
+ while ((bio = remove_bio_from_retry(conf, &offset))) {
int ok;
spin_unlock_irq(&conf->device_lock);
- ok = retry_aligned_read(conf, bio);
+ ok = retry_aligned_read(conf, bio, offset);
spin_lock_irq(&conf->device_lock);
if (!ok)
break;
@@ -6219,7 +6332,6 @@ int
raid5_set_cache_size(struct mddev *mddev, int size)
{
struct r5conf *conf = mddev->private;
- int err;
if (size <= 16 || size > 32768)
return -EINVAL;
@@ -6231,10 +6343,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
;
mutex_unlock(&conf->cache_size_mutex);
-
- err = md_allow_write(mddev);
- if (err)
- return err;
+ md_allow_write(mddev);
mutex_lock(&conf->cache_size_mutex);
while (size > conf->max_nr_stripes)
@@ -6544,6 +6653,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
group = &(*worker_groups)[i];
INIT_LIST_HEAD(&group->handle_list);
+ INIT_LIST_HEAD(&group->loprio_list);
group->conf = conf;
group->workers = workers + i * cnt;
@@ -6634,8 +6744,8 @@ static void free_conf(struct r5conf *conf)
{
int i;
- if (conf->log)
- r5l_exit_log(conf->log);
+ log_exit(conf);
+
if (conf->shrinker.nr_deferred)
unregister_shrinker(&conf->shrinker);
@@ -6646,7 +6756,10 @@ static void free_conf(struct r5conf *conf)
if (conf->disks[i].extra_page)
put_page(conf->disks[i].extra_page);
kfree(conf->disks);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf->stripe_hashtbl);
+ kfree(conf->pending_data);
kfree(conf);
}
@@ -6756,6 +6869,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
if (conf == NULL)
goto abort;
+ INIT_LIST_HEAD(&conf->free_list);
+ INIT_LIST_HEAD(&conf->pending_list);
+ conf->pending_data = kzalloc(sizeof(struct r5pending_data) *
+ PENDING_IO_MAX, GFP_KERNEL);
+ if (!conf->pending_data)
+ goto abort;
+ for (i = 0; i < PENDING_IO_MAX; i++)
+ list_add(&conf->pending_data[i].sibling, &conf->free_list);
/* Don't enable multi-threading by default*/
if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
&new_group)) {
@@ -6771,15 +6892,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
+ INIT_LIST_HEAD(&conf->loprio_list);
INIT_LIST_HEAD(&conf->hold_list);
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list);
- bio_list_init(&conf->return_bi);
init_llist_head(&conf->released_stripes);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
atomic_set(&conf->active_aligned_reads, 0);
- bio_list_init(&conf->pending_bios);
spin_lock_init(&conf->pending_bios_lock);
conf->batch_bio_dispatch = true;
rdev_for_each(rdev, mddev) {
@@ -6813,6 +6933,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
}
+ conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+ if (!conf->bio_split)
+ goto abort;
conf->mddev = mddev;
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -7097,6 +7220,13 @@ static int raid5_run(struct mddev *mddev)
BUG_ON(mddev->delta_disks != 0);
}
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+ test_bit(MD_HAS_PPL, &mddev->flags)) {
+ pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
+ mdname(mddev));
+ clear_bit(MD_HAS_PPL, &mddev->flags);
+ }
+
if (mddev->private == NULL)
conf = setup_conf(mddev);
else
@@ -7188,7 +7318,10 @@ static int raid5_run(struct mddev *mddev)
if (mddev->degraded > dirty_parity_disks &&
mddev->recovery_cp != MaxSector) {
- if (mddev->ok_start_degraded)
+ if (test_bit(MD_HAS_PPL, &mddev->flags))
+ pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
+ mdname(mddev));
+ else if (mddev->ok_start_degraded)
pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
mdname(mddev));
else {
@@ -7254,14 +7387,6 @@ static int raid5_run(struct mddev *mddev)
mddev->queue->limits.discard_alignment = stripe;
mddev->queue->limits.discard_granularity = stripe;
- /*
- * We use 16-bit counter of active stripes in bi_phys_segments
- * (minus one for over-loaded initialization)
- */
- blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
- blk_queue_max_discard_sectors(mddev->queue,
- 0xfffe * STRIPE_SECTORS);
-
blk_queue_max_write_same_sectors(mddev->queue, 0);
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
@@ -7299,14 +7424,8 @@ static int raid5_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
}
- if (journal_dev) {
- char b[BDEVNAME_SIZE];
-
- pr_debug("md/raid:%s: using device %s as journal\n",
- mdname(mddev), bdevname(journal_dev->bdev, b));
- if (r5l_init_log(conf, journal_dev))
- goto abort;
- }
+ if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
+ goto abort;
return 0;
abort:
@@ -7420,17 +7539,18 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
print_raid5_conf(conf);
if (test_bit(Journal, &rdev->flags) && conf->log) {
- struct r5l_log *log;
/*
* we can't wait pending write here, as this is called in
* raid5d, wait will deadlock.
+ * neilb: there is no locking about new writes here,
+ * so this cannot be safe.
*/
- if (atomic_read(&mddev->writes_pending))
+ if (atomic_read(&conf->active_stripes) ||
+ atomic_read(&conf->r5c_cached_full_stripes) ||
+ atomic_read(&conf->r5c_cached_partial_stripes)) {
return -EBUSY;
- log = conf->log;
- conf->log = NULL;
- synchronize_rcu();
- r5l_exit_log(log);
+ }
+ log_exit(conf);
return 0;
}
if (rdev == p->rdev)
@@ -7469,6 +7589,11 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
*rdevp = rdev;
}
}
+ if (!err) {
+ err = log_modify(conf, rdev, false);
+ if (err)
+ goto abort;
+ }
if (p->replacement) {
/* We must have just cleared 'rdev' */
p->rdev = p->replacement;
@@ -7477,12 +7602,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
* but will never see neither - if they are careful
*/
p->replacement = NULL;
- clear_bit(WantReplacement, &rdev->flags);
- } else
- /* We might have just removed the Replacement as faulty-
- * clear the bit just in case
- */
- clear_bit(WantReplacement, &rdev->flags);
+
+ if (!err)
+ err = log_modify(conf, p->rdev, true);
+ }
+
+ clear_bit(WantReplacement, &rdev->flags);
abort:
print_raid5_conf(conf);
@@ -7499,7 +7624,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int last = conf->raid_disks - 1;
if (test_bit(Journal, &rdev->flags)) {
- char b[BDEVNAME_SIZE];
if (conf->log)
return -EBUSY;
@@ -7508,9 +7632,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* The array is in readonly mode if journal is missing, so no
* write requests running. We should be safe
*/
- r5l_init_log(conf, rdev);
- pr_debug("md/raid:%s: using device %s as journal\n",
- mdname(mddev), bdevname(rdev->bdev, b));
+ log_init(conf, rdev, false);
return 0;
}
if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7537,10 +7659,12 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (p->rdev == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->raid_disk = disk;
- err = 0;
if (rdev->saved_raid_disk != disk)
conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev);
+
+ err = log_modify(conf, rdev, true);
+
goto out;
}
}
@@ -7574,7 +7698,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
sector_t newsize;
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return -EINVAL;
sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7625,7 +7749,7 @@ static int check_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return -EINVAL;
if (mddev->delta_disks == 0 &&
mddev->new_layout == mddev->layout &&
@@ -7658,6 +7782,9 @@ static int check_reshape(struct mddev *mddev)
mddev->chunk_sectors)
) < 0)
return -ENOMEM;
+
+ if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
+ return 0; /* never bother to shrink */
return resize_stripes(conf, (conf->previous_raid_disks
+ mddev->delta_disks));
}
@@ -8148,6 +8275,68 @@ static void *raid6_takeover(struct mddev *mddev)
return setup_conf(mddev);
}
+static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
+{
+ struct r5conf *conf;
+ int err;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf) {
+ mddev_unlock(mddev);
+ return -ENODEV;
+ }
+
+ if (strncmp(buf, "ppl", 3) == 0) {
+ /* ppl only works with RAID 5 */
+ if (!raid5_has_ppl(conf) && conf->level == 5) {
+ err = log_init(conf, NULL, true);
+ if (!err) {
+ err = resize_stripes(conf, conf->pool_size);
+ if (err)
+ log_exit(conf);
+ }
+ } else
+ err = -EINVAL;
+ } else if (strncmp(buf, "resync", 6) == 0) {
+ if (raid5_has_ppl(conf)) {
+ mddev_suspend(mddev);
+ log_exit(conf);
+ mddev_resume(mddev);
+ err = resize_stripes(conf, conf->pool_size);
+ } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
+ r5l_log_disk_error(conf)) {
+ bool journal_dev_exists = false;
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ if (test_bit(Journal, &rdev->flags)) {
+ journal_dev_exists = true;
+ break;
+ }
+
+ if (!journal_dev_exists) {
+ mddev_suspend(mddev);
+ clear_bit(MD_HAS_JOURNAL, &mddev->flags);
+ mddev_resume(mddev);
+ } else /* need remove journal device first */
+ err = -EBUSY;
+ } else
+ err = -EINVAL;
+ } else {
+ err = -EINVAL;
+ }
+
+ if (!err)
+ md_update_sb(mddev, 1);
+
+ mddev_unlock(mddev);
+
+ return err;
+}
+
static struct md_personality raid6_personality =
{
.name = "raid6",
@@ -8170,6 +8359,7 @@ static struct md_personality raid6_personality =
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.congested = raid5_congested,
+ .change_consistency_policy = raid5_change_consistency_policy,
};
static struct md_personality raid5_personality =
{
@@ -8193,6 +8383,7 @@ static struct md_personality raid5_personality =
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.congested = raid5_congested,
+ .change_consistency_policy = raid5_change_consistency_policy,
};
static struct md_personality raid4_personality =
@@ -8217,6 +8408,7 @@ static struct md_personality raid4_personality =
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.congested = raid5_congested,
+ .change_consistency_policy = raid5_change_consistency_policy,
};
static int __init raid5_init(void)