aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c7
-rw-r--r--drivers/md/dm-crypt.c12
-rw-r--r--drivers/md/dm-mpath.c4
-rw-r--r--drivers/md/dm-table.c16
-rw-r--r--drivers/md/dm.c40
-rw-r--r--drivers/md/md.c32
-rw-r--r--drivers/md/raid0.c9
-rw-r--r--drivers/md/raid10.c1
-rw-r--r--drivers/md/raid5.c270
-rw-r--r--drivers/md/raid5.h5
10 files changed, 234 insertions, 162 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 2bc56e2a3526..135a0907e9de 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -177,11 +177,16 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
* nr_pending is 0 and In_sync is clear, the entries we return will
* still be in the same position on the list when we re-enter
* list_for_each_entry_continue_rcu.
+ *
+ * Note that if entered with 'rdev == NULL' to start at the
+ * beginning, we temporarily assign 'rdev' to an address which
+ * isn't really an rdev, but which can be used by
+ * list_for_each_entry_continue_rcu() to find the first entry.
*/
rcu_read_lock();
if (rdev == NULL)
/* start at the beginning */
- rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set);
+ rdev = list_entry(&mddev->disks, struct md_rdev, same_set);
else {
/* release the previous rdev and start from there. */
rdev_dec_pending(rdev, mddev);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9eeea196328a..5503e43e5f28 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -925,10 +925,11 @@ static int crypt_convert(struct crypt_config *cc,
switch (r) {
/* async */
- case -EINPROGRESS:
case -EBUSY:
wait_for_completion(&ctx->restart);
reinit_completion(&ctx->restart);
+ /* fall through*/
+ case -EINPROGRESS:
ctx->req = NULL;
ctx->cc_sector++;
continue;
@@ -1345,8 +1346,10 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
struct crypt_config *cc = io->cc;
- if (error == -EINPROGRESS)
+ if (error == -EINPROGRESS) {
+ complete(&ctx->restart);
return;
+ }
if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
@@ -1357,15 +1360,12 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
if (!atomic_dec_and_test(&ctx->cc_pending))
- goto done;
+ return;
if (bio_data_dir(io->base_bio) == READ)
kcryptd_crypt_read_done(io);
else
kcryptd_crypt_write_io_submit(io, 1);
-done:
- if (!completion_done(&ctx->restart))
- complete(&ctx->restart);
}
static void kcryptd_crypt(struct work_struct *work)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 63953477a07c..eff7bdd7731d 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -429,9 +429,11 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
/* blk-mq request-based interface */
*__clone = blk_get_request(bdev_get_queue(bdev),
rq_data_dir(rq), GFP_ATOMIC);
- if (IS_ERR(*__clone))
+ if (IS_ERR(*__clone)) {
/* ENOMEM, requeue */
+ clear_mapinfo(m, map_context);
return r;
+ }
(*__clone)->bio = (*__clone)->biotail = NULL;
(*__clone)->rq_disk = bdev->bd_disk;
(*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3662b2e49b8d..a5f94125ad01 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -820,6 +820,12 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
}
EXPORT_SYMBOL(dm_consume_args);
+static bool __table_type_request_based(unsigned table_type)
+{
+ return (table_type == DM_TYPE_REQUEST_BASED ||
+ table_type == DM_TYPE_MQ_REQUEST_BASED);
+}
+
static int dm_table_set_type(struct dm_table *t)
{
unsigned i;
@@ -852,8 +858,7 @@ static int dm_table_set_type(struct dm_table *t)
* Determine the type from the live device.
* Default to bio-based if device is new.
*/
- if (live_md_type == DM_TYPE_REQUEST_BASED ||
- live_md_type == DM_TYPE_MQ_REQUEST_BASED)
+ if (__table_type_request_based(live_md_type))
request_based = 1;
else
bio_based = 1;
@@ -903,7 +908,7 @@ static int dm_table_set_type(struct dm_table *t)
}
t->type = DM_TYPE_MQ_REQUEST_BASED;
- } else if (hybrid && list_empty(devices) && live_md_type != DM_TYPE_NONE) {
+ } else if (list_empty(devices) && __table_type_request_based(live_md_type)) {
/* inherit live MD type */
t->type = live_md_type;
@@ -925,10 +930,7 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
bool dm_table_request_based(struct dm_table *t)
{
- unsigned table_type = dm_table_get_type(t);
-
- return (table_type == DM_TYPE_REQUEST_BASED ||
- table_type == DM_TYPE_MQ_REQUEST_BASED);
+ return __table_type_request_based(dm_table_get_type(t));
}
bool dm_table_mq_request_based(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 38837f8ea327..4d6f089a0e9e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1031,13 +1031,11 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
dm_put(md);
}
-static void free_rq_clone(struct request *clone, bool must_be_mapped)
+static void free_rq_clone(struct request *clone)
{
struct dm_rq_target_io *tio = clone->end_io_data;
struct mapped_device *md = tio->md;
- WARN_ON_ONCE(must_be_mapped && !clone->q);
-
if (md->type == DM_TYPE_MQ_REQUEST_BASED)
/* stacked on blk-mq queue(s) */
tio->ti->type->release_clone_rq(clone);
@@ -1079,7 +1077,7 @@ static void dm_end_request(struct request *clone, int error)
rq->sense_len = clone->sense_len;
}
- free_rq_clone(clone, true);
+ free_rq_clone(clone);
if (!rq->q->mq_ops)
blk_end_request_all(rq, error);
else
@@ -1098,7 +1096,7 @@ static void dm_unprep_request(struct request *rq)
}
if (clone)
- free_rq_clone(clone, false);
+ free_rq_clone(clone);
}
/*
@@ -1111,6 +1109,7 @@ static void old_requeue_request(struct request *rq)
spin_lock_irqsave(q->queue_lock, flags);
blk_requeue_request(q, rq);
+ blk_run_queue_async(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
@@ -1671,8 +1670,7 @@ static int dm_merge_bvec(struct request_queue *q,
struct mapped_device *md = q->queuedata;
struct dm_table *map = dm_get_live_table_fast(md);
struct dm_target *ti;
- sector_t max_sectors;
- int max_size = 0;
+ sector_t max_sectors, max_size = 0;
if (unlikely(!map))
goto out;
@@ -1687,8 +1685,16 @@ static int dm_merge_bvec(struct request_queue *q,
max_sectors = min(max_io_len(bvm->bi_sector, ti),
(sector_t) queue_max_sectors(q));
max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
- if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
- max_size = 0;
+
+ /*
+ * FIXME: this stop-gap fix _must_ be cleaned up (by passing a sector_t
+ * to the targets' merge function since it holds sectors not bytes).
+ * Just doing this as an interim fix for stable@ because the more
+ * comprehensive cleanup of switching to sector_t will impact every
+ * DM target that implements a ->merge hook.
+ */
+ if (max_size > INT_MAX)
+ max_size = INT_MAX;
/*
* merge_bvec_fn() returns number of bytes
@@ -1696,7 +1702,7 @@ static int dm_merge_bvec(struct request_queue *q,
* max is precomputed maximal io size
*/
if (max_size && ti->type->merge)
- max_size = ti->type->merge(ti, bvm, biovec, max_size);
+ max_size = ti->type->merge(ti, bvm, biovec, (int) max_size);
/*
* If the target doesn't support merge method and some of the devices
* provided their merge_bvec method (we know this by looking for the
@@ -1887,8 +1893,8 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
dm_kill_unmapped_request(rq, r);
return r;
}
- if (IS_ERR(clone))
- return DM_MAPIO_REQUEUE;
+ if (r != DM_MAPIO_REMAPPED)
+ return r;
setup_clone(clone, rq, tio);
}
@@ -2663,13 +2669,15 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
/* clone request is allocated at the end of the pdu */
tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
- if (!clone_rq(rq, md, tio, GFP_ATOMIC))
- return BLK_MQ_RQ_QUEUE_BUSY;
+ (void) clone_rq(rq, md, tio, GFP_ATOMIC);
queue_kthread_work(&md->kworker, &tio->work);
} else {
/* Direct call is fine since .queue_rq allows allocations */
- if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
- dm_requeue_unmapped_original_request(md, rq);
+ if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
+ /* Undo dm_start_request() before requeuing */
+ rq_completed(md, rq_data_dir(rq), false);
+ return BLK_MQ_RQ_QUEUE_BUSY;
+ }
}
return BLK_MQ_RQ_QUEUE_OK;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d4f31e195e26..4dbed4a67aaf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3834,7 +3834,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err = -EBUSY;
}
spin_unlock(&mddev->lock);
- return err;
+ return err ?: len;
}
err = mddev_lock(mddev);
if (err)
@@ -4211,34 +4211,36 @@ action_store(struct mddev *mddev, const char *page, size_t len)
if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL;
- if (cmd_match(page, "frozen"))
- set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- else
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
- flush_workqueue(md_misc_wq);
- if (mddev->sync_thread) {
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- if (mddev_lock(mddev) == 0) {
+ if (cmd_match(page, "frozen"))
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ else
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+ mddev_lock(mddev) == 0) {
+ flush_workqueue(md_misc_wq);
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
- mddev_unlock(mddev);
}
+ mddev_unlock(mddev);
}
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
return -EBUSY;
else if (cmd_match(page, "resync"))
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
else if (cmd_match(page, "recover")) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} else if (cmd_match(page, "reshape")) {
int err;
if (mddev->pers->start_reshape == NULL)
return -EINVAL;
err = mddev_lock(mddev);
if (!err) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
err = mddev->pers->start_reshape(mddev);
mddev_unlock(mddev);
}
@@ -4250,6 +4252,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
else if (!cmd_match(page, "repair"))
return -EINVAL;
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
}
@@ -4818,12 +4821,12 @@ static void md_free(struct kobject *ko)
if (mddev->sysfs_state)
sysfs_put(mddev->sysfs_state);
+ if (mddev->queue)
+ blk_cleanup_queue(mddev->queue);
if (mddev->gendisk) {
del_gendisk(mddev->gendisk);
put_disk(mddev->gendisk);
}
- if (mddev->queue)
- blk_cleanup_queue(mddev->queue);
kfree(mddev);
}
@@ -8259,6 +8262,7 @@ void md_reap_sync_thread(struct mddev *mddev)
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 2cb59a641cd2..efb654eb5399 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -188,8 +188,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
dev[j] = rdev1;
- disk_stack_limits(mddev->gendisk, rdev1->bdev,
- rdev1->data_offset << 9);
+ if (mddev->queue)
+ disk_stack_limits(mddev->gendisk, rdev1->bdev,
+ rdev1->data_offset << 9);
if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
conf->has_merge_bvec = 1;
@@ -523,6 +524,9 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
? (sector & (chunk_sects-1))
: sector_div(sector, chunk_sects));
+ /* Restore due to sector_div */
+ sector = bio->bi_iter.bi_sector;
+
if (sectors < bio_sectors(bio)) {
split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
bio_chain(split, bio);
@@ -530,7 +534,6 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
split = bio;
}
- sector = bio->bi_iter.bi_sector;
zone = find_zone(mddev->private, &sector);
tmp_dev = map_sector(mddev, zone, sector, &sector);
split->bi_bdev = tmp_dev->bdev;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e793ab6b3570..f55c3f35b746 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4156,6 +4156,7 @@ static int raid10_start_reshape(struct mddev *mddev)
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77dfd720aaa0..b6793d2e051f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -749,6 +749,7 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
static bool stripe_can_batch(struct stripe_head *sh)
{
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
+ !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
is_full_stripe_write(sh);
}
@@ -837,6 +838,15 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
< IO_THRESHOLD)
md_wakeup_thread(conf->mddev->thread);
+ if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
+ int seq = sh->bm_seq;
+ if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
+ sh->batch_head->bm_seq > seq)
+ seq = sh->batch_head->bm_seq;
+ set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
+ sh->batch_head->bm_seq = seq;
+ }
+
atomic_inc(&sh->count);
unlock_out:
unlock_two_stripes(head, sh);
@@ -1078,9 +1088,6 @@ again:
pr_debug("skip op %ld on disc %d for sector %llu\n",
bi->bi_rw, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
- if (sh->batch_head)
- set_bit(STRIPE_BATCH_ERR,
- &sh->batch_head->state);
set_bit(STRIPE_HANDLE, &sh->state);
}
@@ -1825,7 +1832,7 @@ again:
} else
init_async_submit(&submit, 0, tx, NULL, NULL,
to_addr_conv(sh, percpu, j));
- async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
if (!last_stripe) {
j++;
sh = list_first_entry(&sh->batch_list, struct stripe_head,
@@ -1971,17 +1978,30 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
put_cpu();
}
+static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
+{
+ struct stripe_head *sh;
+
+ sh = kmem_cache_zalloc(sc, gfp);
+ if (sh) {
+ spin_lock_init(&sh->stripe_lock);
+ spin_lock_init(&sh->batch_lock);
+ INIT_LIST_HEAD(&sh->batch_list);
+ INIT_LIST_HEAD(&sh->lru);
+ atomic_set(&sh->count, 1);
+ }
+ return sh;
+}
static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
{
struct stripe_head *sh;
- sh = kmem_cache_zalloc(conf->slab_cache, gfp);
+
+ sh = alloc_stripe(conf->slab_cache, gfp);
if (!sh)
return 0;
sh->raid_conf = conf;
- spin_lock_init(&sh->stripe_lock);
-
if (grow_buffers(sh, gfp)) {
shrink_buffers(sh);
kmem_cache_free(conf->slab_cache, sh);
@@ -1990,13 +2010,8 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
sh->hash_lock_index =
conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
/* we just created an active stripe so... */
- atomic_set(&sh->count, 1);
atomic_inc(&conf->active_stripes);
- INIT_LIST_HEAD(&sh->lru);
- spin_lock_init(&sh->batch_lock);
- INIT_LIST_HEAD(&sh->batch_list);
- sh->batch_head = NULL;
release_stripe(sh);
conf->max_nr_stripes++;
return 1;
@@ -2060,6 +2075,35 @@ static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
return ret;
}
+static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
+{
+ unsigned long cpu;
+ int err = 0;
+
+ mddev_suspend(conf->mddev);
+ get_online_cpus();
+ for_each_present_cpu(cpu) {
+ struct raid5_percpu *percpu;
+ struct flex_array *scribble;
+
+ percpu = per_cpu_ptr(conf->percpu, cpu);
+ scribble = scribble_alloc(new_disks,
+ new_sectors / STRIPE_SECTORS,
+ GFP_NOIO);
+
+ if (scribble) {
+ flex_array_free(percpu->scribble);
+ percpu->scribble = scribble;
+ } else {
+ err = -ENOMEM;
+ break;
+ }
+ }
+ put_online_cpus();
+ mddev_resume(conf->mddev);
+ return err;
+}
+
static int resize_stripes(struct r5conf *conf, int newsize)
{
/* Make all the stripes able to hold 'newsize' devices.
@@ -2088,7 +2132,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
struct stripe_head *osh, *nsh;
LIST_HEAD(newstripes);
struct disk_info *ndisks;
- unsigned long cpu;
int err;
struct kmem_cache *sc;
int i;
@@ -2109,13 +2152,11 @@ static int resize_stripes(struct r5conf *conf, int newsize)
return -ENOMEM;
for (i = conf->max_nr_stripes; i; i--) {
- nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
+ nsh = alloc_stripe(sc, GFP_KERNEL);
if (!nsh)
break;
nsh->raid_conf = conf;
- spin_lock_init(&nsh->stripe_lock);
-
list_add(&nsh->lru, &newstripes);
}
if (i) {
@@ -2142,13 +2183,11 @@ static int resize_stripes(struct r5conf *conf, int newsize)
lock_device_hash_lock(conf, hash));
osh = get_free_stripe(conf, hash);
unlock_device_hash_lock(conf, hash);
- atomic_set(&nsh->count, 1);
+
for(i=0; i<conf->pool_size; i++) {
nsh->dev[i].page = osh->dev[i].page;
nsh->dev[i].orig_page = osh->dev[i].page;
}
- for( ; i<newsize; i++)
- nsh->dev[i].page = NULL;
nsh->hash_lock_index = hash;
kmem_cache_free(conf->slab_cache, osh);
cnt++;
@@ -2174,25 +2213,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
} else
err = -ENOMEM;
- get_online_cpus();
- for_each_present_cpu(cpu) {
- struct raid5_percpu *percpu;
- struct flex_array *scribble;
-
- percpu = per_cpu_ptr(conf->percpu, cpu);
- scribble = scribble_alloc(newsize, conf->chunk_sectors /
- STRIPE_SECTORS, GFP_NOIO);
-
- if (scribble) {
- flex_array_free(percpu->scribble);
- percpu->scribble = scribble;
- } else {
- err = -ENOMEM;
- break;
- }
- }
- put_online_cpus();
-
/* Step 4, return new stripes to service */
while(!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2212,7 +2232,8 @@ static int resize_stripes(struct r5conf *conf, int newsize)
conf->slab_cache = sc;
conf->active_name = 1-conf->active_name;
- conf->pool_size = newsize;
+ if (!err)
+ conf->pool_size = newsize;
return err;
}
@@ -2434,7 +2455,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
}
rdev_dec_pending(rdev, conf->mddev);
- if (sh->batch_head && !uptodate)
+ if (sh->batch_head && !uptodate && !replacement)
set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
@@ -2976,14 +2997,32 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
(unsigned long long)(*bip)->bi_iter.bi_sector,
(unsigned long long)sh->sector, dd_idx);
- spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap && firstwrite) {
+ /* Cannot hold spinlock over bitmap_startwrite,
+ * but must ensure this isn't added to a batch until
+ * we have added to the bitmap and set bm_seq.
+ * So set STRIPE_BITMAP_PENDING to prevent
+ * batching.
+ * If multiple add_stripe_bio() calls race here they
+ * much all set STRIPE_BITMAP_PENDING. So only the first one
+ * to complete "bitmap_startwrite" gets to set
+ * STRIPE_BIT_DELAY. This is important as once a stripe
+ * is added to a batch, STRIPE_BIT_DELAY cannot be changed
+ * any more.
+ */
+ set_bit(STRIPE_BITMAP_PENDING, &sh->state);
+ spin_unlock_irq(&sh->stripe_lock);
bitmap_startwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0);
- sh->bm_seq = conf->seq_flush+1;
- set_bit(STRIPE_BIT_DELAY, &sh->state);
+ spin_lock_irq(&sh->stripe_lock);
+ clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
+ if (!sh->batch_head) {
+ sh->bm_seq = conf->seq_flush+1;
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
+ }
}
+ spin_unlock_irq(&sh->stripe_lock);
if (stripe_can_batch(sh))
stripe_add_to_batch_list(conf, sh);
@@ -3278,7 +3317,9 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
/* reconstruct-write isn't being forced */
return 0;
for (i = 0; i < s->failed; i++) {
- if (!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+ if (s->failed_num[i] != sh->pd_idx &&
+ s->failed_num[i] != sh->qd_idx &&
+ !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
!test_bit(R5_OVERWRITE, &fdev[i]->flags))
return 1;
}
@@ -3298,6 +3339,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
*/
BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
BUG_ON(test_bit(R5_Wantread, &dev->flags));
+ BUG_ON(sh->batch_head);
if ((s->uptodate == disks - 1) &&
(s->failed && (disk_idx == s->failed_num[0] ||
disk_idx == s->failed_num[1]))) {
@@ -3366,7 +3408,6 @@ static void handle_stripe_fill(struct stripe_head *sh,
{
int i;
- BUG_ON(sh->batch_head);
/* look for blocks to read/compute, skip this if a compute
* is already in flight, or if the stripe contents are in the
* midst of changing due to a write
@@ -3379,6 +3420,8 @@ static void handle_stripe_fill(struct stripe_head *sh,
set_bit(STRIPE_HANDLE, &sh->state);
}
+static void break_stripe_batch_list(struct stripe_head *head_sh,
+ unsigned long handle_flags);
/* handle_stripe_clean_event
* any written block on an uptodate or failed drive can be returned.
* Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
@@ -3392,7 +3435,6 @@ static void handle_stripe_clean_event(struct r5conf *conf,
int discard_pending = 0;
struct stripe_head *head_sh = sh;
bool do_endio = false;
- int wakeup_nr = 0;
for (i = disks; i--; )
if (sh->dev[i].written) {
@@ -3481,44 +3523,8 @@ unhash:
if (atomic_dec_and_test(&conf->pending_full_writes))
md_wakeup_thread(conf->mddev->thread);
- if (!head_sh->batch_head || !do_endio)
- return;
- for (i = 0; i < head_sh->disks; i++) {
- if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
- wakeup_nr++;
- }
- while (!list_empty(&head_sh->batch_list)) {
- int i;
- sh = list_first_entry(&head_sh->batch_list,
- struct stripe_head, batch_list);
- list_del_init(&sh->batch_list);
-
- set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
- head_sh->state & ~((1 << STRIPE_ACTIVE) |
- (1 << STRIPE_PREREAD_ACTIVE) |
- STRIPE_EXPAND_SYNC_FLAG));
- sh->check_state = head_sh->check_state;
- sh->reconstruct_state = head_sh->reconstruct_state;
- for (i = 0; i < sh->disks; i++) {
- if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
- wakeup_nr++;
- sh->dev[i].flags = head_sh->dev[i].flags;
- }
-
- spin_lock_irq(&sh->stripe_lock);
- sh->batch_head = NULL;
- spin_unlock_irq(&sh->stripe_lock);
- if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
- set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
- }
-
- spin_lock_irq(&head_sh->stripe_lock);
- head_sh->batch_head = NULL;
- spin_unlock_irq(&head_sh->stripe_lock);
- wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
- if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
- set_bit(STRIPE_HANDLE, &head_sh->state);
+ if (head_sh->batch_head && do_endio)
+ break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
}
static void handle_stripe_dirtying(struct r5conf *conf,
@@ -4159,9 +4165,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
static int clear_batch_ready(struct stripe_head *sh)
{
+ /* Return '1' if this is a member of batch, or
+ * '0' if it is a lone stripe or a head which can now be
+ * handled.
+ */
struct stripe_head *tmp;
if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
- return 0;
+ return (sh->batch_head && sh->batch_head != sh);
spin_lock(&sh->stripe_lock);
if (!sh->batch_head) {
spin_unlock(&sh->stripe_lock);
@@ -4189,46 +4199,65 @@ static int clear_batch_ready(struct stripe_head *sh)
return 0;
}
-static void check_break_stripe_batch_list(struct stripe_head *sh)
+static void break_stripe_batch_list(struct stripe_head *head_sh,
+ unsigned long handle_flags)
{
- struct stripe_head *head_sh, *next;
+ struct stripe_head *sh, *next;
int i;
+ int do_wakeup = 0;
- if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
- return;
+ list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
- head_sh = sh;
- do {
- sh = list_first_entry(&sh->batch_list,
- struct stripe_head, batch_list);
- BUG_ON(sh == head_sh);
- } while (!test_bit(STRIPE_DEGRADED, &sh->state));
-
- while (sh != head_sh) {
- next = list_first_entry(&sh->batch_list,
- struct stripe_head, batch_list);
list_del_init(&sh->batch_list);
- set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
- head_sh->state & ~((1 << STRIPE_ACTIVE) |
- (1 << STRIPE_PREREAD_ACTIVE) |
- (1 << STRIPE_DEGRADED) |
- STRIPE_EXPAND_SYNC_FLAG));
+ WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
+ (1 << STRIPE_SYNCING) |
+ (1 << STRIPE_REPLACED) |
+ (1 << STRIPE_PREREAD_ACTIVE) |
+ (1 << STRIPE_DELAYED) |
+ (1 << STRIPE_BIT_DELAY) |
+ (1 << STRIPE_FULL_WRITE) |
+ (1 << STRIPE_BIOFILL_RUN) |
+ (1 << STRIPE_COMPUTE_RUN) |
+ (1 << STRIPE_OPS_REQ_PENDING) |
+ (1 << STRIPE_DISCARD) |
+ (1 << STRIPE_BATCH_READY) |
+ (1 << STRIPE_BATCH_ERR) |
+ (1 << STRIPE_BITMAP_PENDING)));
+ WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
+ (1 << STRIPE_REPLACED)));
+
+ set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
+ (1 << STRIPE_DEGRADED)),
+ head_sh->state & (1 << STRIPE_INSYNC));
+
sh->check_state = head_sh->check_state;
sh->reconstruct_state = head_sh->reconstruct_state;
- for (i = 0; i < sh->disks; i++)
+ for (i = 0; i < sh->disks; i++) {
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+ do_wakeup = 1;
sh->dev[i].flags = head_sh->dev[i].flags &
(~((1 << R5_WriteError) | (1 << R5_Overlap)));
-
+ }
spin_lock_irq(&sh->stripe_lock);
sh->batch_head = NULL;
spin_unlock_irq(&sh->stripe_lock);
-
- set_bit(STRIPE_HANDLE, &sh->state);
+ if (handle_flags == 0 ||
+ sh->state & handle_flags)
+ set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
-
- sh = next;
}
+ spin_lock_irq(&head_sh->stripe_lock);
+ head_sh->batch_head = NULL;
+ spin_unlock_irq(&head_sh->stripe_lock);
+ for (i = 0; i < head_sh->disks; i++)
+ if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
+ do_wakeup = 1;
+ if (head_sh->state & handle_flags)
+ set_bit(STRIPE_HANDLE, &head_sh->state);
+
+ if (do_wakeup)
+ wake_up(&head_sh->raid_conf->wait_for_overlap);
}
static void handle_stripe(struct stripe_head *sh)
@@ -4253,7 +4282,8 @@ static void handle_stripe(struct stripe_head *sh)
return;
}
- check_break_stripe_batch_list(sh);
+ if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
+ break_stripe_batch_list(sh, 0);
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
spin_lock(&sh->stripe_lock);
@@ -4307,6 +4337,7 @@ static void handle_stripe(struct stripe_head *sh)
if (s.failed > conf->max_degraded) {
sh->check_state = 0;
sh->reconstruct_state = 0;
+ break_stripe_batch_list(sh, 0);
if (s.to_read+s.to_write+s.written)
handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
if (s.syncing + s.replacing)
@@ -6221,8 +6252,11 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
percpu->spare_page = alloc_page(GFP_KERNEL);
if (!percpu->scribble)
percpu->scribble = scribble_alloc(max(conf->raid_disks,
- conf->previous_raid_disks), conf->chunk_sectors /
- STRIPE_SECTORS, GFP_KERNEL);
+ conf->previous_raid_disks),
+ max(conf->chunk_sectors,
+ conf->prev_chunk_sectors)
+ / STRIPE_SECTORS,
+ GFP_KERNEL);
if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
free_scratch_buffer(conf, percpu);
@@ -7198,6 +7232,15 @@ static int check_reshape(struct mddev *mddev)
if (!check_stripe_cache(mddev))
return -ENOSPC;
+ if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
+ mddev->delta_disks > 0)
+ if (resize_chunks(conf,
+ conf->previous_raid_disks
+ + max(0, mddev->delta_disks),
+ max(mddev->new_chunk_sectors,
+ mddev->chunk_sectors)
+ ) < 0)
+ return -ENOMEM;
return resize_stripes(conf, (conf->previous_raid_disks
+ mddev->delta_disks));
}
@@ -7311,6 +7354,7 @@ static int raid5_start_reshape(struct mddev *mddev)
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7dc0dd86074b..896d603ad0da 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -337,9 +337,12 @@ enum {
STRIPE_ON_RELEASE_LIST,
STRIPE_BATCH_READY,
STRIPE_BATCH_ERR,
+ STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
+ * to batch yet.
+ */
};
-#define STRIPE_EXPAND_SYNC_FLAG \
+#define STRIPE_EXPAND_SYNC_FLAGS \
((1 << STRIPE_EXPAND_SOURCE) |\
(1 << STRIPE_EXPAND_READY) |\
(1 << STRIPE_EXPANDING) |\