aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c166
1 files changed, 77 insertions, 89 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c5439dce0295..3ee2912889e7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -53,6 +53,8 @@
#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
+#include <trace/events/block.h>
+
#include "md.h"
#include "raid5.h"
#include "raid0.h"
@@ -361,10 +363,9 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
short generation)
{
struct stripe_head *sh;
- struct hlist_node *hn;
pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
- hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
+ hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
if (sh->sector == sector && sh->generation == generation)
return sh;
pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
@@ -466,7 +467,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
do {
wait_event_lock_irq(conf->wait_for_stripe,
conf->quiesce == 0 || noquiesce,
- conf->device_lock, /* nothing */);
+ conf->device_lock);
sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) {
if (!conf->inactive_blocked)
@@ -480,8 +481,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
(atomic_read(&conf->active_stripes)
< (conf->max_nr_stripes *3/4)
|| !conf->inactive_blocked),
- conf->device_lock,
- );
+ conf->device_lock);
conf->inactive_blocked = 0;
} else
init_stripe(sh, sector, previous);
@@ -671,6 +671,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
bi->bi_next = NULL;
if (rrdev)
set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
+ trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
+ bi, disk_devt(conf->mddev->gendisk),
+ sh->dev[i].sector);
generic_make_request(bi);
}
if (rrdev) {
@@ -698,6 +701,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_size = STRIPE_SIZE;
rbi->bi_next = NULL;
+ trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
+ rbi, disk_devt(conf->mddev->gendisk),
+ sh->dev[i].sector);
generic_make_request(rbi);
}
if (!rdev && !rrdev) {
@@ -1397,7 +1403,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
&sh->ops.zero_sum_result, percpu->spare_page, &submit);
}
-static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
+static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
{
int overlap_clear = 0, i, disks = sh->disks;
struct dma_async_tx_descriptor *tx = NULL;
@@ -1462,36 +1468,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
put_cpu();
}
-#ifdef CONFIG_MULTICORE_RAID456
-static void async_run_ops(void *param, async_cookie_t cookie)
-{
- struct stripe_head *sh = param;
- unsigned long ops_request = sh->ops.request;
-
- clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
- wake_up(&sh->ops.wait_for_ops);
-
- __raid_run_ops(sh, ops_request);
- release_stripe(sh);
-}
-
-static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
-{
- /* since handle_stripe can be called outside of raid5d context
- * we need to ensure sh->ops.request is de-staged before another
- * request arrives
- */
- wait_event(sh->ops.wait_for_ops,
- !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
- sh->ops.request = ops_request;
-
- atomic_inc(&sh->count);
- async_schedule(async_run_ops, sh);
-}
-#else
-#define raid_run_ops __raid_run_ops
-#endif
-
static int grow_one_stripe(struct r5conf *conf)
{
struct stripe_head *sh;
@@ -1500,9 +1476,6 @@ static int grow_one_stripe(struct r5conf *conf)
return 0;
sh->raid_conf = conf;
- #ifdef CONFIG_MULTICORE_RAID456
- init_waitqueue_head(&sh->ops.wait_for_ops);
- #endif
spin_lock_init(&sh->stripe_lock);
@@ -1576,7 +1549,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
* This happens in stages:
* 1/ create a new kmem_cache and allocate the required number of
* stripe_heads.
- * 2/ gather all the old stripe_heads and tranfer the pages across
+ * 2/ gather all the old stripe_heads and transfer the pages across
* to the new stripe_heads. This will have the side effect of
* freezing the array as once all stripe_heads have been collected,
* no IO will be possible. Old stripe heads are freed once their
@@ -1621,9 +1594,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
break;
nsh->raid_conf = conf;
- #ifdef CONFIG_MULTICORE_RAID456
- init_waitqueue_head(&nsh->ops.wait_for_ops);
- #endif
spin_lock_init(&nsh->stripe_lock);
list_add(&nsh->lru, &newstripes);
@@ -1646,8 +1616,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
spin_lock_irq(&conf->device_lock);
wait_event_lock_irq(conf->wait_for_stripe,
!list_empty(&conf->inactive_list),
- conf->device_lock,
- );
+ conf->device_lock);
osh = get_free_stripe(conf);
spin_unlock_irq(&conf->device_lock);
atomic_set(&nsh->count, 1);
@@ -2774,10 +2743,12 @@ static void handle_stripe_clean_event(struct r5conf *conf,
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) &&
(test_bit(R5_UPTODATE, &dev->flags) ||
- test_and_clear_bit(R5_Discard, &dev->flags))) {
+ test_bit(R5_Discard, &dev->flags))) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
pr_debug("Return write for disc %d\n", i);
+ if (test_and_clear_bit(R5_Discard, &dev->flags))
+ clear_bit(R5_UPTODATE, &dev->flags);
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_sector <
@@ -2795,7 +2766,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
}
- }
+ } else if (test_bit(R5_Discard, &sh->dev[i].flags))
+ clear_bit(R5_Discard, &sh->dev[i].flags);
if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
if (atomic_dec_and_test(&conf->pending_full_writes))
@@ -2852,8 +2824,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
pr_debug("for sector %llu, rmw=%d rcw=%d\n",
(unsigned long long)sh->sector, rmw, rcw);
set_bit(STRIPE_HANDLE, &sh->state);
- if (rmw < rcw && rmw > 0)
+ if (rmw < rcw && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
+ blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d",
+ (unsigned long long)sh->sector, rmw);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) &&
@@ -2864,7 +2838,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
if (
test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
pr_debug("Read_old block "
- "%d for r-m-w\n", i);
+ "%d for r-m-w\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
s->locked++;
@@ -2874,8 +2848,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
}
}
}
+ }
if (rcw <= rmw && rcw > 0) {
/* want reconstruct write, but need to get some data */
+ int qread =0;
rcw = 0;
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
@@ -2894,12 +2870,17 @@ static void handle_stripe_dirtying(struct r5conf *conf,
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
s->locked++;
+ qread++;
} else {
set_bit(STRIPE_DELAYED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
}
}
}
+ if (rcw)
+ blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
+ (unsigned long long)sh->sector,
+ rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
}
/* now if nothing is locked, and if we have enough data,
* we can start a write request
@@ -3221,10 +3202,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
}
/* done submitting copies, wait for them to complete */
- if (tx) {
- async_tx_ack(tx);
- dma_wait_for_async_tx(tx);
- }
+ async_tx_quiesce(&tx);
}
/*
@@ -3490,40 +3468,6 @@ static void handle_stripe(struct stripe_head *sh)
handle_failed_sync(conf, sh, &s);
}
- /*
- * might be able to return some write requests if the parity blocks
- * are safe, or on a failed drive
- */
- pdev = &sh->dev[sh->pd_idx];
- s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
- || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
- qdev = &sh->dev[sh->qd_idx];
- s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
- || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
- || conf->level < 6;
-
- if (s.written &&
- (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
- && !test_bit(R5_LOCKED, &pdev->flags)
- && (test_bit(R5_UPTODATE, &pdev->flags) ||
- test_bit(R5_Discard, &pdev->flags))))) &&
- (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
- && !test_bit(R5_LOCKED, &qdev->flags)
- && (test_bit(R5_UPTODATE, &qdev->flags) ||
- test_bit(R5_Discard, &qdev->flags))))))
- handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
-
- /* Now we might consider reading some blocks, either to check/generate
- * parity, or to satisfy requests
- * or to load a block that is being partially written.
- */
- if (s.to_read || s.non_overwrite
- || (conf->level == 6 && s.to_write && s.failed)
- || (s.syncing && (s.uptodate + s.compute < disks))
- || s.replacing
- || s.expanding)
- handle_stripe_fill(sh, &s, disks);
-
/* Now we check to see if any write operations have recently
* completed
*/
@@ -3561,6 +3505,40 @@ static void handle_stripe(struct stripe_head *sh)
s.dec_preread_active = 1;
}
+ /*
+ * might be able to return some write requests if the parity blocks
+ * are safe, or on a failed drive
+ */
+ pdev = &sh->dev[sh->pd_idx];
+ s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
+ || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
+ qdev = &sh->dev[sh->qd_idx];
+ s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
+ || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
+ || conf->level < 6;
+
+ if (s.written &&
+ (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
+ && !test_bit(R5_LOCKED, &pdev->flags)
+ && (test_bit(R5_UPTODATE, &pdev->flags) ||
+ test_bit(R5_Discard, &pdev->flags))))) &&
+ (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
+ && !test_bit(R5_LOCKED, &qdev->flags)
+ && (test_bit(R5_UPTODATE, &qdev->flags) ||
+ test_bit(R5_Discard, &qdev->flags))))))
+ handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+
+ /* Now we might consider reading some blocks, either to check/generate
+ * parity, or to satisfy requests
+ * or to load a block that is being partially written.
+ */
+ if (s.to_read || s.non_overwrite
+ || (conf->level == 6 && s.to_write && s.failed)
+ || (s.syncing && (s.uptodate + s.compute < disks))
+ || s.replacing
+ || s.expanding)
+ handle_stripe_fill(sh, &s, disks);
+
/* Now to consider new write requests and what else, if anything
* should be read. We do not handle new writes when:
* 1/ A 'write' operation (copy+xor) is already in flight.
@@ -4000,10 +3978,13 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
spin_lock_irq(&conf->device_lock);
wait_event_lock_irq(conf->wait_for_stripe,
conf->quiesce == 0,
- conf->device_lock, /* nothing */);
+ conf->device_lock);
atomic_inc(&conf->active_aligned_reads);
spin_unlock_irq(&conf->device_lock);
+ trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
+ align_bi, disk_devt(mddev->gendisk),
+ raid_bio->bi_sector);
generic_make_request(align_bi);
return 1;
} else {
@@ -4078,6 +4059,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
struct stripe_head *sh;
struct mddev *mddev = cb->cb.data;
struct r5conf *conf = mddev->private;
+ int cnt = 0;
if (cb->list.next && !list_empty(&cb->list)) {
spin_lock_irq(&conf->device_lock);
@@ -4092,9 +4074,11 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
smp_mb__before_clear_bit();
clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
__release_stripe(conf, sh);
+ cnt++;
}
spin_unlock_irq(&conf->device_lock);
}
+ trace_block_unplug(mddev->queue, cnt, !from_schedule);
kfree(cb);
}
@@ -5529,6 +5513,10 @@ static int run(struct mddev *mddev)
* discard data disk but write parity disk
*/
stripe = stripe * PAGE_SIZE;
+ /* Round up to power of 2, as discard handling
+ * currently assumes that */
+ while ((stripe-1) & stripe)
+ stripe = (stripe | (stripe-1)) + 1;
mddev->queue->limits.discard_alignment = stripe;
mddev->queue->limits.discard_granularity = stripe;
/*
@@ -6088,7 +6076,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
wait_event_lock_irq(conf->wait_for_stripe,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 0,
- conf->device_lock, /* nothing */);
+ conf->device_lock);
conf->quiesce = 1;
spin_unlock_irq(&conf->device_lock);
/* allow reshape to continue */