aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2018-02-01 00:41:33 -0800
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2018-02-01 00:41:33 -0800
commit10a558374f3751cf4eb55143008975641dfc2cf4 (patch)
treeaab0def9d2289f6520ffb62ec4c1d738135447c4 /drivers/md
parentInput: goodix - use generic touchscreen_properties (diff)
parentLinux 4.15 (diff)
downloadlinux-dev-10a558374f3751cf4eb55143008975641dfc2cf4.tar.xz
linux-dev-10a558374f3751cf4eb55143008975641dfc2cf4.zip
Merge tag 'v4.15' into next
Sync with mainline to get in trackpoint updates and other changes.
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig5
-rw-r--r--drivers/md/Makefile5
-rw-r--r--drivers/md/bcache/alloc.c17
-rw-r--r--drivers/md/bcache/bcache.h19
-rw-r--r--drivers/md/bcache/btree.c22
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/bcache/closure.h6
-rw-r--r--drivers/md/bcache/extents.c2
-rw-r--r--drivers/md/bcache/journal.c7
-rw-r--r--drivers/md/bcache/request.c35
-rw-r--r--drivers/md/bcache/stats.c8
-rw-r--r--drivers/md/bcache/super.c52
-rw-r--r--drivers/md/bcache/sysfs.c28
-rw-r--r--drivers/md/bcache/util.c10
-rw-r--r--drivers/md/bcache/util.h4
-rw-r--r--drivers/md/bcache/writeback.c117
-rw-r--r--drivers/md/bcache/writeback.h6
-rw-r--r--drivers/md/dm-bufio.c31
-rw-r--r--drivers/md/dm-cache-background-tracker.c18
-rw-r--r--drivers/md/dm-cache-metadata.c9
-rw-r--r--drivers/md/dm-cache-policy-smq.c42
-rw-r--r--drivers/md/dm-cache-target.c338
-rw-r--r--drivers/md/dm-core.h3
-rw-r--r--drivers/md/dm-crypt.c24
-rw-r--r--drivers/md/dm-delay.c6
-rw-r--r--drivers/md/dm-era-target.c1
-rw-r--r--drivers/md/dm-integrity.c72
-rw-r--r--drivers/md/dm-kcopyd.c4
-rw-r--r--drivers/md/dm-log-writes.c175
-rw-r--r--drivers/md/dm-mpath.c107
-rw-r--r--drivers/md/dm-raid.c39
-rw-r--r--drivers/md/dm-raid1.c8
-rw-r--r--drivers/md/dm-rq.c2
-rw-r--r--drivers/md/dm-snap.c48
-rw-r--r--drivers/md/dm-stats.c36
-rw-r--r--drivers/md/dm-switch.c2
-rw-r--r--drivers/md/dm-table.c67
-rw-r--r--drivers/md/dm-thin-metadata.c6
-rw-r--r--drivers/md/dm-thin.c24
-rw-r--r--drivers/md/dm-verity-target.c83
-rw-r--r--drivers/md/dm-verity.h5
-rw-r--r--drivers/md/dm-zoned-target.c13
-rw-r--r--drivers/md/dm.c60
-rw-r--r--drivers/md/dm.h3
-rw-r--r--drivers/md/md-bitmap.c (renamed from drivers/md/bitmap.c)29
-rw-r--r--drivers/md/md-bitmap.h (renamed from drivers/md/bitmap.h)0
-rw-r--r--drivers/md/md-cluster.c12
-rw-r--r--drivers/md/md-faulty.c (renamed from drivers/md/faulty.c)0
-rw-r--r--drivers/md/md-linear.c (renamed from drivers/md/linear.c)2
-rw-r--r--drivers/md/md-linear.h (renamed from drivers/md/linear.h)0
-rw-r--r--drivers/md/md-multipath.c (renamed from drivers/md/multipath.c)4
-rw-r--r--drivers/md/md-multipath.h (renamed from drivers/md/multipath.h)0
-rw-r--r--drivers/md/md.c168
-rw-r--r--drivers/md/md.h20
-rw-r--r--drivers/md/persistent-data/dm-btree.c19
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c3
-rw-r--r--drivers/md/raid0.c2
-rw-r--r--drivers/md/raid1.c82
-rw-r--r--drivers/md/raid10.c173
-rw-r--r--drivers/md/raid10.h6
-rw-r--r--drivers/md/raid5-cache.c66
-rw-r--r--drivers/md/raid5-log.h2
-rw-r--r--drivers/md/raid5-ppl.c6
-rw-r--r--drivers/md/raid5.c83
64 files changed, 1323 insertions, 925 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 4a249ee86364..83b9362be09c 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -178,7 +178,7 @@ config MD_FAULTY
config MD_CLUSTER
- tristate "Cluster Support for MD (EXPERIMENTAL)"
+ tristate "Cluster Support for MD"
depends on BLK_DEV_MD
depends on DLM
default n
@@ -188,7 +188,8 @@ config MD_CLUSTER
nodes in the cluster can access the MD devices simultaneously.
This brings the redundancy (and uptime) of RAID levels across the
- nodes of the cluster.
+ nodes of the cluster. Currently, it can work with raid1 and raid10
+ (limited support).
If unsure, say N.
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index e94b6f9be941..f701bb211783 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -19,9 +19,12 @@ dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
dm-cache-smq-y += dm-cache-policy-smq.o
dm-era-y += dm-era-target.o
dm-verity-y += dm-verity-target.o
-md-mod-y += md.o bitmap.o
+md-mod-y += md.o md-bitmap.o
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
+linear-y += md-linear.o
+multipath-y += md-multipath.o
+faulty-y += md-faulty.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 08035634795c..a0cc1bc6d884 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -407,7 +407,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
finish_wait(&ca->set->bucket_wait, &w);
out:
- wake_up_process(ca->alloc_thread);
+ if (ca->alloc_thread)
+ wake_up_process(ca->alloc_thread);
trace_bcache_alloc(ca, reserve);
@@ -442,6 +443,11 @@ out:
b->prio = INITIAL_PRIO;
}
+ if (ca->set->avail_nbuckets > 0) {
+ ca->set->avail_nbuckets--;
+ bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
+ }
+
return r;
}
@@ -449,6 +455,11 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b)
{
SET_GC_MARK(b, 0);
SET_GC_SECTORS_USED(b, 0);
+
+ if (ca->set->avail_nbuckets < ca->set->nbuckets) {
+ ca->set->avail_nbuckets++;
+ bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
+ }
}
void bch_bucket_free(struct cache_set *c, struct bkey *k)
@@ -479,7 +490,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
if (b == -1)
goto err;
- k->ptr[i] = PTR(ca->buckets[b].gen,
+ k->ptr[i] = MAKE_PTR(ca->buckets[b].gen,
bucket_to_sector(c, b),
ca->sb.nr_this_dev);
@@ -601,7 +612,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
/*
* If we had to allocate, we might race and not need to allocate the
- * second time we call find_data_bucket(). If we allocated a bucket but
+ * second time we call pick_data_bucket(). If we allocated a bucket but
* didn't use it, drop the refcount bch_bucket_alloc_set() took:
*/
if (KEY_PTRS(&alloc.key))
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index abd31e847f96..843877e017e1 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -185,6 +185,7 @@
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/rwsem.h>
+#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/workqueue.h>
@@ -266,9 +267,6 @@ struct bcache_device {
atomic_t *stripe_sectors_dirty;
unsigned long *full_dirty_stripes;
- unsigned long sectors_dirty_last;
- long sectors_dirty_derivative;
-
struct bio_set *bio_split;
unsigned data_csum:1;
@@ -300,7 +298,7 @@ struct cached_dev {
struct semaphore sb_write_mutex;
/* Refcount on the cache set. Always nonzero when we're caching. */
- atomic_t count;
+ refcount_t count;
struct work_struct detach;
/*
@@ -363,12 +361,14 @@ struct cached_dev {
uint64_t writeback_rate_target;
int64_t writeback_rate_proportional;
- int64_t writeback_rate_derivative;
- int64_t writeback_rate_change;
+ int64_t writeback_rate_integral;
+ int64_t writeback_rate_integral_scaled;
+ int32_t writeback_rate_change;
unsigned writeback_rate_update_seconds;
- unsigned writeback_rate_d_term;
+ unsigned writeback_rate_i_term_inverse;
unsigned writeback_rate_p_term_inverse;
+ unsigned writeback_rate_minimum;
};
enum alloc_reserve {
@@ -582,6 +582,7 @@ struct cache_set {
uint8_t need_gc;
struct gc_stat gc_stats;
size_t nbuckets;
+ size_t avail_nbuckets;
struct task_struct *gc_thread;
/* Where in the btree gc currently is */
@@ -807,13 +808,13 @@ do { \
static inline void cached_dev_put(struct cached_dev *dc)
{
- if (atomic_dec_and_test(&dc->count))
+ if (refcount_dec_and_test(&dc->count))
schedule_work(&dc->detach);
}
static inline bool cached_dev_get(struct cached_dev *dc)
{
- if (!atomic_inc_not_zero(&dc->count))
+ if (!refcount_inc_not_zero(&dc->count))
return false;
/* Paired with the mb in cached_dev_attach */
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 658c54b3b07a..81e8dc3dbe5e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -807,7 +807,10 @@ int bch_btree_cache_alloc(struct cache_set *c)
c->shrink.scan_objects = bch_mca_scan;
c->shrink.seeks = 4;
c->shrink.batch = c->btree_pages * 2;
- register_shrinker(&c->shrink);
+
+ if (register_shrinker(&c->shrink))
+ pr_warn("bcache: %s: could not register shrinker",
+ __func__);
return 0;
}
@@ -1241,6 +1244,11 @@ void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
__bch_btree_mark_key(c, level, k);
}
+void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats)
+{
+ stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets;
+}
+
static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
{
uint8_t stale = 0;
@@ -1652,9 +1660,8 @@ static void btree_gc_start(struct cache_set *c)
mutex_unlock(&c->bucket_lock);
}
-static size_t bch_btree_gc_finish(struct cache_set *c)
+static void bch_btree_gc_finish(struct cache_set *c)
{
- size_t available = 0;
struct bucket *b;
struct cache *ca;
unsigned i;
@@ -1691,6 +1698,7 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
}
rcu_read_unlock();
+ c->avail_nbuckets = 0;
for_each_cache(ca, c, i) {
uint64_t *i;
@@ -1712,18 +1720,16 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
- available++;
+ c->avail_nbuckets++;
}
}
mutex_unlock(&c->bucket_lock);
- return available;
}
static void bch_btree_gc(struct cache_set *c)
{
int ret;
- unsigned long available;
struct gc_stat stats;
struct closure writes;
struct btree_op op;
@@ -1746,14 +1752,14 @@ static void bch_btree_gc(struct cache_set *c)
pr_warn("gc failed!");
} while (ret);
- available = bch_btree_gc_finish(c);
+ bch_btree_gc_finish(c);
wake_up_allocators(c);
bch_time_stats_update(&c->btree_gc_time, start_time);
stats.key_bytes *= sizeof(uint64_t);
stats.data <<= 9;
- stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
+ bch_update_bucket_in_use(c, &stats);
memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
trace_bcache_gc_end(c);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 42204d61bc95..d211e2c25b6b 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -306,5 +306,5 @@ void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
struct keybuf_key *bch_keybuf_next(struct keybuf *);
struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
struct bkey *, keybuf_pred_fn *);
-
+void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats);
#endif
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 965907ce1e20..ccfbea6f9f6b 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -252,6 +252,12 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
static inline void closure_queue(struct closure *cl)
{
struct workqueue_struct *wq = cl->wq;
+ /**
+ * Changes made to closure, work_struct, or a couple of other structs
+ * may cause work.func not pointing to the right location.
+ */
+ BUILD_BUG_ON(offsetof(struct closure, fn)
+ != offsetof(struct work_struct, func));
if (wq) {
INIT_WORK(&cl->work, cl->work.func);
BUG_ON(!queue_work(wq, &cl->work));
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 41c238fc3733..f9d391711595 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -585,7 +585,7 @@ static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey
return false;
for (i = 0; i < KEY_PTRS(l); i++)
- if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+ if (l->ptr[i] + MAKE_PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
return false;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 02a98ddb592d..a87165c1d8e5 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -170,6 +170,11 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
* find a sequence of buckets with valid journal entries
*/
for (i = 0; i < ca->sb.njournal_buckets; i++) {
+ /*
+ * We must try the index l with ZERO first for
+ * correctness due to the scenario that the journal
+ * bucket is circular buffer which might have wrapped
+ */
l = (i * 2654435769U) % ca->sb.njournal_buckets;
if (test_bit(l, bitmap))
@@ -507,7 +512,7 @@ static void journal_reclaim(struct cache_set *c)
continue;
ja->cur_idx = next;
- k->ptr[n++] = PTR(0,
+ k->ptr[n++] = MAKE_PTR(0,
bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
ca->sb.nr_this_dev);
}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 3475d6628e21..643c3021624f 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -27,12 +27,12 @@ struct kmem_cache *bch_search_cache;
static void bch_data_insert_start(struct closure *);
-static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
+static unsigned cache_mode(struct cached_dev *dc)
{
return BDEV_CACHE_MODE(&dc->sb);
}
-static bool verify(struct cached_dev *dc, struct bio *bio)
+static bool verify(struct cached_dev *dc)
{
return dc->verify;
}
@@ -370,7 +370,7 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
{
struct cache_set *c = dc->disk.c;
- unsigned mode = cache_mode(dc, bio);
+ unsigned mode = cache_mode(dc);
unsigned sectors, congested = bch_get_congested(c);
struct task_struct *task = current;
struct io *i;
@@ -385,6 +385,14 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
op_is_write(bio_op(bio))))
goto skip;
+ /*
+ * Flag for bypass if the IO is for read-ahead or background,
+ * unless the read-ahead request is for metadata (eg, for gfs2).
+ */
+ if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
+ !(bio->bi_opf & REQ_META))
+ goto skip;
+
if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
bio_sectors(bio) & (c->sb.block_size - 1)) {
pr_debug("skipping unaligned io");
@@ -463,6 +471,7 @@ struct search {
unsigned recoverable:1;
unsigned write:1;
unsigned read_dirty_data:1;
+ unsigned cache_missed:1;
unsigned long start_time;
@@ -649,6 +658,7 @@ static inline struct search *search_alloc(struct bio *bio,
s->orig_bio = bio;
s->cache_miss = NULL;
+ s->cache_missed = 0;
s->d = d;
s->recoverable = 1;
s->write = op_is_write(bio_op(bio));
@@ -699,7 +709,14 @@ static void cached_dev_read_error(struct closure *cl)
struct search *s = container_of(cl, struct search, cl);
struct bio *bio = &s->bio.bio;
- if (s->recoverable) {
+ /*
+ * If read request hit dirty data (s->read_dirty_data is true),
+ * then recovery a failed read request from cached device may
+ * get a stale data back. So read failure recovery is only
+ * permitted when read request hit clean data in cache device,
+ * or when cache read race happened.
+ */
+ if (s->recoverable && !s->read_dirty_data) {
/* Retry from the backing device: */
trace_bcache_read_retry(s->orig_bio);
@@ -740,7 +757,7 @@ static void cached_dev_read_done(struct closure *cl)
s->cache_miss = NULL;
}
- if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data)
+ if (verify(dc) && s->recoverable && !s->read_dirty_data)
bch_data_verify(dc, s->orig_bio);
bio_complete(s);
@@ -760,12 +777,12 @@ static void cached_dev_read_done_bh(struct closure *cl)
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
bch_mark_cache_accounting(s->iop.c, s->d,
- !s->cache_miss, s->iop.bypass);
+ !s->cache_missed, s->iop.bypass);
trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
if (s->iop.status)
continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
- else if (s->iop.bio || verify(dc, &s->bio.bio))
+ else if (s->iop.bio || verify(dc))
continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
else
continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
@@ -779,6 +796,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
struct bio *miss, *cache_bio;
+ s->cache_missed = 1;
+
if (s->cache_miss || s->iop.bypass) {
miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
@@ -892,7 +911,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
s->iop.bypass = true;
if (should_writeback(dc, s->orig_bio,
- cache_mode(dc, bio),
+ cache_mode(dc),
s->iop.bypass)) {
s->iop.bypass = false;
s->iop.writeback = true;
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index d0831d5bcc87..be119326297b 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -147,9 +147,9 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
}
}
-static void scale_accounting(unsigned long data)
+static void scale_accounting(struct timer_list *t)
{
- struct cache_accounting *acc = (struct cache_accounting *) data;
+ struct cache_accounting *acc = from_timer(acc, t, timer);
#define move_stat(name) do { \
unsigned t = atomic_xchg(&acc->collector.name, 0); \
@@ -234,9 +234,7 @@ void bch_cache_accounting_init(struct cache_accounting *acc,
kobject_init(&acc->day.kobj, &bch_stats_ktype);
closure_init(&acc->cl, parent);
- init_timer(&acc->timer);
+ timer_setup(&acc->timer, scale_accounting, 0);
acc->timer.expires = jiffies + accounting_delay;
- acc->timer.data = (unsigned long) acc;
- acc->timer.function = scale_accounting;
add_timer(&acc->timer);
}
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fc0a31b13ac4..b4d28928dec5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -53,12 +53,15 @@ LIST_HEAD(bch_cache_sets);
static LIST_HEAD(uncached_devices);
static int bcache_major;
-static DEFINE_IDA(bcache_minor);
+static DEFINE_IDA(bcache_device_idx);
static wait_queue_head_t unregister_wait;
struct workqueue_struct *bcache_wq;
#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
-#define BCACHE_MINORS 16 /* partition support */
+/* limitation of partitions number on single bcache device */
+#define BCACHE_MINORS 128
+/* limitation of bcache devices number on single system */
+#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
/* Superblock */
@@ -721,6 +724,16 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
closure_get(&c->caching);
}
+static inline int first_minor_to_idx(int first_minor)
+{
+ return (first_minor/BCACHE_MINORS);
+}
+
+static inline int idx_to_first_minor(int idx)
+{
+ return (idx * BCACHE_MINORS);
+}
+
static void bcache_device_free(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
@@ -734,7 +747,8 @@ static void bcache_device_free(struct bcache_device *d)
if (d->disk && d->disk->queue)
blk_cleanup_queue(d->disk->queue);
if (d->disk) {
- ida_simple_remove(&bcache_minor, d->disk->first_minor);
+ ida_simple_remove(&bcache_device_idx,
+ first_minor_to_idx(d->disk->first_minor));
put_disk(d->disk);
}
@@ -751,7 +765,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
{
struct request_queue *q;
size_t n;
- int minor;
+ int idx;
if (!d->stripe_size)
d->stripe_size = 1 << 31;
@@ -776,25 +790,24 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
if (!d->full_dirty_stripes)
return -ENOMEM;
- minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
- if (minor < 0)
- return minor;
-
- minor *= BCACHE_MINORS;
+ idx = ida_simple_get(&bcache_device_idx, 0,
+ BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
+ if (idx < 0)
+ return idx;
if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
BIOSET_NEED_BVECS |
BIOSET_NEED_RESCUER)) ||
!(d->disk = alloc_disk(BCACHE_MINORS))) {
- ida_simple_remove(&bcache_minor, minor);
+ ida_simple_remove(&bcache_device_idx, idx);
return -ENOMEM;
}
set_capacity(d->disk, sectors);
- snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
+ snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
d->disk->major = bcache_major;
- d->disk->first_minor = minor;
+ d->disk->first_minor = idx_to_first_minor(idx);
d->disk->fops = &bcache_ops;
d->disk->private_data = d;
@@ -889,7 +902,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
closure_init_stack(&cl);
BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
- BUG_ON(atomic_read(&dc->count));
+ BUG_ON(refcount_read(&dc->count));
mutex_lock(&bch_register_lock);
@@ -1016,7 +1029,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
* dc->c must be set before dc->count != 0 - paired with the mb in
* cached_dev_get()
*/
- atomic_set(&dc->count, 1);
+ refcount_set(&dc->count, 1);
/* Block writeback thread, but spawn it */
down_write(&dc->writeback_lock);
@@ -1028,7 +1041,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
bch_sectors_dirty_init(&dc->disk);
atomic_set(&dc->has_dirty, 1);
- atomic_inc(&dc->count);
+ refcount_inc(&dc->count);
bch_writeback_queue(dc);
}
@@ -1129,9 +1142,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
if (ret)
return ret;
- set_capacity(dc->disk.disk,
- dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
-
dc->disk.disk->queue->backing_dev_info->ra_pages =
max(dc->disk.disk->queue->backing_dev_info->ra_pages,
q->backing_dev_info->ra_pages);
@@ -2085,6 +2095,7 @@ static void bcache_exit(void)
if (bcache_major)
unregister_blkdev(bcache_major, "bcache");
unregister_reboot_notifier(&reboot);
+ mutex_destroy(&bch_register_lock);
}
static int __init bcache_init(void)
@@ -2103,14 +2114,15 @@ static int __init bcache_init(void)
bcache_major = register_blkdev(0, "bcache");
if (bcache_major < 0) {
unregister_reboot_notifier(&reboot);
+ mutex_destroy(&bch_register_lock);
return bcache_major;
}
if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
!(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
- sysfs_create_files(bcache_kobj, files) ||
bch_request_init() ||
- bch_debug_init(bcache_kobj))
+ bch_debug_init(bcache_kobj) ||
+ sysfs_create_files(bcache_kobj, files))
goto err;
return 0;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 234b2f5b286d..b4184092c727 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -82,8 +82,9 @@ rw_attribute(writeback_delay);
rw_attribute(writeback_rate);
rw_attribute(writeback_rate_update_seconds);
-rw_attribute(writeback_rate_d_term);
+rw_attribute(writeback_rate_i_term_inverse);
rw_attribute(writeback_rate_p_term_inverse);
+rw_attribute(writeback_rate_minimum);
read_attribute(writeback_rate_debug);
read_attribute(stripe_size);
@@ -131,15 +132,16 @@ SHOW(__bch_cached_dev)
sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
var_print(writeback_rate_update_seconds);
- var_print(writeback_rate_d_term);
+ var_print(writeback_rate_i_term_inverse);
var_print(writeback_rate_p_term_inverse);
+ var_print(writeback_rate_minimum);
if (attr == &sysfs_writeback_rate_debug) {
char rate[20];
char dirty[20];
char target[20];
char proportional[20];
- char derivative[20];
+ char integral[20];
char change[20];
s64 next_io;
@@ -147,7 +149,7 @@ SHOW(__bch_cached_dev)
bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
bch_hprint(target, dc->writeback_rate_target << 9);
bch_hprint(proportional,dc->writeback_rate_proportional << 9);
- bch_hprint(derivative, dc->writeback_rate_derivative << 9);
+ bch_hprint(integral, dc->writeback_rate_integral_scaled << 9);
bch_hprint(change, dc->writeback_rate_change << 9);
next_io = div64_s64(dc->writeback_rate.next - local_clock(),
@@ -158,11 +160,11 @@ SHOW(__bch_cached_dev)
"dirty:\t\t%s\n"
"target:\t\t%s\n"
"proportional:\t%s\n"
- "derivative:\t%s\n"
+ "integral:\t%s\n"
"change:\t\t%s/sec\n"
"next io:\t%llims\n",
rate, dirty, target, proportional,
- derivative, change, next_io);
+ integral, change, next_io);
}
sysfs_hprint(dirty_data,
@@ -214,7 +216,7 @@ STORE(__cached_dev)
dc->writeback_rate.rate, 1, INT_MAX);
d_strtoul_nonzero(writeback_rate_update_seconds);
- d_strtoul(writeback_rate_d_term);
+ d_strtoul(writeback_rate_i_term_inverse);
d_strtoul_nonzero(writeback_rate_p_term_inverse);
d_strtoi_h(sequential_cutoff);
@@ -320,7 +322,7 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_writeback_percent,
&sysfs_writeback_rate,
&sysfs_writeback_rate_update_seconds,
- &sysfs_writeback_rate_d_term,
+ &sysfs_writeback_rate_i_term_inverse,
&sysfs_writeback_rate_p_term_inverse,
&sysfs_writeback_rate_debug,
&sysfs_dirty_data,
@@ -746,6 +748,11 @@ static struct attribute *bch_cache_set_internal_files[] = {
};
KTYPE(bch_cache_set_internal);
+static int __bch_cache_cmp(const void *l, const void *r)
+{
+ return *((uint16_t *)r) - *((uint16_t *)l);
+}
+
SHOW(__bch_cache)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
@@ -770,9 +777,6 @@ SHOW(__bch_cache)
CACHE_REPLACEMENT(&ca->sb));
if (attr == &sysfs_priority_stats) {
- int cmp(const void *l, const void *r)
- { return *((uint16_t *) r) - *((uint16_t *) l); }
-
struct bucket *b;
size_t n = ca->sb.nbuckets, i;
size_t unused = 0, available = 0, dirty = 0, meta = 0;
@@ -801,7 +805,7 @@ SHOW(__bch_cache)
p[i] = ca->buckets[i].prio;
mutex_unlock(&ca->set->bucket_lock);
- sort(p, n, sizeof(uint16_t), cmp, NULL);
+ sort(p, n, sizeof(uint16_t), __bch_cache_cmp, NULL);
while (n &&
!cached[n - 1])
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 176d3c2ef5f5..e548b8b51322 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -232,8 +232,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
d->next += div_u64(done * NSEC_PER_SEC, d->rate);
- if (time_before64(now + NSEC_PER_SEC, d->next))
- d->next = now + NSEC_PER_SEC;
+ /* Bound the time. Don't let us fall further than 2 seconds behind
+ * (this prevents unnecessary backlog that would make it impossible
+ * to catch up). If we're ahead of the desired writeback rate,
+ * don't let us sleep more than 2.5 seconds (so we can notice/respond
+ * if the control system tells us to speed up!).
+ */
+ if (time_before64(now + NSEC_PER_SEC * 5LLU / 2LLU, d->next))
+ d->next = now + NSEC_PER_SEC * 5LLU / 2LLU;
if (time_after64(now - NSEC_PER_SEC * 2, d->next))
d->next = now - NSEC_PER_SEC * 2;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index f54b58282f77..ed5e8a412eb8 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -442,10 +442,10 @@ struct bch_ratelimit {
uint64_t next;
/*
- * Rate at which we want to do work, in units per nanosecond
+ * Rate at which we want to do work, in units per second
* The units here correspond to the units passed to bch_next_delay()
*/
- unsigned rate;
+ uint32_t rate;
};
static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 70454f2ad2fa..56a37884ca8b 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -26,48 +26,63 @@ static void __update_writeback_rate(struct cached_dev *dc)
bcache_flash_devs_sectors_dirty(c);
uint64_t cache_dirty_target =
div_u64(cache_sectors * dc->writeback_percent, 100);
-
int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
c->cached_dev_sectors);
- /* PD controller */
-
+ /*
+ * PI controller:
+ * Figures out the amount that should be written per second.
+ *
+ * First, the error (number of sectors that are dirty beyond our
+ * target) is calculated. The error is accumulated (numerically
+ * integrated).
+ *
+ * Then, the proportional value and integral value are scaled
+ * based on configured values. These are stored as inverses to
+ * avoid fixed point math and to make configuration easy-- e.g.
+ * the default value of 40 for writeback_rate_p_term_inverse
+ * attempts to write at a rate that would retire all the dirty
+ * blocks in 40 seconds.
+ *
+ * The writeback_rate_i_inverse value of 10000 means that 1/10000th
+ * of the error is accumulated in the integral term per second.
+ * This acts as a slow, long-term average that is not subject to
+ * variations in usage like the p term.
+ */
int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
- int64_t derivative = dirty - dc->disk.sectors_dirty_last;
- int64_t proportional = dirty - target;
- int64_t change;
-
- dc->disk.sectors_dirty_last = dirty;
-
- /* Scale to sectors per second */
-
- proportional *= dc->writeback_rate_update_seconds;
- proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
-
- derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
-
- derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
- (dc->writeback_rate_d_term /
- dc->writeback_rate_update_seconds) ?: 1, 0);
-
- derivative *= dc->writeback_rate_d_term;
- derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
-
- change = proportional + derivative;
+ int64_t error = dirty - target;
+ int64_t proportional_scaled =
+ div_s64(error, dc->writeback_rate_p_term_inverse);
+ int64_t integral_scaled;
+ uint32_t new_rate;
+
+ if ((error < 0 && dc->writeback_rate_integral > 0) ||
+ (error > 0 && time_before64(local_clock(),
+ dc->writeback_rate.next + NSEC_PER_MSEC))) {
+ /*
+ * Only decrease the integral term if it's more than
+ * zero. Only increase the integral term if the device
+ * is keeping up. (Don't wind up the integral
+ * ineffectively in either case).
+ *
+ * It's necessary to scale this by
+ * writeback_rate_update_seconds to keep the integral
+ * term dimensioned properly.
+ */
+ dc->writeback_rate_integral += error *
+ dc->writeback_rate_update_seconds;
+ }
- /* Don't increase writeback rate if the device isn't keeping up */
- if (change > 0 &&
- time_after64(local_clock(),
- dc->writeback_rate.next + NSEC_PER_MSEC))
- change = 0;
+ integral_scaled = div_s64(dc->writeback_rate_integral,
+ dc->writeback_rate_i_term_inverse);
- dc->writeback_rate.rate =
- clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
- 1, NSEC_PER_MSEC);
+ new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
+ dc->writeback_rate_minimum, NSEC_PER_SEC);
- dc->writeback_rate_proportional = proportional;
- dc->writeback_rate_derivative = derivative;
- dc->writeback_rate_change = change;
+ dc->writeback_rate_proportional = proportional_scaled;
+ dc->writeback_rate_integral_scaled = integral_scaled;
+ dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
+ dc->writeback_rate.rate = new_rate;
dc->writeback_rate_target = target;
}
@@ -180,13 +195,21 @@ static void write_dirty(struct closure *cl)
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
- dirty_init(w);
- bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
- io->bio.bi_iter.bi_sector = KEY_START(&w->key);
- bio_set_dev(&io->bio, io->dc->bdev);
- io->bio.bi_end_io = dirty_endio;
+ /*
+ * IO errors are signalled using the dirty bit on the key.
+ * If we failed to read, we should not attempt to write to the
+ * backing device. Instead, immediately go to write_dirty_finish
+ * to clean up.
+ */
+ if (KEY_DIRTY(&w->key)) {
+ dirty_init(w);
+ bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
+ io->bio.bi_iter.bi_sector = KEY_START(&w->key);
+ bio_set_dev(&io->bio, io->dc->bdev);
+ io->bio.bi_end_io = dirty_endio;
- closure_bio_submit(&io->bio, cl);
+ closure_bio_submit(&io->bio, cl);
+ }
continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
}
@@ -418,6 +441,8 @@ static int bch_writeback_thread(void *arg)
struct cached_dev *dc = arg;
bool searched_full_index;
+ bch_ratelimit_reset(&dc->writeback_rate);
+
while (!kthread_should_stop()) {
down_write(&dc->writeback_lock);
if (!atomic_read(&dc->has_dirty) ||
@@ -445,7 +470,6 @@ static int bch_writeback_thread(void *arg)
up_write(&dc->writeback_lock);
- bch_ratelimit_reset(&dc->writeback_rate);
read_dirty(dc);
if (searched_full_index) {
@@ -455,6 +479,8 @@ static int bch_writeback_thread(void *arg)
!kthread_should_stop() &&
!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
delay = schedule_timeout_interruptible(delay);
+
+ bch_ratelimit_reset(&dc->writeback_rate);
}
}
@@ -492,8 +518,6 @@ void bch_sectors_dirty_init(struct bcache_device *d)
bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
sectors_dirty_init_fn, 0);
-
- d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -507,10 +531,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_percent = 10;
dc->writeback_delay = 30;
dc->writeback_rate.rate = 1024;
+ dc->writeback_rate_minimum = 8;
dc->writeback_rate_update_seconds = 5;
- dc->writeback_rate_d_term = 30;
- dc->writeback_rate_p_term_inverse = 6000;
+ dc->writeback_rate_p_term_inverse = 40;
+ dc->writeback_rate_i_term_inverse = 10000;
INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
}
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 151544740148..a9e3ffb4b03c 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -77,7 +77,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
if (would_skip)
return false;
- return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
+ return (op_is_sync(bio->bi_opf) ||
+ bio->bi_opf & (REQ_META|REQ_PRIO) ||
+ in_use <= CUTOFF_WRITEBACK);
}
static inline void bch_writeback_queue(struct cached_dev *dc)
@@ -90,7 +92,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
{
if (!atomic_read(&dc->has_dirty) &&
!atomic_xchg(&dc->has_dirty, 1)) {
- atomic_inc(&dc->count);
+ refcount_inc(&dc->count);
if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index d216a8f7bc22..c546b567f3b5 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -347,7 +347,7 @@ static void __cache_size_refresh(void)
BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
BUG_ON(dm_bufio_client_count < 0);
- dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size);
+ dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
/*
* Use default if set to 0 and report the actual cache size used.
@@ -960,7 +960,7 @@ static void __get_memory_limit(struct dm_bufio_client *c,
{
unsigned long buffers;
- if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
+ if (unlikely(READ_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
if (mutex_trylock(&dm_bufio_clients_lock)) {
__cache_size_refresh();
mutex_unlock(&dm_bufio_clients_lock);
@@ -974,7 +974,8 @@ static void __get_memory_limit(struct dm_bufio_client *c,
buffers = c->minimum_buffers;
*limit_buffers = buffers;
- *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
+ *threshold_buffers = mult_frac(buffers,
+ DM_BUFIO_WRITEBACK_PERCENT, 100);
}
/*
@@ -1600,7 +1601,7 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
static unsigned long get_retain_buffers(struct dm_bufio_client *c)
{
- unsigned long retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
+ unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT);
}
@@ -1610,7 +1611,8 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
int l;
struct dm_buffer *b, *tmp;
unsigned long freed = 0;
- unsigned long count = nr_to_scan;
+ unsigned long count = c->n_buffers[LIST_CLEAN] +
+ c->n_buffers[LIST_DIRTY];
unsigned long retain_target = get_retain_buffers(c);
for (l = 0; l < LIST_SIZE; l++) {
@@ -1646,8 +1648,11 @@ static unsigned long
dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
+ unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
+ READ_ONCE(c->n_buffers[LIST_DIRTY]);
+ unsigned long retain_target = get_retain_buffers(c);
- return ACCESS_ONCE(c->n_buffers[LIST_CLEAN]) + ACCESS_ONCE(c->n_buffers[LIST_DIRTY]);
+ return (count < retain_target) ? 0 : (count - retain_target);
}
/*
@@ -1818,7 +1823,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
static unsigned get_max_age_hz(void)
{
- unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
+ unsigned max_age = READ_ONCE(dm_bufio_max_age);
if (max_age > UINT_MAX / HZ)
max_age = UINT_MAX / HZ;
@@ -1910,19 +1915,15 @@ static int __init dm_bufio_init(void)
memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
- mem = (__u64)((totalram_pages - totalhigh_pages) *
- DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT;
+ mem = (__u64)mult_frac(totalram_pages - totalhigh_pages,
+ DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
if (mem > ULONG_MAX)
mem = ULONG_MAX;
#ifdef CONFIG_MMU
- /*
- * Get the size of vmalloc space the same way as VMALLOC_TOTAL
- * in fs/proc/internal.h
- */
- if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100)
- mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100;
+ if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
+ mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
#endif
dm_bufio_default_cache_size = mem;
diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c
index 707233891291..1d0af0a21fc7 100644
--- a/drivers/md/dm-cache-background-tracker.c
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -161,8 +161,17 @@ EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued);
static bool max_work_reached(struct background_tracker *b)
{
- // FIXME: finish
- return false;
+ return atomic_read(&b->pending_promotes) +
+ atomic_read(&b->pending_writebacks) +
+ atomic_read(&b->pending_demotes) >= b->max_work;
+}
+
+struct bt_work *alloc_work(struct background_tracker *b)
+{
+ if (max_work_reached(b))
+ return NULL;
+
+ return kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
}
int btracker_queue(struct background_tracker *b,
@@ -174,10 +183,7 @@ int btracker_queue(struct background_tracker *b,
if (pwork)
*pwork = NULL;
- if (max_work_reached(b))
- return -ENOMEM;
-
- w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
+ w = alloc_work(b);
if (!w)
return -ENOMEM;
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 4a4e9c75fc4c..0d7212410e21 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -13,6 +13,7 @@
#include "persistent-data/dm-transaction-manager.h"
#include <linux/device-mapper.h>
+#include <linux/refcount.h>
/*----------------------------------------------------------------*/
@@ -100,7 +101,7 @@ struct cache_disk_superblock {
} __packed;
struct dm_cache_metadata {
- atomic_t ref_count;
+ refcount_t ref_count;
struct list_head list;
unsigned version;
@@ -753,7 +754,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
}
cmd->version = metadata_version;
- atomic_set(&cmd->ref_count, 1);
+ refcount_set(&cmd->ref_count, 1);
init_rwsem(&cmd->root_lock);
cmd->bdev = bdev;
cmd->data_block_size = data_block_size;
@@ -791,7 +792,7 @@ static struct dm_cache_metadata *lookup(struct block_device *bdev)
list_for_each_entry(cmd, &table, list)
if (cmd->bdev == bdev) {
- atomic_inc(&cmd->ref_count);
+ refcount_inc(&cmd->ref_count);
return cmd;
}
@@ -862,7 +863,7 @@ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
{
- if (atomic_dec_and_test(&cmd->ref_count)) {
+ if (refcount_dec_and_test(&cmd->ref_count)) {
mutex_lock(&table_lock);
list_del(&cmd->list);
mutex_unlock(&table_lock);
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index e5eb9c9b4bc8..4ab23d0075f6 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -213,6 +213,19 @@ static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
l->nr_elts--;
}
+static struct entry *l_pop_head(struct entry_space *es, struct ilist *l)
+{
+ struct entry *e;
+
+ for (e = l_head(es, l); e; e = l_next(es, e))
+ if (!e->sentinel) {
+ l_del(es, l, e);
+ return e;
+ }
+
+ return NULL;
+}
+
static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
{
struct entry *e;
@@ -719,7 +732,7 @@ static struct entry *alloc_entry(struct entry_alloc *ea)
if (l_empty(&ea->free))
return NULL;
- e = l_pop_tail(ea->es, &ea->free);
+ e = l_pop_head(ea->es, &ea->free);
init_entry(e);
ea->nr_allocated++;
@@ -1158,13 +1171,13 @@ static void clear_pending(struct smq_policy *mq, struct entry *e)
e->pending_work = false;
}
-static void queue_writeback(struct smq_policy *mq)
+static void queue_writeback(struct smq_policy *mq, bool idle)
{
int r;
struct policy_work work;
struct entry *e;
- e = q_peek(&mq->dirty, mq->dirty.nr_levels, !mq->migrations_allowed);
+ e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle);
if (e) {
mark_pending(mq, e);
q_del(&mq->dirty, e);
@@ -1174,12 +1187,16 @@ static void queue_writeback(struct smq_policy *mq)
work.cblock = infer_cblock(mq, e);
r = btracker_queue(mq->bg_work, &work, NULL);
- WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race.
+ if (r) {
+ clear_pending(mq, e);
+ q_push_front(&mq->dirty, e);
+ }
}
}
static void queue_demotion(struct smq_policy *mq)
{
+ int r;
struct policy_work work;
struct entry *e;
@@ -1189,7 +1206,7 @@ static void queue_demotion(struct smq_policy *mq)
e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
if (!e) {
if (!clean_target_met(mq, true))
- queue_writeback(mq);
+ queue_writeback(mq, false);
return;
}
@@ -1199,12 +1216,17 @@ static void queue_demotion(struct smq_policy *mq)
work.op = POLICY_DEMOTE;
work.oblock = e->oblock;
work.cblock = infer_cblock(mq, e);
- btracker_queue(mq->bg_work, &work, NULL);
+ r = btracker_queue(mq->bg_work, &work, NULL);
+ if (r) {
+ clear_pending(mq, e);
+ q_push_front(&mq->clean, e);
+ }
}
static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
struct policy_work **workp)
{
+ int r;
struct entry *e;
struct policy_work work;
@@ -1234,7 +1256,9 @@ static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
work.op = POLICY_PROMOTE;
work.oblock = oblock;
work.cblock = infer_cblock(mq, e);
- btracker_queue(mq->bg_work, &work, workp);
+ r = btracker_queue(mq->bg_work, &work, workp);
+ if (r)
+ free_entry(&mq->cache_alloc, e);
}
/*----------------------------------------------------------------*/
@@ -1418,7 +1442,7 @@ static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
r = btracker_issue(mq->bg_work, result);
if (r == -ENODATA) {
if (!clean_target_met(mq, idle)) {
- queue_writeback(mq);
+ queue_writeback(mq, idle);
r = btracker_issue(mq->bg_work, result);
}
}
@@ -1778,7 +1802,7 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
mq->next_hotspot_period = jiffies;
mq->next_cache_period = jiffies;
- mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */
+ mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */
if (!mq->bg_work)
goto bad_btracker;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 8785134c9f1f..47407e43b96a 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -408,9 +408,7 @@ struct cache {
int sectors_per_block_shift;
spinlock_t lock;
- struct list_head deferred_cells;
struct bio_list deferred_bios;
- struct bio_list deferred_writethrough_bios;
sector_t migration_threshold;
wait_queue_head_t migration_wait;
atomic_t nr_allocated_migrations;
@@ -446,10 +444,10 @@ struct cache {
struct dm_kcopyd_client *copier;
struct workqueue_struct *wq;
struct work_struct deferred_bio_worker;
- struct work_struct deferred_writethrough_worker;
struct work_struct migration_worker;
struct delayed_work waker;
struct dm_bio_prison_v2 *prison;
+ struct bio_set *bs;
mempool_t *migration_pool;
@@ -490,15 +488,6 @@ struct per_bio_data {
struct dm_bio_prison_cell_v2 *cell;
struct dm_hook_info hook_info;
sector_t len;
-
- /*
- * writethrough fields. These MUST remain at the end of this
- * structure and the 'cache' member must be the first as it
- * is used to determine the offset of the writethrough fields.
- */
- struct cache *cache;
- dm_cblock_t cblock;
- struct dm_bio_details bio_details;
};
struct dm_cache_migration {
@@ -515,19 +504,19 @@ struct dm_cache_migration {
/*----------------------------------------------------------------*/
-static bool writethrough_mode(struct cache_features *f)
+static bool writethrough_mode(struct cache *cache)
{
- return f->io_mode == CM_IO_WRITETHROUGH;
+ return cache->features.io_mode == CM_IO_WRITETHROUGH;
}
-static bool writeback_mode(struct cache_features *f)
+static bool writeback_mode(struct cache *cache)
{
- return f->io_mode == CM_IO_WRITEBACK;
+ return cache->features.io_mode == CM_IO_WRITEBACK;
}
-static inline bool passthrough_mode(struct cache_features *f)
+static inline bool passthrough_mode(struct cache *cache)
{
- return unlikely(f->io_mode == CM_IO_PASSTHROUGH);
+ return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
}
/*----------------------------------------------------------------*/
@@ -537,14 +526,9 @@ static void wake_deferred_bio_worker(struct cache *cache)
queue_work(cache->wq, &cache->deferred_bio_worker);
}
-static void wake_deferred_writethrough_worker(struct cache *cache)
-{
- queue_work(cache->wq, &cache->deferred_writethrough_worker);
-}
-
static void wake_migration_worker(struct cache *cache)
{
- if (passthrough_mode(&cache->features))
+ if (passthrough_mode(cache))
return;
queue_work(cache->wq, &cache->migration_worker);
@@ -567,10 +551,13 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache)
struct dm_cache_migration *mg;
mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
- if (mg) {
- mg->cache = cache;
- atomic_inc(&mg->cache->nr_allocated_migrations);
- }
+ if (!mg)
+ return NULL;
+
+ memset(mg, 0, sizeof(*mg));
+
+ mg->cache = cache;
+ atomic_inc(&cache->nr_allocated_migrations);
return mg;
}
@@ -618,27 +605,16 @@ static unsigned lock_level(struct bio *bio)
* Per bio data
*--------------------------------------------------------------*/
-/*
- * If using writeback, leave out struct per_bio_data's writethrough fields.
- */
-#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
-#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
-
-static size_t get_per_bio_data_size(struct cache *cache)
-{
- return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
-}
-
-static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
+static struct per_bio_data *get_per_bio_data(struct bio *bio)
{
- struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
+ struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
BUG_ON(!pb);
return pb;
}
-static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
+static struct per_bio_data *init_per_bio_data(struct bio *bio)
{
- struct per_bio_data *pb = get_per_bio_data(bio, data_size);
+ struct per_bio_data *pb = get_per_bio_data(bio);
pb->tick = false;
pb->req_nr = dm_bio_get_target_bio_nr(bio);
@@ -678,7 +654,6 @@ static void defer_bios(struct cache *cache, struct bio_list *bios)
static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
{
bool r;
- size_t pb_size;
struct per_bio_data *pb;
struct dm_cell_key_v2 key;
dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
@@ -703,8 +678,7 @@ static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bi
if (cell != cell_prealloc)
free_prison_cell(cache, cell_prealloc);
- pb_size = get_per_bio_data_size(cache);
- pb = get_per_bio_data(bio, pb_size);
+ pb = get_per_bio_data(bio);
pb->cell = cell;
return r;
@@ -856,28 +830,35 @@ static void remap_to_cache(struct cache *cache, struct bio *bio,
static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
{
unsigned long flags;
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ struct per_bio_data *pb;
spin_lock_irqsave(&cache->lock, flags);
if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
bio_op(bio) != REQ_OP_DISCARD) {
+ pb = get_per_bio_data(bio);
pb->tick = true;
cache->need_tick_bio = false;
}
spin_unlock_irqrestore(&cache->lock, flags);
}
-static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
- dm_oblock_t oblock)
+static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
+ dm_oblock_t oblock, bool bio_has_pbd)
{
- // FIXME: this is called way too much.
- check_if_tick_bio_needed(cache, bio);
+ if (bio_has_pbd)
+ check_if_tick_bio_needed(cache, bio);
remap_to_origin(cache, bio);
if (bio_data_dir(bio) == WRITE)
clear_discard(cache, oblock_to_dblock(cache, oblock));
}
+static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
+ dm_oblock_t oblock)
+{
+ // FIXME: check_if_tick_bio_needed() is called way too much through this interface
+ __remap_to_origin_clear_discard(cache, bio, oblock, true);
+}
+
static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
dm_oblock_t oblock, dm_cblock_t cblock)
{
@@ -908,10 +889,10 @@ static bool accountable_bio(struct cache *cache, struct bio *bio)
static void accounted_begin(struct cache *cache, struct bio *bio)
{
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ struct per_bio_data *pb;
if (accountable_bio(cache, bio)) {
+ pb = get_per_bio_data(bio);
pb->len = bio_sectors(bio);
iot_io_begin(&cache->tracker, pb->len);
}
@@ -919,8 +900,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio)
static void accounted_complete(struct cache *cache, struct bio *bio)
{
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ struct per_bio_data *pb = get_per_bio_data(bio);
iot_io_end(&cache->tracker, pb->len);
}
@@ -937,57 +917,26 @@ static void issue_op(struct bio *bio, void *context)
accounted_request(cache, bio);
}
-static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_add(&cache->deferred_writethrough_bios, bio);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- wake_deferred_writethrough_worker(cache);
-}
-
-static void writethrough_endio(struct bio *bio)
-{
- struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
-
- dm_unhook_bio(&pb->hook_info, bio);
-
- if (bio->bi_status) {
- bio_endio(bio);
- return;
- }
-
- dm_bio_restore(&pb->bio_details, bio);
- remap_to_cache(pb->cache, bio, pb->cblock);
-
- /*
- * We can't issue this bio directly, since we're in interrupt
- * context. So it gets put on a bio list for processing by the
- * worker thread.
- */
- defer_writethrough_bio(pb->cache, bio);
-}
-
/*
- * FIXME: send in parallel, huge latency as is.
* When running in writethrough mode we need to send writes to clean blocks
- * to both the cache and origin devices. In future we'd like to clone the
- * bio and send them in parallel, but for now we're doing them in
- * series as this is easier.
+ * to both the cache and origin devices. Clone the bio and send them in parallel.
*/
-static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
- dm_oblock_t oblock, dm_cblock_t cblock)
+static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
+ dm_oblock_t oblock, dm_cblock_t cblock)
{
- struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
+ struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, cache->bs);
+
+ BUG_ON(!origin_bio);
- pb->cache = cache;
- pb->cblock = cblock;
- dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
- dm_bio_record(&pb->bio_details, bio);
+ bio_chain(origin_bio, bio);
+ /*
+ * Passing false to __remap_to_origin_clear_discard() skips
+ * all code that might use per_bio_data (since clone doesn't have it)
+ */
+ __remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
+ submit_bio(origin_bio);
- remap_to_origin_clear_discard(pb->cache, bio, oblock);
+ remap_to_cache(cache, bio, cblock);
}
/*----------------------------------------------------------------
@@ -1201,6 +1150,18 @@ static void background_work_end(struct cache *cache)
/*----------------------------------------------------------------*/
+static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+{
+ return (bio_data_dir(bio) == WRITE) &&
+ (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+}
+
+static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
+{
+ return writeback_mode(cache) &&
+ (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
+}
+
static void quiesce(struct dm_cache_migration *mg,
void (*continuation)(struct work_struct *))
{
@@ -1248,8 +1209,7 @@ static int copy(struct dm_cache_migration *mg, bool promote)
static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
{
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ struct per_bio_data *pb = get_per_bio_data(bio);
if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
free_prison_cell(cache, pb->cell);
@@ -1260,23 +1220,21 @@ static void overwrite_endio(struct bio *bio)
{
struct dm_cache_migration *mg = bio->bi_private;
struct cache *cache = mg->cache;
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ struct per_bio_data *pb = get_per_bio_data(bio);
dm_unhook_bio(&pb->hook_info, bio);
if (bio->bi_status)
mg->k.input = bio->bi_status;
- queue_continuation(mg->cache->wq, &mg->k);
+ queue_continuation(cache->wq, &mg->k);
}
static void overwrite(struct dm_cache_migration *mg,
void (*continuation)(struct work_struct *))
{
struct bio *bio = mg->overwrite_bio;
- size_t pb_data_size = get_per_bio_data_size(mg->cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ struct per_bio_data *pb = get_per_bio_data(bio);
dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
@@ -1474,13 +1432,51 @@ static void mg_upgrade_lock(struct work_struct *ws)
}
}
+static void mg_full_copy(struct work_struct *ws)
+{
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+ struct cache *cache = mg->cache;
+ struct policy_work *op = mg->op;
+ bool is_policy_promote = (op->op == POLICY_PROMOTE);
+
+ if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
+ is_discarded_oblock(cache, op->oblock)) {
+ mg_upgrade_lock(ws);
+ return;
+ }
+
+ init_continuation(&mg->k, mg_upgrade_lock);
+
+ if (copy(mg, is_policy_promote)) {
+ DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
+ mg->k.input = BLK_STS_IOERR;
+ mg_complete(mg, false);
+ }
+}
+
static void mg_copy(struct work_struct *ws)
{
- int r;
struct dm_cache_migration *mg = ws_to_mg(ws);
if (mg->overwrite_bio) {
/*
+ * No exclusive lock was held when we last checked if the bio
+ * was optimisable. So we have to check again in case things
+ * have changed (eg, the block may no longer be discarded).
+ */
+ if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) {
+ /*
+ * Fallback to a real full copy after doing some tidying up.
+ */
+ bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
+ BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
+ mg->overwrite_bio = NULL;
+ inc_io_migrations(mg->cache);
+ mg_full_copy(ws);
+ return;
+ }
+
+ /*
* It's safe to do this here, even though it's new data
* because all IO has been locked out of the block.
*
@@ -1489,26 +1485,8 @@ static void mg_copy(struct work_struct *ws)
*/
overwrite(mg, mg_update_metadata_after_copy);
- } else {
- struct cache *cache = mg->cache;
- struct policy_work *op = mg->op;
- bool is_policy_promote = (op->op == POLICY_PROMOTE);
-
- if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
- is_discarded_oblock(cache, op->oblock)) {
- mg_upgrade_lock(ws);
- return;
- }
-
- init_continuation(&mg->k, mg_upgrade_lock);
-
- r = copy(mg, is_policy_promote);
- if (r) {
- DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
- mg->k.input = BLK_STS_IOERR;
- mg_complete(mg, false);
- }
- }
+ } else
+ mg_full_copy(ws);
}
static int mg_lock_writes(struct dm_cache_migration *mg)
@@ -1567,9 +1545,6 @@ static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio
return -ENOMEM;
}
- memset(mg, 0, sizeof(*mg));
-
- mg->cache = cache;
mg->op = op;
mg->overwrite_bio = bio;
@@ -1703,9 +1678,6 @@ static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
return -ENOMEM;
}
- memset(mg, 0, sizeof(*mg));
-
- mg->cache = cache;
mg->overwrite_bio = bio;
mg->invalidate_cblock = cblock;
mg->invalidate_oblock = oblock;
@@ -1748,26 +1720,12 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
/*----------------------------------------------------------------*/
-static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
-{
- return (bio_data_dir(bio) == WRITE) &&
- (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
-}
-
-static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
-{
- return writeback_mode(&cache->features) &&
- (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
-}
-
static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
bool *commit_needed)
{
int r, data_dir;
bool rb, background_queued;
dm_cblock_t cblock;
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
*commit_needed = false;
@@ -1816,6 +1774,8 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
}
if (r == -ENOENT) {
+ struct per_bio_data *pb = get_per_bio_data(bio);
+
/*
* Miss.
*/
@@ -1823,7 +1783,6 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
if (pb->req_nr == 0) {
accounted_begin(cache, bio);
remap_to_origin_clear_discard(cache, bio, block);
-
} else {
/*
* This is a duplicate writethrough io that is no
@@ -1842,18 +1801,17 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
* Passthrough always maps to the origin, invalidating any
* cache blocks that are written to.
*/
- if (passthrough_mode(&cache->features)) {
+ if (passthrough_mode(cache)) {
if (bio_data_dir(bio) == WRITE) {
bio_drop_shared_lock(cache, bio);
atomic_inc(&cache->stats.demotion);
invalidate_start(cache, cblock, block, bio);
} else
remap_to_origin_clear_discard(cache, bio, block);
-
} else {
- if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
+ if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
!is_dirty(cache, cblock)) {
- remap_to_origin_then_cache(cache, bio, block, cblock);
+ remap_to_origin_and_cache(cache, bio, block, cblock);
accounted_begin(cache, bio);
} else
remap_to_cache_dirty(cache, bio, block, cblock);
@@ -1922,8 +1880,7 @@ static blk_status_t commit_op(void *context)
static bool process_flush_bio(struct cache *cache, struct bio *bio)
{
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ struct per_bio_data *pb = get_per_bio_data(bio);
if (!pb->req_nr)
remap_to_origin(cache, bio);
@@ -1983,28 +1940,6 @@ static void process_deferred_bios(struct work_struct *ws)
schedule_commit(&cache->committer);
}
-static void process_deferred_writethrough_bios(struct work_struct *ws)
-{
- struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
-
- unsigned long flags;
- struct bio_list bios;
- struct bio *bio;
-
- bio_list_init(&bios);
-
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_merge(&bios, &cache->deferred_writethrough_bios);
- bio_list_init(&cache->deferred_writethrough_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- /*
- * These bios have already been through accounted_begin()
- */
- while ((bio = bio_list_pop(&bios)))
- generic_make_request(bio);
-}
-
/*----------------------------------------------------------------
* Main worker loop
*--------------------------------------------------------------*/
@@ -2112,6 +2047,9 @@ static void destroy(struct cache *cache)
kfree(cache->ctr_args[i]);
kfree(cache->ctr_args);
+ if (cache->bs)
+ bioset_free(cache->bs);
+
kfree(cache);
}
@@ -2555,8 +2493,15 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ti->discards_supported = true;
ti->split_discard_bios = false;
+ ti->per_io_data_size = sizeof(struct per_bio_data);
+
cache->features = ca->features;
- ti->per_io_data_size = get_per_bio_data_size(cache);
+ if (writethrough_mode(cache)) {
+ /* Create bioset for writethrough bios issued to origin */
+ cache->bs = bioset_create(BIO_POOL_SIZE, 0, 0);
+ if (!cache->bs)
+ goto bad;
+ }
cache->callbacks.congested_fn = cache_is_congested;
dm_table_add_target_callbacks(ti->table, &cache->callbacks);
@@ -2618,7 +2563,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
goto bad;
}
- if (passthrough_mode(&cache->features)) {
+ if (passthrough_mode(cache)) {
bool all_clean;
r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
@@ -2637,9 +2582,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
}
spin_lock_init(&cache->lock);
- INIT_LIST_HEAD(&cache->deferred_cells);
bio_list_init(&cache->deferred_bios);
- bio_list_init(&cache->deferred_writethrough_bios);
atomic_set(&cache->nr_allocated_migrations, 0);
atomic_set(&cache->nr_io_migrations, 0);
init_waitqueue_head(&cache->migration_wait);
@@ -2678,8 +2621,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
goto bad;
}
INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
- INIT_WORK(&cache->deferred_writethrough_worker,
- process_deferred_writethrough_bios);
INIT_WORK(&cache->migration_worker, check_migrations);
INIT_DELAYED_WORK(&cache->waker, do_waker);
@@ -2795,9 +2736,8 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
int r;
bool commit_needed;
dm_oblock_t block = get_bio_block(cache, bio);
- size_t pb_data_size = get_per_bio_data_size(cache);
- init_per_bio_data(bio, pb_data_size);
+ init_per_bio_data(bio);
if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
/*
* This can only occur if the io goes to a partial block at
@@ -2821,13 +2761,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
return r;
}
-static int cache_end_io(struct dm_target *ti, struct bio *bio,
- blk_status_t *error)
+static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
{
struct cache *cache = ti->private;
unsigned long flags;
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ struct per_bio_data *pb = get_per_bio_data(bio);
if (pb->tick) {
policy_tick(cache->policy, false);
@@ -3243,13 +3181,13 @@ static void cache_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("1 ");
- if (writethrough_mode(&cache->features))
+ if (writethrough_mode(cache))
DMEMIT("writethrough ");
- else if (passthrough_mode(&cache->features))
+ else if (passthrough_mode(cache))
DMEMIT("passthrough ");
- else if (writeback_mode(&cache->features))
+ else if (writeback_mode(cache))
DMEMIT("writeback ");
else {
@@ -3415,7 +3353,7 @@ static int process_invalidate_cblocks_message(struct cache *cache, unsigned coun
unsigned i;
struct cblock_range range;
- if (!passthrough_mode(&cache->features)) {
+ if (!passthrough_mode(cache)) {
DMERR("%s: cache has to be in passthrough mode for invalidation",
cache_device_name(cache));
return -EPERM;
@@ -3534,18 +3472,18 @@ static int __init dm_cache_init(void)
{
int r;
- r = dm_register_target(&cache_target);
- if (r) {
- DMERR("cache target registration failed: %d", r);
- return r;
- }
-
migration_cache = KMEM_CACHE(dm_cache_migration, 0);
if (!migration_cache) {
dm_unregister_target(&cache_target);
return -ENOMEM;
}
+ r = dm_register_target(&cache_target);
+ if (r) {
+ DMERR("cache target registration failed: %d", r);
+ return r;
+ }
+
return 0;
}
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 203144762f36..6a14f945783c 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -29,7 +29,6 @@ struct dm_kobject_holder {
* DM targets must _not_ deference a mapped_device to directly access its members!
*/
struct mapped_device {
- struct srcu_struct io_barrier;
struct mutex suspend_lock;
/*
@@ -127,6 +126,8 @@ struct mapped_device {
struct blk_mq_tag_set *tag_set;
bool use_blk_mq:1;
bool init_tio_pdu:1;
+
+ struct srcu_struct io_barrier;
};
void dm_init_md_queue(struct mapped_device *md);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 96ab46512e1f..554d60394c06 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1075,7 +1075,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
/* Reject unexpected unaligned bio. */
- if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
+ if (unlikely(bv_in.bv_len & (cc->sector_size - 1)))
return -EIO;
dmreq = dmreq_of_req(cc, req);
@@ -1168,7 +1168,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
int r = 0;
/* Reject unexpected unaligned bio. */
- if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
+ if (unlikely(bv_in.bv_len & (cc->sector_size - 1)))
return -EIO;
dmreq = dmreq_of_req(cc, req);
@@ -1954,10 +1954,15 @@ static int crypt_setkey(struct crypt_config *cc)
/* Ignore extra keys (which are used for IV etc) */
subkey_size = crypt_subkey_size(cc);
- if (crypt_integrity_hmac(cc))
+ if (crypt_integrity_hmac(cc)) {
+ if (subkey_size < cc->key_mac_size)
+ return -EINVAL;
+
crypt_copy_authenckey(cc->authenc_key, cc->key,
subkey_size - cc->key_mac_size,
cc->key_mac_size);
+ }
+
for (i = 0; i < cc->tfms_count; i++) {
if (crypt_integrity_hmac(cc))
r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
@@ -2053,9 +2058,6 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
ret = crypt_setkey(cc);
- /* wipe the kernel key payload copy in each case */
- memset(cc->key, 0, cc->key_size * sizeof(u8));
-
if (!ret) {
set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
kzfree(cc->key_string);
@@ -2523,6 +2525,10 @@ static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key)
}
}
+ /* wipe the kernel key payload copy */
+ if (cc->key_string)
+ memset(cc->key, 0, cc->key_size * sizeof(u8));
+
return ret;
}
@@ -2740,6 +2746,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->tag_pool_max_sectors * cc->on_disk_tag_size);
if (!cc->tag_pool) {
ti->error = "Cannot allocate integrity tags mempool";
+ ret = -ENOMEM;
goto bad;
}
@@ -2961,6 +2968,9 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
return ret;
if (cc->iv_gen_ops && cc->iv_gen_ops->init)
ret = cc->iv_gen_ops->init(cc);
+ /* wipe the kernel key payload copy */
+ if (cc->key_string)
+ memset(cc->key, 0, cc->key_size * sizeof(u8));
return ret;
}
if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
@@ -3007,7 +3017,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 18, 0},
+ .version = {1, 18, 1},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 2209a9700acd..288386bfbfb5 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -44,9 +44,9 @@ struct dm_delay_info {
static DEFINE_MUTEX(delayed_bios_lock);
-static void handle_delayed_timer(unsigned long data)
+static void handle_delayed_timer(struct timer_list *t)
{
- struct delay_c *dc = (struct delay_c *)data;
+ struct delay_c *dc = from_timer(dc, t, delay_timer);
queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
}
@@ -195,7 +195,7 @@ out:
goto bad_queue;
}
- setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc);
+ timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
INIT_LIST_HEAD(&dc->delayed_bios);
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index ba84b8d62cd0..73a5c198113a 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1513,7 +1513,6 @@ static int era_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->flush_supported = true;
ti->num_discard_bios = 1;
- ti->discards_supported = true;
era->callbacks.congested_fn = era_is_congested;
dm_table_add_target_callbacks(ti->table, &era->callbacks);
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 096fe9b66c50..46d7c8749222 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -6,6 +6,7 @@
* This file is released under the GPL.
*/
+#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/device-mapper.h>
#include <linux/dm-io.h>
@@ -80,13 +81,13 @@ struct journal_entry {
#define journal_entry_tag(ic, je) ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block])
#if BITS_PER_LONG == 64
-#define journal_entry_set_sector(je, x) do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0)
+#define journal_entry_set_sector(je, x) do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0)
#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
#elif defined(CONFIG_LBDAF)
-#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0)
+#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0)
#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
#else
-#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0)
+#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32(0)); } while (0)
#define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo)
#endif
#define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1))
@@ -320,7 +321,7 @@ static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, in
static int dm_integrity_failed(struct dm_integrity_c *ic)
{
- return ACCESS_ONCE(ic->failed);
+ return READ_ONCE(ic->failed);
}
static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i,
@@ -1093,9 +1094,9 @@ static void sleep_on_endio_wait(struct dm_integrity_c *ic)
__remove_wait_queue(&ic->endio_wait, &wait);
}
-static void autocommit_fn(unsigned long data)
+static void autocommit_fn(struct timer_list *t)
{
- struct dm_integrity_c *ic = (struct dm_integrity_c *)data;
+ struct dm_integrity_c *ic = from_timer(ic, t, autocommit_timer);
if (likely(!dm_integrity_failed(ic)))
queue_work(ic->commit_wq, &ic->commit_work);
@@ -1376,7 +1377,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
struct bvec_iter iter;
struct bio_vec bv;
bio_for_each_segment(bv, bio, iter) {
- if (unlikely((bv.bv_offset | bv.bv_len) & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
+ if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
bv.bv_offset, bv.bv_len, ic->sectors_per_block);
return DM_MAPIO_KILL;
@@ -1545,7 +1546,7 @@ retry_kmap:
smp_mb();
if (unlikely(waitqueue_active(&ic->copy_to_journal_wait)))
wake_up(&ic->copy_to_journal_wait);
- if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) {
+ if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) {
queue_work(ic->commit_wq, &ic->commit_work);
} else {
schedule_autocommit(ic);
@@ -1798,7 +1799,7 @@ static void integrity_commit(struct work_struct *w)
ic->n_committed_sections += commit_sections;
spin_unlock_irq(&ic->endio_wait.lock);
- if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold)
+ if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold)
queue_work(ic->writer_wq, &ic->writer_work);
release_flush_bios:
@@ -1980,7 +1981,7 @@ static void integrity_writer(struct work_struct *w)
unsigned prev_free_sectors;
/* the following test is not needed, but it tests the replay code */
- if (ACCESS_ONCE(ic->suspending))
+ if (READ_ONCE(ic->suspending))
return;
spin_lock_irq(&ic->endio_wait.lock);
@@ -2558,7 +2559,8 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
int r = 0;
unsigned i;
__u64 journal_pages, journal_desc_size, journal_tree_size;
- unsigned char *crypt_data = NULL;
+ unsigned char *crypt_data = NULL, *crypt_iv = NULL;
+ struct skcipher_request *req = NULL;
ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL);
ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL);
@@ -2616,9 +2618,20 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
if (blocksize == 1) {
struct scatterlist *sg;
- SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
- unsigned char iv[ivsize];
- skcipher_request_set_tfm(req, ic->journal_crypt);
+
+ req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
+ if (!req) {
+ *error = "Could not allocate crypt request";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ crypt_iv = kmalloc(ivsize, GFP_KERNEL);
+ if (!crypt_iv) {
+ *error = "Could not allocate iv";
+ r = -ENOMEM;
+ goto bad;
+ }
ic->journal_xor = dm_integrity_alloc_page_list(ic);
if (!ic->journal_xor) {
@@ -2640,9 +2653,9 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
sg_set_buf(&sg[i], va, PAGE_SIZE);
}
sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
- memset(iv, 0x00, ivsize);
+ memset(crypt_iv, 0x00, ivsize);
- skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv);
+ skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
init_completion(&comp.comp);
comp.in_flight = (atomic_t)ATOMIC_INIT(1);
if (do_crypt(true, req, &comp))
@@ -2658,10 +2671,22 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
crypto_free_skcipher(ic->journal_crypt);
ic->journal_crypt = NULL;
} else {
- SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
- unsigned char iv[ivsize];
unsigned crypt_len = roundup(ivsize, blocksize);
+ req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
+ if (!req) {
+ *error = "Could not allocate crypt request";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ crypt_iv = kmalloc(ivsize, GFP_KERNEL);
+ if (!crypt_iv) {
+ *error = "Could not allocate iv";
+ r = -ENOMEM;
+ goto bad;
+ }
+
crypt_data = kmalloc(crypt_len, GFP_KERNEL);
if (!crypt_data) {
*error = "Unable to allocate crypt data";
@@ -2669,8 +2694,6 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
goto bad;
}
- skcipher_request_set_tfm(req, ic->journal_crypt);
-
ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal);
if (!ic->journal_scatterlist) {
*error = "Unable to allocate sg list";
@@ -2694,12 +2717,12 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
struct skcipher_request *section_req;
__u32 section_le = cpu_to_le32(i);
- memset(iv, 0x00, ivsize);
+ memset(crypt_iv, 0x00, ivsize);
memset(crypt_data, 0x00, crypt_len);
memcpy(crypt_data, &section_le, min((size_t)crypt_len, sizeof(section_le)));
sg_init_one(&sg, crypt_data, crypt_len);
- skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv);
+ skcipher_request_set_crypt(req, &sg, &sg, crypt_len, crypt_iv);
init_completion(&comp.comp);
comp.in_flight = (atomic_t)ATOMIC_INIT(1);
if (do_crypt(true, req, &comp))
@@ -2757,6 +2780,9 @@ retest_commit_id:
}
bad:
kfree(crypt_data);
+ kfree(crypt_iv);
+ skcipher_request_free(req);
+
return r;
}
@@ -2941,7 +2967,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
ic->autocommit_msec = sync_msec;
- setup_timer(&ic->autocommit_timer, autocommit_fn, (unsigned long)ic);
+ timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
ic->io = dm_io_client_create();
if (IS_ERR(ic->io)) {
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index cf2c67e35eaf..eb45cc3df31d 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -107,7 +107,7 @@ static void io_job_start(struct dm_kcopyd_throttle *t)
try_again:
spin_lock_irq(&throttle_spinlock);
- throttle = ACCESS_ONCE(t->throttle);
+ throttle = READ_ONCE(t->throttle);
if (likely(throttle >= 100))
goto skip_limit;
@@ -157,7 +157,7 @@ static void io_job_finish(struct dm_kcopyd_throttle *t)
t->num_io_jobs--;
- if (likely(ACCESS_ONCE(t->throttle) >= 100))
+ if (likely(READ_ONCE(t->throttle) >= 100))
goto skip_limit;
if (!t->num_io_jobs) {
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 8b80a9ce9ea9..189badbeddaf 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -10,9 +10,11 @@
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
+#include <linux/dax.h>
#include <linux/slab.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/uio.h>
#define DM_MSG_PREFIX "log-writes"
@@ -246,27 +248,108 @@ error:
return -1;
}
+static int write_inline_data(struct log_writes_c *lc, void *entry,
+ size_t entrylen, void *data, size_t datalen,
+ sector_t sector)
+{
+ int num_pages, bio_pages, pg_datalen, pg_sectorlen, i;
+ struct page *page;
+ struct bio *bio;
+ size_t ret;
+ void *ptr;
+
+ while (datalen) {
+ num_pages = ALIGN(datalen, PAGE_SIZE) >> PAGE_SHIFT;
+ bio_pages = min(num_pages, BIO_MAX_PAGES);
+
+ atomic_inc(&lc->io_blocks);
+
+ bio = bio_alloc(GFP_KERNEL, bio_pages);
+ if (!bio) {
+ DMERR("Couldn't alloc inline data bio");
+ goto error;
+ }
+
+ bio->bi_iter.bi_size = 0;
+ bio->bi_iter.bi_sector = sector;
+ bio_set_dev(bio, lc->logdev->bdev);
+ bio->bi_end_io = log_end_io;
+ bio->bi_private = lc;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+ for (i = 0; i < bio_pages; i++) {
+ pg_datalen = min_t(int, datalen, PAGE_SIZE);
+ pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize);
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ DMERR("Couldn't alloc inline data page");
+ goto error_bio;
+ }
+
+ ptr = kmap_atomic(page);
+ memcpy(ptr, data, pg_datalen);
+ if (pg_sectorlen > pg_datalen)
+ memset(ptr + pg_datalen, 0, pg_sectorlen - pg_datalen);
+ kunmap_atomic(ptr);
+
+ ret = bio_add_page(bio, page, pg_sectorlen, 0);
+ if (ret != pg_sectorlen) {
+ DMERR("Couldn't add page of inline data");
+ __free_page(page);
+ goto error_bio;
+ }
+
+ datalen -= pg_datalen;
+ data += pg_datalen;
+ }
+ submit_bio(bio);
+
+ sector += bio_pages * PAGE_SECTORS;
+ }
+ return 0;
+error_bio:
+ bio_free_pages(bio);
+ bio_put(bio);
+error:
+ put_io_block(lc);
+ return -1;
+}
+
static int log_one_block(struct log_writes_c *lc,
struct pending_block *block, sector_t sector)
{
struct bio *bio;
struct log_write_entry entry;
- size_t ret;
+ size_t metadatalen, ret;
int i;
entry.sector = cpu_to_le64(block->sector);
entry.nr_sectors = cpu_to_le64(block->nr_sectors);
entry.flags = cpu_to_le64(block->flags);
entry.data_len = cpu_to_le64(block->datalen);
+
+ metadatalen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0;
if (write_metadata(lc, &entry, sizeof(entry), block->data,
- block->datalen, sector)) {
+ metadatalen, sector)) {
free_pending_block(lc, block);
return -1;
}
+ sector += dev_to_bio_sectors(lc, 1);
+
+ if (block->datalen && metadatalen == 0) {
+ if (write_inline_data(lc, &entry, sizeof(entry), block->data,
+ block->datalen, sector)) {
+ free_pending_block(lc, block);
+ return -1;
+ }
+ /* we don't support both inline data & bio data */
+ goto out;
+ }
+
if (!block->vec_cnt)
goto out;
- sector += dev_to_bio_sectors(lc, 1);
atomic_inc(&lc->io_blocks);
bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES));
@@ -527,6 +610,51 @@ static int log_mark(struct log_writes_c *lc, char *data)
return 0;
}
+static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes,
+ struct iov_iter *i)
+{
+ struct pending_block *block;
+
+ if (!bytes)
+ return 0;
+
+ block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
+ if (!block) {
+ DMERR("Error allocating dax pending block");
+ return -ENOMEM;
+ }
+
+ block->data = kzalloc(bytes, GFP_KERNEL);
+ if (!block->data) {
+ DMERR("Error allocating dax data space");
+ kfree(block);
+ return -ENOMEM;
+ }
+
+ /* write data provided via the iterator */
+ if (!copy_from_iter(block->data, bytes, i)) {
+ DMERR("Error copying dax data");
+ kfree(block->data);
+ kfree(block);
+ return -EIO;
+ }
+
+ /* rewind the iterator so that the block driver can use it */
+ iov_iter_revert(i, bytes);
+
+ block->datalen = bytes;
+ block->sector = bio_to_dev_sectors(lc, sector);
+ block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift;
+
+ atomic_inc(&lc->pending_blocks);
+ spin_lock_irq(&lc->blocks_lock);
+ list_add_tail(&block->list, &lc->unflushed_blocks);
+ spin_unlock_irq(&lc->blocks_lock);
+ wake_up_process(lc->log_kthread);
+
+ return 0;
+}
+
static void log_writes_dtr(struct dm_target *ti)
{
struct log_writes_c *lc = ti->private;
@@ -792,9 +920,46 @@ static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limit
limits->io_min = limits->physical_block_size;
}
+static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+ long nr_pages, void **kaddr, pfn_t *pfn)
+{
+ struct log_writes_c *lc = ti->private;
+ sector_t sector = pgoff * PAGE_SECTORS;
+ int ret;
+
+ ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages * PAGE_SIZE, &pgoff);
+ if (ret)
+ return ret;
+ return dax_direct_access(lc->dev->dax_dev, pgoff, nr_pages, kaddr, pfn);
+}
+
+static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
+ pgoff_t pgoff, void *addr, size_t bytes,
+ struct iov_iter *i)
+{
+ struct log_writes_c *lc = ti->private;
+ sector_t sector = pgoff * PAGE_SECTORS;
+ int err;
+
+ if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+ return 0;
+
+ /* Don't bother doing anything if logging has been disabled */
+ if (!lc->logging_enabled)
+ goto dax_copy;
+
+ err = log_dax(lc, sector, bytes, i);
+ if (err) {
+ DMWARN("Error %d logging DAX write", err);
+ return 0;
+ }
+dax_copy:
+ return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
+}
+
static struct target_type log_writes_target = {
.name = "log-writes",
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = log_writes_ctr,
.dtr = log_writes_dtr,
@@ -805,6 +970,8 @@ static struct target_type log_writes_target = {
.message = log_writes_message,
.iterate_devices = log_writes_iterate_devices,
.io_hints = log_writes_io_hints,
+ .direct_access = log_writes_dax_direct_access,
+ .dax_copy_from_iter = log_writes_dax_copy_from_iter,
};
static int __init dm_log_writes_init(void)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 11f273d2f018..f7810cc869ac 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -366,7 +366,7 @@ static struct pgpath *choose_path_in_pg(struct multipath *m,
pgpath = path_to_pgpath(path);
- if (unlikely(lockless_dereference(m->current_pg) != pg)) {
+ if (unlikely(READ_ONCE(m->current_pg) != pg)) {
/* Only update current_pgpath if pg changed */
spin_lock_irqsave(&m->lock, flags);
m->current_pgpath = pgpath;
@@ -390,7 +390,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
}
/* Were we instructed to switch PG? */
- if (lockless_dereference(m->next_pg)) {
+ if (READ_ONCE(m->next_pg)) {
spin_lock_irqsave(&m->lock, flags);
pg = m->next_pg;
if (!pg) {
@@ -406,7 +406,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
/* Don't change PG until it has no remaining paths */
check_current_pg:
- pg = lockless_dereference(m->current_pg);
+ pg = READ_ONCE(m->current_pg);
if (pg) {
pgpath = choose_path_in_pg(m, pg, nr_bytes);
if (!IS_ERR_OR_NULL(pgpath))
@@ -458,6 +458,38 @@ do { \
} while (0)
/*
+ * Check whether bios must be queued in the device-mapper core rather
+ * than here in the target.
+ *
+ * If MPATHF_QUEUE_IF_NO_PATH and MPATHF_SAVED_QUEUE_IF_NO_PATH hold
+ * the same value then we are not between multipath_presuspend()
+ * and multipath_resume() calls and we have no need to check
+ * for the DMF_NOFLUSH_SUSPENDING flag.
+ */
+static bool __must_push_back(struct multipath *m, unsigned long flags)
+{
+ return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) !=
+ test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &flags)) &&
+ dm_noflush_suspending(m->ti));
+}
+
+/*
+ * Following functions use READ_ONCE to get atomic access to
+ * all m->flags to avoid taking spinlock
+ */
+static bool must_push_back_rq(struct multipath *m)
+{
+ unsigned long flags = READ_ONCE(m->flags);
+ return test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) || __must_push_back(m, flags);
+}
+
+static bool must_push_back_bio(struct multipath *m)
+{
+ unsigned long flags = READ_ONCE(m->flags);
+ return __must_push_back(m, flags);
+}
+
+/*
* Map cloned requests (request-based multipath)
*/
static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
@@ -473,12 +505,12 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
struct request *clone;
/* Do we need to select a new pgpath? */
- pgpath = lockless_dereference(m->current_pgpath);
+ pgpath = READ_ONCE(m->current_pgpath);
if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
pgpath = choose_pgpath(m, nr_bytes);
if (!pgpath) {
- if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+ if (must_push_back_rq(m))
return DM_MAPIO_DELAY_REQUEUE;
dm_report_EIO(m); /* Failed */
return DM_MAPIO_KILL;
@@ -499,8 +531,6 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
if (IS_ERR(clone)) {
/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
bool queue_dying = blk_queue_dying(q);
- DMERR_LIMIT("blk_get_request() returned %ld%s - requeuing",
- PTR_ERR(clone), queue_dying ? " (path offline)" : "");
if (queue_dying) {
atomic_inc(&m->pg_init_in_progress);
activate_or_offline_path(pgpath);
@@ -535,7 +565,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
bool queue_io;
/* Do we need to select a new pgpath? */
- pgpath = lockless_dereference(m->current_pgpath);
+ pgpath = READ_ONCE(m->current_pgpath);
queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
if (!pgpath || !queue_io)
pgpath = choose_pgpath(m, nr_bytes);
@@ -555,7 +585,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
}
if (!pgpath) {
- if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+ if (must_push_back_bio(m))
return DM_MAPIO_REQUEUE;
dm_report_EIO(m);
return DM_MAPIO_KILL;
@@ -641,14 +671,6 @@ static void process_queued_bios(struct work_struct *work)
blk_finish_plug(&plug);
}
-static void assign_bit(bool value, long nr, unsigned long *addr)
-{
- if (value)
- set_bit(nr, addr);
- else
- clear_bit(nr, addr);
-}
-
/*
* If we run out of usable paths, should we queue I/O or error it?
*/
@@ -658,11 +680,10 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
- assign_bit((save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
- (!save_old_value && queue_if_no_path),
- MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
- assign_bit(queue_if_no_path || dm_noflush_suspending(m->ti),
- MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+ assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags,
+ (save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
+ (!save_old_value && queue_if_no_path));
+ assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path);
spin_unlock_irqrestore(&m->lock, flags);
if (!queue_if_no_path) {
@@ -1496,7 +1517,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
fail_path(pgpath);
if (atomic_read(&m->nr_valid_paths) == 0 &&
- !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+ !must_push_back_rq(m)) {
if (error == BLK_STS_IOERR)
dm_report_EIO(m);
/* complete with the original error */
@@ -1531,8 +1552,12 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
if (atomic_read(&m->nr_valid_paths) == 0 &&
!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
- dm_report_EIO(m);
- *error = BLK_STS_IOERR;
+ if (must_push_back_bio(m)) {
+ r = DM_ENDIO_REQUEUE;
+ } else {
+ dm_report_EIO(m);
+ *error = BLK_STS_IOERR;
+ }
goto done;
}
@@ -1588,8 +1613,8 @@ static void multipath_resume(struct dm_target *ti)
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
- assign_bit(test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
- MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+ assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags,
+ test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags));
spin_unlock_irqrestore(&m->lock, flags);
}
@@ -1804,7 +1829,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
struct pgpath *current_pgpath;
int r;
- current_pgpath = lockless_dereference(m->current_pgpath);
+ current_pgpath = READ_ONCE(m->current_pgpath);
if (!current_pgpath)
current_pgpath = choose_pgpath(m, 0);
@@ -1826,7 +1851,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
}
if (r == -ENOTCONN) {
- if (!lockless_dereference(m->current_pg)) {
+ if (!READ_ONCE(m->current_pg)) {
/* Path status changed, redo selection */
(void) choose_pgpath(m, 0);
}
@@ -1895,9 +1920,9 @@ static int multipath_busy(struct dm_target *ti)
return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED);
/* Guess which priority_group will be used at next mapping time */
- pg = lockless_dereference(m->current_pg);
- next_pg = lockless_dereference(m->next_pg);
- if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg))
+ pg = READ_ONCE(m->current_pg);
+ next_pg = READ_ONCE(m->next_pg);
+ if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg))
pg = next_pg;
if (!pg) {
@@ -1967,13 +1992,6 @@ static int __init dm_multipath_init(void)
{
int r;
- r = dm_register_target(&multipath_target);
- if (r < 0) {
- DMERR("request-based register failed %d", r);
- r = -EINVAL;
- goto bad_register_target;
- }
-
kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
if (!kmultipathd) {
DMERR("failed to create workqueue kmpathd");
@@ -1995,13 +2013,20 @@ static int __init dm_multipath_init(void)
goto bad_alloc_kmpath_handlerd;
}
+ r = dm_register_target(&multipath_target);
+ if (r < 0) {
+ DMERR("request-based register failed %d", r);
+ r = -EINVAL;
+ goto bad_register_target;
+ }
+
return 0;
+bad_register_target:
+ destroy_workqueue(kmpath_handlerd);
bad_alloc_kmpath_handlerd:
destroy_workqueue(kmultipathd);
bad_alloc_kmultipathd:
- dm_unregister_target(&multipath_target);
-bad_register_target:
return r;
}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 2245d06d2045..6319d846e0ad 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -12,7 +12,7 @@
#include "raid1.h"
#include "raid5.h"
#include "raid10.h"
-#include "bitmap.h"
+#include "md-bitmap.h"
#include <linux/device-mapper.h>
@@ -2143,13 +2143,6 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
struct dm_raid_superblock *refsb;
uint64_t events_sb, events_refsb;
- rdev->sb_start = 0;
- rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev);
- if (rdev->sb_size < sizeof(*sb) || rdev->sb_size > PAGE_SIZE) {
- DMERR("superblock size of a logical block is no longer valid");
- return -EINVAL;
- }
-
r = read_disk_sb(rdev, rdev->sb_size, false);
if (r)
return r;
@@ -2494,6 +2487,17 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
if (test_bit(Journal, &rdev->flags))
continue;
+ if (!rdev->meta_bdev)
+ continue;
+
+ /* Set superblock offset/size for metadata device. */
+ rdev->sb_start = 0;
+ rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev);
+ if (rdev->sb_size < sizeof(struct dm_raid_superblock) || rdev->sb_size > PAGE_SIZE) {
+ DMERR("superblock size of a logical block is no longer valid");
+ return -EINVAL;
+ }
+
/*
* Skipping super_load due to CTR_FLAG_SYNC will cause
* the array to undergo initialization again as
@@ -2506,9 +2510,6 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
continue;
- if (!rdev->meta_bdev)
- continue;
-
r = super_load(rdev, freshest);
switch (r) {
@@ -2886,9 +2887,6 @@ static void configure_discard_support(struct raid_set *rs)
bool raid456;
struct dm_target *ti = rs->ti;
- /* Assume discards not supported until after checks below. */
- ti->discards_supported = false;
-
/*
* XXX: RAID level 4,5,6 require zeroing for safety.
*/
@@ -2913,9 +2911,6 @@ static void configure_discard_support(struct raid_set *rs)
}
}
- /* All RAID members properly support discards */
- ti->discards_supported = true;
-
/*
* RAID1 and RAID10 personalities require bio splitting,
* RAID0/4/5/6 don't and process large discard bios properly.
@@ -3629,8 +3624,11 @@ static void raid_postsuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
- if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
+ if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
+ mddev_lock_nointr(&rs->md);
mddev_suspend(&rs->md);
+ mddev_unlock(&rs->md);
+ }
rs->md.ro = 1;
}
@@ -3887,8 +3885,11 @@ static void raid_resume(struct dm_target *ti)
if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
+ if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
+ mddev_lock_nointr(mddev);
mddev_resume(mddev);
+ mddev_unlock(mddev);
+ }
}
static struct target_type raid_target = {
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index c0b82136b2d1..580c49cc8079 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -94,9 +94,9 @@ static void wakeup_mirrord(void *context)
queue_work(ms->kmirrord_wq, &ms->kmirrord_work);
}
-static void delayed_wake_fn(unsigned long data)
+static void delayed_wake_fn(struct timer_list *t)
{
- struct mirror_set *ms = (struct mirror_set *) data;
+ struct mirror_set *ms = from_timer(ms, t, timer);
clear_bit(0, &ms->timer_pending);
wakeup_mirrord(ms);
@@ -108,8 +108,6 @@ static void delayed_wake(struct mirror_set *ms)
return;
ms->timer.expires = jiffies + HZ / 5;
- ms->timer.data = (unsigned long) ms;
- ms->timer.function = delayed_wake_fn;
add_timer(&ms->timer);
}
@@ -1133,7 +1131,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto err_free_context;
}
INIT_WORK(&ms->kmirrord_work, do_mirror);
- init_timer(&ms->timer);
+ timer_setup(&ms->timer, delayed_wake_fn, 0);
ms->timer_pending = 0;
INIT_WORK(&ms->trigger_event, trigger_event);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index eadfcfd106ff..9d32f25489c2 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -56,7 +56,7 @@ static unsigned dm_get_blk_mq_queue_depth(void)
int dm_request_based(struct mapped_device *md)
{
- return blk_queue_stackable(md->queue);
+ return queue_is_rq_based(md->queue);
}
static void dm_old_start_queue(struct request_queue *q)
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 1113b42e1eda..a0613bd8ed00 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -2411,24 +2411,6 @@ static int __init dm_snapshot_init(void)
return r;
}
- r = dm_register_target(&snapshot_target);
- if (r < 0) {
- DMERR("snapshot target register failed %d", r);
- goto bad_register_snapshot_target;
- }
-
- r = dm_register_target(&origin_target);
- if (r < 0) {
- DMERR("Origin target register failed %d", r);
- goto bad_register_origin_target;
- }
-
- r = dm_register_target(&merge_target);
- if (r < 0) {
- DMERR("Merge target register failed %d", r);
- goto bad_register_merge_target;
- }
-
r = init_origin_hash();
if (r) {
DMERR("init_origin_hash failed.");
@@ -2449,19 +2431,37 @@ static int __init dm_snapshot_init(void)
goto bad_pending_cache;
}
+ r = dm_register_target(&snapshot_target);
+ if (r < 0) {
+ DMERR("snapshot target register failed %d", r);
+ goto bad_register_snapshot_target;
+ }
+
+ r = dm_register_target(&origin_target);
+ if (r < 0) {
+ DMERR("Origin target register failed %d", r);
+ goto bad_register_origin_target;
+ }
+
+ r = dm_register_target(&merge_target);
+ if (r < 0) {
+ DMERR("Merge target register failed %d", r);
+ goto bad_register_merge_target;
+ }
+
return 0;
-bad_pending_cache:
- kmem_cache_destroy(exception_cache);
-bad_exception_cache:
- exit_origin_hash();
-bad_origin_hash:
- dm_unregister_target(&merge_target);
bad_register_merge_target:
dm_unregister_target(&origin_target);
bad_register_origin_target:
dm_unregister_target(&snapshot_target);
bad_register_snapshot_target:
+ kmem_cache_destroy(pending_cache);
+bad_pending_cache:
+ kmem_cache_destroy(exception_cache);
+bad_exception_cache:
+ exit_origin_hash();
+bad_origin_hash:
dm_exception_store_exit();
return r;
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index a7868503d135..29bc51084c82 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -432,7 +432,7 @@ do_sync_free:
synchronize_rcu_expedited();
dm_stat_free(&s->rcu_head);
} else {
- ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1;
+ WRITE_ONCE(dm_stat_need_rcu_barrier, 1);
call_rcu(&s->rcu_head, dm_stat_free);
}
return 0;
@@ -640,12 +640,12 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
*/
last = raw_cpu_ptr(stats->last);
stats_aux->merged =
- (bi_sector == (ACCESS_ONCE(last->last_sector) &&
+ (bi_sector == (READ_ONCE(last->last_sector) &&
((bi_rw == WRITE) ==
- (ACCESS_ONCE(last->last_rw) == WRITE))
+ (READ_ONCE(last->last_rw) == WRITE))
));
- ACCESS_ONCE(last->last_sector) = end_sector;
- ACCESS_ONCE(last->last_rw) = bi_rw;
+ WRITE_ONCE(last->last_sector, end_sector);
+ WRITE_ONCE(last->last_rw, bi_rw);
}
rcu_read_lock();
@@ -694,22 +694,22 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
for_each_possible_cpu(cpu) {
p = &s->stat_percpu[cpu][x];
- shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
- shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]);
- shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]);
- shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]);
- shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]);
- shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]);
- shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]);
- shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]);
- shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]);
- shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
- shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
- shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
+ shared->tmp.sectors[READ] += READ_ONCE(p->sectors[READ]);
+ shared->tmp.sectors[WRITE] += READ_ONCE(p->sectors[WRITE]);
+ shared->tmp.ios[READ] += READ_ONCE(p->ios[READ]);
+ shared->tmp.ios[WRITE] += READ_ONCE(p->ios[WRITE]);
+ shared->tmp.merges[READ] += READ_ONCE(p->merges[READ]);
+ shared->tmp.merges[WRITE] += READ_ONCE(p->merges[WRITE]);
+ shared->tmp.ticks[READ] += READ_ONCE(p->ticks[READ]);
+ shared->tmp.ticks[WRITE] += READ_ONCE(p->ticks[WRITE]);
+ shared->tmp.io_ticks[READ] += READ_ONCE(p->io_ticks[READ]);
+ shared->tmp.io_ticks[WRITE] += READ_ONCE(p->io_ticks[WRITE]);
+ shared->tmp.io_ticks_total += READ_ONCE(p->io_ticks_total);
+ shared->tmp.time_in_queue += READ_ONCE(p->time_in_queue);
if (s->n_histogram_entries) {
unsigned i;
for (i = 0; i < s->n_histogram_entries + 1; i++)
- shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
+ shared->tmp.histogram[i] += READ_ONCE(p->histogram[i]);
}
}
}
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 4c8de1ff78ca..8d0ba879777e 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -144,7 +144,7 @@ static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long
switch_get_position(sctx, region_nr, &region_index, &bit);
- return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
+ return (READ_ONCE(sctx->region_table[region_index]) >> bit) &
((1 << sctx->region_table_entry_bits) - 1);
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ef7b8f201f73..aaffd0c0ee9a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -451,16 +451,17 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
return r;
}
- atomic_set(&dd->count, 0);
+ refcount_set(&dd->count, 1);
list_add(&dd->list, &t->devices);
+ goto out;
} else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) {
r = upgrade_mode(dd, mode, t->md);
if (r)
return r;
}
- atomic_inc(&dd->count);
-
+ refcount_inc(&dd->count);
+out:
*result = dd->dm_dev;
return 0;
}
@@ -515,7 +516,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
dm_device_name(ti->table->md), d->name);
return;
}
- if (atomic_dec_and_test(&dd->count)) {
+ if (refcount_dec_and_test(&dd->count)) {
dm_put_table_device(ti->table->md, d);
list_del(&dd->list);
kfree(dd);
@@ -1000,7 +1001,7 @@ verify_rq_based:
list_for_each_entry(dd, devices, list) {
struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
- if (!blk_queue_stackable(q)) {
+ if (!queue_is_rq_based(q)) {
DMERR("table load rejected: including"
" non-request-stackable devices");
return -EINVAL;
@@ -1758,13 +1759,12 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t)
return true;
}
-
-static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
+static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
- return q && blk_queue_discard(q);
+ return q && !blk_queue_discard(q);
}
static bool dm_table_supports_discards(struct dm_table *t)
@@ -1772,28 +1772,24 @@ static bool dm_table_supports_discards(struct dm_table *t)
struct dm_target *ti;
unsigned i;
- /*
- * Unless any target used by the table set discards_supported,
- * require at least one underlying device to support discards.
- * t->devices includes internal dm devices such as mirror logs
- * so we need to use iterate_devices here, which targets
- * supporting discard selectively must provide.
- */
for (i = 0; i < dm_table_get_num_targets(t); i++) {
ti = dm_table_get_target(t, i);
if (!ti->num_discard_bios)
- continue;
-
- if (ti->discards_supported)
- return true;
+ return false;
- if (ti->type->iterate_devices &&
- ti->type->iterate_devices(ti, device_discard_capable, NULL))
- return true;
+ /*
+ * Either the target provides discard support (as implied by setting
+ * 'discards_supported') or it relies on _all_ data devices having
+ * discard support.
+ */
+ if (!ti->discards_supported &&
+ (!ti->type->iterate_devices ||
+ ti->type->iterate_devices(ti, device_not_discard_capable, NULL)))
+ return false;
}
- return false;
+ return true;
}
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -1806,9 +1802,15 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
*/
q->limits = *limits;
- if (!dm_table_supports_discards(t))
+ if (!dm_table_supports_discards(t)) {
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
- else
+ /* Must also clear discard limits... */
+ q->limits.max_discard_sectors = 0;
+ q->limits.max_hw_discard_sectors = 0;
+ q->limits.discard_granularity = 0;
+ q->limits.discard_alignment = 0;
+ q->limits.discard_misaligned = 0;
+ } else
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
@@ -1847,19 +1849,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
*/
if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
-
- /*
- * QUEUE_FLAG_STACKABLE must be set after all queue settings are
- * visible to other CPUs because, once the flag is set, incoming bios
- * are processed by request-based dm, which refers to the queue
- * settings.
- * Until the flag set, bios are passed to bio-based dm and queued to
- * md->deferred where queue settings are not needed yet.
- * Those bios are passed to request-based dm at the resume time.
- */
- smp_mb();
- if (dm_table_request_based(t))
- queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
}
unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index d31d18d9727c..36ef284ad086 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -80,10 +80,14 @@
#define SECTOR_TO_BLOCK_SHIFT 3
/*
+ * For btree insert:
* 3 for btree insert +
* 2 for btree lookup used within space map
+ * For btree remove:
+ * 2 for shadow spine +
+ * 4 for rebalance 3 child node
*/
-#define THIN_MAX_CONCURRENT_LOCKS 5
+#define THIN_MAX_CONCURRENT_LOCKS 6
/* This should be plenty */
#define SPACE_MAP_ROOT_SIZE 128
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 1e25705209c2..f91d771fff4b 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2431,7 +2431,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
struct pool_c *pt = pool->ti->private;
bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
enum pool_mode old_mode = get_pool_mode(pool);
- unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
+ unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ;
/*
* Never allow the pool to transition to PM_WRITE mode if user
@@ -4355,30 +4355,28 @@ static struct target_type thin_target = {
static int __init dm_thin_init(void)
{
- int r;
+ int r = -ENOMEM;
pool_table_init();
+ _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
+ if (!_new_mapping_cache)
+ return r;
+
r = dm_register_target(&thin_target);
if (r)
- return r;
+ goto bad_new_mapping_cache;
r = dm_register_target(&pool_target);
if (r)
- goto bad_pool_target;
-
- r = -ENOMEM;
-
- _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
- if (!_new_mapping_cache)
- goto bad_new_mapping_cache;
+ goto bad_thin_target;
return 0;
-bad_new_mapping_cache:
- dm_unregister_target(&pool_target);
-bad_pool_target:
+bad_thin_target:
dm_unregister_target(&thin_target);
+bad_new_mapping_cache:
+ kmem_cache_destroy(_new_mapping_cache);
return r;
}
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index bda3caca23ca..aedb8222836b 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -92,74 +92,33 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
return block >> (level * v->hash_per_block_bits);
}
-/*
- * Callback function for asynchrnous crypto API completion notification
- */
-static void verity_op_done(struct crypto_async_request *base, int err)
-{
- struct verity_result *res = (struct verity_result *)base->data;
-
- if (err == -EINPROGRESS)
- return;
-
- res->err = err;
- complete(&res->completion);
-}
-
-/*
- * Wait for async crypto API callback
- */
-static inline int verity_complete_op(struct verity_result *res, int ret)
-{
- switch (ret) {
- case 0:
- break;
-
- case -EINPROGRESS:
- case -EBUSY:
- ret = wait_for_completion_interruptible(&res->completion);
- if (!ret)
- ret = res->err;
- reinit_completion(&res->completion);
- break;
-
- default:
- DMERR("verity_wait_hash: crypto op submission failed: %d", ret);
- }
-
- if (unlikely(ret < 0))
- DMERR("verity_wait_hash: crypto op failed: %d", ret);
-
- return ret;
-}
-
static int verity_hash_update(struct dm_verity *v, struct ahash_request *req,
const u8 *data, size_t len,
- struct verity_result *res)
+ struct crypto_wait *wait)
{
struct scatterlist sg;
sg_init_one(&sg, data, len);
ahash_request_set_crypt(req, &sg, NULL, len);
- return verity_complete_op(res, crypto_ahash_update(req));
+ return crypto_wait_req(crypto_ahash_update(req), wait);
}
/*
* Wrapper for crypto_ahash_init, which handles verity salting.
*/
static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
- struct verity_result *res)
+ struct crypto_wait *wait)
{
int r;
ahash_request_set_tfm(req, v->tfm);
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
CRYPTO_TFM_REQ_MAY_BACKLOG,
- verity_op_done, (void *)res);
- init_completion(&res->completion);
+ crypto_req_done, (void *)wait);
+ crypto_init_wait(wait);
- r = verity_complete_op(res, crypto_ahash_init(req));
+ r = crypto_wait_req(crypto_ahash_init(req), wait);
if (unlikely(r < 0)) {
DMERR("crypto_ahash_init failed: %d", r);
@@ -167,18 +126,18 @@ static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
}
if (likely(v->salt_size && (v->version >= 1)))
- r = verity_hash_update(v, req, v->salt, v->salt_size, res);
+ r = verity_hash_update(v, req, v->salt, v->salt_size, wait);
return r;
}
static int verity_hash_final(struct dm_verity *v, struct ahash_request *req,
- u8 *digest, struct verity_result *res)
+ u8 *digest, struct crypto_wait *wait)
{
int r;
if (unlikely(v->salt_size && (!v->version))) {
- r = verity_hash_update(v, req, v->salt, v->salt_size, res);
+ r = verity_hash_update(v, req, v->salt, v->salt_size, wait);
if (r < 0) {
DMERR("verity_hash_final failed updating salt: %d", r);
@@ -187,7 +146,7 @@ static int verity_hash_final(struct dm_verity *v, struct ahash_request *req,
}
ahash_request_set_crypt(req, NULL, digest, 0);
- r = verity_complete_op(res, crypto_ahash_final(req));
+ r = crypto_wait_req(crypto_ahash_final(req), wait);
out:
return r;
}
@@ -196,17 +155,17 @@ int verity_hash(struct dm_verity *v, struct ahash_request *req,
const u8 *data, size_t len, u8 *digest)
{
int r;
- struct verity_result res;
+ struct crypto_wait wait;
- r = verity_hash_init(v, req, &res);
+ r = verity_hash_init(v, req, &wait);
if (unlikely(r < 0))
goto out;
- r = verity_hash_update(v, req, data, len, &res);
+ r = verity_hash_update(v, req, data, len, &wait);
if (unlikely(r < 0))
goto out;
- r = verity_hash_final(v, req, digest, &res);
+ r = verity_hash_final(v, req, digest, &wait);
out:
return r;
@@ -389,7 +348,7 @@ out:
* Calculates the digest for the given bio
*/
int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io,
- struct bvec_iter *iter, struct verity_result *res)
+ struct bvec_iter *iter, struct crypto_wait *wait)
{
unsigned int todo = 1 << v->data_dev_block_bits;
struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
@@ -414,7 +373,7 @@ int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io,
*/
sg_set_page(&sg, bv.bv_page, len, bv.bv_offset);
ahash_request_set_crypt(req, &sg, NULL, len);
- r = verity_complete_op(res, crypto_ahash_update(req));
+ r = crypto_wait_req(crypto_ahash_update(req), wait);
if (unlikely(r < 0)) {
DMERR("verity_for_io_block crypto op failed: %d", r);
@@ -482,7 +441,7 @@ static int verity_verify_io(struct dm_verity_io *io)
struct dm_verity *v = io->v;
struct bvec_iter start;
unsigned b;
- struct verity_result res;
+ struct crypto_wait wait;
for (b = 0; b < io->n_blocks; b++) {
int r;
@@ -507,17 +466,17 @@ static int verity_verify_io(struct dm_verity_io *io)
continue;
}
- r = verity_hash_init(v, req, &res);
+ r = verity_hash_init(v, req, &wait);
if (unlikely(r < 0))
return r;
start = io->iter;
- r = verity_for_io_block(v, io, &io->iter, &res);
+ r = verity_for_io_block(v, io, &io->iter, &wait);
if (unlikely(r < 0))
return r;
r = verity_hash_final(v, req, verity_io_real_digest(v, io),
- &res);
+ &wait);
if (unlikely(r < 0))
return r;
@@ -589,7 +548,7 @@ static void verity_prefetch_io(struct work_struct *work)
verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL);
verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL);
if (!i) {
- unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster);
+ unsigned cluster = READ_ONCE(dm_verity_prefetch_cluster);
cluster >>= v->data_dev_block_bits;
if (unlikely(!cluster))
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index a59e0ada6fd3..b675bc015512 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -90,11 +90,6 @@ struct dm_verity_io {
*/
};
-struct verity_result {
- struct completion completion;
- int err;
-};
-
static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v,
struct dm_verity_io *io)
{
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index b87c1741da4b..6d7bda6f8190 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -660,6 +660,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
struct dmz_target *dmz = ti->private;
struct request_queue *q;
struct dmz_dev *dev;
+ sector_t aligned_capacity;
int ret;
/* Get the target device */
@@ -685,15 +686,17 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
goto err;
}
+ q = bdev_get_queue(dev->bdev);
dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
- if (ti->begin || (ti->len != dev->capacity)) {
+ aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1);
+ if (ti->begin ||
+ ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) {
ti->error = "Partial mapping not supported";
ret = -EINVAL;
goto err;
}
- q = bdev_get_queue(dev->bdev);
- dev->zone_nr_sectors = q->limits.chunk_sectors;
+ dev->zone_nr_sectors = blk_queue_zone_sectors(q);
dev->zone_nr_sectors_shift = ilog2(dev->zone_nr_sectors);
dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
@@ -929,8 +932,10 @@ static int dmz_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct dmz_target *dmz = ti->private;
+ struct dmz_dev *dev = dmz->dev;
+ sector_t capacity = dev->capacity & ~(dev->zone_nr_sectors - 1);
- return fn(ti, dmz->ddev, 0, dmz->dev->capacity, data);
+ return fn(ti, dmz->ddev, 0, capacity, data);
}
static struct target_type dmz_type = {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4be85324f44d..de17b7193299 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -24,6 +24,7 @@
#include <linux/delay.h>
#include <linux/wait.h>
#include <linux/pr.h>
+#include <linux/refcount.h>
#define DM_MSG_PREFIX "core"
@@ -98,7 +99,7 @@ struct dm_md_mempools {
struct table_device {
struct list_head list;
- atomic_t count;
+ refcount_t count;
struct dm_dev dm_dev;
};
@@ -114,7 +115,7 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
static int __dm_get_module_param_int(int *module_param, int min, int max)
{
- int param = ACCESS_ONCE(*module_param);
+ int param = READ_ONCE(*module_param);
int modified_param = 0;
bool modified = true;
@@ -136,7 +137,7 @@ static int __dm_get_module_param_int(int *module_param, int min, int max)
unsigned __dm_get_module_param(unsigned *module_param,
unsigned def, unsigned max)
{
- unsigned param = ACCESS_ONCE(*module_param);
+ unsigned param = READ_ONCE(*module_param);
unsigned modified_param = 0;
if (!param)
@@ -685,10 +686,11 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
format_dev_t(td->dm_dev.name, dev);
- atomic_set(&td->count, 0);
+ refcount_set(&td->count, 1);
list_add(&td->list, &md->table_devices);
+ } else {
+ refcount_inc(&td->count);
}
- atomic_inc(&td->count);
mutex_unlock(&md->table_devices_lock);
*result = &td->dm_dev;
@@ -701,7 +703,7 @@ void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
struct table_device *td = container_of(d, struct table_device, dm_dev);
mutex_lock(&md->table_devices_lock);
- if (atomic_dec_and_test(&td->count)) {
+ if (refcount_dec_and_test(&td->count)) {
close_table_device(td, md);
list_del(&td->list);
kfree(td);
@@ -718,7 +720,7 @@ static void free_table_devices(struct list_head *devices)
struct table_device *td = list_entry(tmp, struct table_device, list);
DMWARN("dm_destroy: %s still exists with %d references",
- td->dm_dev.name, atomic_read(&td->count));
+ td->dm_dev.name, refcount_read(&td->count));
kfree(td);
}
}
@@ -1619,17 +1621,6 @@ static void dm_wq_work(struct work_struct *work);
void dm_init_md_queue(struct mapped_device *md)
{
/*
- * Request-based dm devices cannot be stacked on top of bio-based dm
- * devices. The type of this dm device may not have been decided yet.
- * The type is decided at the first table loading time.
- * To prevent problematic device stacking, clear the queue flag
- * for request stacking support until then.
- *
- * This queue is new, so no concurrency on the queue_flags.
- */
- queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
-
- /*
* Initialize data that will only be used by a non-blk-mq DM queue
* - must do so here (in alloc_dev callchain) before queue is used
*/
@@ -1695,7 +1686,7 @@ static struct mapped_device *alloc_dev(int minor)
struct mapped_device *md;
void *old_md;
- md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
+ md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
if (!md) {
DMWARN("unable to allocate device, out of memory.");
return NULL;
@@ -1795,7 +1786,7 @@ bad_io_barrier:
bad_minor:
module_put(THIS_MODULE);
bad_module_get:
- kfree(md);
+ kvfree(md);
return NULL;
}
@@ -1814,7 +1805,7 @@ static void free_dev(struct mapped_device *md)
free_minor(minor);
module_put(THIS_MODULE);
- kfree(md);
+ kvfree(md);
}
static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
@@ -2072,17 +2063,12 @@ struct mapped_device *dm_get_md(dev_t dev)
spin_lock(&_minor_lock);
md = idr_find(&_minor_idr, minor);
- if (md) {
- if ((md == MINOR_ALLOCED ||
- (MINOR(disk_devt(dm_disk(md))) != minor) ||
- dm_deleting_md(md) ||
- test_bit(DMF_FREEING, &md->flags))) {
- md = NULL;
- goto out;
- }
- dm_get(md);
+ if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
+ test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
+ md = NULL;
+ goto out;
}
-
+ dm_get(md);
out:
spin_unlock(&_minor_lock);
@@ -2709,11 +2695,15 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
- if (test_bit(DMF_FREEING, &md->flags) ||
- dm_deleting_md(md))
- return NULL;
-
+ spin_lock(&_minor_lock);
+ if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
+ md = NULL;
+ goto out;
+ }
dm_get(md);
+out:
+ spin_unlock(&_minor_lock);
+
return md;
}
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 38c84c0a35d4..36399bb875dd 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -19,6 +19,7 @@
#include <linux/hdreg.h>
#include <linux/completion.h>
#include <linux/kobject.h>
+#include <linux/refcount.h>
#include "dm-stats.h"
@@ -38,7 +39,7 @@
*/
struct dm_dev_internal {
struct list_head list;
- atomic_t count;
+ refcount_t count;
struct dm_dev *dm_dev;
};
diff --git a/drivers/md/bitmap.c b/drivers/md/md-bitmap.c
index d2121637b4ab..239c7bb3929b 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -29,7 +29,7 @@
#include <linux/seq_file.h>
#include <trace/events/block.h>
#include "md.h"
-#include "bitmap.h"
+#include "md-bitmap.h"
static inline char *bmname(struct bitmap *bitmap)
{
@@ -368,7 +368,7 @@ static int read_page(struct file *file, unsigned long index,
pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
(unsigned long long)index << PAGE_SHIFT);
- bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
+ bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false);
if (!bh) {
ret = -ENOMEM;
goto out;
@@ -459,7 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
/* rocking back to read-only */
bitmap->events_cleared = bitmap->mddev->events;
sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
- sb->state = cpu_to_le32(bitmap->flags);
+ /*
+ * clear BITMAP_WRITE_ERROR bit to protect against the case that
+ * a bitmap write error occurred but the later writes succeeded.
+ */
+ sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR));
/* Just in case these have been changed via sysfs: */
sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
@@ -625,7 +629,7 @@ re_read:
err = read_sb_page(bitmap->mddev,
offset,
sb_page,
- 0, PAGE_SIZE);
+ 0, sizeof(bitmap_super_t));
}
if (err)
return err;
@@ -1816,6 +1820,12 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot)
BUG_ON(file && mddev->bitmap_info.offset);
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ pr_notice("md/raid:%s: array with journal cannot have bitmap\n",
+ mdname(mddev));
+ return ERR_PTR(-EBUSY);
+ }
+
bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
if (!bitmap)
return ERR_PTR(-ENOMEM);
@@ -2123,7 +2133,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
if (store.sb_page && bitmap->storage.sb_page)
memcpy(page_address(store.sb_page),
page_address(bitmap->storage.sb_page),
- PAGE_SIZE);
+ sizeof(bitmap_super_t));
bitmap_file_unmap(&bitmap->storage);
bitmap->storage = store;
@@ -2152,6 +2162,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
for (k = 0; k < page; k++) {
kfree(new_bp[k].map);
}
+ kfree(new_bp);
/* restore some fields from old_counts */
bitmap->counts.bp = old_counts.bp;
@@ -2202,6 +2213,14 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
block += old_blocks;
}
+ if (bitmap->counts.bp != old_counts.bp) {
+ unsigned long k;
+ for (k = 0; k < old_counts.pages; k++)
+ if (!old_counts.bp[k].hijacked)
+ kfree(old_counts.bp[k].map);
+ kfree(old_counts.bp);
+ }
+
if (!init) {
int i;
while (block < (chunks << chunkshift)) {
diff --git a/drivers/md/bitmap.h b/drivers/md/md-bitmap.h
index 5df35ca90f58..5df35ca90f58 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/md-bitmap.h
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 03082e17c65c..79bfbc840385 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -15,7 +15,7 @@
#include <linux/sched.h>
#include <linux/raid/md_p.h>
#include "md.h"
-#include "bitmap.h"
+#include "md-bitmap.h"
#include "md-cluster.h"
#define LVB_SIZE 64
@@ -442,10 +442,11 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
static void remove_suspend_info(struct mddev *mddev, int slot)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
+ mddev->pers->quiesce(mddev, 1);
spin_lock_irq(&cinfo->suspend_lock);
__remove_suspend_info(cinfo, slot);
spin_unlock_irq(&cinfo->suspend_lock);
- mddev->pers->quiesce(mddev, 2);
+ mddev->pers->quiesce(mddev, 0);
}
@@ -492,13 +493,12 @@ static void process_suspend_info(struct mddev *mddev,
s->lo = lo;
s->hi = hi;
mddev->pers->quiesce(mddev, 1);
- mddev->pers->quiesce(mddev, 0);
spin_lock_irq(&cinfo->suspend_lock);
/* Remove existing entry (if exists) before adding */
__remove_suspend_info(cinfo, slot);
list_add(&s->list, &cinfo->suspend_list);
spin_unlock_irq(&cinfo->suspend_lock);
- mddev->pers->quiesce(mddev, 2);
+ mddev->pers->quiesce(mddev, 0);
}
static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
@@ -1094,7 +1094,7 @@ static void metadata_update_cancel(struct mddev *mddev)
/*
* return 0 if all the bitmaps have the same sync_size
*/
-int cluster_check_sync_size(struct mddev *mddev)
+static int cluster_check_sync_size(struct mddev *mddev)
{
int i, rv;
bitmap_super_t *sb;
@@ -1478,7 +1478,7 @@ static struct md_cluster_operations cluster_ops = {
static int __init cluster_init(void)
{
- pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n");
+ pr_warn("md-cluster: support raid1 and raid10 (limited support)\n");
pr_info("Registering Cluster MD functions\n");
register_md_cluster_operations(&cluster_ops, THIS_MODULE);
return 0;
diff --git a/drivers/md/faulty.c b/drivers/md/md-faulty.c
index 38264b38420f..38264b38420f 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/md-faulty.c
diff --git a/drivers/md/linear.c b/drivers/md/md-linear.c
index c464fb48039a..773fc70dced7 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/md-linear.c
@@ -23,7 +23,7 @@
#include <linux/slab.h>
#include <trace/events/block.h>
#include "md.h"
-#include "linear.h"
+#include "md-linear.h"
/*
* find which device holds a particular offset
diff --git a/drivers/md/linear.h b/drivers/md/md-linear.h
index 8381d651d4ed..8381d651d4ed 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/md-linear.h
diff --git a/drivers/md/multipath.c b/drivers/md/md-multipath.c
index b68e0666b9b0..e40065bdbfc8 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/md-multipath.c
@@ -25,7 +25,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include "md.h"
-#include "multipath.h"
+#include "md-multipath.h"
#define MAX_WORK_PER_DISK 128
@@ -243,7 +243,6 @@ static void print_multipath_conf (struct mpconf *conf)
static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct mpconf *conf = mddev->private;
- struct request_queue *q;
int err = -EEXIST;
int path;
struct multipath_info *p;
@@ -257,7 +256,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
for (path = first; path <= last; path++)
if ((p=conf->multipaths+path)->rdev == NULL) {
- q = rdev->bdev->bd_disk->queue;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
diff --git a/drivers/md/multipath.h b/drivers/md/md-multipath.h
index 0adb941f485a..0adb941f485a 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/md-multipath.h
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0ff1bbf6c90e..4e4dee0ec2de 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -69,7 +69,7 @@
#include <trace/events/block.h>
#include "md.h"
-#include "bitmap.h"
+#include "md-bitmap.h"
#include "md-cluster.h"
#ifndef MODULE
@@ -266,16 +266,31 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
* call has finished, the bio has been linked into some internal structure
* and so is visible to ->quiesce(), so we don't need the refcount any more.
*/
+static bool is_suspended(struct mddev *mddev, struct bio *bio)
+{
+ if (mddev->suspended)
+ return true;
+ if (bio_data_dir(bio) != WRITE)
+ return false;
+ if (mddev->suspend_lo >= mddev->suspend_hi)
+ return false;
+ if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
+ return false;
+ if (bio_end_sector(bio) < mddev->suspend_lo)
+ return false;
+ return true;
+}
+
void md_handle_request(struct mddev *mddev, struct bio *bio)
{
check_suspended:
rcu_read_lock();
- if (mddev->suspended) {
+ if (is_suspended(mddev, bio)) {
DEFINE_WAIT(__wait);
for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait,
TASK_UNINTERRUPTIBLE);
- if (!mddev->suspended)
+ if (!is_suspended(mddev, bio))
break;
rcu_read_unlock();
schedule();
@@ -344,12 +359,17 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
void mddev_suspend(struct mddev *mddev)
{
WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
+ lockdep_assert_held(&mddev->reconfig_mutex);
if (mddev->suspended++)
return;
synchronize_rcu();
wake_up(&mddev->sb_wait);
+ set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
+ smp_mb__after_atomic();
wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
mddev->pers->quiesce(mddev, 1);
+ clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
+ wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
del_timer_sync(&mddev->safemode_timer);
}
@@ -357,6 +377,7 @@ EXPORT_SYMBOL_GPL(mddev_suspend);
void mddev_resume(struct mddev *mddev)
{
+ lockdep_assert_held(&mddev->reconfig_mutex);
if (--mddev->suspended)
return;
wake_up(&mddev->sb_wait);
@@ -520,7 +541,7 @@ static void mddev_put(struct mddev *mddev)
bioset_free(sync_bs);
}
-static void md_safemode_timeout(unsigned long data);
+static void md_safemode_timeout(struct timer_list *t);
void mddev_init(struct mddev *mddev)
{
@@ -529,8 +550,7 @@ void mddev_init(struct mddev *mddev)
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
- setup_timer(&mddev->safemode_timer, md_safemode_timeout,
- (unsigned long) mddev);
+ timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
@@ -663,6 +683,7 @@ void mddev_unlock(struct mddev *mddev)
*/
spin_lock(&pers_lock);
md_wakeup_thread(mddev->thread);
+ wake_up(&mddev->sb_wait);
spin_unlock(&pers_lock);
}
EXPORT_SYMBOL_GPL(mddev_unlock);
@@ -2313,7 +2334,7 @@ static void export_array(struct mddev *mddev)
static bool set_in_sync(struct mddev *mddev)
{
- WARN_ON_ONCE(NR_CPUS != 1 && !spin_is_locked(&mddev->lock));
+ lockdep_assert_held(&mddev->lock);
if (!mddev->in_sync) {
mddev->sync_checkers++;
spin_unlock(&mddev->lock);
@@ -2432,10 +2453,18 @@ repeat:
}
}
- /* First make sure individual recovery_offsets are correct */
+ /*
+ * First make sure individual recovery_offsets are correct
+ * curr_resync_completed can only be used during recovery.
+ * During reshape/resync it might use array-addresses rather
+ * that device addresses.
+ */
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+ test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
mddev->curr_resync_completed > rdev->recovery_offset)
@@ -2651,7 +2680,7 @@ state_show(struct md_rdev *rdev, char *page)
{
char *sep = ",";
size_t len = 0;
- unsigned long flags = ACCESS_ONCE(rdev->flags);
+ unsigned long flags = READ_ONCE(rdev->flags);
if (test_bit(Faulty, &flags) ||
(!test_bit(ExternalBbl, &flags) &&
@@ -4824,7 +4853,7 @@ suspend_lo_show(struct mddev *mddev, char *page)
static ssize_t
suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
{
- unsigned long long old, new;
+ unsigned long long new;
int err;
err = kstrtoull(buf, 10, &new);
@@ -4840,16 +4869,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->pers == NULL ||
mddev->pers->quiesce == NULL)
goto unlock;
- old = mddev->suspend_lo;
+ mddev_suspend(mddev);
mddev->suspend_lo = new;
- if (new >= old)
- /* Shrinking suspended region */
- mddev->pers->quiesce(mddev, 2);
- else {
- /* Expanding suspended region - need to wait */
- mddev->pers->quiesce(mddev, 1);
- mddev->pers->quiesce(mddev, 0);
- }
+ mddev_resume(mddev);
+
err = 0;
unlock:
mddev_unlock(mddev);
@@ -4867,7 +4890,7 @@ suspend_hi_show(struct mddev *mddev, char *page)
static ssize_t
suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
{
- unsigned long long old, new;
+ unsigned long long new;
int err;
err = kstrtoull(buf, 10, &new);
@@ -4880,19 +4903,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
if (err)
return err;
err = -EINVAL;
- if (mddev->pers == NULL ||
- mddev->pers->quiesce == NULL)
+ if (mddev->pers == NULL)
goto unlock;
- old = mddev->suspend_hi;
+
+ mddev_suspend(mddev);
mddev->suspend_hi = new;
- if (new <= old)
- /* Shrinking suspended region */
- mddev->pers->quiesce(mddev, 2);
- else {
- /* Expanding suspended region - need to wait */
- mddev->pers->quiesce(mddev, 1);
- mddev->pers->quiesce(mddev, 0);
- }
+ mddev_resume(mddev);
+
err = 0;
unlock:
mddev_unlock(mddev);
@@ -5357,7 +5374,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
return NULL;
}
-static int add_named_array(const char *val, struct kernel_param *kp)
+static int add_named_array(const char *val, const struct kernel_param *kp)
{
/*
* val must be "md_*" or "mdNNN".
@@ -5386,9 +5403,9 @@ static int add_named_array(const char *val, struct kernel_param *kp)
return -EINVAL;
}
-static void md_safemode_timeout(unsigned long data)
+static void md_safemode_timeout(struct timer_list *t)
{
- struct mddev *mddev = (struct mddev *) data;
+ struct mddev *mddev = from_timer(mddev, t, safemode_timer);
mddev->safemode = 1;
if (mddev->external)
@@ -5834,8 +5851,14 @@ void md_stop(struct mddev *mddev)
* This is called from dm-raid
*/
__md_stop(mddev);
- if (mddev->bio_set)
+ if (mddev->bio_set) {
bioset_free(mddev->bio_set);
+ mddev->bio_set = NULL;
+ }
+ if (mddev->sync_set) {
+ bioset_free(mddev->sync_set);
+ mddev->sync_set = NULL;
+ }
}
EXPORT_SYMBOL_GPL(md_stop);
@@ -6362,7 +6385,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
break;
}
}
- if (has_journal) {
+ if (has_journal || mddev->bitmap) {
export_rdev(rdev);
return -EBUSY;
}
@@ -6618,22 +6641,26 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
return -ENOENT; /* cannot remove what isn't there */
err = 0;
if (mddev->pers) {
- mddev->pers->quiesce(mddev, 1);
if (fd >= 0) {
struct bitmap *bitmap;
bitmap = bitmap_create(mddev, -1);
+ mddev_suspend(mddev);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
err = bitmap_load(mddev);
} else
err = PTR_ERR(bitmap);
- }
- if (fd < 0 || err) {
+ if (err) {
+ bitmap_destroy(mddev);
+ fd = -1;
+ }
+ mddev_resume(mddev);
+ } else if (fd < 0) {
+ mddev_suspend(mddev);
bitmap_destroy(mddev);
- fd = -1; /* make sure to put the file */
+ mddev_resume(mddev);
}
- mddev->pers->quiesce(mddev, 0);
}
if (fd < 0) {
struct file *f = mddev->bitmap_info.file;
@@ -6735,7 +6762,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
{
- WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
+ lockdep_assert_held(&mddev->reconfig_mutex);
if (mddev->external_size)
return;
@@ -6917,8 +6944,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.default_offset;
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
- mddev->pers->quiesce(mddev, 1);
bitmap = bitmap_create(mddev, -1);
+ mddev_suspend(mddev);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
rv = bitmap_load(mddev);
@@ -6926,7 +6953,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
rv = PTR_ERR(bitmap);
if (rv)
bitmap_destroy(mddev);
- mddev->pers->quiesce(mddev, 0);
+ mddev_resume(mddev);
} else {
/* remove the bitmap */
if (!mddev->bitmap) {
@@ -6949,9 +6976,9 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.nodes = 0;
md_cluster_ops->leave(mddev);
}
- mddev->pers->quiesce(mddev, 1);
+ mddev_suspend(mddev);
bitmap_destroy(mddev);
- mddev->pers->quiesce(mddev, 0);
+ mddev_resume(mddev);
mddev->bitmap_info.offset = 0;
}
}
@@ -7468,8 +7495,8 @@ void md_wakeup_thread(struct md_thread *thread)
{
if (thread) {
pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
- if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags))
- wake_up(&thread->wqueue);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
}
}
EXPORT_SYMBOL(md_wakeup_thread);
@@ -7578,7 +7605,9 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
/* Still cleaning up */
resync = max_sectors;
- } else
+ } else if (resync > max_sectors)
+ resync = max_sectors;
+ else
resync -= atomic_read(&mddev->recovery_active);
if (resync == 0) {
@@ -8039,7 +8068,8 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
wait_event(mddev->sb_wait,
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended);
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
+ mddev->suspended);
if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
percpu_ref_put(&mddev->writes_pending);
return false;
@@ -8110,7 +8140,6 @@ void md_allow_write(struct mddev *mddev)
sysfs_notify_dirent_safe(mddev->sysfs_state);
/* wait for the dirty state to be recorded in the metadata */
wait_event(mddev->sb_wait,
- !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) &&
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
} else
spin_unlock(&mddev->lock);
@@ -8477,16 +8506,19 @@ void md_do_sync(struct md_thread *thread)
} else {
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
mddev->curr_resync = MaxSector;
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev)
- if (rdev->raid_disk >= 0 &&
- mddev->delta_disks >= 0 &&
- !test_bit(Journal, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags) &&
- !test_bit(In_sync, &rdev->flags) &&
- rdev->recovery_offset < mddev->curr_resync)
- rdev->recovery_offset = mddev->curr_resync;
- rcu_read_unlock();
+ if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
+ if (rdev->raid_disk >= 0 &&
+ mddev->delta_disks >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < mddev->curr_resync)
+ rdev->recovery_offset = mddev->curr_resync;
+ rcu_read_unlock();
+ }
}
}
skip:
@@ -8813,6 +8845,16 @@ void md_check_recovery(struct mddev *mddev)
unlock:
wake_up(&mddev->sb_wait);
mddev_unlock(mddev);
+ } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
+ /* Write superblock - thread that called mddev_suspend()
+ * holds reconfig_mutex for us.
+ */
+ set_bit(MD_UPDATING_SB, &mddev->flags);
+ smp_mb__after_atomic();
+ if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
+ md_update_sb(mddev, 0);
+ clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
+ wake_up(&mddev->sb_wait);
}
}
EXPORT_SYMBOL(md_check_recovery);
@@ -9274,11 +9316,11 @@ static __exit void md_exit(void)
subsys_initcall(md_init);
module_exit(md_exit)
-static int get_ro(char *buffer, struct kernel_param *kp)
+static int get_ro(char *buffer, const struct kernel_param *kp)
{
return sprintf(buffer, "%d", start_readonly);
}
-static int set_ro(const char *val, struct kernel_param *kp)
+static int set_ro(const char *val, const struct kernel_param *kp)
{
return kstrtouint(val, 10, (unsigned int *)&start_readonly);
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d8287d3cd1bf..7d6bcf0eba0c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -237,6 +237,12 @@ enum mddev_flags {
*/
MD_HAS_PPL, /* The raid array has PPL feature set */
MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */
+ MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update
+ * the metadata without taking reconfig_mutex.
+ */
+ MD_UPDATING_SB, /* md_check_recovery is updating the metadata
+ * without explicitly holding reconfig_mutex.
+ */
};
enum mddev_sb_flags {
@@ -494,11 +500,6 @@ static inline void mddev_lock_nointr(struct mddev *mddev)
mutex_lock(&mddev->reconfig_mutex);
}
-static inline int mddev_is_locked(struct mddev *mddev)
-{
- return mutex_is_locked(&mddev->reconfig_mutex);
-}
-
static inline int mddev_trylock(struct mddev *mddev)
{
return mutex_trylock(&mddev->reconfig_mutex);
@@ -538,12 +539,11 @@ struct md_personality
int (*check_reshape) (struct mddev *mddev);
int (*start_reshape) (struct mddev *mddev);
void (*finish_reshape) (struct mddev *mddev);
- /* quiesce moves between quiescence states
- * 0 - fully active
- * 1 - no new requests allowed
- * others - reserved
+ /* quiesce suspends or resumes internal processing.
+ * 1 - stop new actions and wait for action io to complete
+ * 0 - return to normal behaviour
*/
- void (*quiesce) (struct mddev *mddev, int state);
+ void (*quiesce) (struct mddev *mddev, int quiesce);
/* takeover is used to transition an array from one
* personality to another. The new personality must be able
* to handle the data in the current layout.
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index f21ce6a3d4cf..58b319757b1e 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -683,23 +683,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
pn->keys[1] = rn->keys[0];
memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
- /*
- * rejig the spine. This is ugly, since it knows too
- * much about the spine
- */
- if (s->nodes[0] != new_parent) {
- unlock_block(s->info, s->nodes[0]);
- s->nodes[0] = new_parent;
- }
- if (key < le64_to_cpu(rn->keys[0])) {
- unlock_block(s->info, right);
- s->nodes[1] = left;
- } else {
- unlock_block(s->info, left);
- s->nodes[1] = right;
- }
- s->count = 2;
-
+ unlock_block(s->info, left);
+ unlock_block(s->info, right);
return 0;
}
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 4aed69d9dd17..aec449243966 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -11,6 +11,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/device-mapper.h>
+#include <linux/kernel.h>
#define DM_MSG_PREFIX "space map metadata"
@@ -111,7 +112,7 @@ static bool brb_empty(struct bop_ring_buffer *brb)
static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
{
unsigned r = old + 1;
- return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
+ return r >= ARRAY_SIZE(brb->bops) ? 0 : r;
}
static int brb_push(struct bop_ring_buffer *brb,
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 5a00fc118470..5ecba9eef441 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -768,7 +768,7 @@ static void *raid0_takeover(struct mddev *mddev)
return ERR_PTR(-EINVAL);
}
-static void raid0_quiesce(struct mddev *mddev, int state)
+static void raid0_quiesce(struct mddev *mddev, int quiesce)
{
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f3f3e40dc9d8..6df398e3a008 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -37,13 +37,12 @@
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
-#include <linux/sched/signal.h>
#include <trace/events/block.h>
#include "md.h"
#include "raid1.h"
-#include "bitmap.h"
+#include "md-bitmap.h"
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
@@ -810,11 +809,15 @@ static void flush_pending_writes(struct r1conf *conf)
spin_lock_irq(&conf->device_lock);
if (conf->pending_bio_list.head) {
+ struct blk_plug plug;
struct bio *bio;
+
bio = bio_list_get(&conf->pending_bio_list);
conf->pending_count = 0;
spin_unlock_irq(&conf->device_lock);
+ blk_start_plug(&plug);
flush_bio_list(conf, bio);
+ blk_finish_plug(&plug);
} else
spin_unlock_irq(&conf->device_lock);
}
@@ -990,14 +993,6 @@ static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
_wait_barrier(conf, idx);
}
-static void wait_all_barriers(struct r1conf *conf)
-{
- int idx;
-
- for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
- _wait_barrier(conf, idx);
-}
-
static void _allow_barrier(struct r1conf *conf, int idx)
{
atomic_dec(&conf->nr_pending[idx]);
@@ -1011,14 +1006,6 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
_allow_barrier(conf, idx);
}
-static void allow_all_barriers(struct r1conf *conf)
-{
- int idx;
-
- for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
- _allow_barrier(conf, idx);
-}
-
/* conf->resync_lock should be held */
static int get_unqueued_pending(struct r1conf *conf)
{
@@ -1303,42 +1290,28 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
int first_clone;
int max_sectors;
- /*
- * Register the new request and wait if the reconstruction
- * thread has put up a bar for new requests.
- * Continue immediately if no resync is active currently.
- */
-
-
- if ((bio_end_sector(bio) > mddev->suspend_lo &&
- bio->bi_iter.bi_sector < mddev->suspend_hi) ||
- (mddev_is_clustered(mddev) &&
+ if (mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, WRITE,
- bio->bi_iter.bi_sector, bio_end_sector(bio)))) {
+ bio->bi_iter.bi_sector, bio_end_sector(bio))) {
- /*
- * As the suspend_* range is controlled by userspace, we want
- * an interruptible wait.
- */
DEFINE_WAIT(w);
for (;;) {
- sigset_t full, old;
prepare_to_wait(&conf->wait_barrier,
- &w, TASK_INTERRUPTIBLE);
- if (bio_end_sector(bio) <= mddev->suspend_lo ||
- bio->bi_iter.bi_sector >= mddev->suspend_hi ||
- (mddev_is_clustered(mddev) &&
- !md_cluster_ops->area_resyncing(mddev, WRITE,
- bio->bi_iter.bi_sector,
- bio_end_sector(bio))))
+ &w, TASK_IDLE);
+ if (!md_cluster_ops->area_resyncing(mddev, WRITE,
+ bio->bi_iter.bi_sector,
+ bio_end_sector(bio)))
break;
- sigfillset(&full);
- sigprocmask(SIG_BLOCK, &full, &old);
schedule();
- sigprocmask(SIG_SETMASK, &old, NULL);
}
finish_wait(&conf->wait_barrier, &w);
}
+
+ /*
+ * Register the new request and wait if the reconstruction
+ * thread has put up a bar for new requests.
+ * Continue immediately if no resync is active currently.
+ */
wait_barrier(conf, bio->bi_iter.bi_sector);
r1_bio = alloc_r1bio(mddev, bio);
@@ -1654,8 +1627,12 @@ static void print_conf(struct r1conf *conf)
static void close_sync(struct r1conf *conf)
{
- wait_all_barriers(conf);
- allow_all_barriers(conf);
+ int idx;
+
+ for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
+ _wait_barrier(conf, idx);
+ _allow_barrier(conf, idx);
+ }
mempool_destroy(conf->r1buf_pool);
conf->r1buf_pool = NULL;
@@ -3277,21 +3254,14 @@ static int raid1_reshape(struct mddev *mddev)
return 0;
}
-static void raid1_quiesce(struct mddev *mddev, int state)
+static void raid1_quiesce(struct mddev *mddev, int quiesce)
{
struct r1conf *conf = mddev->private;
- switch(state) {
- case 2: /* wake for suspend */
- wake_up(&conf->wait_barrier);
- break;
- case 1:
+ if (quiesce)
freeze_array(conf, 0);
- break;
- case 0:
+ else
unfreeze_array(conf);
- break;
- }
}
static void *raid1_takeover(struct mddev *mddev)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 374df5796649..c131835cf008 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -29,7 +29,7 @@
#include "md.h"
#include "raid10.h"
#include "raid0.h"
-#include "bitmap.h"
+#include "md-bitmap.h"
/*
* RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -136,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data)
kfree(r10_bio);
}
+#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
/* amount of memory to reserve for resync requests */
#define RESYNC_WINDOW (1024*1024)
/* maximum number of concurrent requests, memory permitting */
#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
+#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
+#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
/*
* When performing a resync, we need to read and compare, so
@@ -383,12 +386,11 @@ static void raid10_end_read_request(struct bio *bio)
{
int uptodate = !bio->bi_status;
struct r10bio *r10_bio = bio->bi_private;
- int slot, dev;
+ int slot;
struct md_rdev *rdev;
struct r10conf *conf = r10_bio->mddev->private;
slot = r10_bio->read_slot;
- dev = r10_bio->devs[slot].devnum;
rdev = r10_bio->devs[slot].rdev;
/*
* this branch is our 'one mirror IO has finished' event handler:
@@ -748,7 +750,6 @@ static struct md_rdev *read_balance(struct r10conf *conf,
raid10_find_phys(conf, r10_bio);
rcu_read_lock();
- sectors = r10_bio->sectors;
best_slot = -1;
best_rdev = NULL;
best_dist = MaxSector;
@@ -761,8 +762,11 @@ static struct md_rdev *read_balance(struct r10conf *conf,
* the resync window. We take the first readable disk when
* above the resync window.
*/
- if (conf->mddev->recovery_cp < MaxSector
- && (this_sector + sectors >= conf->next_resync))
+ if ((conf->mddev->recovery_cp < MaxSector
+ && (this_sector + sectors >= conf->next_resync)) ||
+ (mddev_is_clustered(conf->mddev) &&
+ md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
+ this_sector + sectors)))
do_balance = 0;
for (slot = 0; slot < conf->copies ; slot++) {
@@ -890,10 +894,13 @@ static void flush_pending_writes(struct r10conf *conf)
spin_lock_irq(&conf->device_lock);
if (conf->pending_bio_list.head) {
+ struct blk_plug plug;
struct bio *bio;
+
bio = bio_list_get(&conf->pending_bio_list);
conf->pending_count = 0;
spin_unlock_irq(&conf->device_lock);
+ blk_start_plug(&plug);
/* flush any pending bitmap writes to disk
* before proceeding w/ I/O */
bitmap_unplug(conf->mddev->bitmap);
@@ -914,6 +921,7 @@ static void flush_pending_writes(struct r10conf *conf)
generic_make_request(bio);
bio = next;
}
+ blk_finish_plug(&plug);
} else
spin_unlock_irq(&conf->device_lock);
}
@@ -1293,6 +1301,22 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
sector_t sectors;
int max_sectors;
+ if ((mddev_is_clustered(mddev) &&
+ md_cluster_ops->area_resyncing(mddev, WRITE,
+ bio->bi_iter.bi_sector,
+ bio_end_sector(bio)))) {
+ DEFINE_WAIT(w);
+ for (;;) {
+ prepare_to_wait(&conf->wait_barrier,
+ &w, TASK_IDLE);
+ if (!md_cluster_ops->area_resyncing(mddev, WRITE,
+ bio->bi_iter.bi_sector, bio_end_sector(bio)))
+ break;
+ schedule();
+ }
+ finish_wait(&conf->wait_barrier, &w);
+ }
+
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
@@ -2575,7 +2599,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
struct bio *bio;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev = r10_bio->devs[slot].rdev;
- sector_t bio_last_sector;
/* we got a read error. Maybe the drive is bad. Maybe just
* the block and we can fix it.
@@ -2586,7 +2609,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
* frozen.
*/
bio = r10_bio->devs[slot].bio;
- bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
bio_put(bio);
r10_bio->devs[slot].bio = NULL;
@@ -2826,6 +2848,43 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
}
/*
+ * Set cluster_sync_high since we need other nodes to add the
+ * range [cluster_sync_low, cluster_sync_high] to suspend list.
+ */
+static void raid10_set_cluster_sync_high(struct r10conf *conf)
+{
+ sector_t window_size;
+ int extra_chunk, chunks;
+
+ /*
+ * First, here we define "stripe" as a unit which across
+ * all member devices one time, so we get chunks by use
+ * raid_disks / near_copies. Otherwise, if near_copies is
+ * close to raid_disks, then resync window could increases
+ * linearly with the increase of raid_disks, which means
+ * we will suspend a really large IO window while it is not
+ * necessary. If raid_disks is not divisible by near_copies,
+ * an extra chunk is needed to ensure the whole "stripe" is
+ * covered.
+ */
+
+ chunks = conf->geo.raid_disks / conf->geo.near_copies;
+ if (conf->geo.raid_disks % conf->geo.near_copies == 0)
+ extra_chunk = 0;
+ else
+ extra_chunk = 1;
+ window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
+
+ /*
+ * At least use a 32M window to align with raid1's resync window
+ */
+ window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
+ CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
+
+ conf->cluster_sync_high = conf->cluster_sync_low + window_size;
+}
+
+/*
* perform a "sync" on one "block"
*
* We need to make sure that no normal I/O request - particularly write
@@ -2897,6 +2956,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) {
+ conf->cluster_sync_low = 0;
+ conf->cluster_sync_high = 0;
+
/* If we aborted, we need to abort the
* sync on the 'current' bitmap chucks (there can
* be several when recovering multiple devices).
@@ -3251,7 +3313,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
/* resync. Schedule a read for every block at this virt offset */
int count = 0;
- bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
+ /*
+ * Since curr_resync_completed could probably not update in
+ * time, and we will set cluster_sync_low based on it.
+ * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
+ * safety reason, which ensures curr_resync_completed is
+ * updated in bitmap_cond_end_sync.
+ */
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr,
+ mddev_is_clustered(mddev) &&
+ (sector_nr + 2 * RESYNC_SECTORS >
+ conf->cluster_sync_high));
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
@@ -3385,6 +3457,52 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
} while (++page_idx < RESYNC_PAGES);
r10_bio->sectors = nr_sectors;
+ if (mddev_is_clustered(mddev) &&
+ test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ /* It is resync not recovery */
+ if (conf->cluster_sync_high < sector_nr + nr_sectors) {
+ conf->cluster_sync_low = mddev->curr_resync_completed;
+ raid10_set_cluster_sync_high(conf);
+ /* Send resync message */
+ md_cluster_ops->resync_info_update(mddev,
+ conf->cluster_sync_low,
+ conf->cluster_sync_high);
+ }
+ } else if (mddev_is_clustered(mddev)) {
+ /* This is recovery not resync */
+ sector_t sect_va1, sect_va2;
+ bool broadcast_msg = false;
+
+ for (i = 0; i < conf->geo.raid_disks; i++) {
+ /*
+ * sector_nr is a device address for recovery, so we
+ * need translate it to array address before compare
+ * with cluster_sync_high.
+ */
+ sect_va1 = raid10_find_virt(conf, sector_nr, i);
+
+ if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
+ broadcast_msg = true;
+ /*
+ * curr_resync_completed is similar as
+ * sector_nr, so make the translation too.
+ */
+ sect_va2 = raid10_find_virt(conf,
+ mddev->curr_resync_completed, i);
+
+ if (conf->cluster_sync_low == 0 ||
+ conf->cluster_sync_low > sect_va2)
+ conf->cluster_sync_low = sect_va2;
+ }
+ }
+ if (broadcast_msg) {
+ raid10_set_cluster_sync_high(conf);
+ md_cluster_ops->resync_info_update(mddev,
+ conf->cluster_sync_low,
+ conf->cluster_sync_high);
+ }
+ }
+
while (biolist) {
bio = biolist;
biolist = biolist->bi_next;
@@ -3644,6 +3762,18 @@ static int raid10_run(struct mddev *mddev)
if (!conf)
goto out;
+ if (mddev_is_clustered(conf->mddev)) {
+ int fc, fo;
+
+ fc = (mddev->layout >> 8) & 255;
+ fo = mddev->layout & (1<<16);
+ if (fc > 1 || fo > 0) {
+ pr_err("only near layout is supported by clustered"
+ " raid10\n");
+ goto out;
+ }
+ }
+
mddev->thread = conf->thread;
conf->thread = NULL;
@@ -3832,18 +3962,14 @@ static void raid10_free(struct mddev *mddev, void *priv)
kfree(conf);
}
-static void raid10_quiesce(struct mddev *mddev, int state)
+static void raid10_quiesce(struct mddev *mddev, int quiesce)
{
struct r10conf *conf = mddev->private;
- switch(state) {
- case 1:
+ if (quiesce)
raise_barrier(conf, 0);
- break;
- case 0:
+ else
lower_barrier(conf);
- break;
- }
}
static int raid10_resize(struct mddev *mddev, sector_t sectors)
@@ -4578,15 +4704,18 @@ static int handle_reshape_read_error(struct mddev *mddev,
/* Use sync reads to get the blocks from somewhere else */
int sectors = r10_bio->sectors;
struct r10conf *conf = mddev->private;
- struct {
- struct r10bio r10_bio;
- struct r10dev devs[conf->copies];
- } on_stack;
- struct r10bio *r10b = &on_stack.r10_bio;
+ struct r10bio *r10b;
int slot = 0;
int idx = 0;
struct page **pages;
+ r10b = kmalloc(sizeof(*r10b) +
+ sizeof(struct r10dev) * conf->copies, GFP_NOIO);
+ if (!r10b) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ return -ENOMEM;
+ }
+
/* reshape IOs share pages from .devs[0].bio */
pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
@@ -4635,11 +4764,13 @@ static int handle_reshape_read_error(struct mddev *mddev,
/* couldn't read this block, must give up */
set_bit(MD_RECOVERY_INTR,
&mddev->recovery);
+ kfree(r10b);
return -EIO;
}
sectors -= s;
idx++;
}
+ kfree(r10b);
return 0;
}
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index abceccab6671..db2ac22ac1b4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -89,6 +89,12 @@ struct r10conf {
* the new thread here until we fully activate the array.
*/
struct md_thread *thread;
+
+ /*
+ * Keep track of cluster resync window to send to other nodes.
+ */
+ sector_t cluster_sync_low;
+ sector_t cluster_sync_high;
};
/*
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 0b7406ac8ce1..39f31f07ffe9 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -23,7 +23,7 @@
#include <linux/types.h>
#include "md.h"
#include "raid5.h"
-#include "bitmap.h"
+#include "md-bitmap.h"
#include "raid5-log.h"
/*
@@ -539,7 +539,7 @@ static void r5l_log_run_stripes(struct r5l_log *log)
{
struct r5l_io_unit *io, *next;
- assert_spin_locked(&log->io_list_lock);
+ lockdep_assert_held(&log->io_list_lock);
list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
/* don't change list order */
@@ -555,7 +555,7 @@ static void r5l_move_to_end_ios(struct r5l_log *log)
{
struct r5l_io_unit *io, *next;
- assert_spin_locked(&log->io_list_lock);
+ lockdep_assert_held(&log->io_list_lock);
list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
/* don't change list order */
@@ -693,6 +693,8 @@ static void r5c_disable_writeback_async(struct work_struct *work)
struct r5l_log *log = container_of(work, struct r5l_log,
disable_writeback_work);
struct mddev *mddev = log->rdev->mddev;
+ struct r5conf *conf = mddev->private;
+ int locked = 0;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
return;
@@ -701,11 +703,15 @@ static void r5c_disable_writeback_async(struct work_struct *work)
/* wait superblock change before suspend */
wait_event(mddev->sb_wait,
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
-
- mddev_suspend(mddev);
- log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
- mddev_resume(mddev);
+ conf->log == NULL ||
+ (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
+ (locked = mddev_trylock(mddev))));
+ if (locked) {
+ mddev_suspend(mddev);
+ log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+ mddev_resume(mddev);
+ mddev_unlock(mddev);
+ }
}
static void r5l_submit_current_io(struct r5l_log *log)
@@ -1194,7 +1200,7 @@ static void r5l_run_no_mem_stripe(struct r5l_log *log)
{
struct stripe_head *sh;
- assert_spin_locked(&log->io_list_lock);
+ lockdep_assert_held(&log->io_list_lock);
if (!list_empty(&log->no_mem_stripes)) {
sh = list_first_entry(&log->no_mem_stripes,
@@ -1210,7 +1216,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
struct r5l_io_unit *io, *next;
bool found = false;
- assert_spin_locked(&log->io_list_lock);
+ lockdep_assert_held(&log->io_list_lock);
list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
/* don't change list order */
@@ -1382,7 +1388,7 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
* raid5_release_stripe() while holding conf->device_lock
*/
BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
- assert_spin_locked(&conf->device_lock);
+ lockdep_assert_held(&conf->device_lock);
list_del_init(&sh->lru);
atomic_inc(&sh->count);
@@ -1409,7 +1415,7 @@ void r5c_flush_cache(struct r5conf *conf, int num)
int count;
struct stripe_head *sh, *next;
- assert_spin_locked(&conf->device_lock);
+ lockdep_assert_held(&conf->device_lock);
if (!conf->log)
return;
@@ -1583,21 +1589,21 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
md_wakeup_thread(log->reclaim_thread);
}
-void r5l_quiesce(struct r5l_log *log, int state)
+void r5l_quiesce(struct r5l_log *log, int quiesce)
{
struct mddev *mddev;
- if (!log || state == 2)
+ if (!log)
return;
- if (state == 0)
- kthread_unpark(log->reclaim_thread->tsk);
- else if (state == 1) {
+
+ if (quiesce) {
/* make sure r5l_write_super_and_discard_space exits */
mddev = log->rdev->mddev;
wake_up(&mddev->sb_wait);
kthread_park(log->reclaim_thread->tsk);
r5l_wake_reclaim(log, MaxSector);
r5l_do_reclaim(log);
- }
+ } else
+ kthread_unpark(log->reclaim_thread->tsk);
}
bool r5l_log_disk_error(struct r5conf *conf)
@@ -2571,31 +2577,22 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
int r5c_journal_mode_set(struct mddev *mddev, int mode)
{
struct r5conf *conf;
- int err;
if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
mode > R5C_JOURNAL_MODE_WRITE_BACK)
return -EINVAL;
- err = mddev_lock(mddev);
- if (err)
- return err;
conf = mddev->private;
- if (!conf || !conf->log) {
- mddev_unlock(mddev);
+ if (!conf || !conf->log)
return -ENODEV;
- }
if (raid5_calc_degraded(conf) > 0 &&
- mode == R5C_JOURNAL_MODE_WRITE_BACK) {
- mddev_unlock(mddev);
+ mode == R5C_JOURNAL_MODE_WRITE_BACK)
return -EINVAL;
- }
mddev_suspend(mddev);
conf->log->r5c_journal_mode = mode;
mddev_resume(mddev);
- mddev_unlock(mddev);
pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
mdname(mddev), mode, r5c_journal_mode_str[mode]);
@@ -2608,6 +2605,7 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev,
{
int mode = ARRAY_SIZE(r5c_journal_mode_str);
size_t len = length;
+ int ret;
if (len < 2)
return -EINVAL;
@@ -2619,8 +2617,12 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev,
if (strlen(r5c_journal_mode_str[mode]) == len &&
!strncmp(page, r5c_journal_mode_str[mode], len))
break;
-
- return r5c_journal_mode_set(mddev, mode) ?: length;
+ ret = mddev_lock(mddev);
+ if (ret)
+ return ret;
+ ret = r5c_journal_mode_set(mddev, mode);
+ mddev_unlock(mddev);
+ return ret ?: length;
}
struct md_sysfs_entry
@@ -3165,6 +3167,8 @@ void r5l_exit_log(struct r5conf *conf)
conf->log = NULL;
synchronize_rcu();
+ /* Ensure disable_writeback_work wakes up and exits */
+ wake_up(&conf->mddev->sb_wait);
flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
mempool_destroy(log->meta_pool);
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 7f9ad5f7cda0..284578b0a349 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -9,7 +9,7 @@ extern void r5l_write_stripe_run(struct r5l_log *log);
extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
extern void r5l_stripe_write_finished(struct stripe_head *sh);
extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int state);
+extern void r5l_quiesce(struct r5l_log *log, int quiesce);
extern bool r5l_log_disk_error(struct r5conf *conf);
extern bool r5c_is_writeback(struct r5l_log *log);
extern int
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index cd026c88f7ef..628c0bf7b9fd 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -758,7 +758,8 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
(unsigned long long)sector);
rdev = conf->disks[dd_idx].rdev;
- if (!rdev) {
+ if (!rdev || (!test_bit(In_sync, &rdev->flags) &&
+ sector >= rdev->recovery_offset)) {
pr_debug("%s:%*s data member disk %d missing\n",
__func__, indent, "", dd_idx);
update_parity = false;
@@ -1296,8 +1297,7 @@ int ppl_init_log(struct r5conf *conf)
if (ret) {
goto err;
- } else if (!mddev->pers &&
- mddev->recovery_cp == 0 && !mddev->degraded &&
+ } else if (!mddev->pers && mddev->recovery_cp == 0 &&
ppl_conf->recovered_entries > 0 &&
ppl_conf->mismatch_count == 0) {
/*
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 928e24a07133..98ce4272ace9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -55,7 +55,6 @@
#include <linux/ratelimit.h>
#include <linux/nodemask.h>
#include <linux/flex_array.h>
-#include <linux/sched/signal.h>
#include <trace/events/block.h>
#include <linux/list_sort.h>
@@ -63,7 +62,7 @@
#include "md.h"
#include "raid5.h"
#include "raid0.h"
-#include "bitmap.h"
+#include "md-bitmap.h"
#include "raid5-log.h"
#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
@@ -1818,8 +1817,11 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
struct r5dev *dev = &sh->dev[i];
if (dev->written || i == pd_idx || i == qd_idx) {
- if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
+ if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
set_bit(R5_UPTODATE, &dev->flags);
+ if (test_bit(STRIPE_EXPAND_READY, &sh->state))
+ set_bit(R5_Expanded, &dev->flags);
+ }
if (fua)
set_bit(R5_WantFUA, &dev->flags);
if (sync)
@@ -2675,13 +2677,13 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
pr_debug("raid456: error called\n");
spin_lock_irqsave(&conf->device_lock, flags);
+ set_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
mddev->degraded = raid5_calc_degraded(conf);
spin_unlock_irqrestore(&conf->device_lock, flags);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(Blocked, &rdev->flags);
- set_bit(Faulty, &rdev->flags);
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
@@ -5682,28 +5684,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
goto retry;
}
- if (rw == WRITE &&
- logical_sector >= mddev->suspend_lo &&
- logical_sector < mddev->suspend_hi) {
- raid5_release_stripe(sh);
- /* As the suspend_* range is controlled by
- * userspace, we want an interruptible
- * wait.
- */
- prepare_to_wait(&conf->wait_for_overlap,
- &w, TASK_INTERRUPTIBLE);
- if (logical_sector >= mddev->suspend_lo &&
- logical_sector < mddev->suspend_hi) {
- sigset_t full, old;
- sigfillset(&full);
- sigprocmask(SIG_BLOCK, &full, &old);
- schedule();
- sigprocmask(SIG_SETMASK, &old, NULL);
- do_prepare = true;
- }
- goto retry;
- }
-
if (test_bit(STRIPE_EXPANDING, &sh->state) ||
!add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
/* Stripe is busy expanding or
@@ -5758,6 +5738,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
*/
struct r5conf *conf = mddev->private;
struct stripe_head *sh;
+ struct md_rdev *rdev;
sector_t first_sector, last_sector;
int raid_disks = conf->previous_raid_disks;
int data_disks = raid_disks - conf->max_degraded;
@@ -5880,6 +5861,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
return 0;
mddev->reshape_position = conf->reshape_progress;
mddev->curr_resync_completed = sector_nr;
+ if (!mddev->reshape_backwards)
+ /* Can update recovery_offset */
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < sector_nr)
+ rdev->recovery_offset = sector_nr;
+
conf->reshape_checkpoint = jiffies;
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
@@ -5978,6 +5968,14 @@ finish:
goto ret;
mddev->reshape_position = conf->reshape_progress;
mddev->curr_resync_completed = sector_nr;
+ if (!mddev->reshape_backwards)
+ /* Can update recovery_offset */
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < sector_nr)
+ rdev->recovery_offset = sector_nr;
conf->reshape_checkpoint = jiffies;
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
@@ -6072,7 +6070,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
*/
rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev);
+ struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
still_degraded = 1;
@@ -7156,6 +7154,13 @@ static int raid5_run(struct mddev *mddev)
min_offset_diff = diff;
}
+ if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
+ (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
+ pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
if (mddev->reshape_position != MaxSector) {
/* Check that we can continue the reshape.
* Difficulties arise if the stripe we would write to
@@ -7958,6 +7963,7 @@ static void end_reshape(struct r5conf *conf)
{
if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
+ struct md_rdev *rdev;
spin_lock_irq(&conf->device_lock);
conf->previous_raid_disks = conf->raid_disks;
@@ -7965,6 +7971,11 @@ static void end_reshape(struct r5conf *conf)
smp_wmb();
conf->reshape_progress = MaxSector;
conf->mddev->reshape_position = MaxSector;
+ rdev_for_each(rdev, conf->mddev)
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags))
+ rdev->recovery_offset = MaxSector;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
@@ -8020,16 +8031,12 @@ static void raid5_finish_reshape(struct mddev *mddev)
}
}
-static void raid5_quiesce(struct mddev *mddev, int state)
+static void raid5_quiesce(struct mddev *mddev, int quiesce)
{
struct r5conf *conf = mddev->private;
- switch(state) {
- case 2: /* resume for a suspend */
- wake_up(&conf->wait_for_overlap);
- break;
-
- case 1: /* stop all writes */
+ if (quiesce) {
+ /* stop all writes */
lock_all_device_hash_locks_irq(conf);
/* '2' tells resync/reshape to pause so that all
* active stripes can drain
@@ -8045,17 +8052,15 @@ static void raid5_quiesce(struct mddev *mddev, int state)
unlock_all_device_hash_locks_irq(conf);
/* allow reshape to continue */
wake_up(&conf->wait_for_overlap);
- break;
-
- case 0: /* re-enable writes */
+ } else {
+ /* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
wake_up(&conf->wait_for_quiescent);
wake_up(&conf->wait_for_overlap);
unlock_all_device_hash_locks_irq(conf);
- break;
}
- r5l_quiesce(conf->log, state);
+ r5l_quiesce(conf->log, quiesce);
}
static void *raid45_takeover_raid0(struct mddev *mddev, int level)