aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/request.c23
-rw-r--r--drivers/md/dm-bio-prison.c186
-rw-r--r--drivers/md/dm-bio-prison.h28
-rw-r--r--drivers/md/dm-bufio.c238
-rw-r--r--drivers/md/dm-cache-block-types.h11
-rw-r--r--drivers/md/dm-cache-metadata.c34
-rw-r--r--drivers/md/dm-cache-metadata.h6
-rw-r--r--drivers/md/dm-cache-policy-mq.c82
-rw-r--r--drivers/md/dm-cache-target.c378
-rw-r--r--drivers/md/dm-crypt.c2
-rw-r--r--drivers/md/dm-ioctl.c5
-rw-r--r--drivers/md/dm-raid.c17
-rw-r--r--drivers/md/dm-stats.c2
-rw-r--r--drivers/md/dm-stripe.c4
-rw-r--r--drivers/md/dm-table.c36
-rw-r--r--drivers/md/dm-thin-metadata.c35
-rw-r--r--drivers/md/dm-thin-metadata.h9
-rw-r--r--drivers/md/dm-thin.c766
-rw-r--r--drivers/md/dm.c286
-rw-r--r--drivers/md/dm.h10
-rw-r--r--drivers/md/md.c48
-rw-r--r--drivers/md/persistent-data/dm-array.c4
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h6
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c2
-rw-r--r--drivers/md/persistent-data/dm-btree.c24
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c8
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c77
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h7
-rw-r--r--drivers/md/raid5.c7
29 files changed, 1701 insertions, 640 deletions
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 62e6e98186b5..ab43faddb447 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -601,13 +601,8 @@ static void request_endio(struct bio *bio, int error)
static void bio_complete(struct search *s)
{
if (s->orig_bio) {
- int cpu, rw = bio_data_dir(s->orig_bio);
- unsigned long duration = jiffies - s->start_time;
-
- cpu = part_stat_lock();
- part_round_stats(cpu, &s->d->disk->part0);
- part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
- part_stat_unlock();
+ generic_end_io_acct(bio_data_dir(s->orig_bio),
+ &s->d->disk->part0, s->start_time);
trace_bcache_request_end(s->d, s->orig_bio);
bio_endio(s->orig_bio, s->iop.error);
@@ -959,12 +954,9 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
struct search *s;
struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
- int cpu, rw = bio_data_dir(bio);
+ int rw = bio_data_dir(bio);
- cpu = part_stat_lock();
- part_stat_inc(cpu, &d->disk->part0, ios[rw]);
- part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
- part_stat_unlock();
+ generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
bio->bi_bdev = dc->bdev;
bio->bi_iter.bi_sector += dc->sb.data_offset;
@@ -1074,12 +1066,9 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
struct search *s;
struct closure *cl;
struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
- int cpu, rw = bio_data_dir(bio);
+ int rw = bio_data_dir(bio);
- cpu = part_stat_lock();
- part_stat_inc(cpu, &d->disk->part0, ios[rw]);
- part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
- part_stat_unlock();
+ generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
s = search_alloc(bio, d);
cl = &s->cl;
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index f752d12081ff..be065300e93c 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -14,68 +14,38 @@
/*----------------------------------------------------------------*/
-struct bucket {
- spinlock_t lock;
- struct hlist_head cells;
-};
+#define MIN_CELLS 1024
struct dm_bio_prison {
+ spinlock_t lock;
mempool_t *cell_pool;
-
- unsigned nr_buckets;
- unsigned hash_mask;
- struct bucket *buckets;
+ struct rb_root cells;
};
-/*----------------------------------------------------------------*/
-
-static uint32_t calc_nr_buckets(unsigned nr_cells)
-{
- uint32_t n = 128;
-
- nr_cells /= 4;
- nr_cells = min(nr_cells, 8192u);
-
- while (n < nr_cells)
- n <<= 1;
-
- return n;
-}
-
static struct kmem_cache *_cell_cache;
-static void init_bucket(struct bucket *b)
-{
- spin_lock_init(&b->lock);
- INIT_HLIST_HEAD(&b->cells);
-}
+/*----------------------------------------------------------------*/
/*
* @nr_cells should be the number of cells you want in use _concurrently_.
* Don't confuse it with the number of distinct keys.
*/
-struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells)
+struct dm_bio_prison *dm_bio_prison_create(void)
{
- unsigned i;
- uint32_t nr_buckets = calc_nr_buckets(nr_cells);
- size_t len = sizeof(struct dm_bio_prison) +
- (sizeof(struct bucket) * nr_buckets);
- struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL);
+ struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
if (!prison)
return NULL;
- prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
+ spin_lock_init(&prison->lock);
+
+ prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
if (!prison->cell_pool) {
kfree(prison);
return NULL;
}
- prison->nr_buckets = nr_buckets;
- prison->hash_mask = nr_buckets - 1;
- prison->buckets = (struct bucket *) (prison + 1);
- for (i = 0; i < nr_buckets; i++)
- init_bucket(prison->buckets + i);
+ prison->cells = RB_ROOT;
return prison;
}
@@ -101,68 +71,73 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
}
EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
-static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key)
+static void __setup_new_cell(struct dm_cell_key *key,
+ struct bio *holder,
+ struct dm_bio_prison_cell *cell)
{
- const unsigned long BIG_PRIME = 4294967291UL;
- uint64_t hash = key->block * BIG_PRIME;
-
- return (uint32_t) (hash & prison->hash_mask);
+ memcpy(&cell->key, key, sizeof(cell->key));
+ cell->holder = holder;
+ bio_list_init(&cell->bios);
}
-static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs)
+static int cmp_keys(struct dm_cell_key *lhs,
+ struct dm_cell_key *rhs)
{
- return (lhs->virtual == rhs->virtual) &&
- (lhs->dev == rhs->dev) &&
- (lhs->block == rhs->block);
-}
+ if (lhs->virtual < rhs->virtual)
+ return -1;
-static struct bucket *get_bucket(struct dm_bio_prison *prison,
- struct dm_cell_key *key)
-{
- return prison->buckets + hash_key(prison, key);
-}
+ if (lhs->virtual > rhs->virtual)
+ return 1;
-static struct dm_bio_prison_cell *__search_bucket(struct bucket *b,
- struct dm_cell_key *key)
-{
- struct dm_bio_prison_cell *cell;
+ if (lhs->dev < rhs->dev)
+ return -1;
- hlist_for_each_entry(cell, &b->cells, list)
- if (keys_equal(&cell->key, key))
- return cell;
+ if (lhs->dev > rhs->dev)
+ return 1;
- return NULL;
-}
+ if (lhs->block_end <= rhs->block_begin)
+ return -1;
-static void __setup_new_cell(struct bucket *b,
- struct dm_cell_key *key,
- struct bio *holder,
- struct dm_bio_prison_cell *cell)
-{
- memcpy(&cell->key, key, sizeof(cell->key));
- cell->holder = holder;
- bio_list_init(&cell->bios);
- hlist_add_head(&cell->list, &b->cells);
+ if (lhs->block_begin >= rhs->block_end)
+ return 1;
+
+ return 0;
}
-static int __bio_detain(struct bucket *b,
+static int __bio_detain(struct dm_bio_prison *prison,
struct dm_cell_key *key,
struct bio *inmate,
struct dm_bio_prison_cell *cell_prealloc,
struct dm_bio_prison_cell **cell_result)
{
- struct dm_bio_prison_cell *cell;
-
- cell = __search_bucket(b, key);
- if (cell) {
- if (inmate)
- bio_list_add(&cell->bios, inmate);
- *cell_result = cell;
- return 1;
+ int r;
+ struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
+
+ while (*new) {
+ struct dm_bio_prison_cell *cell =
+ container_of(*new, struct dm_bio_prison_cell, node);
+
+ r = cmp_keys(key, &cell->key);
+
+ parent = *new;
+ if (r < 0)
+ new = &((*new)->rb_left);
+ else if (r > 0)
+ new = &((*new)->rb_right);
+ else {
+ if (inmate)
+ bio_list_add(&cell->bios, inmate);
+ *cell_result = cell;
+ return 1;
+ }
}
- __setup_new_cell(b, key, inmate, cell_prealloc);
+ __setup_new_cell(key, inmate, cell_prealloc);
*cell_result = cell_prealloc;
+
+ rb_link_node(&cell_prealloc->node, parent, new);
+ rb_insert_color(&cell_prealloc->node, &prison->cells);
+
return 0;
}
@@ -174,11 +149,10 @@ static int bio_detain(struct dm_bio_prison *prison,
{
int r;
unsigned long flags;
- struct bucket *b = get_bucket(prison, key);
- spin_lock_irqsave(&b->lock, flags);
- r = __bio_detain(b, key, inmate, cell_prealloc, cell_result);
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
+ spin_unlock_irqrestore(&prison->lock, flags);
return r;
}
@@ -205,10 +179,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell);
/*
* @inmates must have been initialised prior to this call
*/
-static void __cell_release(struct dm_bio_prison_cell *cell,
+static void __cell_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
struct bio_list *inmates)
{
- hlist_del(&cell->list);
+ rb_erase(&cell->node, &prison->cells);
if (inmates) {
if (cell->holder)
@@ -222,21 +197,21 @@ void dm_cell_release(struct dm_bio_prison *prison,
struct bio_list *bios)
{
unsigned long flags;
- struct bucket *b = get_bucket(prison, &cell->key);
- spin_lock_irqsave(&b->lock, flags);
- __cell_release(cell, bios);
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_lock_irqsave(&prison->lock, flags);
+ __cell_release(prison, cell, bios);
+ spin_unlock_irqrestore(&prison->lock, flags);
}
EXPORT_SYMBOL_GPL(dm_cell_release);
/*
* Sometimes we don't want the holder, just the additional bios.
*/
-static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
+static void __cell_release_no_holder(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
struct bio_list *inmates)
{
- hlist_del(&cell->list);
+ rb_erase(&cell->node, &prison->cells);
bio_list_merge(inmates, &cell->bios);
}
@@ -245,11 +220,10 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
struct bio_list *inmates)
{
unsigned long flags;
- struct bucket *b = get_bucket(prison, &cell->key);
- spin_lock_irqsave(&b->lock, flags);
- __cell_release_no_holder(cell, inmates);
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_lock_irqsave(&prison->lock, flags);
+ __cell_release_no_holder(prison, cell, inmates);
+ spin_unlock_irqrestore(&prison->lock, flags);
}
EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
@@ -267,6 +241,20 @@ void dm_cell_error(struct dm_bio_prison *prison,
}
EXPORT_SYMBOL_GPL(dm_cell_error);
+void dm_cell_visit_release(struct dm_bio_prison *prison,
+ void (*visit_fn)(void *, struct dm_bio_prison_cell *),
+ void *context,
+ struct dm_bio_prison_cell *cell)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ visit_fn(context, cell);
+ rb_erase(&cell->node, &prison->cells);
+ spin_unlock_irqrestore(&prison->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_cell_visit_release);
+
/*----------------------------------------------------------------*/
#define DEFERRED_SET_SIZE 64
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 6805a142b750..74cf01144b1f 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -10,8 +10,8 @@
#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
-#include <linux/list.h>
#include <linux/bio.h>
+#include <linux/rbtree.h>
/*----------------------------------------------------------------*/
@@ -23,11 +23,14 @@
*/
struct dm_bio_prison;
-/* FIXME: this needs to be more abstract */
+/*
+ * Keys define a range of blocks within either a virtual or physical
+ * device.
+ */
struct dm_cell_key {
int virtual;
dm_thin_id dev;
- dm_block_t block;
+ dm_block_t block_begin, block_end;
};
/*
@@ -35,13 +38,15 @@ struct dm_cell_key {
* themselves.
*/
struct dm_bio_prison_cell {
- struct hlist_node list;
+ struct list_head user_list; /* for client use */
+ struct rb_node node;
+
struct dm_cell_key key;
struct bio *holder;
struct bio_list bios;
};
-struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells);
+struct dm_bio_prison *dm_bio_prison_create(void);
void dm_bio_prison_destroy(struct dm_bio_prison *prison);
/*
@@ -57,7 +62,7 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell);
/*
- * Creates, or retrieves a cell for the given key.
+ * Creates, or retrieves a cell that overlaps the given key.
*
* Returns 1 if pre-existing cell returned, zero if new cell created using
* @cell_prealloc.
@@ -68,7 +73,8 @@ int dm_get_cell(struct dm_bio_prison *prison,
struct dm_bio_prison_cell **cell_result);
/*
- * An atomic op that combines retrieving a cell, and adding a bio to it.
+ * An atomic op that combines retrieving or creating a cell, and adding a
+ * bio to it.
*
* Returns 1 if the cell was already held, 0 if @inmate is the new holder.
*/
@@ -87,6 +93,14 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
void dm_cell_error(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell, int error);
+/*
+ * Visits the cell and then releases. Guarantees no new inmates are
+ * inserted between the visit and release.
+ */
+void dm_cell_visit_release(struct dm_bio_prison *prison,
+ void (*visit_fn)(void *, struct dm_bio_prison_cell *),
+ void *context, struct dm_bio_prison_cell *cell);
+
/*----------------------------------------------------------------*/
/*
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 825ca1f87639..c33b49792b87 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -14,6 +14,7 @@
#include <linux/vmalloc.h>
#include <linux/shrinker.h>
#include <linux/module.h>
+#include <linux/rbtree.h>
#define DM_MSG_PREFIX "bufio"
@@ -34,26 +35,23 @@
/*
* Check buffer ages in this interval (seconds)
*/
-#define DM_BUFIO_WORK_TIMER_SECS 10
+#define DM_BUFIO_WORK_TIMER_SECS 30
/*
* Free buffers when they are older than this (seconds)
*/
-#define DM_BUFIO_DEFAULT_AGE_SECS 60
+#define DM_BUFIO_DEFAULT_AGE_SECS 300
/*
- * The number of bvec entries that are embedded directly in the buffer.
- * If the chunk size is larger, dm-io is used to do the io.
+ * The nr of bytes of cached data to keep around.
*/
-#define DM_BUFIO_INLINE_VECS 16
+#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
/*
- * Buffer hash
+ * The number of bvec entries that are embedded directly in the buffer.
+ * If the chunk size is larger, dm-io is used to do the io.
*/
-#define DM_BUFIO_HASH_BITS 20
-#define DM_BUFIO_HASH(block) \
- ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
- ((1 << DM_BUFIO_HASH_BITS) - 1))
+#define DM_BUFIO_INLINE_VECS 16
/*
* Don't try to use kmem_cache_alloc for blocks larger than this.
@@ -106,7 +104,7 @@ struct dm_bufio_client {
unsigned minimum_buffers;
- struct hlist_head *cache_hash;
+ struct rb_root buffer_tree;
wait_queue_head_t free_buffer_wait;
int async_write_error;
@@ -135,7 +133,7 @@ enum data_mode {
};
struct dm_buffer {
- struct hlist_node hash_list;
+ struct rb_node node;
struct list_head lru_list;
sector_t block;
void *data;
@@ -223,6 +221,7 @@ static DEFINE_SPINLOCK(param_spinlock);
* Buffers are freed after this timeout
*/
static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
+static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
static unsigned long dm_bufio_peak_allocated;
static unsigned long dm_bufio_allocated_kmem_cache;
@@ -253,6 +252,53 @@ static LIST_HEAD(dm_bufio_all_clients);
*/
static DEFINE_MUTEX(dm_bufio_clients_lock);
+/*----------------------------------------------------------------
+ * A red/black tree acts as an index for all the buffers.
+ *--------------------------------------------------------------*/
+static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
+{
+ struct rb_node *n = c->buffer_tree.rb_node;
+ struct dm_buffer *b;
+
+ while (n) {
+ b = container_of(n, struct dm_buffer, node);
+
+ if (b->block == block)
+ return b;
+
+ n = (b->block < block) ? n->rb_left : n->rb_right;
+ }
+
+ return NULL;
+}
+
+static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
+{
+ struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
+ struct dm_buffer *found;
+
+ while (*new) {
+ found = container_of(*new, struct dm_buffer, node);
+
+ if (found->block == b->block) {
+ BUG_ON(found != b);
+ return;
+ }
+
+ parent = *new;
+ new = (found->block < b->block) ?
+ &((*new)->rb_left) : &((*new)->rb_right);
+ }
+
+ rb_link_node(&b->node, parent, new);
+ rb_insert_color(&b->node, &c->buffer_tree);
+}
+
+static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
+{
+ rb_erase(&b->node, &c->buffer_tree);
+}
+
/*----------------------------------------------------------------*/
static void adjust_total_allocated(enum data_mode data_mode, long diff)
@@ -434,7 +480,7 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
b->block = block;
b->list_mode = dirty;
list_add(&b->lru_list, &c->lru[dirty]);
- hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
+ __insert(b->c, b);
b->last_accessed = jiffies;
}
@@ -448,7 +494,7 @@ static void __unlink_buffer(struct dm_buffer *b)
BUG_ON(!c->n_buffers[b->list_mode]);
c->n_buffers[b->list_mode]--;
- hlist_del(&b->hash_list);
+ __remove(b->c, b);
list_del(&b->lru_list);
}
@@ -532,6 +578,19 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
end_io(&b->bio, r);
}
+static void inline_endio(struct bio *bio, int error)
+{
+ bio_end_io_t *end_fn = bio->bi_private;
+
+ /*
+ * Reset the bio to free any attached resources
+ * (e.g. bio integrity profiles).
+ */
+ bio_reset(bio);
+
+ end_fn(bio, error);
+}
+
static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
bio_end_io_t *end_io)
{
@@ -543,7 +602,12 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
b->bio.bi_bdev = b->c->bdev;
- b->bio.bi_end_io = end_io;
+ b->bio.bi_end_io = inline_endio;
+ /*
+ * Use of .bi_private isn't a problem here because
+ * the dm_buffer's inline bio is local to bufio.
+ */
+ b->bio.bi_private = end_io;
/*
* We assume that if len >= PAGE_SIZE ptr is page-aligned.
@@ -887,23 +951,6 @@ static void __check_watermark(struct dm_bufio_client *c,
__write_dirty_buffers_async(c, 1, write_list);
}
-/*
- * Find a buffer in the hash.
- */
-static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
-{
- struct dm_buffer *b;
-
- hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)],
- hash_list) {
- dm_bufio_cond_resched();
- if (b->block == block)
- return b;
- }
-
- return NULL;
-}
-
/*----------------------------------------------------------------
* Getting a buffer
*--------------------------------------------------------------*/
@@ -1433,45 +1480,52 @@ static void drop_buffers(struct dm_bufio_client *c)
}
/*
- * Test if the buffer is unused and too old, and commit it.
- * At if noio is set, we must not do any I/O because we hold
- * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to
- * different bufio client.
+ * We may not be able to evict this buffer if IO pending or the client
+ * is still using it. Caller is expected to know buffer is too old.
+ *
+ * And if GFP_NOFS is used, we must not do any I/O because we hold
+ * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
+ * rerouted to different bufio client.
*/
-static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
- unsigned long max_jiffies)
+static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
{
- if (jiffies - b->last_accessed < max_jiffies)
- return 0;
-
- if (!(gfp & __GFP_IO)) {
+ if (!(gfp & __GFP_FS)) {
if (test_bit(B_READING, &b->state) ||
test_bit(B_WRITING, &b->state) ||
test_bit(B_DIRTY, &b->state))
- return 0;
+ return false;
}
if (b->hold_count)
- return 0;
+ return false;
__make_buffer_clean(b);
__unlink_buffer(b);
__free_buffer_wake(b);
- return 1;
+ return true;
}
-static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
- gfp_t gfp_mask)
+static unsigned get_retain_buffers(struct dm_bufio_client *c)
+{
+ unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
+ return retain_bytes / c->block_size;
+}
+
+static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
+ gfp_t gfp_mask)
{
int l;
struct dm_buffer *b, *tmp;
- long freed = 0;
+ unsigned long freed = 0;
+ unsigned long count = nr_to_scan;
+ unsigned retain_target = get_retain_buffers(c);
for (l = 0; l < LIST_SIZE; l++) {
list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
- freed += __cleanup_old_buffer(b, gfp_mask, 0);
- if (!--nr_to_scan)
+ if (__try_evict_buffer(b, gfp_mask))
+ freed++;
+ if (!--nr_to_scan || ((count - freed) <= retain_target))
return freed;
dm_bufio_cond_resched();
}
@@ -1486,7 +1540,7 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
unsigned long freed;
c = container_of(shrink, struct dm_bufio_client, shrinker);
- if (sc->gfp_mask & __GFP_IO)
+ if (sc->gfp_mask & __GFP_FS)
dm_bufio_lock(c);
else if (!dm_bufio_trylock(c))
return SHRINK_STOP;
@@ -1503,7 +1557,7 @@ dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
unsigned long count;
c = container_of(shrink, struct dm_bufio_client, shrinker);
- if (sc->gfp_mask & __GFP_IO)
+ if (sc->gfp_mask & __GFP_FS)
dm_bufio_lock(c);
else if (!dm_bufio_trylock(c))
return 0;
@@ -1533,11 +1587,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
r = -ENOMEM;
goto bad_client;
}
- c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS);
- if (!c->cache_hash) {
- r = -ENOMEM;
- goto bad_hash;
- }
+ c->buffer_tree = RB_ROOT;
c->bdev = bdev;
c->block_size = block_size;
@@ -1556,9 +1606,6 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
c->n_buffers[i] = 0;
}
- for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
- INIT_HLIST_HEAD(&c->cache_hash[i]);
-
mutex_init(&c->lock);
INIT_LIST_HEAD(&c->reserved_buffers);
c->need_reserved_buffers = reserved_buffers;
@@ -1632,8 +1679,6 @@ bad_cache:
}
dm_io_client_destroy(c->dm_io);
bad_dm_io:
- vfree(c->cache_hash);
-bad_hash:
kfree(c);
bad_client:
return ERR_PTR(r);
@@ -1660,9 +1705,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
mutex_unlock(&dm_bufio_clients_lock);
- for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
- BUG_ON(!hlist_empty(&c->cache_hash[i]));
-
+ BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
BUG_ON(c->need_reserved_buffers);
while (!list_empty(&c->reserved_buffers)) {
@@ -1680,36 +1723,60 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
BUG_ON(c->n_buffers[i]);
dm_io_client_destroy(c->dm_io);
- vfree(c->cache_hash);
kfree(c);
}
EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
-static void cleanup_old_buffers(void)
+static unsigned get_max_age_hz(void)
{
- unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age);
- struct dm_bufio_client *c;
+ unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
- if (max_age > ULONG_MAX / HZ)
- max_age = ULONG_MAX / HZ;
+ if (max_age > UINT_MAX / HZ)
+ max_age = UINT_MAX / HZ;
- mutex_lock(&dm_bufio_clients_lock);
- list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
- if (!dm_bufio_trylock(c))
- continue;
+ return max_age * HZ;
+}
- while (!list_empty(&c->lru[LIST_CLEAN])) {
- struct dm_buffer *b;
- b = list_entry(c->lru[LIST_CLEAN].prev,
- struct dm_buffer, lru_list);
- if (!__cleanup_old_buffer(b, 0, max_age * HZ))
- break;
- dm_bufio_cond_resched();
- }
+static bool older_than(struct dm_buffer *b, unsigned long age_hz)
+{
+ return (jiffies - b->last_accessed) >= age_hz;
+}
+
+static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
+{
+ struct dm_buffer *b, *tmp;
+ unsigned retain_target = get_retain_buffers(c);
+ unsigned count;
+
+ dm_bufio_lock(c);
+
+ count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
+ list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
+ if (count <= retain_target)
+ break;
+
+ if (!older_than(b, age_hz))
+ break;
+
+ if (__try_evict_buffer(b, 0))
+ count--;
- dm_bufio_unlock(c);
dm_bufio_cond_resched();
}
+
+ dm_bufio_unlock(c);
+}
+
+static void cleanup_old_buffers(void)
+{
+ unsigned long max_age_hz = get_max_age_hz();
+ struct dm_bufio_client *c;
+
+ mutex_lock(&dm_bufio_clients_lock);
+
+ list_for_each_entry(c, &dm_bufio_all_clients, client_list)
+ __evict_old_buffers(c, max_age_hz);
+
mutex_unlock(&dm_bufio_clients_lock);
}
@@ -1834,6 +1901,9 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
+module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
+
module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h
index aac0e2df06be..bed4ad4e1b7c 100644
--- a/drivers/md/dm-cache-block-types.h
+++ b/drivers/md/dm-cache-block-types.h
@@ -19,6 +19,7 @@
typedef dm_block_t __bitwise__ dm_oblock_t;
typedef uint32_t __bitwise__ dm_cblock_t;
+typedef dm_block_t __bitwise__ dm_dblock_t;
static inline dm_oblock_t to_oblock(dm_block_t b)
{
@@ -40,4 +41,14 @@ static inline uint32_t from_cblock(dm_cblock_t b)
return (__force uint32_t) b;
}
+static inline dm_dblock_t to_dblock(dm_block_t b)
+{
+ return (__force dm_dblock_t) b;
+}
+
+static inline dm_block_t from_dblock(dm_dblock_t b)
+{
+ return (__force dm_block_t) b;
+}
+
#endif /* DM_CACHE_BLOCK_TYPES_H */
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 06709257adde..9fc616c2755e 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -109,7 +109,7 @@ struct dm_cache_metadata {
dm_block_t discard_root;
sector_t discard_block_size;
- dm_oblock_t discard_nr_blocks;
+ dm_dblock_t discard_nr_blocks;
sector_t data_block_size;
dm_cblock_t cache_blocks;
@@ -329,7 +329,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->hint_root = cpu_to_le64(cmd->hint_root);
disk_super->discard_root = cpu_to_le64(cmd->discard_root);
disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
- disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
+ disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE);
disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
disk_super->cache_blocks = cpu_to_le32(0);
@@ -528,7 +528,7 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
cmd->hint_root = le64_to_cpu(disk_super->hint_root);
cmd->discard_root = le64_to_cpu(disk_super->discard_root);
cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size);
- cmd->discard_nr_blocks = to_oblock(le64_to_cpu(disk_super->discard_nr_blocks));
+ cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks));
cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
@@ -626,7 +626,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
disk_super->hint_root = cpu_to_le64(cmd->hint_root);
disk_super->discard_root = cpu_to_le64(cmd->discard_root);
disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
- disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
+ disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
@@ -797,15 +797,15 @@ out:
int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
sector_t discard_block_size,
- dm_oblock_t new_nr_entries)
+ dm_dblock_t new_nr_entries)
{
int r;
down_write(&cmd->root_lock);
r = dm_bitset_resize(&cmd->discard_info,
cmd->discard_root,
- from_oblock(cmd->discard_nr_blocks),
- from_oblock(new_nr_entries),
+ from_dblock(cmd->discard_nr_blocks),
+ from_dblock(new_nr_entries),
false, &cmd->discard_root);
if (!r) {
cmd->discard_block_size = discard_block_size;
@@ -818,28 +818,28 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
return r;
}
-static int __set_discard(struct dm_cache_metadata *cmd, dm_oblock_t b)
+static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
{
return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root,
- from_oblock(b), &cmd->discard_root);
+ from_dblock(b), &cmd->discard_root);
}
-static int __clear_discard(struct dm_cache_metadata *cmd, dm_oblock_t b)
+static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
{
return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root,
- from_oblock(b), &cmd->discard_root);
+ from_dblock(b), &cmd->discard_root);
}
-static int __is_discarded(struct dm_cache_metadata *cmd, dm_oblock_t b,
+static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
bool *is_discarded)
{
return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
- from_oblock(b), &cmd->discard_root,
+ from_dblock(b), &cmd->discard_root,
is_discarded);
}
static int __discard(struct dm_cache_metadata *cmd,
- dm_oblock_t dblock, bool discard)
+ dm_dblock_t dblock, bool discard)
{
int r;
@@ -852,7 +852,7 @@ static int __discard(struct dm_cache_metadata *cmd,
}
int dm_cache_set_discard(struct dm_cache_metadata *cmd,
- dm_oblock_t dblock, bool discard)
+ dm_dblock_t dblock, bool discard)
{
int r;
@@ -870,8 +870,8 @@ static int __load_discards(struct dm_cache_metadata *cmd,
dm_block_t b;
bool discard;
- for (b = 0; b < from_oblock(cmd->discard_nr_blocks); b++) {
- dm_oblock_t dblock = to_oblock(b);
+ for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+ dm_dblock_t dblock = to_dblock(b);
if (cmd->clean_when_opened) {
r = __is_discarded(cmd, dblock, &discard);
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 7383c90ccdb8..4ecc403be283 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -70,14 +70,14 @@ dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
sector_t discard_block_size,
- dm_oblock_t new_nr_entries);
+ dm_dblock_t new_nr_entries);
typedef int (*load_discard_fn)(void *context, sector_t discard_block_size,
- dm_oblock_t dblock, bool discarded);
+ dm_dblock_t dblock, bool discarded);
int dm_cache_load_discards(struct dm_cache_metadata *cmd,
load_discard_fn fn, void *context);
-int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_oblock_t dblock, bool discard);
+int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard);
int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock);
int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock);
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 0e385e40909e..13f547a4eeb6 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -181,24 +181,30 @@ static void queue_shift_down(struct queue *q)
* Gives us the oldest entry of the lowest popoulated level. If the first
* level is emptied then we shift down one level.
*/
-static struct list_head *queue_pop(struct queue *q)
+static struct list_head *queue_peek(struct queue *q)
{
unsigned level;
- struct list_head *r;
for (level = 0; level < NR_QUEUE_LEVELS; level++)
- if (!list_empty(q->qs + level)) {
- r = q->qs[level].next;
- list_del(r);
+ if (!list_empty(q->qs + level))
+ return q->qs[level].next;
- /* have we just emptied the bottom level? */
- if (level == 0 && list_empty(q->qs))
- queue_shift_down(q);
+ return NULL;
+}
- return r;
- }
+static struct list_head *queue_pop(struct queue *q)
+{
+ struct list_head *r = queue_peek(q);
- return NULL;
+ if (r) {
+ list_del(r);
+
+ /* have we just emptied the bottom level? */
+ if (list_empty(q->qs))
+ queue_shift_down(q);
+ }
+
+ return r;
}
static struct list_head *list_pop(struct list_head *lh)
@@ -383,13 +389,6 @@ struct mq_policy {
unsigned generation;
unsigned generation_period; /* in lookups (will probably change) */
- /*
- * Entries in the pre_cache whose hit count passes the promotion
- * threshold move to the cache proper. Working out the correct
- * value for the promotion_threshold is crucial to this policy.
- */
- unsigned promote_threshold;
-
unsigned discard_promote_adjustment;
unsigned read_promote_adjustment;
unsigned write_promote_adjustment;
@@ -406,6 +405,7 @@ struct mq_policy {
#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
+#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128
/*----------------------------------------------------------------*/
@@ -518,6 +518,12 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q)
return e;
}
+static struct entry *peek(struct queue *q)
+{
+ struct list_head *h = queue_peek(q);
+ return h ? container_of(h, struct entry, list) : NULL;
+}
+
/*
* Has this entry already been updated?
*/
@@ -570,10 +576,6 @@ static void check_generation(struct mq_policy *mq)
break;
}
}
-
- mq->promote_threshold = nr ? total / nr : 1;
- if (mq->promote_threshold * nr < total)
- mq->promote_threshold++;
}
}
@@ -641,6 +643,30 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
}
/*
+ * Entries in the pre_cache whose hit count passes the promotion
+ * threshold move to the cache proper. Working out the correct
+ * value for the promotion_threshold is crucial to this policy.
+ */
+static unsigned promote_threshold(struct mq_policy *mq)
+{
+ struct entry *e;
+
+ if (any_free_cblocks(mq))
+ return 0;
+
+ e = peek(&mq->cache_clean);
+ if (e)
+ return e->hit_count;
+
+ e = peek(&mq->cache_dirty);
+ if (e)
+ return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD;
+
+ /* This should never happen */
+ return 0;
+}
+
+/*
* We modify the basic promotion_threshold depending on the specific io.
*
* If the origin block has been discarded then there's no cost to copy it
@@ -653,7 +679,7 @@ static unsigned adjusted_promote_threshold(struct mq_policy *mq,
bool discarded_oblock, int data_dir)
{
if (data_dir == READ)
- return mq->promote_threshold + mq->read_promote_adjustment;
+ return promote_threshold(mq) + mq->read_promote_adjustment;
if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
/*
@@ -663,7 +689,7 @@ static unsigned adjusted_promote_threshold(struct mq_policy *mq,
return mq->discard_promote_adjustment;
}
- return mq->promote_threshold + mq->write_promote_adjustment;
+ return promote_threshold(mq) + mq->write_promote_adjustment;
}
static bool should_promote(struct mq_policy *mq, struct entry *e,
@@ -839,7 +865,8 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock,
if (e && in_cache(mq, e))
r = cache_entry_found(mq, e, result);
- else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
+ else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] &&
+ iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
result->op = POLICY_MISS;
else if (e)
@@ -1230,7 +1257,6 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
mq->tick = 0;
mq->hit_count = 0;
mq->generation = 0;
- mq->promote_threshold = 0;
mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
@@ -1265,7 +1291,7 @@ bad_pre_cache_init:
static struct dm_cache_policy_type mq_policy_type = {
.name = "mq",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create
@@ -1273,7 +1299,7 @@ static struct dm_cache_policy_type mq_policy_type = {
static struct dm_cache_policy_type default_policy_type = {
.name = "default",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create,
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7130505c2425..1e96d7889f51 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -95,7 +95,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
/*----------------------------------------------------------------*/
-#define PRISON_CELLS 1024
#define MIGRATION_POOL_SIZE 128
#define COMMIT_PERIOD HZ
#define MIGRATION_COUNT_WINDOW 10
@@ -237,8 +236,9 @@ struct cache {
/*
* origin_blocks entries, discarded if set.
*/
- dm_oblock_t discard_nr_blocks;
+ dm_dblock_t discard_nr_blocks;
unsigned long *discard_bitset;
+ uint32_t discard_block_size; /* a power of 2 times sectors per block */
/*
* Rather than reconstructing the table line for the status we just
@@ -310,6 +310,7 @@ struct dm_cache_migration {
dm_cblock_t cblock;
bool err:1;
+ bool discard:1;
bool writeback:1;
bool demote:1;
bool promote:1;
@@ -433,11 +434,12 @@ static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cel
/*----------------------------------------------------------------*/
-static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
+static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
{
key->virtual = 0;
key->dev = 0;
- key->block = from_oblock(oblock);
+ key->block_begin = from_oblock(begin);
+ key->block_end = from_oblock(end);
}
/*
@@ -447,15 +449,15 @@ static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
*/
typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
-static int bio_detain(struct cache *cache, dm_oblock_t oblock,
- struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
- cell_free_fn free_fn, void *free_context,
- struct dm_bio_prison_cell **cell_result)
+static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
+ struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
+ cell_free_fn free_fn, void *free_context,
+ struct dm_bio_prison_cell **cell_result)
{
int r;
struct dm_cell_key key;
- build_key(oblock, &key);
+ build_key(oblock_begin, oblock_end, &key);
r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
if (r)
free_fn(free_context, cell_prealloc);
@@ -463,6 +465,16 @@ static int bio_detain(struct cache *cache, dm_oblock_t oblock,
return r;
}
+static int bio_detain(struct cache *cache, dm_oblock_t oblock,
+ struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
+ cell_free_fn free_fn, void *free_context,
+ struct dm_bio_prison_cell **cell_result)
+{
+ dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
+ return bio_detain_range(cache, oblock, end, bio,
+ cell_prealloc, free_fn, free_context, cell_result);
+}
+
static int get_cell(struct cache *cache,
dm_oblock_t oblock,
struct prealloc *structs,
@@ -474,7 +486,7 @@ static int get_cell(struct cache *cache,
cell_prealloc = prealloc_get_cell(structs);
- build_key(oblock, &key);
+ build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
if (r)
prealloc_put_cell(structs, cell_prealloc);
@@ -524,33 +536,57 @@ static dm_block_t block_div(dm_block_t b, uint32_t n)
return b;
}
-static void set_discard(struct cache *cache, dm_oblock_t b)
+static dm_block_t oblocks_per_dblock(struct cache *cache)
+{
+ dm_block_t oblocks = cache->discard_block_size;
+
+ if (block_size_is_power_of_two(cache))
+ oblocks >>= cache->sectors_per_block_shift;
+ else
+ oblocks = block_div(oblocks, cache->sectors_per_block);
+
+ return oblocks;
+}
+
+static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
+{
+ return to_dblock(block_div(from_oblock(oblock),
+ oblocks_per_dblock(cache)));
+}
+
+static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
+{
+ return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
+}
+
+static void set_discard(struct cache *cache, dm_dblock_t b)
{
unsigned long flags;
+ BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
atomic_inc(&cache->stats.discard_count);
spin_lock_irqsave(&cache->lock, flags);
- set_bit(from_oblock(b), cache->discard_bitset);
+ set_bit(from_dblock(b), cache->discard_bitset);
spin_unlock_irqrestore(&cache->lock, flags);
}
-static void clear_discard(struct cache *cache, dm_oblock_t b)
+static void clear_discard(struct cache *cache, dm_dblock_t b)
{
unsigned long flags;
spin_lock_irqsave(&cache->lock, flags);
- clear_bit(from_oblock(b), cache->discard_bitset);
+ clear_bit(from_dblock(b), cache->discard_bitset);
spin_unlock_irqrestore(&cache->lock, flags);
}
-static bool is_discarded(struct cache *cache, dm_oblock_t b)
+static bool is_discarded(struct cache *cache, dm_dblock_t b)
{
int r;
unsigned long flags;
spin_lock_irqsave(&cache->lock, flags);
- r = test_bit(from_oblock(b), cache->discard_bitset);
+ r = test_bit(from_dblock(b), cache->discard_bitset);
spin_unlock_irqrestore(&cache->lock, flags);
return r;
@@ -562,7 +598,8 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
unsigned long flags;
spin_lock_irqsave(&cache->lock, flags);
- r = test_bit(from_oblock(b), cache->discard_bitset);
+ r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
+ cache->discard_bitset);
spin_unlock_irqrestore(&cache->lock, flags);
return r;
@@ -687,7 +724,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
check_if_tick_bio_needed(cache, bio);
remap_to_origin(cache, bio);
if (bio_data_dir(bio) == WRITE)
- clear_discard(cache, oblock);
+ clear_discard(cache, oblock_to_dblock(cache, oblock));
}
static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
@@ -697,7 +734,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
remap_to_cache(cache, bio, cblock);
if (bio_data_dir(bio) == WRITE) {
set_dirty(cache, oblock, cblock);
- clear_discard(cache, oblock);
+ clear_discard(cache, oblock_to_dblock(cache, oblock));
}
}
@@ -951,10 +988,14 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
}
} else {
- clear_dirty(cache, mg->new_oblock, mg->cblock);
- if (mg->requeue_holder)
+ if (mg->requeue_holder) {
+ clear_dirty(cache, mg->new_oblock, mg->cblock);
cell_defer(cache, mg->new_ocell, true);
- else {
+ } else {
+ /*
+ * The block was promoted via an overwrite, so it's dirty.
+ */
+ set_dirty(cache, mg->new_oblock, mg->cblock);
bio_endio(mg->new_ocell->holder, 0);
cell_defer(cache, mg->new_ocell, false);
}
@@ -978,7 +1019,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
wake_worker(cache);
}
-static void issue_copy_real(struct dm_cache_migration *mg)
+static void issue_copy(struct dm_cache_migration *mg)
{
int r;
struct dm_io_region o_region, c_region;
@@ -1057,11 +1098,46 @@ static void avoid_copy(struct dm_cache_migration *mg)
migration_success_pre_commit(mg);
}
-static void issue_copy(struct dm_cache_migration *mg)
+static void calc_discard_block_range(struct cache *cache, struct bio *bio,
+ dm_dblock_t *b, dm_dblock_t *e)
+{
+ sector_t sb = bio->bi_iter.bi_sector;
+ sector_t se = bio_end_sector(bio);
+
+ *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
+
+ if (se - sb < cache->discard_block_size)
+ *e = *b;
+ else
+ *e = to_dblock(block_div(se, cache->discard_block_size));
+}
+
+static void issue_discard(struct dm_cache_migration *mg)
+{
+ dm_dblock_t b, e;
+ struct bio *bio = mg->new_ocell->holder;
+
+ calc_discard_block_range(mg->cache, bio, &b, &e);
+ while (b != e) {
+ set_discard(mg->cache, b);
+ b = to_dblock(from_dblock(b) + 1);
+ }
+
+ bio_endio(bio, 0);
+ cell_defer(mg->cache, mg->new_ocell, false);
+ free_migration(mg);
+}
+
+static void issue_copy_or_discard(struct dm_cache_migration *mg)
{
bool avoid;
struct cache *cache = mg->cache;
+ if (mg->discard) {
+ issue_discard(mg);
+ return;
+ }
+
if (mg->writeback || mg->demote)
avoid = !is_dirty(cache, mg->cblock) ||
is_discarded_oblock(cache, mg->old_oblock);
@@ -1070,13 +1146,14 @@ static void issue_copy(struct dm_cache_migration *mg)
avoid = is_discarded_oblock(cache, mg->new_oblock);
- if (!avoid && bio_writes_complete_block(cache, bio)) {
+ if (writeback_mode(&cache->features) &&
+ !avoid && bio_writes_complete_block(cache, bio)) {
issue_overwrite(mg, bio);
return;
}
}
- avoid ? avoid_copy(mg) : issue_copy_real(mg);
+ avoid ? avoid_copy(mg) : issue_copy(mg);
}
static void complete_migration(struct dm_cache_migration *mg)
@@ -1161,6 +1238,7 @@ static void promote(struct cache *cache, struct prealloc *structs,
struct dm_cache_migration *mg = prealloc_get_migration(structs);
mg->err = false;
+ mg->discard = false;
mg->writeback = false;
mg->demote = false;
mg->promote = true;
@@ -1184,6 +1262,7 @@ static void writeback(struct cache *cache, struct prealloc *structs,
struct dm_cache_migration *mg = prealloc_get_migration(structs);
mg->err = false;
+ mg->discard = false;
mg->writeback = true;
mg->demote = false;
mg->promote = false;
@@ -1209,6 +1288,7 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs,
struct dm_cache_migration *mg = prealloc_get_migration(structs);
mg->err = false;
+ mg->discard = false;
mg->writeback = false;
mg->demote = true;
mg->promote = true;
@@ -1237,6 +1317,7 @@ static void invalidate(struct cache *cache, struct prealloc *structs,
struct dm_cache_migration *mg = prealloc_get_migration(structs);
mg->err = false;
+ mg->discard = false;
mg->writeback = false;
mg->demote = true;
mg->promote = false;
@@ -1253,6 +1334,26 @@ static void invalidate(struct cache *cache, struct prealloc *structs,
quiesce_migration(mg);
}
+static void discard(struct cache *cache, struct prealloc *structs,
+ struct dm_bio_prison_cell *cell)
+{
+ struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+ mg->err = false;
+ mg->discard = true;
+ mg->writeback = false;
+ mg->demote = false;
+ mg->promote = false;
+ mg->requeue_holder = false;
+ mg->invalidate = false;
+ mg->cache = cache;
+ mg->old_ocell = NULL;
+ mg->new_ocell = cell;
+ mg->start_jiffies = jiffies;
+
+ quiesce_migration(mg);
+}
+
/*----------------------------------------------------------------
* bio processing
*--------------------------------------------------------------*/
@@ -1286,31 +1387,27 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
issue(cache, bio);
}
-/*
- * People generally discard large parts of a device, eg, the whole device
- * when formatting. Splitting these large discards up into cache block
- * sized ios and then quiescing (always neccessary for discard) takes too
- * long.
- *
- * We keep it simple, and allow any size of discard to come in, and just
- * mark off blocks on the discard bitset. No passdown occurs!
- *
- * To implement passdown we need to change the bio_prison such that a cell
- * can have a key that spans many blocks.
- */
-static void process_discard_bio(struct cache *cache, struct bio *bio)
+static void process_discard_bio(struct cache *cache, struct prealloc *structs,
+ struct bio *bio)
{
- dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector,
- cache->sectors_per_block);
- dm_block_t end_block = bio_end_sector(bio);
- dm_block_t b;
+ int r;
+ dm_dblock_t b, e;
+ struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
- end_block = block_div(end_block, cache->sectors_per_block);
+ calc_discard_block_range(cache, bio, &b, &e);
+ if (b == e) {
+ bio_endio(bio, 0);
+ return;
+ }
- for (b = start_block; b < end_block; b++)
- set_discard(cache, to_oblock(b));
+ cell_prealloc = prealloc_get_cell(structs);
+ r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
+ (cell_free_fn) prealloc_put_cell,
+ structs, &new_ocell);
+ if (r > 0)
+ return;
- bio_endio(bio, 0);
+ discard(cache, structs, new_ocell);
}
static bool spare_migration_bandwidth(struct cache *cache)
@@ -1340,9 +1437,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
dm_oblock_t block = get_bio_block(cache, bio);
struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
struct policy_result lookup_result;
- bool discarded_block = is_discarded_oblock(cache, block);
bool passthrough = passthrough_mode(&cache->features);
- bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
+ bool discarded_block, can_migrate;
/*
* Check to see if that block is currently migrating.
@@ -1354,6 +1450,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
if (r > 0)
return;
+ discarded_block = is_discarded_oblock(cache, block);
+ can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
+
r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
bio, &lookup_result);
@@ -1500,7 +1599,7 @@ static void process_deferred_bios(struct cache *cache)
if (bio->bi_rw & REQ_FLUSH)
process_flush_bio(cache, bio);
else if (bio->bi_rw & REQ_DISCARD)
- process_discard_bio(cache, bio);
+ process_discard_bio(cache, &structs, bio);
else
process_bio(cache, &structs, bio);
}
@@ -1715,7 +1814,7 @@ static void do_worker(struct work_struct *ws)
process_invalidation_requests(cache);
}
- process_migrations(cache, &cache->quiesced_migrations, issue_copy);
+ process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
process_migrations(cache, &cache->completed_migrations, complete_migration);
if (commit_if_needed(cache)) {
@@ -2180,6 +2279,45 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
return 0;
}
+/*
+ * We want the discard block size to be at least the size of the cache
+ * block size and have no more than 2^14 discard blocks across the origin.
+ */
+#define MAX_DISCARD_BLOCKS (1 << 14)
+
+static bool too_many_discard_blocks(sector_t discard_block_size,
+ sector_t origin_size)
+{
+ (void) sector_div(origin_size, discard_block_size);
+
+ return origin_size > MAX_DISCARD_BLOCKS;
+}
+
+static sector_t calculate_discard_block_size(sector_t cache_block_size,
+ sector_t origin_size)
+{
+ sector_t discard_block_size = cache_block_size;
+
+ if (origin_size)
+ while (too_many_discard_blocks(discard_block_size, origin_size))
+ discard_block_size *= 2;
+
+ return discard_block_size;
+}
+
+static void set_cache_size(struct cache *cache, dm_cblock_t size)
+{
+ dm_block_t nr_blocks = from_cblock(size);
+
+ if (nr_blocks > (1 << 20) && cache->cache_size != size)
+ DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
+ "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
+ "Please consider increasing the cache block size to reduce the overall cache block count.",
+ (unsigned long long) nr_blocks);
+
+ cache->cache_size = size;
+}
+
#define DEFAULT_MIGRATION_THRESHOLD 2048
static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2204,8 +2342,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ti->num_discard_bios = 1;
ti->discards_supported = true;
ti->discard_zeroes_data_unsupported = true;
- /* Discard bios must be split on a block boundary */
- ti->split_discard_bios = true;
+ ti->split_discard_bios = false;
cache->features = ca->features;
ti->per_bio_data_size = get_per_bio_data_size(cache);
@@ -2235,10 +2372,10 @@ static int cache_create(struct cache_args *ca, struct cache **result)
cache->sectors_per_block_shift = -1;
cache_size = block_div(cache_size, ca->block_size);
- cache->cache_size = to_cblock(cache_size);
+ set_cache_size(cache, to_cblock(cache_size));
} else {
cache->sectors_per_block_shift = __ffs(ca->block_size);
- cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
+ set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
}
r = create_cache_policy(cache, ca, error);
@@ -2303,13 +2440,17 @@ static int cache_create(struct cache_args *ca, struct cache **result)
}
clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
- cache->discard_nr_blocks = cache->origin_blocks;
- cache->discard_bitset = alloc_bitset(from_oblock(cache->discard_nr_blocks));
+ cache->discard_block_size =
+ calculate_discard_block_size(cache->sectors_per_block,
+ cache->origin_sectors);
+ cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
+ cache->discard_block_size));
+ cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
if (!cache->discard_bitset) {
*error = "could not allocate discard bitset";
goto bad;
}
- clear_bitset(cache->discard_bitset, from_oblock(cache->discard_nr_blocks));
+ clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
if (IS_ERR(cache->copier)) {
@@ -2327,7 +2468,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
INIT_DELAYED_WORK(&cache->waker, do_waker);
cache->last_commit_jiffies = jiffies;
- cache->prison = dm_bio_prison_create(PRISON_CELLS);
+ cache->prison = dm_bio_prison_create();
if (!cache->prison) {
*error = "could not create bio prison";
goto bad;
@@ -2549,11 +2690,11 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
static int cache_map(struct dm_target *ti, struct bio *bio)
{
int r;
- struct dm_bio_prison_cell *cell;
+ struct dm_bio_prison_cell *cell = NULL;
struct cache *cache = ti->private;
r = __cache_map(cache, bio, &cell);
- if (r == DM_MAPIO_REMAPPED) {
+ if (r == DM_MAPIO_REMAPPED && cell) {
inc_ds(cache, bio, cell);
cell_defer(cache, cell, false);
}
@@ -2599,16 +2740,16 @@ static int write_discard_bitset(struct cache *cache)
{
unsigned i, r;
- r = dm_cache_discard_bitset_resize(cache->cmd, cache->sectors_per_block,
- cache->origin_blocks);
+ r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
+ cache->discard_nr_blocks);
if (r) {
DMERR("could not resize on-disk discard bitset");
return r;
}
- for (i = 0; i < from_oblock(cache->discard_nr_blocks); i++) {
- r = dm_cache_set_discard(cache->cmd, to_oblock(i),
- is_discarded(cache, to_oblock(i)));
+ for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
+ r = dm_cache_set_discard(cache->cmd, to_dblock(i),
+ is_discarded(cache, to_dblock(i)));
if (r)
return r;
}
@@ -2680,15 +2821,86 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
return 0;
}
+/*
+ * The discard block size in the on disk metadata is not
+ * neccessarily the same as we're currently using. So we have to
+ * be careful to only set the discarded attribute if we know it
+ * covers a complete block of the new size.
+ */
+struct discard_load_info {
+ struct cache *cache;
+
+ /*
+ * These blocks are sized using the on disk dblock size, rather
+ * than the current one.
+ */
+ dm_block_t block_size;
+ dm_block_t discard_begin, discard_end;
+};
+
+static void discard_load_info_init(struct cache *cache,
+ struct discard_load_info *li)
+{
+ li->cache = cache;
+ li->discard_begin = li->discard_end = 0;
+}
+
+static void set_discard_range(struct discard_load_info *li)
+{
+ sector_t b, e;
+
+ if (li->discard_begin == li->discard_end)
+ return;
+
+ /*
+ * Convert to sectors.
+ */
+ b = li->discard_begin * li->block_size;
+ e = li->discard_end * li->block_size;
+
+ /*
+ * Then convert back to the current dblock size.
+ */
+ b = dm_sector_div_up(b, li->cache->discard_block_size);
+ sector_div(e, li->cache->discard_block_size);
+
+ /*
+ * The origin may have shrunk, so we need to check we're still in
+ * bounds.
+ */
+ if (e > from_dblock(li->cache->discard_nr_blocks))
+ e = from_dblock(li->cache->discard_nr_blocks);
+
+ for (; b < e; b++)
+ set_discard(li->cache, to_dblock(b));
+}
+
static int load_discard(void *context, sector_t discard_block_size,
- dm_oblock_t oblock, bool discard)
+ dm_dblock_t dblock, bool discard)
{
- struct cache *cache = context;
+ struct discard_load_info *li = context;
- if (discard)
- set_discard(cache, oblock);
- else
- clear_discard(cache, oblock);
+ li->block_size = discard_block_size;
+
+ if (discard) {
+ if (from_dblock(dblock) == li->discard_end)
+ /*
+ * We're already in a discard range, just extend it.
+ */
+ li->discard_end = li->discard_end + 1ULL;
+
+ else {
+ /*
+ * Emit the old range and start a new one.
+ */
+ set_discard_range(li);
+ li->discard_begin = from_dblock(dblock);
+ li->discard_end = li->discard_begin + 1ULL;
+ }
+ } else {
+ set_discard_range(li);
+ li->discard_begin = li->discard_end = 0;
+ }
return 0;
}
@@ -2730,7 +2942,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
return r;
}
- cache->cache_size = new_size;
+ set_cache_size(cache, new_size);
return 0;
}
@@ -2772,11 +2984,22 @@ static int cache_preresume(struct dm_target *ti)
}
if (!cache->loaded_discards) {
- r = dm_cache_load_discards(cache->cmd, load_discard, cache);
+ struct discard_load_info li;
+
+ /*
+ * The discard bitset could have been resized, or the
+ * discard block size changed. To be safe we start by
+ * setting every dblock to not discarded.
+ */
+ clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
+
+ discard_load_info_init(cache, &li);
+ r = dm_cache_load_discards(cache->cmd, load_discard, &li);
if (r) {
DMERR("could not load origin discards");
return r;
}
+ set_discard_range(&li);
cache->loaded_discards = true;
}
@@ -3079,8 +3302,9 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
/*
* FIXME: these limits may be incompatible with the cache device
*/
- limits->max_discard_sectors = cache->sectors_per_block;
- limits->discard_granularity = cache->sectors_per_block << SECTOR_SHIFT;
+ limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
+ cache->origin_sectors);
+ limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
}
static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -3104,7 +3328,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 5, 0},
+ .version = {1, 6, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index fc93b9330af4..08981be7baa1 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -705,7 +705,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,
for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
crypto_xor(data + i * 8, buf, 8);
out:
- memset(buf, 0, sizeof(buf));
+ memzero_explicit(buf, sizeof(buf));
return r;
}
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 0be9381365d7..73f791bb9ea4 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -684,11 +684,14 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
int srcu_idx;
param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
- DM_ACTIVE_PRESENT_FLAG);
+ DM_ACTIVE_PRESENT_FLAG | DM_INTERNAL_SUSPEND_FLAG);
if (dm_suspended_md(md))
param->flags |= DM_SUSPEND_FLAG;
+ if (dm_suspended_internally_md(md))
+ param->flags |= DM_INTERNAL_SUSPEND_FLAG;
+
if (dm_test_deferred_remove_flag(md))
param->flags |= DM_DEFERRED_REMOVE;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 4857fa4a5484..07c0fa0fa284 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -789,8 +789,7 @@ struct dm_raid_superblock {
__le32 layout;
__le32 stripe_sectors;
- __u8 pad[452]; /* Round struct to 512 bytes. */
- /* Always set to 0 when writing. */
+ /* Remainder of a logical block is zero-filled when writing (see super_sync()). */
} __packed;
static int read_disk_sb(struct md_rdev *rdev, int size)
@@ -827,7 +826,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
test_bit(Faulty, &(rs->dev[i].rdev.flags)))
failed_devices |= (1ULL << i);
- memset(sb, 0, sizeof(*sb));
+ memset(sb + 1, 0, rdev->sb_size - sizeof(*sb));
sb->magic = cpu_to_le32(DM_RAID_MAGIC);
sb->features = cpu_to_le32(0); /* No features yet */
@@ -862,7 +861,11 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
uint64_t events_sb, events_refsb;
rdev->sb_start = 0;
- rdev->sb_size = sizeof(*sb);
+ rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev);
+ if (rdev->sb_size < sizeof(*sb) || rdev->sb_size > PAGE_SIZE) {
+ DMERR("superblock size of a logical block is no longer valid");
+ return -EINVAL;
+ }
ret = read_disk_sb(rdev, rdev->sb_size);
if (ret)
@@ -1169,8 +1172,12 @@ static void configure_discard_support(struct dm_target *ti, struct raid_set *rs)
raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
for (i = 0; i < rs->md.raid_disks; i++) {
- struct request_queue *q = bdev_get_queue(rs->dev[i].rdev.bdev);
+ struct request_queue *q;
+
+ if (!rs->dev[i].rdev.bdev)
+ continue;
+ q = bdev_get_queue(rs->dev[i].rdev.bdev);
if (!q || !blk_queue_discard(q))
return;
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 87f86c77b094..f478a4c96d2f 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -824,7 +824,7 @@ static int message_stats_create(struct mapped_device *md,
return 1;
id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
- dm_internal_suspend, dm_internal_resume, md);
+ dm_internal_suspend_fast, dm_internal_resume_fast, md);
if (id < 0)
return id;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index d1600d2aa2e2..f8b37d4c05d8 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -159,8 +159,10 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
sc->stripes_shift = __ffs(stripes);
r = dm_set_target_max_io_len(ti, chunk_size);
- if (r)
+ if (r) {
+ kfree(sc);
return r;
+ }
ti->num_flush_bios = stripes;
ti->num_discard_bios = stripes;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index b2bd1ebf4562..3afae9e062f8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1521,18 +1521,32 @@ fmode_t dm_table_get_mode(struct dm_table *t)
}
EXPORT_SYMBOL(dm_table_get_mode);
-static void suspend_targets(struct dm_table *t, unsigned postsuspend)
+enum suspend_mode {
+ PRESUSPEND,
+ PRESUSPEND_UNDO,
+ POSTSUSPEND,
+};
+
+static void suspend_targets(struct dm_table *t, enum suspend_mode mode)
{
int i = t->num_targets;
struct dm_target *ti = t->targets;
while (i--) {
- if (postsuspend) {
+ switch (mode) {
+ case PRESUSPEND:
+ if (ti->type->presuspend)
+ ti->type->presuspend(ti);
+ break;
+ case PRESUSPEND_UNDO:
+ if (ti->type->presuspend_undo)
+ ti->type->presuspend_undo(ti);
+ break;
+ case POSTSUSPEND:
if (ti->type->postsuspend)
ti->type->postsuspend(ti);
- } else if (ti->type->presuspend)
- ti->type->presuspend(ti);
-
+ break;
+ }
ti++;
}
}
@@ -1542,7 +1556,15 @@ void dm_table_presuspend_targets(struct dm_table *t)
if (!t)
return;
- suspend_targets(t, 0);
+ suspend_targets(t, PRESUSPEND);
+}
+
+void dm_table_presuspend_undo_targets(struct dm_table *t)
+{
+ if (!t)
+ return;
+
+ suspend_targets(t, PRESUSPEND_UNDO);
}
void dm_table_postsuspend_targets(struct dm_table *t)
@@ -1550,7 +1572,7 @@ void dm_table_postsuspend_targets(struct dm_table *t)
if (!t)
return;
- suspend_targets(t, 1);
+ suspend_targets(t, POSTSUSPEND);
}
int dm_table_resume_targets(struct dm_table *t)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index e9d33ad59df5..43adbb863f5a 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1384,42 +1384,38 @@ static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
}
int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
- int can_block, struct dm_thin_lookup_result *result)
+ int can_issue_io, struct dm_thin_lookup_result *result)
{
- int r = -EINVAL;
- uint64_t block_time = 0;
+ int r;
__le64 value;
struct dm_pool_metadata *pmd = td->pmd;
dm_block_t keys[2] = { td->id, block };
struct dm_btree_info *info;
- if (can_block) {
- down_read(&pmd->root_lock);
- info = &pmd->info;
- } else if (down_read_trylock(&pmd->root_lock))
- info = &pmd->nb_info;
- else
- return -EWOULDBLOCK;
-
if (pmd->fail_io)
- goto out;
+ return -EINVAL;
- r = dm_btree_lookup(info, pmd->root, keys, &value);
- if (!r)
- block_time = le64_to_cpu(value);
+ down_read(&pmd->root_lock);
-out:
- up_read(&pmd->root_lock);
+ if (can_issue_io) {
+ info = &pmd->info;
+ } else
+ info = &pmd->nb_info;
+ r = dm_btree_lookup(info, pmd->root, keys, &value);
if (!r) {
+ uint64_t block_time = 0;
dm_block_t exception_block;
uint32_t exception_time;
+
+ block_time = le64_to_cpu(value);
unpack_block_time(block_time, &exception_block,
&exception_time);
result->block = exception_block;
result->shared = __snapshotted_since(td, exception_time);
}
+ up_read(&pmd->root_lock);
return r;
}
@@ -1813,3 +1809,8 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
return needs_check;
}
+
+void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
+{
+ dm_tm_issue_prefetches(pmd->tm);
+}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index e3c857db195a..921d15ee56a0 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -139,12 +139,12 @@ struct dm_thin_lookup_result {
/*
* Returns:
- * -EWOULDBLOCK iff @can_block is set and would block.
+ * -EWOULDBLOCK iff @can_issue_io is set and would issue IO
* -ENODATA iff that mapping is not present.
* 0 success
*/
int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
- int can_block, struct dm_thin_lookup_result *result);
+ int can_issue_io, struct dm_thin_lookup_result *result);
/*
* Obtain an unused block.
@@ -213,6 +213,11 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
+/*
+ * Issue any prefetches that may be useful.
+ */
+void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd);
+
/*----------------------------------------------------------------*/
#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 4843801173fe..8735543eacdb 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -11,11 +11,13 @@
#include <linux/device-mapper.h>
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
+#include <linux/log2.h>
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/sort.h>
#include <linux/rbtree.h>
#define DM_MSG_PREFIX "thin"
@@ -25,7 +27,6 @@
*/
#define ENDIO_HOOK_POOL_SIZE 1024
#define MAPPING_POOL_SIZE 1024
-#define PRISON_CELLS 1024
#define COMMIT_PERIOD HZ
#define NO_SPACE_TIMEOUT_SECS 60
@@ -114,7 +115,8 @@ static void build_data_key(struct dm_thin_device *td,
{
key->virtual = 0;
key->dev = dm_thin_dev_id(td);
- key->block = b;
+ key->block_begin = b;
+ key->block_end = b + 1ULL;
}
static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
@@ -122,7 +124,55 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
{
key->virtual = 1;
key->dev = dm_thin_dev_id(td);
- key->block = b;
+ key->block_begin = b;
+ key->block_end = b + 1ULL;
+}
+
+/*----------------------------------------------------------------*/
+
+#define THROTTLE_THRESHOLD (1 * HZ)
+
+struct throttle {
+ struct rw_semaphore lock;
+ unsigned long threshold;
+ bool throttle_applied;
+};
+
+static void throttle_init(struct throttle *t)
+{
+ init_rwsem(&t->lock);
+ t->throttle_applied = false;
+}
+
+static void throttle_work_start(struct throttle *t)
+{
+ t->threshold = jiffies + THROTTLE_THRESHOLD;
+}
+
+static void throttle_work_update(struct throttle *t)
+{
+ if (!t->throttle_applied && jiffies > t->threshold) {
+ down_write(&t->lock);
+ t->throttle_applied = true;
+ }
+}
+
+static void throttle_work_complete(struct throttle *t)
+{
+ if (t->throttle_applied) {
+ t->throttle_applied = false;
+ up_write(&t->lock);
+ }
+}
+
+static void throttle_lock(struct throttle *t)
+{
+ down_read(&t->lock);
+}
+
+static void throttle_unlock(struct throttle *t)
+{
+ up_read(&t->lock);
}
/*----------------------------------------------------------------*/
@@ -155,8 +205,11 @@ struct pool_features {
struct thin_c;
typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
+typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
+#define CELL_SORT_ARRAY_SIZE 8192
+
struct pool {
struct list_head list;
struct dm_target *ti; /* Only set if a pool target is bound */
@@ -171,11 +224,13 @@ struct pool {
struct pool_features pf;
bool low_water_triggered:1; /* A dm event has been sent */
+ bool suspended:1;
struct dm_bio_prison *prison;
struct dm_kcopyd_client *copier;
struct workqueue_struct *wq;
+ struct throttle throttle;
struct work_struct worker;
struct delayed_work waker;
struct delayed_work no_space_timeout;
@@ -198,8 +253,13 @@ struct pool {
process_bio_fn process_bio;
process_bio_fn process_discard;
+ process_cell_fn process_cell;
+ process_cell_fn process_discard_cell;
+
process_mapping_fn process_prepared_mapping;
process_mapping_fn process_prepared_discard;
+
+ struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE];
};
static enum pool_mode get_pool_mode(struct pool *pool);
@@ -232,8 +292,11 @@ struct thin_c {
struct pool *pool;
struct dm_thin_device *td;
+ struct mapped_device *thin_md;
+
bool requeue_mode:1;
spinlock_t lock;
+ struct list_head deferred_cells;
struct bio_list deferred_bio_list;
struct bio_list retry_on_resume_list;
struct rb_root sort_bio_list; /* sorted list of deferred bios */
@@ -290,6 +353,15 @@ static void cell_release(struct pool *pool,
dm_bio_prison_free_cell(pool->prison, cell);
}
+static void cell_visit_release(struct pool *pool,
+ void (*fn)(void *, struct dm_bio_prison_cell *),
+ void *context,
+ struct dm_bio_prison_cell *cell)
+{
+ dm_cell_visit_release(pool->prison, fn, context, cell);
+ dm_bio_prison_free_cell(pool->prison, cell);
+}
+
static void cell_release_no_holder(struct pool *pool,
struct dm_bio_prison_cell *cell,
struct bio_list *bios)
@@ -298,19 +370,6 @@ static void cell_release_no_holder(struct pool *pool,
dm_bio_prison_free_cell(pool->prison, cell);
}
-static void cell_defer_no_holder_no_free(struct thin_c *tc,
- struct dm_bio_prison_cell *cell)
-{
- struct pool *pool = tc->pool;
- unsigned long flags;
-
- spin_lock_irqsave(&tc->lock, flags);
- dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
- spin_unlock_irqrestore(&tc->lock, flags);
-
- wake_worker(pool);
-}
-
static void cell_error_with_code(struct pool *pool,
struct dm_bio_prison_cell *cell, int error_code)
{
@@ -323,6 +382,16 @@ static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
cell_error_with_code(pool, cell, -EIO);
}
+static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+ cell_error_with_code(pool, cell, 0);
+}
+
+static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+ cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
+}
+
/*----------------------------------------------------------------*/
/*
@@ -393,44 +462,65 @@ struct dm_thin_endio_hook {
struct rb_node rb_node;
};
-static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
+static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
+{
+ bio_list_merge(bios, master);
+ bio_list_init(master);
+}
+
+static void error_bio_list(struct bio_list *bios, int error)
{
struct bio *bio;
+
+ while ((bio = bio_list_pop(bios)))
+ bio_endio(bio, error);
+}
+
+static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
+{
struct bio_list bios;
unsigned long flags;
bio_list_init(&bios);
spin_lock_irqsave(&tc->lock, flags);
- bio_list_merge(&bios, master);
- bio_list_init(master);
+ __merge_bio_list(&bios, master);
spin_unlock_irqrestore(&tc->lock, flags);
- while ((bio = bio_list_pop(&bios)))
- bio_endio(bio, DM_ENDIO_REQUEUE);
+ error_bio_list(&bios, error);
}
-static void requeue_io(struct thin_c *tc)
+static void requeue_deferred_cells(struct thin_c *tc)
{
- requeue_bio_list(tc, &tc->deferred_bio_list);
- requeue_bio_list(tc, &tc->retry_on_resume_list);
+ struct pool *pool = tc->pool;
+ unsigned long flags;
+ struct list_head cells;
+ struct dm_bio_prison_cell *cell, *tmp;
+
+ INIT_LIST_HEAD(&cells);
+
+ spin_lock_irqsave(&tc->lock, flags);
+ list_splice_init(&tc->deferred_cells, &cells);
+ spin_unlock_irqrestore(&tc->lock, flags);
+
+ list_for_each_entry_safe(cell, tmp, &cells, user_list)
+ cell_requeue(pool, cell);
}
-static void error_thin_retry_list(struct thin_c *tc)
+static void requeue_io(struct thin_c *tc)
{
- struct bio *bio;
- unsigned long flags;
struct bio_list bios;
+ unsigned long flags;
bio_list_init(&bios);
spin_lock_irqsave(&tc->lock, flags);
- bio_list_merge(&bios, &tc->retry_on_resume_list);
- bio_list_init(&tc->retry_on_resume_list);
+ __merge_bio_list(&bios, &tc->deferred_bio_list);
+ __merge_bio_list(&bios, &tc->retry_on_resume_list);
spin_unlock_irqrestore(&tc->lock, flags);
- while ((bio = bio_list_pop(&bios)))
- bio_io_error(bio);
+ error_bio_list(&bios, DM_ENDIO_REQUEUE);
+ requeue_deferred_cells(tc);
}
static void error_retry_list(struct pool *pool)
@@ -439,7 +529,7 @@ static void error_retry_list(struct pool *pool)
rcu_read_lock();
list_for_each_entry_rcu(tc, &pool->active_thins, list)
- error_thin_retry_list(tc);
+ error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
rcu_read_unlock();
}
@@ -629,33 +719,75 @@ static void overwrite_endio(struct bio *bio, int err)
*/
/*
- * This sends the bios in the cell back to the deferred_bios list.
+ * This sends the bios in the cell, except the original holder, back
+ * to the deferred_bios list.
*/
-static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
struct pool *pool = tc->pool;
unsigned long flags;
spin_lock_irqsave(&tc->lock, flags);
- cell_release(pool, cell, &tc->deferred_bio_list);
+ cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
spin_unlock_irqrestore(&tc->lock, flags);
wake_worker(pool);
}
-/*
- * Same as cell_defer above, except it omits the original holder of the cell.
- */
-static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
+
+struct remap_info {
+ struct thin_c *tc;
+ struct bio_list defer_bios;
+ struct bio_list issue_bios;
+};
+
+static void __inc_remap_and_issue_cell(void *context,
+ struct dm_bio_prison_cell *cell)
{
- struct pool *pool = tc->pool;
- unsigned long flags;
+ struct remap_info *info = context;
+ struct bio *bio;
- spin_lock_irqsave(&tc->lock, flags);
- cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
- spin_unlock_irqrestore(&tc->lock, flags);
+ while ((bio = bio_list_pop(&cell->bios))) {
+ if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))
+ bio_list_add(&info->defer_bios, bio);
+ else {
+ inc_all_io_entry(info->tc->pool, bio);
- wake_worker(pool);
+ /*
+ * We can't issue the bios with the bio prison lock
+ * held, so we add them to a list to issue on
+ * return from this function.
+ */
+ bio_list_add(&info->issue_bios, bio);
+ }
+ }
+}
+
+static void inc_remap_and_issue_cell(struct thin_c *tc,
+ struct dm_bio_prison_cell *cell,
+ dm_block_t block)
+{
+ struct bio *bio;
+ struct remap_info info;
+
+ info.tc = tc;
+ bio_list_init(&info.defer_bios);
+ bio_list_init(&info.issue_bios);
+
+ /*
+ * We have to be careful to inc any bios we're about to issue
+ * before the cell is released, and avoid a race with new bios
+ * being added to the cell.
+ */
+ cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
+ &info, cell);
+
+ while ((bio = bio_list_pop(&info.defer_bios)))
+ thin_defer_bio(tc, bio);
+
+ while ((bio = bio_list_pop(&info.issue_bios)))
+ remap_and_issue(info.tc, bio, block);
}
static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
@@ -706,10 +838,13 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
* the bios in the cell.
*/
if (bio) {
- cell_defer_no_holder(tc, m->cell);
+ inc_remap_and_issue_cell(tc, m->cell, m->data_block);
bio_endio(bio, 0);
- } else
- cell_defer(tc, m->cell);
+ } else {
+ inc_all_io_entry(tc->pool, m->cell->holder);
+ remap_and_issue(tc, m->cell->holder, m->data_block);
+ inc_remap_and_issue_cell(tc, m->cell, m->data_block);
+ }
out:
list_del(&m->list);
@@ -842,6 +977,20 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
}
}
+static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
+ dm_block_t data_block,
+ struct dm_thin_new_mapping *m)
+{
+ struct pool *pool = tc->pool;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+ h->overwrite_mapping = m;
+ m->bio = bio;
+ save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
+ inc_all_io_entry(pool, bio);
+ remap_and_issue(tc, bio, data_block);
+}
+
/*
* A partial copy also needs to zero the uncopied region.
*/
@@ -876,15 +1025,9 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
* If the whole block of data is being overwritten, we can issue the
* bio immediately. Otherwise we use kcopyd to clone the data first.
*/
- if (io_overwrites_block(pool, bio)) {
- struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-
- h->overwrite_mapping = m;
- m->bio = bio;
- save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
- inc_all_io_entry(pool, bio);
- remap_and_issue(tc, bio, data_dest);
- } else {
+ if (io_overwrites_block(pool, bio))
+ remap_and_issue_overwrite(tc, bio, data_dest, m);
+ else {
struct dm_io_region from, to;
from.bdev = origin->bdev;
@@ -953,16 +1096,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
if (!pool->pf.zero_new_blocks)
process_prepared_mapping(m);
- else if (io_overwrites_block(pool, bio)) {
- struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-
- h->overwrite_mapping = m;
- m->bio = bio;
- save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
- inc_all_io_entry(pool, bio);
- remap_and_issue(tc, bio, data_block);
+ else if (io_overwrites_block(pool, bio))
+ remap_and_issue_overwrite(tc, bio, data_block, m);
- } else
+ else
ll_zero(tc, m,
data_block * pool->sectors_per_block,
(data_block + 1) * pool->sectors_per_block);
@@ -1134,29 +1271,25 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
bio_list_init(&bios);
cell_release(pool, cell, &bios);
- error = should_error_unserviceable_bio(pool);
- if (error)
- while ((bio = bio_list_pop(&bios)))
- bio_endio(bio, error);
- else
- while ((bio = bio_list_pop(&bios)))
- retry_on_resume(bio);
+ while ((bio = bio_list_pop(&bios)))
+ retry_on_resume(bio);
}
-static void process_discard(struct thin_c *tc, struct bio *bio)
+static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
int r;
- unsigned long flags;
+ struct bio *bio = cell->holder;
struct pool *pool = tc->pool;
- struct dm_bio_prison_cell *cell, *cell2;
- struct dm_cell_key key, key2;
+ struct dm_bio_prison_cell *cell2;
+ struct dm_cell_key key2;
dm_block_t block = get_bio_block(tc, bio);
struct dm_thin_lookup_result lookup_result;
struct dm_thin_new_mapping *m;
- build_virtual_key(tc->td, block, &key);
- if (bio_detain(tc->pool, &key, bio, &cell))
+ if (tc->requeue_mode) {
+ cell_requeue(pool, cell);
return;
+ }
r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
switch (r) {
@@ -1187,12 +1320,9 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
m->cell2 = cell2;
m->bio = bio;
- if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
- spin_lock_irqsave(&pool->lock, flags);
- list_add_tail(&m->list, &pool->prepared_discards);
- spin_unlock_irqrestore(&pool->lock, flags);
- wake_worker(pool);
- }
+ if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+ pool->process_prepared_discard(m);
+
} else {
inc_all_io_entry(pool, bio);
cell_defer_no_holder(tc, cell);
@@ -1227,6 +1357,19 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
}
}
+static void process_discard_bio(struct thin_c *tc, struct bio *bio)
+{
+ struct dm_bio_prison_cell *cell;
+ struct dm_cell_key key;
+ dm_block_t block = get_bio_block(tc, bio);
+
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(tc->pool, &key, bio, &cell))
+ return;
+
+ process_discard_cell(tc, cell);
+}
+
static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
struct dm_cell_key *key,
struct dm_thin_lookup_result *lookup_result,
@@ -1255,11 +1398,53 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
}
}
+static void __remap_and_issue_shared_cell(void *context,
+ struct dm_bio_prison_cell *cell)
+{
+ struct remap_info *info = context;
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&cell->bios))) {
+ if ((bio_data_dir(bio) == WRITE) ||
+ (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)))
+ bio_list_add(&info->defer_bios, bio);
+ else {
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
+
+ h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
+ inc_all_io_entry(info->tc->pool, bio);
+ bio_list_add(&info->issue_bios, bio);
+ }
+ }
+}
+
+static void remap_and_issue_shared_cell(struct thin_c *tc,
+ struct dm_bio_prison_cell *cell,
+ dm_block_t block)
+{
+ struct bio *bio;
+ struct remap_info info;
+
+ info.tc = tc;
+ bio_list_init(&info.defer_bios);
+ bio_list_init(&info.issue_bios);
+
+ cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
+ &info, cell);
+
+ while ((bio = bio_list_pop(&info.defer_bios)))
+ thin_defer_bio(tc, bio);
+
+ while ((bio = bio_list_pop(&info.issue_bios)))
+ remap_and_issue(tc, bio, block);
+}
+
static void process_shared_bio(struct thin_c *tc, struct bio *bio,
dm_block_t block,
- struct dm_thin_lookup_result *lookup_result)
+ struct dm_thin_lookup_result *lookup_result,
+ struct dm_bio_prison_cell *virt_cell)
{
- struct dm_bio_prison_cell *cell;
+ struct dm_bio_prison_cell *data_cell;
struct pool *pool = tc->pool;
struct dm_cell_key key;
@@ -1268,19 +1453,23 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
* of being broken so we have nothing further to do here.
*/
build_data_key(tc->td, lookup_result->block, &key);
- if (bio_detain(pool, &key, bio, &cell))
+ if (bio_detain(pool, &key, bio, &data_cell)) {
+ cell_defer_no_holder(tc, virt_cell);
return;
+ }
- if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
- break_sharing(tc, bio, block, &key, lookup_result, cell);
- else {
+ if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
+ break_sharing(tc, bio, block, &key, lookup_result, data_cell);
+ cell_defer_no_holder(tc, virt_cell);
+ } else {
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
inc_all_io_entry(pool, bio);
- cell_defer_no_holder(tc, cell);
-
remap_and_issue(tc, bio, lookup_result->block);
+
+ remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
+ remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
}
}
@@ -1333,34 +1522,28 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
}
}
-static void process_bio(struct thin_c *tc, struct bio *bio)
+static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
int r;
struct pool *pool = tc->pool;
+ struct bio *bio = cell->holder;
dm_block_t block = get_bio_block(tc, bio);
- struct dm_bio_prison_cell *cell;
- struct dm_cell_key key;
struct dm_thin_lookup_result lookup_result;
- /*
- * If cell is already occupied, then the block is already
- * being provisioned so we have nothing further to do here.
- */
- build_virtual_key(tc->td, block, &key);
- if (bio_detain(pool, &key, bio, &cell))
+ if (tc->requeue_mode) {
+ cell_requeue(pool, cell);
return;
+ }
r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
switch (r) {
case 0:
- if (lookup_result.shared) {
- process_shared_bio(tc, bio, block, &lookup_result);
- cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
- } else {
+ if (lookup_result.shared)
+ process_shared_bio(tc, bio, block, &lookup_result, cell);
+ else {
inc_all_io_entry(pool, bio);
- cell_defer_no_holder(tc, cell);
-
remap_and_issue(tc, bio, lookup_result.block);
+ inc_remap_and_issue_cell(tc, cell, lookup_result.block);
}
break;
@@ -1394,7 +1577,26 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
}
}
-static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
+static void process_bio(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ dm_block_t block = get_bio_block(tc, bio);
+ struct dm_bio_prison_cell *cell;
+ struct dm_cell_key key;
+
+ /*
+ * If cell is already occupied, then the block is already
+ * being provisioned so we have nothing further to do here.
+ */
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(pool, &key, bio, &cell))
+ return;
+
+ process_cell(tc, cell);
+}
+
+static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
+ struct dm_bio_prison_cell *cell)
{
int r;
int rw = bio_data_dir(bio);
@@ -1404,15 +1606,21 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
switch (r) {
case 0:
- if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
+ if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
handle_unserviceable_bio(tc->pool, bio);
- else {
+ if (cell)
+ cell_defer_no_holder(tc, cell);
+ } else {
inc_all_io_entry(tc->pool, bio);
remap_and_issue(tc, bio, lookup_result.block);
+ if (cell)
+ inc_remap_and_issue_cell(tc, cell, lookup_result.block);
}
break;
case -ENODATA:
+ if (cell)
+ cell_defer_no_holder(tc, cell);
if (rw != READ) {
handle_unserviceable_bio(tc->pool, bio);
break;
@@ -1431,11 +1639,23 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
default:
DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
__func__, r);
+ if (cell)
+ cell_defer_no_holder(tc, cell);
bio_io_error(bio);
break;
}
}
+static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
+{
+ __process_bio_read_only(tc, bio, NULL);
+}
+
+static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ __process_bio_read_only(tc, cell->holder, cell);
+}
+
static void process_bio_success(struct thin_c *tc, struct bio *bio)
{
bio_endio(bio, 0);
@@ -1446,6 +1666,16 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio)
bio_io_error(bio);
}
+static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ cell_success(tc->pool, cell);
+}
+
+static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ cell_error(tc->pool, cell);
+}
+
/*
* FIXME: should we also commit due to size of transaction, measured in
* metadata blocks?
@@ -1527,9 +1757,10 @@ static void process_thin_deferred_bios(struct thin_c *tc)
struct bio *bio;
struct bio_list bios;
struct blk_plug plug;
+ unsigned count = 0;
if (tc->requeue_mode) {
- requeue_bio_list(tc, &tc->deferred_bio_list);
+ error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE);
return;
}
@@ -1568,10 +1799,97 @@ static void process_thin_deferred_bios(struct thin_c *tc)
pool->process_discard(tc, bio);
else
pool->process_bio(tc, bio);
+
+ if ((count++ & 127) == 0) {
+ throttle_work_update(&pool->throttle);
+ dm_pool_issue_prefetches(pool->pmd);
+ }
}
blk_finish_plug(&plug);
}
+static int cmp_cells(const void *lhs, const void *rhs)
+{
+ struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);
+ struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);
+
+ BUG_ON(!lhs_cell->holder);
+ BUG_ON(!rhs_cell->holder);
+
+ if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
+ return -1;
+
+ if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
+ return 1;
+
+ return 0;
+}
+
+static unsigned sort_cells(struct pool *pool, struct list_head *cells)
+{
+ unsigned count = 0;
+ struct dm_bio_prison_cell *cell, *tmp;
+
+ list_for_each_entry_safe(cell, tmp, cells, user_list) {
+ if (count >= CELL_SORT_ARRAY_SIZE)
+ break;
+
+ pool->cell_sort_array[count++] = cell;
+ list_del(&cell->user_list);
+ }
+
+ sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
+
+ return count;
+}
+
+static void process_thin_deferred_cells(struct thin_c *tc)
+{
+ struct pool *pool = tc->pool;
+ unsigned long flags;
+ struct list_head cells;
+ struct dm_bio_prison_cell *cell;
+ unsigned i, j, count;
+
+ INIT_LIST_HEAD(&cells);
+
+ spin_lock_irqsave(&tc->lock, flags);
+ list_splice_init(&tc->deferred_cells, &cells);
+ spin_unlock_irqrestore(&tc->lock, flags);
+
+ if (list_empty(&cells))
+ return;
+
+ do {
+ count = sort_cells(tc->pool, &cells);
+
+ for (i = 0; i < count; i++) {
+ cell = pool->cell_sort_array[i];
+ BUG_ON(!cell->holder);
+
+ /*
+ * If we've got no free new_mapping structs, and processing
+ * this bio might require one, we pause until there are some
+ * prepared mappings to process.
+ */
+ if (ensure_next_mapping(pool)) {
+ for (j = i; j < count; j++)
+ list_add(&pool->cell_sort_array[j]->user_list, &cells);
+
+ spin_lock_irqsave(&tc->lock, flags);
+ list_splice(&cells, &tc->deferred_cells);
+ spin_unlock_irqrestore(&tc->lock, flags);
+ return;
+ }
+
+ if (cell->holder->bi_rw & REQ_DISCARD)
+ pool->process_discard_cell(tc, cell);
+ else
+ pool->process_cell(tc, cell);
+ }
+ } while (!list_empty(&cells));
+}
+
static void thin_get(struct thin_c *tc);
static void thin_put(struct thin_c *tc);
@@ -1620,6 +1938,7 @@ static void process_deferred_bios(struct pool *pool)
tc = get_first_thin(pool);
while (tc) {
+ process_thin_deferred_cells(tc);
process_thin_deferred_bios(tc);
tc = get_next_thin(pool, tc);
}
@@ -1653,9 +1972,15 @@ static void do_worker(struct work_struct *ws)
{
struct pool *pool = container_of(ws, struct pool, worker);
+ throttle_work_start(&pool->throttle);
+ dm_pool_issue_prefetches(pool->pmd);
+ throttle_work_update(&pool->throttle);
process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
+ throttle_work_update(&pool->throttle);
process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
+ throttle_work_update(&pool->throttle);
process_deferred_bios(pool);
+ throttle_work_complete(&pool->throttle);
}
/*
@@ -1792,6 +2117,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_fail;
pool->process_discard = process_bio_fail;
+ pool->process_cell = process_cell_fail;
+ pool->process_discard_cell = process_cell_fail;
pool->process_prepared_mapping = process_prepared_mapping_fail;
pool->process_prepared_discard = process_prepared_discard_fail;
@@ -1804,6 +2131,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_read_only;
pool->process_discard = process_bio_success;
+ pool->process_cell = process_cell_read_only;
+ pool->process_discard_cell = process_cell_success;
pool->process_prepared_mapping = process_prepared_mapping_fail;
pool->process_prepared_discard = process_prepared_discard_passdown;
@@ -1822,7 +2151,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
if (old_mode != new_mode)
notify_of_pool_mode_change(pool, "out-of-data-space");
pool->process_bio = process_bio_read_only;
- pool->process_discard = process_discard;
+ pool->process_discard = process_discard_bio;
+ pool->process_cell = process_cell_read_only;
+ pool->process_discard_cell = process_discard_cell;
pool->process_prepared_mapping = process_prepared_mapping;
pool->process_prepared_discard = process_prepared_discard_passdown;
@@ -1835,7 +2166,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
notify_of_pool_mode_change(pool, "write");
dm_pool_metadata_read_write(pool->pmd);
pool->process_bio = process_bio;
- pool->process_discard = process_discard;
+ pool->process_discard = process_discard_bio;
+ pool->process_cell = process_cell;
+ pool->process_discard_cell = process_discard_cell;
pool->process_prepared_mapping = process_prepared_mapping;
pool->process_prepared_discard = process_prepared_discard;
break;
@@ -1895,6 +2228,29 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
wake_worker(pool);
}
+static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+
+ throttle_lock(&pool->throttle);
+ thin_defer_bio(tc, bio);
+ throttle_unlock(&pool->throttle);
+}
+
+static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ unsigned long flags;
+ struct pool *pool = tc->pool;
+
+ throttle_lock(&pool->throttle);
+ spin_lock_irqsave(&tc->lock, flags);
+ list_add_tail(&cell->user_list, &tc->deferred_cells);
+ spin_unlock_irqrestore(&tc->lock, flags);
+ throttle_unlock(&pool->throttle);
+
+ wake_worker(pool);
+}
+
static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
{
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -1915,8 +2271,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
dm_block_t block = get_bio_block(tc, bio);
struct dm_thin_device *td = tc->td;
struct dm_thin_lookup_result result;
- struct dm_bio_prison_cell cell1, cell2;
- struct dm_bio_prison_cell *cell_result;
+ struct dm_bio_prison_cell *virt_cell, *data_cell;
struct dm_cell_key key;
thin_hook_bio(tc, bio);
@@ -1932,10 +2287,18 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
}
if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
- thin_defer_bio(tc, bio);
+ thin_defer_bio_with_throttle(tc, bio);
return DM_MAPIO_SUBMITTED;
}
+ /*
+ * We must hold the virtual cell before doing the lookup, otherwise
+ * there's a race with discard.
+ */
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(tc->pool, &key, bio, &virt_cell))
+ return DM_MAPIO_SUBMITTED;
+
r = dm_thin_find_block(td, block, 0, &result);
/*
@@ -1958,23 +2321,19 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
* More distant ancestors are irrelevant. The
* shared flag will be set in their case.
*/
- thin_defer_bio(tc, bio);
+ thin_defer_cell(tc, virt_cell);
return DM_MAPIO_SUBMITTED;
}
- build_virtual_key(tc->td, block, &key);
- if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
- return DM_MAPIO_SUBMITTED;
-
build_data_key(tc->td, result.block, &key);
- if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
- cell_defer_no_holder_no_free(tc, &cell1);
+ if (bio_detain(tc->pool, &key, bio, &data_cell)) {
+ cell_defer_no_holder(tc, virt_cell);
return DM_MAPIO_SUBMITTED;
}
inc_all_io_entry(tc->pool, bio);
- cell_defer_no_holder_no_free(tc, &cell2);
- cell_defer_no_holder_no_free(tc, &cell1);
+ cell_defer_no_holder(tc, data_cell);
+ cell_defer_no_holder(tc, virt_cell);
remap(tc, bio, result.block);
return DM_MAPIO_REMAPPED;
@@ -1986,16 +2345,13 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
* of doing so.
*/
handle_unserviceable_bio(tc->pool, bio);
+ cell_defer_no_holder(tc, virt_cell);
return DM_MAPIO_SUBMITTED;
}
/* fall through */
case -EWOULDBLOCK:
- /*
- * In future, the failed dm_thin_find_block above could
- * provide the hint to load the metadata into cache.
- */
- thin_defer_bio(tc, bio);
+ thin_defer_cell(tc, virt_cell);
return DM_MAPIO_SUBMITTED;
default:
@@ -2005,6 +2361,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
* pool is switched to fail-io mode.
*/
bio_io_error(bio);
+ cell_defer_no_holder(tc, virt_cell);
return DM_MAPIO_SUBMITTED;
}
}
@@ -2185,7 +2542,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->sectors_per_block_shift = __ffs(block_size);
pool->low_water_blocks = 0;
pool_features_init(&pool->pf);
- pool->prison = dm_bio_prison_create(PRISON_CELLS);
+ pool->prison = dm_bio_prison_create();
if (!pool->prison) {
*error = "Error creating pool's bio prison";
err_p = ERR_PTR(-ENOMEM);
@@ -2211,6 +2568,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
goto bad_wq;
}
+ throttle_init(&pool->throttle);
INIT_WORK(&pool->worker, do_worker);
INIT_DELAYED_WORK(&pool->waker, do_waker);
INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
@@ -2220,6 +2578,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
INIT_LIST_HEAD(&pool->prepared_discards);
INIT_LIST_HEAD(&pool->active_thins);
pool->low_water_triggered = false;
+ pool->suspended = true;
pool->shared_read_ds = dm_deferred_set_create();
if (!pool->shared_read_ds) {
@@ -2756,20 +3115,77 @@ static int pool_preresume(struct dm_target *ti)
return 0;
}
+static void pool_suspend_active_thins(struct pool *pool)
+{
+ struct thin_c *tc;
+
+ /* Suspend all active thin devices */
+ tc = get_first_thin(pool);
+ while (tc) {
+ dm_internal_suspend_noflush(tc->thin_md);
+ tc = get_next_thin(pool, tc);
+ }
+}
+
+static void pool_resume_active_thins(struct pool *pool)
+{
+ struct thin_c *tc;
+
+ /* Resume all active thin devices */
+ tc = get_first_thin(pool);
+ while (tc) {
+ dm_internal_resume(tc->thin_md);
+ tc = get_next_thin(pool, tc);
+ }
+}
+
static void pool_resume(struct dm_target *ti)
{
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
unsigned long flags;
+ /*
+ * Must requeue active_thins' bios and then resume
+ * active_thins _before_ clearing 'suspend' flag.
+ */
+ requeue_bios(pool);
+ pool_resume_active_thins(pool);
+
spin_lock_irqsave(&pool->lock, flags);
pool->low_water_triggered = false;
+ pool->suspended = false;
spin_unlock_irqrestore(&pool->lock, flags);
- requeue_bios(pool);
do_waker(&pool->waker.work);
}
+static void pool_presuspend(struct dm_target *ti)
+{
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ pool->suspended = true;
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ pool_suspend_active_thins(pool);
+}
+
+static void pool_presuspend_undo(struct dm_target *ti)
+{
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+ unsigned long flags;
+
+ pool_resume_active_thins(pool);
+
+ spin_lock_irqsave(&pool->lock, flags);
+ pool->suspended = false;
+ spin_unlock_irqrestore(&pool->lock, flags);
+}
+
static void pool_postsuspend(struct dm_target *ti)
{
struct pool_c *pt = ti->private;
@@ -2941,7 +3357,6 @@ static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct
* create_thin <dev_id>
* create_snap <dev_id> <origin_id>
* delete <dev_id>
- * trim <dev_id> <new_size_in_sectors>
* set_transaction_id <current_trans_id> <new_trans_id>
* reserve_metadata_snap
* release_metadata_snap
@@ -3169,15 +3584,35 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+ sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+
+ /*
+ * If max_sectors is smaller than pool->sectors_per_block adjust it
+ * to the highest possible power-of-2 factor of pool->sectors_per_block.
+ * This is especially beneficial when the pool's data device is a RAID
+ * device that has a full stripe width that matches pool->sectors_per_block
+ * -- because even though partial RAID stripe-sized IOs will be issued to a
+ * single RAID stripe; when aggregated they will end on a full RAID stripe
+ * boundary.. which avoids additional partial RAID stripe writes cascading
+ */
+ if (limits->max_sectors < pool->sectors_per_block) {
+ while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
+ if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
+ limits->max_sectors--;
+ limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
+ }
+ }
/*
* If the system-determined stacked limits are compatible with the
* pool's blocksize (io_opt is a factor) do not override them.
*/
if (io_opt_sectors < pool->sectors_per_block ||
- do_div(io_opt_sectors, pool->sectors_per_block)) {
- blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
+ !is_factor(io_opt_sectors, pool->sectors_per_block)) {
+ if (is_factor(pool->sectors_per_block, limits->max_sectors))
+ blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
+ else
+ blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
}
@@ -3206,11 +3641,13 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 13, 0},
+ .version = {1, 14, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
.map = pool_map,
+ .presuspend = pool_presuspend,
+ .presuspend_undo = pool_presuspend_undo,
.postsuspend = pool_postsuspend,
.preresume = pool_preresume,
.resume = pool_resume,
@@ -3240,14 +3677,14 @@ static void thin_dtr(struct dm_target *ti)
struct thin_c *tc = ti->private;
unsigned long flags;
- thin_put(tc);
- wait_for_completion(&tc->can_destroy);
-
spin_lock_irqsave(&tc->pool->lock, flags);
list_del_rcu(&tc->list);
spin_unlock_irqrestore(&tc->pool->lock, flags);
synchronize_rcu();
+ thin_put(tc);
+ wait_for_completion(&tc->can_destroy);
+
mutex_lock(&dm_thin_pool_table.mutex);
__pool_dec(tc->pool);
@@ -3294,7 +3731,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
r = -ENOMEM;
goto out_unlock;
}
+ tc->thin_md = dm_table_get_md(ti->table);
spin_lock_init(&tc->lock);
+ INIT_LIST_HEAD(&tc->deferred_cells);
bio_list_init(&tc->deferred_bio_list);
bio_list_init(&tc->retry_on_resume_list);
tc->sort_bio_list = RB_ROOT;
@@ -3339,18 +3778,18 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (get_pool_mode(tc->pool) == PM_FAIL) {
ti->error = "Couldn't open thin device, Pool is in fail mode";
r = -EINVAL;
- goto bad_thin_open;
+ goto bad_pool;
}
r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
if (r) {
ti->error = "Couldn't open thin internal device";
- goto bad_thin_open;
+ goto bad_pool;
}
r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
if (r)
- goto bad_target_max_io_len;
+ goto bad;
ti->num_flush_bios = 1;
ti->flush_supported = true;
@@ -3365,14 +3804,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->split_discard_bios = true;
}
- dm_put(pool_md);
-
mutex_unlock(&dm_thin_pool_table.mutex);
- atomic_set(&tc->refcount, 1);
- init_completion(&tc->can_destroy);
-
spin_lock_irqsave(&tc->pool->lock, flags);
+ if (tc->pool->suspended) {
+ spin_unlock_irqrestore(&tc->pool->lock, flags);
+ mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
+ ti->error = "Unable to activate thin device while pool is suspended";
+ r = -EINVAL;
+ goto bad;
+ }
list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
spin_unlock_irqrestore(&tc->pool->lock, flags);
/*
@@ -3383,11 +3824,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
*/
synchronize_rcu();
+ dm_put(pool_md);
+
+ atomic_set(&tc->refcount, 1);
+ init_completion(&tc->can_destroy);
+
return 0;
-bad_target_max_io_len:
+bad:
dm_pool_close_thin_device(tc->td);
-bad_thin_open:
+bad_pool:
__pool_dec(tc->pool);
bad_pool_lookup:
dm_put(pool_md);
@@ -3533,6 +3979,21 @@ err:
DMEMIT("Error");
}
+static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct thin_c *tc = ti->private;
+ struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = tc->pool_dev->bdev;
+ bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
static int thin_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
@@ -3557,7 +4018,7 @@ static int thin_iterate_devices(struct dm_target *ti,
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 13, 0},
+ .version = {1, 14, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
@@ -3567,6 +4028,7 @@ static struct target_type thin_target = {
.presuspend = thin_presuspend,
.postsuspend = thin_postsuspend,
.status = thin_status,
+ .merge = thin_merge,
.iterate_devices = thin_iterate_devices,
};
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 58f3927fd7cc..4c06585bf165 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,6 +19,7 @@
#include <linux/idr.h>
#include <linux/hdreg.h>
#include <linux/delay.h>
+#include <linux/wait.h>
#include <trace/events/block.h>
@@ -117,6 +118,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
#define DMF_NOFLUSH_SUSPENDING 5
#define DMF_MERGE_IS_OPTIONAL 6
#define DMF_DEFERRED_REMOVE 7
+#define DMF_SUSPENDED_INTERNALLY 8
/*
* A dummy definition to make RCU happy.
@@ -140,7 +142,7 @@ struct mapped_device {
* Use dm_get_live_table{_fast} or take suspend_lock for
* dereference.
*/
- struct dm_table *map;
+ struct dm_table __rcu *map;
struct list_head table_devices;
struct mutex table_devices_lock;
@@ -525,14 +527,15 @@ retry:
goto out;
tgt = dm_table_get_target(map, 0);
+ if (!tgt->type->ioctl)
+ goto out;
if (dm_suspended_md(md)) {
r = -EAGAIN;
goto out;
}
- if (tgt->type->ioctl)
- r = tgt->type->ioctl(tgt, cmd, arg);
+ r = tgt->type->ioctl(tgt, cmd, arg);
out:
dm_put_live_table(md, srcu_idx);
@@ -602,13 +605,10 @@ static void end_io_acct(struct dm_io *io)
struct mapped_device *md = io->md;
struct bio *bio = io->bio;
unsigned long duration = jiffies - io->start_time;
- int pending, cpu;
+ int pending;
int rw = bio_data_dir(bio);
- cpu = part_stat_lock();
- part_round_stats(cpu, &dm_disk(md)->part0);
- part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
- part_stat_unlock();
+ generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
@@ -1607,9 +1607,9 @@ static int dm_merge_bvec(struct request_queue *q,
* Find maximum amount of I/O that won't need splitting
*/
max_sectors = min(max_io_len(bvm->bi_sector, ti),
- (sector_t) BIO_MAX_SECTORS);
+ (sector_t) queue_max_sectors(q));
max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
- if (max_size < 0)
+ if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
max_size = 0;
/*
@@ -1621,10 +1621,10 @@ static int dm_merge_bvec(struct request_queue *q,
max_size = ti->type->merge(ti, bvm, biovec, max_size);
/*
* If the target doesn't support merge method and some of the devices
- * provided their merge_bvec method (we know this by looking at
- * queue_max_hw_sectors), then we can't allow bios with multiple vector
- * entries. So always set max_size to 0, and the code below allows
- * just one page.
+ * provided their merge_bvec method (we know this by looking for the
+ * max_hw_sectors that dm_set_device_limits may set), then we can't
+ * allow bios with multiple vector entries. So always set max_size
+ * to 0, and the code below allows just one page.
*/
else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
max_size = 0;
@@ -1648,16 +1648,12 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
{
int rw = bio_data_dir(bio);
struct mapped_device *md = q->queuedata;
- int cpu;
int srcu_idx;
struct dm_table *map;
map = dm_get_live_table(md, &srcu_idx);
- cpu = part_stat_lock();
- part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
- part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
- part_stat_unlock();
+ generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
/* if we're suspended, we have to queue this io for later */
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
@@ -2332,7 +2328,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
merge_is_optional = dm_table_merge_is_optional(t);
- old_map = md->map;
+ old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
rcu_assign_pointer(md->map, t);
md->immutable_target_type = dm_table_get_immutable_target_type(t);
@@ -2341,7 +2337,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
else
clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
- dm_sync_table(md);
+ if (old_map)
+ dm_sync_table(md);
return old_map;
}
@@ -2351,7 +2348,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
*/
static struct dm_table *__unbind(struct mapped_device *md)
{
- struct dm_table *map = md->map;
+ struct dm_table *map = rcu_dereference_protected(md->map, 1);
if (!map)
return NULL;
@@ -2716,36 +2713,18 @@ static void unlock_fs(struct mapped_device *md)
}
/*
- * We need to be able to change a mapping table under a mounted
- * filesystem. For example we might want to move some data in
- * the background. Before the table can be swapped with
- * dm_bind_table, dm_suspend must be called to flush any in
- * flight bios and ensure that any further io gets deferred.
- */
-/*
- * Suspend mechanism in request-based dm.
+ * If __dm_suspend returns 0, the device is completely quiescent
+ * now. There is no request-processing activity. All new requests
+ * are being added to md->deferred list.
*
- * 1. Flush all I/Os by lock_fs() if needed.
- * 2. Stop dispatching any I/O by stopping the request_queue.
- * 3. Wait for all in-flight I/Os to be completed or requeued.
- *
- * To abort suspend, start the request_queue.
+ * Caller must hold md->suspend_lock
*/
-int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
+static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
+ unsigned suspend_flags, int interruptible)
{
- struct dm_table *map = NULL;
- int r = 0;
- int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
- int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
-
- mutex_lock(&md->suspend_lock);
-
- if (dm_suspended_md(md)) {
- r = -EINVAL;
- goto out_unlock;
- }
-
- map = md->map;
+ bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
+ bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
+ int r;
/*
* DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2754,7 +2733,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
if (noflush)
set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
- /* This does not get reverted if there's an error later. */
+ /*
+ * This gets reverted if there's an error later and the targets
+ * provide the .presuspend_undo hook.
+ */
dm_table_presuspend_targets(map);
/*
@@ -2765,8 +2747,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
*/
if (!noflush && do_lockfs) {
r = lock_fs(md);
- if (r)
- goto out_unlock;
+ if (r) {
+ dm_table_presuspend_undo_targets(map);
+ return r;
+ }
}
/*
@@ -2782,7 +2766,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
* flush_workqueue(md->wq).
*/
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
- synchronize_srcu(&md->io_barrier);
+ if (map)
+ synchronize_srcu(&md->io_barrier);
/*
* Stop md->queue before flushing md->wq in case request-based
@@ -2798,11 +2783,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
* We call dm_wait_for_completion to wait for all existing requests
* to finish.
*/
- r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
+ r = dm_wait_for_completion(md, interruptible);
if (noflush)
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
- synchronize_srcu(&md->io_barrier);
+ if (map)
+ synchronize_srcu(&md->io_barrier);
/* were we interrupted ? */
if (r < 0) {
@@ -2812,14 +2798,56 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
start_queue(md->queue);
unlock_fs(md);
- goto out_unlock; /* pushback list is already flushed, so skip flush */
+ dm_table_presuspend_undo_targets(map);
+ /* pushback list is already flushed, so skip flush */
}
- /*
- * If dm_wait_for_completion returned 0, the device is completely
- * quiescent now. There is no request-processing activity. All new
- * requests are being added to md->deferred list.
- */
+ return r;
+}
+
+/*
+ * We need to be able to change a mapping table under a mounted
+ * filesystem. For example we might want to move some data in
+ * the background. Before the table can be swapped with
+ * dm_bind_table, dm_suspend must be called to flush any in
+ * flight bios and ensure that any further io gets deferred.
+ */
+/*
+ * Suspend mechanism in request-based dm.
+ *
+ * 1. Flush all I/Os by lock_fs() if needed.
+ * 2. Stop dispatching any I/O by stopping the request_queue.
+ * 3. Wait for all in-flight I/Os to be completed or requeued.
+ *
+ * To abort suspend, start the request_queue.
+ */
+int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
+{
+ struct dm_table *map = NULL;
+ int r = 0;
+
+retry:
+ mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
+
+ if (dm_suspended_md(md)) {
+ r = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (dm_suspended_internally_md(md)) {
+ /* already internally suspended, wait for internal resume */
+ mutex_unlock(&md->suspend_lock);
+ r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
+ if (r)
+ return r;
+ goto retry;
+ }
+
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+
+ r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE);
+ if (r)
+ goto out_unlock;
set_bit(DMF_SUSPENDED, &md->flags);
@@ -2830,22 +2858,13 @@ out_unlock:
return r;
}
-int dm_resume(struct mapped_device *md)
+static int __dm_resume(struct mapped_device *md, struct dm_table *map)
{
- int r = -EINVAL;
- struct dm_table *map = NULL;
-
- mutex_lock(&md->suspend_lock);
- if (!dm_suspended_md(md))
- goto out;
-
- map = md->map;
- if (!map || !dm_table_get_size(map))
- goto out;
-
- r = dm_table_resume_targets(map);
- if (r)
- goto out;
+ if (map) {
+ int r = dm_table_resume_targets(map);
+ if (r)
+ return r;
+ }
dm_queue_flush(md);
@@ -2859,6 +2878,37 @@ int dm_resume(struct mapped_device *md)
unlock_fs(md);
+ return 0;
+}
+
+int dm_resume(struct mapped_device *md)
+{
+ int r = -EINVAL;
+ struct dm_table *map = NULL;
+
+retry:
+ mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
+
+ if (!dm_suspended_md(md))
+ goto out;
+
+ if (dm_suspended_internally_md(md)) {
+ /* already internally suspended, wait for internal resume */
+ mutex_unlock(&md->suspend_lock);
+ r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
+ if (r)
+ return r;
+ goto retry;
+ }
+
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+ if (!map || !dm_table_get_size(map))
+ goto out;
+
+ r = __dm_resume(md, map);
+ if (r)
+ goto out;
+
clear_bit(DMF_SUSPENDED, &md->flags);
r = 0;
@@ -2872,15 +2922,80 @@ out:
* Internal suspend/resume works like userspace-driven suspend. It waits
* until all bios finish and prevents issuing new bios to the target drivers.
* It may be used only from the kernel.
- *
- * Internal suspend holds md->suspend_lock, which prevents interaction with
- * userspace-driven suspend.
*/
-void dm_internal_suspend(struct mapped_device *md)
+static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
{
- mutex_lock(&md->suspend_lock);
+ struct dm_table *map = NULL;
+
+ if (dm_suspended_internally_md(md))
+ return; /* nested internal suspend */
+
+ if (dm_suspended_md(md)) {
+ set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+ return; /* nest suspend */
+ }
+
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+
+ /*
+ * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
+ * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
+ * would require changing .presuspend to return an error -- avoid this
+ * until there is a need for more elaborate variants of internal suspend.
+ */
+ (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE);
+
+ set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+
+ dm_table_postsuspend_targets(map);
+}
+
+static void __dm_internal_resume(struct mapped_device *md)
+{
+ if (!dm_suspended_internally_md(md))
+ return; /* resume from nested internal suspend */
+
if (dm_suspended_md(md))
+ goto done; /* resume from nested suspend */
+
+ /*
+ * NOTE: existing callers don't need to call dm_table_resume_targets
+ * (which may fail -- so best to avoid it for now by passing NULL map)
+ */
+ (void) __dm_resume(md, NULL);
+
+done:
+ clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
+}
+
+void dm_internal_suspend_noflush(struct mapped_device *md)
+{
+ mutex_lock(&md->suspend_lock);
+ __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
+ mutex_unlock(&md->suspend_lock);
+}
+EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
+
+void dm_internal_resume(struct mapped_device *md)
+{
+ mutex_lock(&md->suspend_lock);
+ __dm_internal_resume(md);
+ mutex_unlock(&md->suspend_lock);
+}
+EXPORT_SYMBOL_GPL(dm_internal_resume);
+
+/*
+ * Fast variants of internal suspend/resume hold md->suspend_lock,
+ * which prevents interaction with userspace-driven suspend.
+ */
+
+void dm_internal_suspend_fast(struct mapped_device *md)
+{
+ mutex_lock(&md->suspend_lock);
+ if (dm_suspended_md(md) || dm_suspended_internally_md(md))
return;
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
@@ -2889,9 +3004,9 @@ void dm_internal_suspend(struct mapped_device *md)
dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
}
-void dm_internal_resume(struct mapped_device *md)
+void dm_internal_resume_fast(struct mapped_device *md)
{
- if (dm_suspended_md(md))
+ if (dm_suspended_md(md) || dm_suspended_internally_md(md))
goto done;
dm_queue_flush(md);
@@ -2977,6 +3092,11 @@ int dm_suspended_md(struct mapped_device *md)
return test_bit(DMF_SUSPENDED, &md->flags);
}
+int dm_suspended_internally_md(struct mapped_device *md)
+{
+ return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+}
+
int dm_test_deferred_remove_flag(struct mapped_device *md)
{
return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 988c7fb7b145..84b0f9e4ba6c 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -65,6 +65,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits);
struct list_head *dm_table_get_devices(struct dm_table *t);
void dm_table_presuspend_targets(struct dm_table *t);
+void dm_table_presuspend_undo_targets(struct dm_table *t);
void dm_table_postsuspend_targets(struct dm_table *t);
int dm_table_resume_targets(struct dm_table *t);
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
@@ -129,6 +130,15 @@ int dm_deleting_md(struct mapped_device *md);
int dm_suspended_md(struct mapped_device *md);
/*
+ * Internal suspend and resume methods.
+ */
+int dm_suspended_internally_md(struct mapped_device *md);
+void dm_internal_suspend_fast(struct mapped_device *md);
+void dm_internal_resume_fast(struct mapped_device *md);
+void dm_internal_suspend_noflush(struct mapped_device *md);
+void dm_internal_resume(struct mapped_device *md);
+
+/*
* Test if the device is scheduled for deferred remove.
*/
int dm_test_deferred_remove_flag(struct mapped_device *md);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4dfa15da9cb8..709755fb6d7b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -247,7 +247,6 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
{
const int rw = bio_data_dir(bio);
struct mddev *mddev = q->queuedata;
- int cpu;
unsigned int sectors;
if (mddev == NULL || mddev->pers == NULL
@@ -284,10 +283,7 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
sectors = bio_sectors(bio);
mddev->pers->make_request(mddev, bio);
- cpu = part_stat_lock();
- part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
- part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
- part_stat_unlock();
+ generic_start_io_acct(rw, sectors, &mddev->gendisk->part0);
if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
wake_up(&mddev->sb_wait);
@@ -2695,7 +2691,8 @@ static ssize_t new_offset_store(struct md_rdev *rdev,
if (kstrtoull(buf, 10, &new_offset) < 0)
return -EINVAL;
- if (mddev->sync_thread)
+ if (mddev->sync_thread ||
+ test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
return -EBUSY;
if (new_offset == rdev->data_offset)
/* reset is always permitted */
@@ -3272,6 +3269,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
*/
if (mddev->sync_thread ||
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
mddev->reshape_position != MaxSector ||
mddev->sysfs_active)
return -EBUSY;
@@ -4026,6 +4024,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
+ flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
@@ -5044,6 +5043,7 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
@@ -5104,23 +5104,27 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
- if (mddev->sync_thread) {
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ if (mddev->sync_thread)
/* Thread might be blocked waiting for metadata update
* which will now never happen */
wake_up_process(mddev->sync_thread->tsk);
- }
+
mddev_unlock(mddev);
- wait_event(resync_wait, mddev->sync_thread == NULL);
+ wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
+ &mddev->recovery));
mddev_lock_nointr(mddev);
mutex_lock(&mddev->open_mutex);
if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
mddev->sync_thread ||
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
(bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
printk("md: %s still in use.\n",mdname(mddev));
if (did_freeze) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
err = -EBUSY;
@@ -5135,6 +5139,8 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
mddev->ro = 1;
set_disk_ro(mddev->gendisk, 1);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
sysfs_notify_dirent_safe(mddev->sysfs_state);
err = 0;
}
@@ -5159,25 +5165,30 @@ static int do_md_stop(struct mddev *mddev, int mode,
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
- if (mddev->sync_thread) {
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ if (mddev->sync_thread)
/* Thread might be blocked waiting for metadata update
* which will now never happen */
wake_up_process(mddev->sync_thread->tsk);
- }
+
mddev_unlock(mddev);
- wait_event(resync_wait, mddev->sync_thread == NULL);
+ wait_event(resync_wait, (mddev->sync_thread == NULL &&
+ !test_bit(MD_RECOVERY_RUNNING,
+ &mddev->recovery)));
mddev_lock_nointr(mddev);
mutex_lock(&mddev->open_mutex);
if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
mddev->sysfs_active ||
mddev->sync_thread ||
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
(bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
printk("md: %s still in use.\n",mdname(mddev));
mutex_unlock(&mddev->open_mutex);
if (did_freeze) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
return -EBUSY;
@@ -5946,7 +5957,8 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
* of each device. If num_sectors is zero, we find the largest size
* that fits.
*/
- if (mddev->sync_thread)
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ mddev->sync_thread)
return -EBUSY;
if (mddev->ro)
return -EROFS;
@@ -5977,7 +5989,9 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
if (raid_disks <= 0 ||
(mddev->max_disks && raid_disks >= mddev->max_disks))
return -EINVAL;
- if (mddev->sync_thread || mddev->reshape_position != MaxSector)
+ if (mddev->sync_thread ||
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ mddev->reshape_position != MaxSector)
return -EBUSY;
rdev_for_each(rdev, mddev) {
@@ -6965,7 +6979,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
int mask;
if (md_unloading)
- return POLLIN|POLLRDNORM|POLLERR|POLLPRI;;
+ return POLLIN|POLLRDNORM|POLLERR|POLLPRI;
poll_wait(filp, &md_event_waiters, wait);
/* always allow read */
@@ -7589,6 +7603,7 @@ static void md_start_sync(struct work_struct *ws)
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ wake_up(&resync_wait);
if (test_and_clear_bit(MD_RECOVERY_RECOVER,
&mddev->recovery))
if (mddev->sysfs_action)
@@ -7757,6 +7772,7 @@ void md_check_recovery(struct mddev *mddev)
not_running:
if (!mddev->sync_thread) {
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ wake_up(&resync_wait);
if (test_and_clear_bit(MD_RECOVERY_RECOVER,
&mddev->recovery))
if (mddev->sysfs_action)
@@ -7775,7 +7791,6 @@ void md_reap_sync_thread(struct mddev *mddev)
/* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread);
- wake_up(&resync_wait);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* success...*/
@@ -7803,6 +7818,7 @@ void md_reap_sync_thread(struct mddev *mddev)
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ wake_up(&resync_wait);
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
sysfs_notify_dirent_safe(mddev->sysfs_action);
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 1d75b1dc1e2e..e64b61ad0ef3 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -645,8 +645,10 @@ static int array_resize(struct dm_array_info *info, dm_block_t root,
int r;
struct resize resize;
- if (old_size == new_size)
+ if (old_size == new_size) {
+ *new_root = root;
return 0;
+ }
resize.info = info;
resize.root = root;
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index 37d367bb9aa8..bf2b80d5c470 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -42,6 +42,12 @@ struct btree_node {
} __packed;
+/*
+ * Locks a block using the btree node validator.
+ */
+int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
+ struct dm_block **result);
+
void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
struct dm_btree_value_type *vt);
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index cf9fd676ae44..1b5e13ec7f96 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -92,7 +92,7 @@ struct dm_block_validator btree_node_validator = {
/*----------------------------------------------------------------*/
-static int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
+int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
struct dm_block **result)
{
return dm_tm_read_lock(info->tm, b, &btree_node_validator, result);
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 416060c25709..200ac12a1d40 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -847,22 +847,26 @@ EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
* FIXME: We shouldn't use a recursive algorithm when we have limited stack
* space. Also this only works for single level trees.
*/
-static int walk_node(struct ro_spine *s, dm_block_t block,
+static int walk_node(struct dm_btree_info *info, dm_block_t block,
int (*fn)(void *context, uint64_t *keys, void *leaf),
void *context)
{
int r;
unsigned i, nr;
+ struct dm_block *node;
struct btree_node *n;
uint64_t keys;
- r = ro_step(s, block);
- n = ro_node(s);
+ r = bn_read_lock(info, block, &node);
+ if (r)
+ return r;
+
+ n = dm_block_data(node);
nr = le32_to_cpu(n->header.nr_entries);
for (i = 0; i < nr; i++) {
if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
- r = walk_node(s, value64(n, i), fn, context);
+ r = walk_node(info, value64(n, i), fn, context);
if (r)
goto out;
} else {
@@ -874,7 +878,7 @@ static int walk_node(struct ro_spine *s, dm_block_t block,
}
out:
- ro_pop(s);
+ dm_tm_unlock(info->tm, node);
return r;
}
@@ -882,15 +886,7 @@ int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
int (*fn)(void *context, uint64_t *keys, void *leaf),
void *context)
{
- int r;
- struct ro_spine spine;
-
BUG_ON(info->levels > 1);
-
- init_ro_spine(&spine, info);
- r = walk_node(&spine, root, fn, context);
- exit_ro_spine(&spine);
-
- return r;
+ return walk_node(info, root, fn, context);
}
EXPORT_SYMBOL_GPL(dm_btree_walk);
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 786b689bdfc7..e8a904298887 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -564,7 +564,9 @@ static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count
{
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
- return smm->ll.nr_blocks;
+ *count = smm->ll.nr_blocks;
+
+ return 0;
}
static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
@@ -581,7 +583,9 @@ static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b,
{
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
- return b < smm->begin ? 1 : 0;
+ *result = (b < smm->begin) ? 1 : 0;
+
+ return 0;
}
static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm,
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 3bc30a0ae3d6..9cb797d800cf 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -10,6 +10,8 @@
#include "dm-persistent-data-internal.h"
#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/device-mapper.h>
@@ -17,6 +19,61 @@
/*----------------------------------------------------------------*/
+#define PREFETCH_SIZE 128
+#define PREFETCH_BITS 7
+#define PREFETCH_SENTINEL ((dm_block_t) -1ULL)
+
+struct prefetch_set {
+ struct mutex lock;
+ dm_block_t blocks[PREFETCH_SIZE];
+};
+
+static unsigned prefetch_hash(dm_block_t b)
+{
+ return hash_64(b, PREFETCH_BITS);
+}
+
+static void prefetch_wipe(struct prefetch_set *p)
+{
+ unsigned i;
+ for (i = 0; i < PREFETCH_SIZE; i++)
+ p->blocks[i] = PREFETCH_SENTINEL;
+}
+
+static void prefetch_init(struct prefetch_set *p)
+{
+ mutex_init(&p->lock);
+ prefetch_wipe(p);
+}
+
+static void prefetch_add(struct prefetch_set *p, dm_block_t b)
+{
+ unsigned h = prefetch_hash(b);
+
+ mutex_lock(&p->lock);
+ if (p->blocks[h] == PREFETCH_SENTINEL)
+ p->blocks[h] = b;
+
+ mutex_unlock(&p->lock);
+}
+
+static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm)
+{
+ unsigned i;
+
+ mutex_lock(&p->lock);
+
+ for (i = 0; i < PREFETCH_SIZE; i++)
+ if (p->blocks[i] != PREFETCH_SENTINEL) {
+ dm_bm_prefetch(bm, p->blocks[i]);
+ p->blocks[i] = PREFETCH_SENTINEL;
+ }
+
+ mutex_unlock(&p->lock);
+}
+
+/*----------------------------------------------------------------*/
+
struct shadow_info {
struct hlist_node hlist;
dm_block_t where;
@@ -37,6 +94,8 @@ struct dm_transaction_manager {
spinlock_t lock;
struct hlist_head buckets[DM_HASH_SIZE];
+
+ struct prefetch_set prefetches;
};
/*----------------------------------------------------------------*/
@@ -117,6 +176,8 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
for (i = 0; i < DM_HASH_SIZE; i++)
INIT_HLIST_HEAD(tm->buckets + i);
+ prefetch_init(&tm->prefetches);
+
return tm;
}
@@ -268,8 +329,14 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
struct dm_block_validator *v,
struct dm_block **blk)
{
- if (tm->is_clone)
- return dm_bm_read_try_lock(tm->real->bm, b, v, blk);
+ if (tm->is_clone) {
+ int r = dm_bm_read_try_lock(tm->real->bm, b, v, blk);
+
+ if (r == -EWOULDBLOCK)
+ prefetch_add(&tm->real->prefetches, b);
+
+ return r;
+ }
return dm_bm_read_lock(tm->bm, b, v, blk);
}
@@ -317,6 +384,12 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
return tm->bm;
}
+void dm_tm_issue_prefetches(struct dm_transaction_manager *tm)
+{
+ prefetch_issue(&tm->prefetches, tm->bm);
+}
+EXPORT_SYMBOL_GPL(dm_tm_issue_prefetches);
+
/*----------------------------------------------------------------*/
static int dm_tm_create_internal(struct dm_block_manager *bm,
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index 2772ed2a781a..2e0d4d66fb1b 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -109,6 +109,13 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
/*
+ * If you're using a non-blocking clone the tm will build up a list of
+ * requested blocks that weren't in core. This call will request those
+ * blocks to be prefetched.
+ */
+void dm_tm_issue_prefetches(struct dm_transaction_manager *tm);
+
+/*
* A little utility that ties the knot by producing a transaction manager
* that has a space map managed by the transaction manager...
*
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9c66e5997fc8..c1b0d52bfcb0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2917,8 +2917,11 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
(sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
(!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
!test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
- (sh->raid_conf->level == 6 && s->failed && s->to_write &&
- s->to_write - s->non_overwrite < sh->raid_conf->raid_disks - 2 &&
+ ((sh->raid_conf->level == 6 ||
+ sh->sector >= sh->raid_conf->mddev->recovery_cp)
+ && s->failed && s->to_write &&
+ (s->to_write - s->non_overwrite <
+ sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) &&
(!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
/* we would like to get this block, possibly by computing it,
* otherwise read it if the backing disk is insync