diff options
Diffstat (limited to 'drivers/md/bcache')
29 files changed, 2437 insertions, 1156 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 6dfa653d30db..529c9d04e9a4 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -2,6 +2,7 @@ config BCACHE tristate "Block device as cache" + select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 help Allows a block device to be used as cache for other devices; uses @@ -26,3 +27,12 @@ config BCACHE_CLOSURES_DEBUG Keeps all active closures in a linked list and provides a debugfs interface to list them, which makes it possible to see asynchronous operations that get stuck. + +config BCACHE_ASYNC_REGISTRATION + bool "Asynchronous device registration" + depends on BCACHE + help + Add a sysfs file /sys/fs/bcache/register_async. Writing registering + device path into this file will returns immediately and the real + registration work is handled in kernel work queue in asynchronous + way. diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index fd714628da6a..5b87e59676b8 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_BCACHE) += bcache.o bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ - util.o writeback.o + util.o writeback.o features.o diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index a1df0d95151c..ce13c272c387 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -49,7 +49,7 @@ * * bch_bucket_alloc() allocates a single bucket from a specific cache. * - * bch_bucket_alloc_set() allocates one or more buckets from different caches + * bch_bucket_alloc_set() allocates one bucket from different caches * out of a cache set. * * free_some_buckets() drives all the processes described above. It's called @@ -87,8 +87,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) { struct cache *ca; struct bucket *b; - unsigned int next = c->nbuckets * c->sb.bucket_size / 1024; - unsigned int i; + unsigned long next = c->nbuckets * c->cache->sb.bucket_size / 1024; int r; atomic_sub(sectors, &c->rescale); @@ -104,14 +103,14 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) c->min_prio = USHRT_MAX; - for_each_cache(ca, c, i) - for_each_bucket(b, ca) - if (b->prio && - b->prio != BTREE_PRIO && - !atomic_read(&b->pin)) { - b->prio--; - c->min_prio = min(c->min_prio, b->prio); - } + ca = c->cache; + for_each_bucket(b, ca) + if (b->prio && + b->prio != BTREE_PRIO && + !atomic_read(&b->pin)) { + b->prio--; + c->min_prio = min(c->min_prio, b->prio); + } mutex_unlock(&c->bucket_lock); } @@ -337,7 +336,7 @@ static int bch_allocator_thread(void *arg) mutex_unlock(&ca->set->bucket_lock); blkdev_issue_discard(ca->bdev, bucket_to_sector(ca->set, bucket), - ca->sb.bucket_size, GFP_KERNEL, 0); + ca->sb.bucket_size, GFP_KERNEL); mutex_lock(&ca->set->bucket_lock); } @@ -362,7 +361,7 @@ retry_invalidate: * new stuff to them: */ allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); - if (CACHE_SYNC(&ca->set->sb)) { + if (CACHE_SYNC(&ca->sb)) { /* * This could deadlock if an allocation with a btree * node locked ever blocked - having the btree node @@ -483,39 +482,33 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) - __bch_bucket_free(PTR_CACHE(c, k, i), - PTR_BUCKET(c, k, i)); + __bch_bucket_free(c->cache, PTR_BUCKET(c, k, i)); } int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait) + struct bkey *k, bool wait) { - int i; + struct cache *ca; + long b; /* No allocation if CACHE_SET_IO_DISABLE bit is set */ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) return -1; lockdep_assert_held(&c->bucket_lock); - BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET); bkey_init(k); - /* sort by free space/prio of oldest data in caches */ - - for (i = 0; i < n; i++) { - struct cache *ca = c->cache_by_alloc[i]; - long b = bch_bucket_alloc(ca, reserve, wait); + ca = c->cache; + b = bch_bucket_alloc(ca, reserve, wait); + if (b == -1) + goto err; - if (b == -1) - goto err; + k->ptr[0] = MAKE_PTR(ca->buckets[b].gen, + bucket_to_sector(c, b), + ca->sb.nr_this_dev); - k->ptr[i] = MAKE_PTR(ca->buckets[b].gen, - bucket_to_sector(c, b), - ca->sb.nr_this_dev); - - SET_KEY_PTRS(k, i + 1); - } + SET_KEY_PTRS(k, 1); return 0; err: @@ -525,12 +518,12 @@ err: } int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait) + struct bkey *k, bool wait) { int ret; mutex_lock(&c->bucket_lock); - ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); + ret = __bch_bucket_alloc_set(c, reserve, k, wait); mutex_unlock(&c->bucket_lock); return ret; } @@ -589,7 +582,7 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c, struct open_bucket, list); found: if (!ret->sectors_free && KEY_PTRS(alloc)) { - ret->sectors_free = c->sb.bucket_size; + ret->sectors_free = c->cache->sb.bucket_size; bkey_copy(&ret->key, alloc); bkey_init(alloc); } @@ -638,7 +631,7 @@ bool bch_alloc_sectors(struct cache_set *c, spin_unlock(&c->data_bucket_lock); - if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) + if (bch_bucket_alloc_set(c, watermark, &alloc.key, wait)) return false; spin_lock(&c->data_bucket_lock); @@ -680,10 +673,10 @@ bool bch_alloc_sectors(struct cache_set *c, SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); atomic_long_add(sectors, - &PTR_CACHE(c, &b->key, i)->sectors_written); + &c->cache->sectors_written); } - if (b->sectors_free < c->sb.block_size) + if (b->sectors_free < c->cache->sb.block_size) b->sectors_free = 0; /* diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 74a9849ea164..aebb7ef10e63 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -107,7 +107,7 @@ * * BTREE NODES: * - * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and + * Our unit of allocation is a bucket, and we can't arbitrarily allocate and * free smaller than a bucket - so, that's how big our btree nodes are. * * (If buckets are really big we'll only use part of the bucket for a btree node @@ -176,9 +176,8 @@ * - updates to non leaf nodes just happen synchronously (see btree_split()). */ -#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ +#define pr_fmt(fmt) "bcache: %s() " fmt, __func__ -#include <linux/bcache.h> #include <linux/bio.h> #include <linux/kobject.h> #include <linux/list.h> @@ -190,6 +189,7 @@ #include <linux/workqueue.h> #include <linux/kthread.h> +#include "bcache_ondisk.h" #include "bset.h" #include "util.h" #include "closure.h" @@ -264,7 +264,7 @@ struct bcache_device { #define BCACHE_DEV_UNLINK_DONE 2 #define BCACHE_DEV_WB_RUNNING 3 #define BCACHE_DEV_RATE_DW_RUNNING 4 - unsigned int nr_stripes; + int nr_stripes; unsigned int stripe_size; atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; @@ -364,7 +364,6 @@ struct cached_dev { /* The rest of this all shows up in sysfs */ unsigned int sequential_cutoff; - unsigned int readahead; unsigned int io_disable:1; unsigned int verify:1; @@ -373,6 +372,7 @@ struct cached_dev { unsigned int partial_stripes_expensive:1; unsigned int writeback_metadata:1; unsigned int writeback_running:1; + unsigned int writeback_consider_fragment:1; unsigned char writeback_percent; unsigned int writeback_delay; @@ -385,6 +385,9 @@ struct cached_dev { unsigned int writeback_rate_update_seconds; unsigned int writeback_rate_i_term_inverse; unsigned int writeback_rate_p_term_inverse; + unsigned int writeback_rate_fp_term_low; + unsigned int writeback_rate_fp_term_mid; + unsigned int writeback_rate_fp_term_high; unsigned int writeback_rate_minimum; enum stop_on_failure stop_when_cache_set_failed; @@ -393,7 +396,12 @@ struct cached_dev { unsigned int error_limit; unsigned int offline_seconds; - char backing_dev_name[BDEVNAME_SIZE]; + /* + * Retry to update writeback_rate if contention happens for + * down_read(dc->writeback_lock) in update_writeback_rate() + */ +#define BCH_WBRATE_UPDATE_MAX_SKIPS 15 + unsigned int rate_update_retry; }; enum alloc_reserve { @@ -467,8 +475,6 @@ struct cache { atomic_long_t meta_sectors_written; atomic_long_t btree_sectors_written; atomic_long_t sectors_written; - - char cache_dev_name[BDEVNAME_SIZE]; }; struct gc_stat { @@ -517,11 +523,7 @@ struct cache_set { atomic_t idle_counter; atomic_t at_max_writeback_rate; - struct cache_sb sb; - - struct cache *cache[MAX_CACHES_PER_SET]; - struct cache *cache_by_alloc[MAX_CACHES_PER_SET]; - int caches_loaded; + struct cache *cache; struct bcache_device **devices; unsigned int devices_max_used; @@ -670,6 +672,7 @@ struct cache_set { struct mutex verify_lock; #endif + uint8_t set_uuid[16]; unsigned int nr_uuids; struct uuid_entry *uuids; BKEY_PADDED(uuid_bucket); @@ -758,15 +761,35 @@ struct bbio { #define btree_default_blocks(c) \ ((unsigned int) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) -#define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS) -#define bucket_bytes(c) ((c)->sb.bucket_size << 9) -#define block_bytes(c) ((c)->sb.block_size << 9) +#define bucket_bytes(ca) ((ca)->sb.bucket_size << 9) +#define block_bytes(ca) ((ca)->sb.block_size << 9) -#define prios_per_bucket(c) \ - ((bucket_bytes(c) - sizeof(struct prio_set)) / \ +static inline unsigned int meta_bucket_pages(struct cache_sb *sb) +{ + unsigned int n, max_pages; + + max_pages = min_t(unsigned int, + __rounddown_pow_of_two(USHRT_MAX) / PAGE_SECTORS, + MAX_ORDER_NR_PAGES); + + n = sb->bucket_size / PAGE_SECTORS; + if (n > max_pages) + n = max_pages; + + return n; +} + +static inline unsigned int meta_bucket_bytes(struct cache_sb *sb) +{ + return meta_bucket_pages(sb) << PAGE_SHIFT; +} + +#define prios_per_bucket(ca) \ + ((meta_bucket_bytes(&(ca)->sb) - sizeof(struct prio_set)) / \ sizeof(struct bucket_disk)) -#define prio_buckets(c) \ - DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) + +#define prio_buckets(ca) \ + DIV_ROUND_UP((size_t) (ca)->sb.nbuckets, prios_per_bucket(ca)) static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) { @@ -780,14 +803,7 @@ static inline sector_t bucket_to_sector(struct cache_set *c, size_t b) static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) { - return s & (c->sb.bucket_size - 1); -} - -static inline struct cache *PTR_CACHE(struct cache_set *c, - const struct bkey *k, - unsigned int ptr) -{ - return c->cache[PTR_DEV(k, ptr)]; + return s & (c->cache->sb.bucket_size - 1); } static inline size_t PTR_BUCKET_NR(struct cache_set *c, @@ -801,7 +817,7 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, const struct bkey *k, unsigned int ptr) { - return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); + return c->cache->buckets + PTR_BUCKET_NR(c, k, ptr); } static inline uint8_t gen_after(uint8_t a, uint8_t b) @@ -820,7 +836,7 @@ static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, static inline bool ptr_available(struct cache_set *c, const struct bkey *k, unsigned int i) { - return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); + return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && c->cache; } /* Btree key macros */ @@ -868,9 +884,6 @@ do { \ /* Looping macros */ -#define for_each_cache(ca, cs, iter) \ - for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++) - #define for_each_bucket(b, ca) \ for (b = (ca)->buckets + (ca)->sb.first_bucket; \ b < (ca)->buckets + (ca)->sb.nbuckets; b++) @@ -912,11 +925,9 @@ static inline uint8_t bucket_gc_gen(struct bucket *b) static inline void wake_up_allocators(struct cache_set *c) { - struct cache *ca; - unsigned int i; + struct cache *ca = c->cache; - for_each_cache(ca, c, i) - wake_up_process(ca->alloc_thread); + wake_up_process(ca->alloc_thread); } static inline void closure_bio_submit(struct cache_set *c, @@ -929,7 +940,7 @@ static inline void closure_bio_submit(struct cache_set *c, bio_endio(bio); return; } - generic_make_request(bio); + submit_bio_noacct(bio); } /* @@ -973,9 +984,9 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k); long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait); int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait); + struct bkey *k, bool wait); int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait); + struct bkey *k, bool wait); bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned int sectors, unsigned int write_point, unsigned int write_prio, bool wait); @@ -989,6 +1000,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); extern struct workqueue_struct *bcache_wq; extern struct workqueue_struct *bch_journal_wq; +extern struct workqueue_struct *bch_flush_wq; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; @@ -1030,5 +1042,7 @@ void bch_debug_exit(void); void bch_debug_init(void); void bch_request_exit(void); int bch_request_init(void); +void bch_btree_exit(void); +int bch_btree_init(void); #endif /* _BCACHE_H */ diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h new file mode 100644 index 000000000000..97413586195b --- /dev/null +++ b/drivers/md/bcache/bcache_ondisk.h @@ -0,0 +1,445 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_BCACHE_H +#define _LINUX_BCACHE_H + +/* + * Bcache on disk data structures + */ + +#include <linux/types.h> + +#define BITMASK(name, type, field, offset, size) \ +static inline __u64 name(const type *k) \ +{ return (k->field >> offset) & ~(~0ULL << size); } \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + k->field &= ~(~(~0ULL << size) << offset); \ + k->field |= (v & ~(~0ULL << size)) << offset; \ +} + +/* Btree keys - all units are in sectors */ + +struct bkey { + __u64 high; + __u64 low; + __u64 ptr[]; +}; + +#define KEY_FIELD(name, field, offset, size) \ + BITMASK(name, struct bkey, field, offset, size) + +#define PTR_FIELD(name, offset, size) \ +static inline __u64 name(const struct bkey *k, unsigned int i) \ +{ return (k->ptr[i] >> offset) & ~(~0ULL << size); } \ + \ +static inline void SET_##name(struct bkey *k, unsigned int i, __u64 v) \ +{ \ + k->ptr[i] &= ~(~(~0ULL << size) << offset); \ + k->ptr[i] |= (v & ~(~0ULL << size)) << offset; \ +} + +#define KEY_SIZE_BITS 16 +#define KEY_MAX_U64S 8 + +KEY_FIELD(KEY_PTRS, high, 60, 3) +KEY_FIELD(__PAD0, high, 58, 2) +KEY_FIELD(KEY_CSUM, high, 56, 2) +KEY_FIELD(__PAD1, high, 55, 1) +KEY_FIELD(KEY_DIRTY, high, 36, 1) + +KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS) +KEY_FIELD(KEY_INODE, high, 0, 20) + +/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ + +static inline __u64 KEY_OFFSET(const struct bkey *k) +{ + return k->low; +} + +static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v) +{ + k->low = v; +} + +/* + * The high bit being set is a relic from when we used it to do binary + * searches - it told you where a key started. It's not used anymore, + * and can probably be safely dropped. + */ +#define KEY(inode, offset, size) \ +((struct bkey) { \ + .high = (1ULL << 63) | ((__u64) (size) << 20) | (inode), \ + .low = (offset) \ +}) + +#define ZERO_KEY KEY(0, 0, 0) + +#define MAX_KEY_INODE (~(~0 << 20)) +#define MAX_KEY_OFFSET (~0ULL >> 1) +#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0) + +#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) +#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) + +#define PTR_DEV_BITS 12 + +PTR_FIELD(PTR_DEV, 51, PTR_DEV_BITS) +PTR_FIELD(PTR_OFFSET, 8, 43) +PTR_FIELD(PTR_GEN, 0, 8) + +#define PTR_CHECK_DEV ((1 << PTR_DEV_BITS) - 1) + +#define MAKE_PTR(gen, offset, dev) \ + ((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen) + +/* Bkey utility code */ + +static inline unsigned long bkey_u64s(const struct bkey *k) +{ + return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k); +} + +static inline unsigned long bkey_bytes(const struct bkey *k) +{ + return bkey_u64s(k) * sizeof(__u64); +} + +#define bkey_copy(_dest, _src) memcpy(_dest, _src, bkey_bytes(_src)) + +static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) +{ + SET_KEY_INODE(dest, KEY_INODE(src)); + SET_KEY_OFFSET(dest, KEY_OFFSET(src)); +} + +static inline struct bkey *bkey_next(const struct bkey *k) +{ + __u64 *d = (void *) k; + + return (struct bkey *) (d + bkey_u64s(k)); +} + +static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys) +{ + __u64 *d = (void *) k; + + return (struct bkey *) (d + nr_keys); +} +/* Enough for a key with 6 pointers */ +#define BKEY_PAD 8 + +#define BKEY_PADDED(key) \ + union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; } + +/* Superblock */ + +/* Version 0: Cache device + * Version 1: Backing device + * Version 2: Seed pointer into btree node checksum + * Version 3: Cache device with new UUID format + * Version 4: Backing device with data offset + */ +#define BCACHE_SB_VERSION_CDEV 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_VERSION_CDEV_WITH_FEATURES 5 +#define BCACHE_SB_VERSION_BDEV_WITH_FEATURES 6 +#define BCACHE_SB_MAX_VERSION 6 + +#define SB_SECTOR 8 +#define SB_OFFSET (SB_SECTOR << SECTOR_SHIFT) +#define SB_SIZE 4096 +#define SB_LABEL_SIZE 32 +#define SB_JOURNAL_BUCKETS 256U +/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ +#define MAX_CACHES_PER_SET 8 + +#define BDEV_DATA_START_DEFAULT 16 /* sectors */ + +struct cache_sb_disk { + __le64 csum; + __le64 offset; /* sector where this sb was written */ + __le64 version; + + __u8 magic[16]; + + __u8 uuid[16]; + union { + __u8 set_uuid[16]; + __le64 set_magic; + }; + __u8 label[SB_LABEL_SIZE]; + + __le64 flags; + __le64 seq; + + __le64 feature_compat; + __le64 feature_incompat; + __le64 feature_ro_compat; + + __le64 pad[5]; + + union { + struct { + /* Cache devices */ + __le64 nbuckets; /* device size */ + + __le16 block_size; /* sectors */ + __le16 bucket_size; /* sectors */ + + __le16 nr_in_set; + __le16 nr_this_dev; + }; + struct { + /* Backing devices */ + __le64 data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; + + __le32 last_mount; /* time overflow in y2106 */ + + __le16 first_bucket; + union { + __le16 njournal_buckets; + __le16 keys; + }; + __le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ + __le16 obso_bucket_size_hi; /* obsoleted */ +}; + +/* + * This is for in-memory bcache super block. + * NOTE: cache_sb is NOT exactly mapping to cache_sb_disk, the member + * size, ordering and even whole struct size may be different + * from cache_sb_disk. + */ +struct cache_sb { + __u64 offset; /* sector where this sb was written */ + __u64 version; + + __u8 magic[16]; + + __u8 uuid[16]; + union { + __u8 set_uuid[16]; + __u64 set_magic; + }; + __u8 label[SB_LABEL_SIZE]; + + __u64 flags; + __u64 seq; + + __u64 feature_compat; + __u64 feature_incompat; + __u64 feature_ro_compat; + + union { + struct { + /* Cache devices */ + __u64 nbuckets; /* device size */ + + __u16 block_size; /* sectors */ + __u16 nr_in_set; + __u16 nr_this_dev; + __u32 bucket_size; /* sectors */ + }; + struct { + /* Backing devices */ + __u64 data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; + + __u32 last_mount; /* time overflow in y2106 */ + + __u16 first_bucket; + union { + __u16 njournal_buckets; + __u16 keys; + }; + __u64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ +}; + +static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) +{ + return sb->version == BCACHE_SB_VERSION_BDEV + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_FEATURES; +} + +BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); +BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); +BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); +#define CACHE_REPLACEMENT_LRU 0U +#define CACHE_REPLACEMENT_FIFO 1U +#define CACHE_REPLACEMENT_RANDOM 2U + +BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); +#define CACHE_MODE_WRITETHROUGH 0U +#define CACHE_MODE_WRITEBACK 1U +#define CACHE_MODE_WRITEAROUND 2U +#define CACHE_MODE_NONE 3U +BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); +#define BDEV_STATE_NONE 0U +#define BDEV_STATE_CLEAN 1U +#define BDEV_STATE_DIRTY 2U +#define BDEV_STATE_STALE 3U + +/* + * Magic numbers + * + * The various other data structures have their own magic numbers, which are + * xored with the first part of the cache set's UUID + */ + +#define JSET_MAGIC 0x245235c1a3625032ULL +#define PSET_MAGIC 0x6750e15f87337f91ULL +#define BSET_MAGIC 0x90135c78b99e07f5ULL + +static inline __u64 jset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ JSET_MAGIC; +} + +static inline __u64 pset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ PSET_MAGIC; +} + +static inline __u64 bset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ BSET_MAGIC; +} + +/* + * Journal + * + * On disk format for a journal entry: + * seq is monotonically increasing; every journal entry has its own unique + * sequence number. + * + * last_seq is the oldest journal entry that still has keys the btree hasn't + * flushed to disk yet. + * + * version is for on disk format changes. + */ + +#define BCACHE_JSET_VERSION_UUIDv1 1 +#define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */ +#define BCACHE_JSET_VERSION 1 + +struct jset { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 keys; + + __u64 last_seq; + + BKEY_PADDED(uuid_bucket); + BKEY_PADDED(btree_root); + __u16 btree_level; + __u16 pad[3]; + + __u64 prio_bucket[MAX_CACHES_PER_SET]; + + union { + struct bkey start[0]; + __u64 d[0]; + }; +}; + +/* Bucket prios/gens */ + +struct prio_set { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 pad; + + __u64 next_bucket; + + struct bucket_disk { + __u16 prio; + __u8 gen; + } __attribute((packed)) data[]; +}; + +/* UUIDS - per backing device/flash only volume metadata */ + +struct uuid_entry { + union { + struct { + __u8 uuid[16]; + __u8 label[32]; + __u32 first_reg; /* time overflow in y2106 */ + __u32 last_reg; + __u32 invalidated; + + __u32 flags; + /* Size of flash only volumes */ + __u64 sectors; + }; + + __u8 pad[128]; + }; +}; + +BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); + +/* Btree nodes */ + +/* Version 1: Seed pointer into btree node checksum + */ +#define BCACHE_BSET_CSUM 1 +#define BCACHE_BSET_VERSION 1 + +/* + * Btree nodes + * + * On disk a btree node is a list/log of these; within each set the keys are + * sorted + */ +struct bset { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 keys; + + union { + struct bkey start[0]; + __u64 d[0]; + }; +}; + +/* OBSOLETE */ + +/* UUIDS - per backing device/flash only volume metadata */ + +struct uuid_entry_v0 { + __u8 uuid[16]; + __u8 label[32]; + __u32 first_reg; + __u32 last_reg; + __u32 invalidated; + __u32 pad; +}; + +#endif /* _LINUX_BCACHE_H */ diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 4385303836d8..2bba4d6aaaa2 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -6,7 +6,7 @@ * Copyright 2012 Google, Inc. */ -#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ +#define pr_fmt(fmt) "bcache: %s() " fmt, __func__ #include "util.h" #include "bset.h" @@ -31,7 +31,7 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set) if (b->ops->key_dump) b->ops->key_dump(b, k); else - pr_err("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); + pr_cont("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); if (next < bset_bkey_last(i) && bkey_cmp(k, b->ops->is_extents ? @@ -322,7 +322,7 @@ int bch_btree_keys_alloc(struct btree_keys *b, b->page_order = page_order; - t->data = (void *) __get_free_pages(gfp, b->page_order); + t->data = (void *) __get_free_pages(__GFP_COMP|gfp, b->page_order); if (!t->data) goto err; @@ -712,8 +712,10 @@ void bch_bset_build_written_tree(struct btree_keys *b) for (j = inorder_next(0, t->size); j; j = inorder_next(j, t->size)) { - while (bkey_to_cacheline(t, k) < cacheline) - prev = k, k = bkey_next(k); + while (bkey_to_cacheline(t, k) < cacheline) { + prev = k; + k = bkey_next(k); + } t->prev[j] = bkey_u64s(prev); t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k); @@ -901,8 +903,10 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, status = BTREE_INSERT_STATUS_INSERT; while (m != bset_bkey_last(i) && - bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) - prev = m, m = bkey_next(m); + bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) { + prev = m; + m = bkey_next(m); + } /* prev is in the tree, if we merge we're done */ status = BTREE_INSERT_STATUS_BACK_MERGE; @@ -1225,7 +1229,7 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out, out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; - pr_debug("sorted %i keys", out->keys); + pr_debug("sorted %i keys\n", out->keys); } static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, @@ -1260,7 +1264,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, * * Don't worry event 'out' is allocated from mempool, it can * still be swapped here. Because state->pool is a page mempool - * creaated by by mempool_init_page_pool(), which allocates + * created by mempool_init_page_pool(), which allocates * pages by alloc_pages() indeed. */ diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index a50dcfda656f..d795c84246b0 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -2,10 +2,10 @@ #ifndef _BCACHE_BSET_H #define _BCACHE_BSET_H -#include <linux/bcache.h> #include <linux/kernel.h> #include <linux/types.h> +#include "bcache_ondisk.h" #include "util.h" /* for time_stats */ /* diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fa872df4e770..147c493a989a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -99,70 +99,14 @@ #define PTR_HASH(c, k) \ (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) -#define insert_lock(s, b) ((b)->level <= (s)->lock) +static struct workqueue_struct *btree_io_wq; -/* - * These macros are for recursing down the btree - they handle the details of - * locking and looking up nodes in the cache for you. They're best treated as - * mere syntax when reading code that uses them. - * - * op->lock determines whether we take a read or a write lock at a given depth. - * If you've got a read lock and find that you need a write lock (i.e. you're - * going to have to split), set op->lock and return -EINTR; btree_root() will - * call you again and you'll have the correct lock. - */ +#define insert_lock(s, b) ((b)->level <= (s)->lock) -/** - * btree - recurse down the btree on a specified key - * @fn: function to call, which will be passed the child node - * @key: key to recurse on - * @b: parent btree node - * @op: pointer to struct btree_op - */ -#define btree(fn, key, b, op, ...) \ -({ \ - int _r, l = (b)->level - 1; \ - bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ - _w, b); \ - if (!IS_ERR(_child)) { \ - _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ - rw_unlock(_w, _child); \ - } else \ - _r = PTR_ERR(_child); \ - _r; \ -}) - -/** - * btree_root - call a function on the root of the btree - * @fn: function to call, which will be passed the child node - * @c: cache set - * @op: pointer to struct btree_op - */ -#define btree_root(fn, c, op, ...) \ -({ \ - int _r = -EINTR; \ - do { \ - struct btree *_b = (c)->root; \ - bool _w = insert_lock(op, _b); \ - rw_lock(_w, _b, _b->level); \ - if (_b == (c)->root && \ - _w == insert_lock(op, _b)) { \ - _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ - } \ - rw_unlock(_w, _b); \ - bch_cannibalize_unlock(c); \ - if (_r == -EINTR) \ - schedule(); \ - } while (_r == -EINTR); \ - \ - finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ - _r; \ -}) static inline struct bset *write_block(struct btree *b) { - return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); + return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache); } static void bch_btree_init_next(struct btree *b) @@ -175,7 +119,7 @@ static void bch_btree_init_next(struct btree *b) if (b->written < btree_blocks(b)) bch_bset_init_next(&b->keys, write_block(b), - bset_magic(&b->c->sb)); + bset_magic(&b->c->cache->sb)); } @@ -197,7 +141,7 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) uint64_t crc = b->key.ptr[0]; void *data = (void *) i + 8, *end = bset_bkey_last(i); - crc = bch_crc64_update(crc, data, end - data); + crc = crc64_be(crc, data, end - data); return crc ^ 0xffffffffffffffffULL; } @@ -213,7 +157,7 @@ void bch_btree_node_read_done(struct btree *b) * See the comment arount cache_set->fill_iter. */ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); - iter->size = b->c->sb.bucket_size / b->c->sb.block_size; + iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; iter->used = 0; #ifdef CONFIG_BCACHE_DEBUG @@ -231,12 +175,12 @@ void bch_btree_node_read_done(struct btree *b) goto err; err = "bad btree header"; - if (b->written + set_blocks(i, block_bytes(b->c)) > + if (b->written + set_blocks(i, block_bytes(b->c->cache)) > btree_blocks(b)) goto err; err = "bad magic"; - if (i->magic != bset_magic(&b->c->sb)) + if (i->magic != bset_magic(&b->c->cache->sb)) goto err; err = "bad checksum"; @@ -257,13 +201,13 @@ void bch_btree_node_read_done(struct btree *b) bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); - b->written += set_blocks(i, block_bytes(b->c)); + b->written += set_blocks(i, block_bytes(b->c->cache)); } err = "corrupted btree"; for (i = write_block(b); bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); - i = ((void *) i) + block_bytes(b->c)) + i = ((void *) i) + block_bytes(b->c->cache)) if (i->seq == b->keys.set[0].data->seq) goto err; @@ -277,7 +221,7 @@ void bch_btree_node_read_done(struct btree *b) if (b->written < btree_blocks(b)) bch_bset_init_next(&b->keys, write_block(b), - bset_magic(&b->c->sb)); + bset_magic(&b->c->cache->sb)); out: mempool_free(iter, &b->c->fill_iter); return; @@ -366,7 +310,7 @@ static void __btree_node_write_done(struct closure *cl) btree_complete_write(b, w); if (btree_node_dirty(b)) - schedule_delayed_work(&b->work, 30 * HZ); + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); closure_return_with_destructor(cl, btree_node_write_unlock); } @@ -405,7 +349,7 @@ static void do_btree_node_write(struct btree *b) b->bio->bi_end_io = btree_node_write_endio; b->bio->bi_private = cl; - b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); + b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache)); b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA; bch_bio_map(b->bio, i); @@ -481,10 +425,10 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent) do_btree_node_write(b); - atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, - &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); + atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size, + &b->c->cache->btree_sectors_written); - b->written += set_blocks(i, block_bytes(b->c)); + b->written += set_blocks(i, block_bytes(b->c->cache)); } void bch_btree_node_write(struct btree *b, struct closure *parent) @@ -539,7 +483,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) BUG_ON(!i->keys); if (!btree_node_dirty(b)) - schedule_delayed_work(&b->work, 30 * HZ); + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); set_btree_node_dirty(b); @@ -572,7 +516,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) * mca -> memory cache */ -#define mca_reserve(c) (((c->root && c->root->level) \ +#define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \ ? c->root->level : 1) * 8 + 16) #define mca_can_free(c) \ max_t(int, 0, c->btree_cache_used - mca_reserve(c)) @@ -677,7 +621,7 @@ retry: * and BTREE_NODE_journal_flush bit cleared by btree_flush_write(). */ if (btree_node_journal_flush(b)) { - pr_debug("bnode %p is flushing by journal, retry", b); + pr_debug("bnode %p is flushing by journal, retry\n", b); mutex_unlock(&b->write_lock); udelay(1); goto retry; @@ -796,7 +740,7 @@ void bch_btree_cache_free(struct cache_set *c) if (c->verify_data) list_move(&c->verify_data->list, &c->btree_cache); - free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); + free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb))); #endif list_splice(&c->btree_cache_freeable, @@ -843,7 +787,16 @@ int bch_btree_cache_alloc(struct cache_set *c) mutex_init(&c->verify_lock); c->verify_ondisk = (void *) - __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); + __get_free_pages(GFP_KERNEL|__GFP_COMP, + ilog2(meta_bucket_pages(&c->cache->sb))); + if (!c->verify_ondisk) { + /* + * Don't worry about the mca_rereserve buckets + * allocated in previous for-loop, they will be + * handled properly in bch_cache_set_unregister(). + */ + return -ENOMEM; + } c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); @@ -859,8 +812,8 @@ int bch_btree_cache_alloc(struct cache_set *c) c->shrink.seeks = 4; c->shrink.batch = c->btree_pages * 2; - if (register_shrinker(&c->shrink)) - pr_warn("bcache: %s: could not register shrinker", + if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid)) + pr_warn("bcache: %s: could not register shrinker\n", __func__); return 0; @@ -1017,7 +970,7 @@ err: * bch_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * - * If IO is necessary and running under generic_make_request, returns -EAGAIN. + * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN. * * The btree node will have either a read or a write lock held, depending on * level and op->lock. @@ -1112,7 +1065,7 @@ retry: */ if (btree_node_journal_flush(b)) { mutex_unlock(&b->write_lock); - pr_debug("bnode %p journal_flush set, retry", b); + pr_debug("bnode %p journal_flush set, retry\n", b); udelay(1); goto retry; } @@ -1141,7 +1094,7 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) goto err; bkey_put(c, &k.key); @@ -1158,7 +1111,7 @@ retry: } b->parent = parent; - bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); + bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb)); mutex_unlock(&c->bucket_lock); @@ -1208,7 +1161,7 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) SET_PTR_GEN(k, i, - bch_inc_gen(PTR_CACHE(b->c, &b->key, i), + bch_inc_gen(b->c->cache, PTR_BUCKET(b->c, &b->key, i))); mutex_unlock(&b->c->bucket_lock); @@ -1217,19 +1170,18 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) static int btree_check_reserve(struct btree *b, struct btree_op *op) { struct cache_set *c = b->c; - struct cache *ca; - unsigned int i, reserve = (c->root->level - b->level) * 2 + 1; + struct cache *ca = c->cache; + unsigned int reserve = (c->root->level - b->level) * 2 + 1; mutex_lock(&c->bucket_lock); - for_each_cache(ca, c, i) - if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { - if (op) - prepare_to_wait(&c->btree_cache_wait, &op->wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&c->bucket_lock); - return -EINTR; - } + if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&c->bucket_lock); + return -EINTR; + } mutex_unlock(&c->bucket_lock); @@ -1395,7 +1347,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, if (nodes < 2 || __set_blocks(b->keys.set[0].data, keys, - block_bytes(b->c)) > blocks * (nodes - 1)) + block_bytes(b->c->cache)) > blocks * (nodes - 1)) return 0; for (i = 0; i < nodes; i++) { @@ -1429,7 +1381,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, k = bkey_next(k)) { if (__set_blocks(n1, n1->keys + keys + bkey_u64s(k), - block_bytes(b->c)) > blocks) + block_bytes(b->c->cache)) > blocks) break; last = k; @@ -1445,16 +1397,16 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, * though) */ if (__set_blocks(n1, n1->keys + n2->keys, - block_bytes(b->c)) > + block_bytes(b->c->cache)) > btree_blocks(new_nodes[i])) - goto out_nocoalesce; + goto out_unlock_nocoalesce; keys = n2->keys; /* Take the key of the node we're getting rid of */ last = &r->b->key; } - BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > + BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) > btree_blocks(new_nodes[i])); if (last) @@ -1476,7 +1428,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, if (__bch_keylist_realloc(&keylist, bkey_u64s(&new_nodes[i]->key))) - goto out_nocoalesce; + goto out_unlock_nocoalesce; bch_btree_node_write(new_nodes[i], &cl); bch_keylist_add(&keylist, &new_nodes[i]->key); @@ -1522,6 +1474,10 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, /* Invalidated our iterator */ return -EINTR; +out_unlock_nocoalesce: + for (i = 0; i < nodes; i++) + mutex_unlock(&new_nodes[i]->write_lock); + out_nocoalesce: closure_sync(&cl); @@ -1741,7 +1697,6 @@ static void btree_gc_start(struct cache_set *c) { struct cache *ca; struct bucket *b; - unsigned int i; if (!c->gc_mark_valid) return; @@ -1751,14 +1706,14 @@ static void btree_gc_start(struct cache_set *c) c->gc_mark_valid = 0; c->gc_done = ZERO_KEY; - for_each_cache(ca, c, i) - for_each_bucket(b, ca) { - b->last_gc = b->gen; - if (!atomic_read(&b->pin)) { - SET_GC_MARK(b, 0); - SET_GC_SECTORS_USED(b, 0); - } + ca = c->cache; + for_each_bucket(b, ca) { + b->last_gc = b->gen; + if (!atomic_read(&b->pin)) { + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); } + } mutex_unlock(&c->bucket_lock); } @@ -1767,7 +1722,8 @@ static void bch_btree_gc_finish(struct cache_set *c) { struct bucket *b; struct cache *ca; - unsigned int i; + unsigned int i, j; + uint64_t *k; mutex_lock(&c->bucket_lock); @@ -1785,7 +1741,6 @@ static void bch_btree_gc_finish(struct cache_set *c) struct bcache_device *d = c->devices[i]; struct cached_dev *dc; struct keybuf_key *w, *n; - unsigned int j; if (!d || UUID_FLASH_ONLY(&c->uuids[i])) continue; @@ -1802,29 +1757,27 @@ static void bch_btree_gc_finish(struct cache_set *c) rcu_read_unlock(); c->avail_nbuckets = 0; - for_each_cache(ca, c, i) { - uint64_t *i; - ca->invalidate_needs_gc = 0; + ca = c->cache; + ca->invalidate_needs_gc = 0; - for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) - SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); + for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++) + SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); - for (i = ca->prio_buckets; - i < ca->prio_buckets + prio_buckets(ca) * 2; i++) - SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); + for (k = ca->prio_buckets; + k < ca->prio_buckets + prio_buckets(ca) * 2; k++) + SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); - for_each_bucket(b, ca) { - c->need_gc = max(c->need_gc, bucket_gc_gen(b)); + for_each_bucket(b, ca) { + c->need_gc = max(c->need_gc, bucket_gc_gen(b)); - if (atomic_read(&b->pin)) - continue; + if (atomic_read(&b->pin)) + continue; - BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); + BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); - if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) - c->avail_nbuckets++; - } + if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) + c->avail_nbuckets++; } mutex_unlock(&c->bucket_lock); @@ -1848,7 +1801,7 @@ static void bch_btree_gc(struct cache_set *c) /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ do { - ret = btree_root(gc_root, c, &op, &writes, &stats); + ret = bcache_btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&writes); cond_resched(); @@ -1856,7 +1809,7 @@ static void bch_btree_gc(struct cache_set *c) schedule_timeout_interruptible(msecs_to_jiffies (GC_SLEEP_MS)); else if (ret) - pr_warn("gc failed!"); + pr_warn("gc failed!\n"); } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); bch_btree_gc_finish(c); @@ -1876,12 +1829,10 @@ static void bch_btree_gc(struct cache_set *c) static bool gc_should_run(struct cache_set *c) { - struct cache *ca; - unsigned int i; + struct cache *ca = c->cache; - for_each_cache(ca, c, i) - if (ca->invalidate_needs_gc) - return true; + if (ca->invalidate_needs_gc) + return true; if (atomic_read(&c->sectors_to_gc) < 0) return true; @@ -1946,7 +1897,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) } if (p) - ret = btree(check_recurse, p, b, op); + ret = bcache_btree(check_recurse, p, b, op); p = k; } while (p && !ret); @@ -1955,20 +1906,177 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) return ret; } + +static int bch_btree_check_thread(void *arg) +{ + int ret; + struct btree_check_info *info = arg; + struct btree_check_state *check_state = info->state; + struct cache_set *c = check_state->c; + struct btree_iter iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + + k = p = NULL; + cur_idx = prev_idx = 0; + ret = 0; + + /* root node keys are checked before thread created */ + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + BUG_ON(!k); + + p = k; + while (k) { + /* + * Fetch a root node key index, skip the keys which + * should be fetched by other threads, then check the + * sub-tree indexed by the fetched key. + */ + spin_lock(&check_state->idx_lock); + cur_idx = check_state->key_idx; + check_state->key_idx++; + spin_unlock(&check_state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + /* + * No more keys to check in root node, + * current checking threads are enough, + * stop creating more. + */ + atomic_set(&check_state->enough, 1); + /* Update check_state->enough earlier */ + smp_mb__after_atomic(); + goto out; + } + skip_nr--; + cond_resched(); + } + + if (p) { + struct btree_op op; + + btree_node_prefetch(c->root, p); + c->gc_stats.nodes++; + bch_btree_op_init(&op, 0); + ret = bcache_btree(check_recurse, p, c->root, &op); + if (ret) + goto out; + } + p = NULL; + prev_idx = cur_idx; + cond_resched(); + } + +out: + info->result = ret; + /* update check_state->started among all CPUs */ + smp_mb__before_atomic(); + if (atomic_dec_and_test(&check_state->started)) + wake_up(&check_state->wait); + + return ret; +} + + + +static int bch_btree_chkthread_nr(void) +{ + int n = num_online_cpus()/2; + + if (n == 0) + n = 1; + else if (n > BCH_BTR_CHKTHREAD_MAX) + n = BCH_BTR_CHKTHREAD_MAX; + + return n; +} + int bch_btree_check(struct cache_set *c) { - struct btree_op op; + int ret = 0; + int i; + struct bkey *k = NULL; + struct btree_iter iter; + struct btree_check_state check_state; - bch_btree_op_init(&op, SHRT_MAX); + /* check and mark root node keys */ + for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) + bch_initial_mark_key(c, c->root->level, k); + + bch_initial_mark_key(c, c->root->level + 1, &c->root->key); + + if (c->root->level == 0) + return 0; + + memset(&check_state, 0, sizeof(struct btree_check_state)); + check_state.c = c; + check_state.total_threads = bch_btree_chkthread_nr(); + check_state.key_idx = 0; + spin_lock_init(&check_state.idx_lock); + atomic_set(&check_state.started, 0); + atomic_set(&check_state.enough, 0); + init_waitqueue_head(&check_state.wait); + + rw_lock(0, c->root, c->root->level); + /* + * Run multiple threads to check btree nodes in parallel, + * if check_state.enough is non-zero, it means current + * running check threads are enough, unncessary to create + * more. + */ + for (i = 0; i < check_state.total_threads; i++) { + /* fetch latest check_state.enough earlier */ + smp_mb__before_atomic(); + if (atomic_read(&check_state.enough)) + break; - return btree_root(check_recurse, c, &op); + check_state.infos[i].result = 0; + check_state.infos[i].state = &check_state; + + check_state.infos[i].thread = + kthread_run(bch_btree_check_thread, + &check_state.infos[i], + "bch_btrchk[%d]", i); + if (IS_ERR(check_state.infos[i].thread)) { + pr_err("fails to run thread bch_btrchk[%d]\n", i); + for (--i; i >= 0; i--) + kthread_stop(check_state.infos[i].thread); + ret = -ENOMEM; + goto out; + } + atomic_inc(&check_state.started); + } + + /* + * Must wait for all threads to stop. + */ + wait_event(check_state.wait, atomic_read(&check_state.started) == 0); + + for (i = 0; i < check_state.total_threads; i++) { + if (check_state.infos[i].result) { + ret = check_state.infos[i].result; + goto out; + } + } + +out: + rw_unlock(0, c->root); + return ret; } void bch_initial_gc_finish(struct cache_set *c) { - struct cache *ca; + struct cache *ca = c->cache; struct bucket *b; - unsigned int i; bch_btree_gc_finish(c); @@ -1983,20 +2091,18 @@ void bch_initial_gc_finish(struct cache_set *c) * This is only safe for buckets that have no live data in them, which * there should always be some of. */ - for_each_cache(ca, c, i) { - for_each_bucket(b, ca) { - if (fifo_full(&ca->free[RESERVE_PRIO]) && - fifo_full(&ca->free[RESERVE_BTREE])) - break; + for_each_bucket(b, ca) { + if (fifo_full(&ca->free[RESERVE_PRIO]) && + fifo_full(&ca->free[RESERVE_BTREE])) + break; - if (bch_can_invalidate_bucket(ca, b) && - !GC_MARK(b)) { - __bch_invalidate_one_bucket(ca, b); - if (!fifo_push(&ca->free[RESERVE_PRIO], - b - ca->buckets)) - fifo_push(&ca->free[RESERVE_BTREE], - b - ca->buckets); - } + if (bch_can_invalidate_bucket(ca, b) && + !GC_MARK(b)) { + __bch_invalidate_one_bucket(ca, b); + if (!fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets)) + fifo_push(&ca->free[RESERVE_BTREE], + b - ca->buckets); } } @@ -2104,7 +2210,7 @@ static int btree_split(struct btree *b, struct btree_op *op, goto err; split = set_blocks(btree_bset_first(n1), - block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; + block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5; if (split) { unsigned int keys = 0; @@ -2351,7 +2457,7 @@ int bch_btree_insert(struct cache_set *c, struct keylist *keys, if (ret) { struct bkey *k; - pr_err("error %i", ret); + pr_err("error %i\n", ret); while ((k = bch_keylist_pop(keys))) bkey_put(c, k); @@ -2401,7 +2507,7 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { - ret = btree(map_nodes_recurse, k, b, + ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); from = NULL; @@ -2419,10 +2525,10 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_nodes_fn *fn, int flags) { - return btree_root(map_nodes_recurse, c, op, from, fn, flags); + return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags); } -static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, struct bkey *from, btree_map_keys_fn *fn, int flags) { @@ -2435,7 +2541,8 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) - : btree(map_keys_recurse, k, b, op, from, fn, flags); + : bcache_btree(map_keys_recurse, k, + b, op, from, fn, flags); from = NULL; if (ret != MAP_CONTINUE) @@ -2452,7 +2559,7 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_keys_fn *fn, int flags) { - return btree_root(map_keys_recurse, c, op, from, fn, flags); + return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags); } /* Keybuf code */ @@ -2638,7 +2745,7 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, break; if (bkey_cmp(&buf->last_scanned, end) >= 0) { - pr_debug("scan finished"); + pr_debug("scan finished\n"); break; } @@ -2656,3 +2763,18 @@ void bch_keybuf_init(struct keybuf *buf) spin_lock_init(&buf->lock); array_allocator_init(&buf->freelist); } + +void bch_btree_exit(void) +{ + if (btree_io_wq) + destroy_workqueue(btree_io_wq); +} + +int __init bch_btree_init(void) +{ + btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0); + if (!btree_io_wq) + return -ENOMEM; + + return 0; +} diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index f4dcca449391..1b5fdbc0d83e 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -145,6 +145,9 @@ struct btree { struct bio *bio; }; + + + #define BTREE_FLAG(flag) \ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ @@ -191,7 +194,7 @@ static inline unsigned int bset_block_offset(struct btree *b, struct bset *i) static inline void set_gc_sectors(struct cache_set *c) { - atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); + atomic_set(&c->sectors_to_gc, c->cache->sb.bucket_size * c->nbuckets / 16); } void bkey_put(struct cache_set *c, struct bkey *k); @@ -216,6 +219,25 @@ struct btree_op { unsigned int insert_collision:1; }; +struct btree_check_state; +struct btree_check_info { + struct btree_check_state *state; + struct task_struct *thread; + int result; +}; + +#define BCH_BTR_CHKTHREAD_MAX 12 +struct btree_check_state { + struct cache_set *c; + int total_threads; + int key_idx; + spinlock_t idx_lock; + atomic_t started; + atomic_t enough; + wait_queue_head_t wait; + struct btree_check_info infos[BCH_BTR_CHKTHREAD_MAX]; +}; + static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) { memset(op, 0, sizeof(struct btree_op)); @@ -284,6 +306,65 @@ static inline void force_wake_up_gc(struct cache_set *c) wake_up_gc(c); } +/* + * These macros are for recursing down the btree - they handle the details of + * locking and looking up nodes in the cache for you. They're best treated as + * mere syntax when reading code that uses them. + * + * op->lock determines whether we take a read or a write lock at a given depth. + * If you've got a read lock and find that you need a write lock (i.e. you're + * going to have to split), set op->lock and return -EINTR; btree_root() will + * call you again and you'll have the correct lock. + */ + +/** + * btree - recurse down the btree on a specified key + * @fn: function to call, which will be passed the child node + * @key: key to recurse on + * @b: parent btree node + * @op: pointer to struct btree_op + */ +#define bcache_btree(fn, key, b, op, ...) \ +({ \ + int _r, l = (b)->level - 1; \ + bool _w = l <= (op)->lock; \ + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ + _w, b); \ + if (!IS_ERR(_child)) { \ + _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ + rw_unlock(_w, _child); \ + } else \ + _r = PTR_ERR(_child); \ + _r; \ +}) + +/** + * btree_root - call a function on the root of the btree + * @fn: function to call, which will be passed the child node + * @c: cache set + * @op: pointer to struct btree_op + */ +#define bcache_btree_root(fn, c, op, ...) \ +({ \ + int _r = -EINTR; \ + do { \ + struct btree *_b = (c)->root; \ + bool _w = insert_lock(op, _b); \ + rw_lock(_w, _b, _b->level); \ + if (_b == (c)->root && \ + _w == insert_lock(op, _b)) { \ + _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ + } \ + rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c); \ + if (_r == -EINTR) \ + schedule(); \ + } while (_r == -EINTR); \ + \ + finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ + _r; \ +}) + #define MAP_DONE 0 #define MAP_CONTINUE 1 @@ -314,6 +395,9 @@ typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b, struct bkey *k); int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_keys_fn *fn, int flags); +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags); typedef bool (keybuf_pred_fn)(struct keybuf *buf, struct bkey *k); diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 0164a1fe94a9..d8d9394a6beb 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -159,7 +159,7 @@ void closure_debug_destroy(struct closure *cl) static struct dentry *closure_debug; -static int debug_seq_show(struct seq_file *f, void *data) +static int debug_show(struct seq_file *f, void *data) { struct closure *cl; @@ -188,17 +188,7 @@ static int debug_seq_show(struct seq_file *f, void *data) return 0; } -static int debug_seq_open(struct inode *inode, struct file *file) -{ - return single_open(file, debug_seq_show, NULL); -} - -static const struct file_operations debug_ops = { - .owner = THIS_MODULE, - .open = debug_seq_open, - .read = seq_read, - .release = single_release -}; +DEFINE_SHOW_ATTRIBUTE(debug); void __init closure_debug_init(void) { @@ -209,7 +199,7 @@ void __init closure_debug_init(void) * about this. */ closure_debug = debugfs_create_file( - "closures", 0400, bcache_debug, NULL, &debug_ops); + "closures", 0400, bcache_debug, NULL, &debug_fops); } #endif diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 336f43910383..7510d1c983a5 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -25,8 +25,8 @@ struct dentry *bcache_debug; for (i = (start); \ (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ i->seq == (start)->seq; \ - i = (void *) i + set_blocks(i, block_bytes(b->c)) * \ - block_bytes(b->c)) + i = (void *) i + set_blocks(i, block_bytes(b->c->cache)) * \ + block_bytes(b->c->cache)) void bch_btree_verify(struct btree *b) { @@ -50,7 +50,7 @@ void bch_btree_verify(struct btree *b) v->keys.ops = b->keys.ops; bio = bch_bbio_alloc(b->c); - bio_set_dev(bio, PTR_CACHE(b->c, &b->key, 0)->bdev); + bio_set_dev(bio, b->c->cache->bdev); bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; bio->bi_opf = REQ_OP_READ | REQ_META; @@ -82,14 +82,14 @@ void bch_btree_verify(struct btree *b) for_each_written_bset(b, ondisk, i) { unsigned int block = ((void *) i - (void *) ondisk) / - block_bytes(b->c); + block_bytes(b->c->cache); pr_err("*** on disk block %u:\n", block); bch_dump_bset(&b->keys, i, block); } pr_err("*** block %zu not written\n", - ((void *) i - (void *) ondisk) / block_bytes(b->c)); + ((void *) i - (void *) ondisk) / block_bytes(b->c->cache)); for (j = 0; j < inmemory->keys; j++) if (inmemory->d[j] != sorted->d[j]) @@ -107,15 +107,16 @@ void bch_btree_verify(struct btree *b) void bch_data_verify(struct cached_dev *dc, struct bio *bio) { + unsigned int nr_segs = bio_segments(bio); struct bio *check; struct bio_vec bv, cbv; struct bvec_iter iter, citer = { 0 }; - check = bio_kmalloc(GFP_NOIO, bio_segments(bio)); + check = bio_kmalloc(nr_segs, GFP_NOIO); if (!check) return; - check->bi_disk = bio->bi_disk; - check->bi_opf = REQ_OP_READ; + bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs, + REQ_OP_READ); check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; @@ -127,27 +128,27 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) citer.bi_size = UINT_MAX; bio_for_each_segment(bv, bio, iter) { - void *p1 = kmap_atomic(bv.bv_page); + void *p1 = bvec_kmap_local(&bv); void *p2; cbv = bio_iter_iovec(check, citer); - p2 = page_address(cbv.bv_page); + p2 = bvec_kmap_local(&cbv); - cache_set_err_on(memcmp(p1 + bv.bv_offset, - p2 + bv.bv_offset, - bv.bv_len), + cache_set_err_on(memcmp(p1, p2, bv.bv_len), dc->disk.c, - "verify failed at dev %s sector %llu", - dc->backing_dev_name, + "verify failed at dev %pg sector %llu", + dc->bdev, (uint64_t) bio->bi_iter.bi_sector); - kunmap_atomic(p1); + kunmap_local(p2); + kunmap_local(p1); bio_advance_iter(check, &citer, bv.bv_len); } bio_free_pages(check); out_put: - bio_put(check); + bio_uninit(check); + kfree(check); } #endif @@ -238,7 +239,7 @@ void bch_debug_init_cache_set(struct cache_set *c) if (!IS_ERR_OR_NULL(bcache_debug)) { char name[50]; - snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); + snprintf(name, 50, "bcache-%pU", c->set_uuid); c->debug = debugfs_create_file(name, 0400, bcache_debug, c, &cache_set_debug_ops); } diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 886710043025..d626ffcbecb9 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -50,11 +50,11 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); + struct cache *ca = c->cache; size_t bucket = PTR_BUCKET_NR(c, k, i); size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - if (KEY_SIZE(k) + r > c->sb.bucket_size || + if (KEY_SIZE(k) + r > c->cache->sb.bucket_size || bucket < ca->sb.first_bucket || bucket >= ca->sb.nbuckets) return true; @@ -71,11 +71,11 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); + struct cache *ca = c->cache; size_t bucket = PTR_BUCKET_NR(c, k, i); size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - if (KEY_SIZE(k) + r > c->sb.bucket_size) + if (KEY_SIZE(k) + r > c->cache->sb.bucket_size) return "bad, length too big"; if (bucket < ca->sb.first_bucket) return "bad, short offset"; @@ -130,18 +130,18 @@ static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) char buf[80]; bch_extent_to_text(buf, sizeof(buf), k); - pr_err(" %s", buf); + pr_cont(" %s", buf); for (j = 0; j < KEY_PTRS(k); j++) { size_t n = PTR_BUCKET_NR(b->c, k, j); - pr_err(" bucket %zu", n); - if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) - pr_err(" prio %i", - PTR_BUCKET(b->c, k, j)->prio); + pr_cont(" bucket %zu", n); + if (n >= b->c->cache->sb.first_bucket && n < b->c->cache->sb.nbuckets) + pr_cont(" prio %i", + PTR_BUCKET(b->c, k, j)->prio); } - pr_err(" %s\n", bch_ptr_status(b->c, k)); + pr_cont(" %s\n", bch_ptr_status(b->c, k)); } /* Btree ptrs */ @@ -553,7 +553,7 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) if (stale && KEY_DIRTY(k)) { bch_extent_to_text(buf, sizeof(buf), k); - pr_info("stale dirty pointer, stale %u, key: %s", + pr_info("stale dirty pointer, stale %u, key: %s\n", stale, buf); } diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c new file mode 100644 index 000000000000..634922c5601d --- /dev/null +++ b/drivers/md/bcache/features.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Feature set bits and string conversion. + * Inspired by ext4's features compat/incompat/ro_compat related code. + * + * Copyright 2020 Coly Li <colyli@suse.de> + * + */ +#include "bcache_ondisk.h" +#include "bcache.h" +#include "features.h" + +struct feature { + int compat; + unsigned int mask; + const char *string; +}; + +static struct feature feature_list[] = { + {BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE, + "large_bucket"}, + {0, 0, NULL }, +}; + +#define compose_feature_string(type) \ +({ \ + struct feature *f; \ + bool first = true; \ + \ + for (f = &feature_list[0]; f->compat != 0; f++) { \ + if (f->compat != BCH_FEATURE_ ## type) \ + continue; \ + if (BCH_HAS_ ## type ## _FEATURE(&c->cache->sb, f->mask)) { \ + if (first) { \ + out += snprintf(out, buf + size - out, \ + "["); \ + } else { \ + out += snprintf(out, buf + size - out, \ + " ["); \ + } \ + } else if (!first) { \ + out += snprintf(out, buf + size - out, " "); \ + } \ + \ + out += snprintf(out, buf + size - out, "%s", f->string);\ + \ + if (BCH_HAS_ ## type ## _FEATURE(&c->cache->sb, f->mask)) \ + out += snprintf(out, buf + size - out, "]"); \ + \ + first = false; \ + } \ + if (!first) \ + out += snprintf(out, buf + size - out, "\n"); \ +}) + +int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size) +{ + char *out = buf; + compose_feature_string(COMPAT); + return out - buf; +} + +int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size) +{ + char *out = buf; + compose_feature_string(RO_COMPAT); + return out - buf; +} + +int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size) +{ + char *out = buf; + compose_feature_string(INCOMPAT); + return out - buf; +} diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h new file mode 100644 index 000000000000..09161b89c63e --- /dev/null +++ b/drivers/md/bcache/features.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _BCACHE_FEATURES_H +#define _BCACHE_FEATURES_H + +#include <linux/kernel.h> +#include <linux/types.h> + +#include "bcache_ondisk.h" + +#define BCH_FEATURE_COMPAT 0 +#define BCH_FEATURE_RO_COMPAT 1 +#define BCH_FEATURE_INCOMPAT 2 +#define BCH_FEATURE_TYPE_MASK 0x03 + +/* Feature set definition */ +/* Incompat feature set */ +/* 32bit bucket size, obsoleted */ +#define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET 0x0001 +/* real bucket size is (1 << bucket_size) */ +#define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE 0x0002 + +#define BCH_FEATURE_COMPAT_SUPP 0 +#define BCH_FEATURE_RO_COMPAT_SUPP 0 +#define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \ + BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE) + +#define BCH_HAS_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_compat & (mask)) +#define BCH_HAS_RO_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_ro_compat & (mask)) +#define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \ + ((sb)->feature_incompat & (mask)) + +#define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + if (sb->version < BCACHE_SB_VERSION_CDEV_WITH_FEATURES) \ + return 0; \ + return (((sb)->feature_compat & \ + BCH##_FEATURE_COMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_compat |= \ + BCH##_FEATURE_COMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_compat &= \ + ~BCH##_FEATURE_COMPAT_##flagname; \ +} + +#define BCH_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + if (sb->version < BCACHE_SB_VERSION_CDEV_WITH_FEATURES) \ + return 0; \ + return (((sb)->feature_ro_compat & \ + BCH##_FEATURE_RO_COMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_ro_compat |= \ + BCH##_FEATURE_RO_COMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_ro_compat &= \ + ~BCH##_FEATURE_RO_COMPAT_##flagname; \ +} + +#define BCH_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + if (sb->version < BCACHE_SB_VERSION_CDEV_WITH_FEATURES) \ + return 0; \ + return (((sb)->feature_incompat & \ + BCH##_FEATURE_INCOMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_incompat |= \ + BCH##_FEATURE_INCOMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_incompat &= \ + ~BCH##_FEATURE_INCOMPAT_##flagname; \ +} + +BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET); +BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE); + +static inline bool bch_has_unknown_compat_features(struct cache_sb *sb) +{ + return ((sb->feature_compat & ~BCH_FEATURE_COMPAT_SUPP) != 0); +} + +static inline bool bch_has_unknown_ro_compat_features(struct cache_sb *sb) +{ + return ((sb->feature_ro_compat & ~BCH_FEATURE_RO_COMPAT_SUPP) != 0); +} + +static inline bool bch_has_unknown_incompat_features(struct cache_sb *sb) +{ + return ((sb->feature_incompat & ~BCH_FEATURE_INCOMPAT_SUPP) != 0); +} + +int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size); +int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size); +int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size); + +#endif diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 4d93f07f63e5..020712c5203f 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -26,7 +26,8 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio, bio->bi_inline_vecs, bucket_pages(c)); + bio_init(bio, NULL, bio->bi_inline_vecs, + meta_bucket_pages(&c->cache->sb), 0); return bio; } @@ -36,7 +37,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) struct bbio *b = container_of(bio, struct bbio, bio); bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); - bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); + bio_set_dev(bio, c->cache->bdev); b->submit_time_us = local_clock_us(); closure_bio_submit(c, bio, bio->bi_private); @@ -65,15 +66,15 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors. */ if (bio->bi_opf & REQ_RAHEAD) { - pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore", - dc->backing_dev_name); + pr_warn_ratelimited("%pg: Read-ahead I/O failed on backing device, ignore\n", + dc->bdev); return; } errors = atomic_add_return(1, &dc->io_errors); if (errors < dc->error_limit) - pr_err("%s: IO error on backing device, unrecoverable", - dc->backing_dev_name); + pr_err("%pg: IO error on backing device, unrecoverable\n", + dc->bdev); else bch_cached_dev_error(dc); } @@ -123,13 +124,13 @@ void bch_count_io_errors(struct cache *ca, errors >>= IO_ERROR_SHIFT; if (errors < ca->set->error_limit) - pr_err("%s: IO error on %s%s", - ca->cache_dev_name, m, + pr_err("%pg: IO error on %s%s\n", + ca->bdev, m, is_read ? ", recovering." : "."); else bch_cache_set_error(ca->set, - "%s: too many IO errors %s", - ca->cache_dev_name, m); + "%pg: too many IO errors %s\n", + ca->bdev, m); } } @@ -137,7 +138,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, blk_status_t error, const char *m) { struct bbio *b = container_of(bio, struct bbio, bio); - struct cache *ca = PTR_CACHE(c, &b->key, 0); + struct cache *ca = c->cache; int is_read = (bio_data_dir(bio) == READ ? 1 : 0); unsigned int threshold = op_is_write(bio_op(bio)) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 0e3ff9745ac7..e5da469a4235 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -47,20 +47,18 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, closure_init_stack(&cl); - pr_debug("reading %u", bucket_index); + pr_debug("reading %u\n", bucket_index); while (offset < ca->sb.bucket_size) { reread: left = ca->sb.bucket_size - offset; len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS); - bio_reset(bio); + bio_reset(bio, ca->bdev, REQ_OP_READ); bio->bi_iter.bi_sector = bucket + offset; - bio_set_dev(bio, ca->bdev); bio->bi_iter.bi_size = len << 9; bio->bi_end_io = journal_read_endio; bio->bi_private = &cl; - bio_set_op_attrs(bio, REQ_OP_READ, 0); bch_bio_map(bio, data); closure_bio_submit(ca->set, bio, &cl); @@ -78,13 +76,13 @@ reread: left = ca->sb.bucket_size - offset; size_t blocks, bytes = set_bytes(j); if (j->magic != jset_magic(&ca->sb)) { - pr_debug("%u: bad magic", bucket_index); + pr_debug("%u: bad magic\n", bucket_index); return ret; } if (bytes > left << 9 || bytes > PAGE_SIZE << JSET_BITS) { - pr_info("%u: too big, %zu bytes, offset %u", + pr_info("%u: too big, %zu bytes, offset %u\n", bucket_index, bytes, offset); return ret; } @@ -93,12 +91,12 @@ reread: left = ca->sb.bucket_size - offset; goto reread; if (j->csum != csum_set(j)) { - pr_info("%u: bad csum, %zu bytes, offset %u", + pr_info("%u: bad csum, %zu bytes, offset %u\n", bucket_index, bytes, offset); return ret; } - blocks = set_blocks(j, block_bytes(ca->set)); + blocks = set_blocks(j, block_bytes(ca)); /* * Nodes in 'list' are in linear increasing order of @@ -111,7 +109,7 @@ reread: left = ca->sb.bucket_size - offset; * Check from the oldest jset for last_seq. If * i->j.seq < j->last_seq, it means the oldest jset * in list is expired and useless, remove it from - * this list. Otherwise, j is a condidate jset for + * this list. Otherwise, j is a candidate jset for * further following checks. */ while (!list_empty(list)) { @@ -179,115 +177,109 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) ret; \ }) - struct cache *ca; - unsigned int iter; + struct cache *ca = c->cache; int ret = 0; + struct journal_device *ja = &ca->journal; + DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); + unsigned int i, l, r, m; + uint64_t seq; - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; - DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); - unsigned int i, l, r, m; - uint64_t seq; - - bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); - pr_debug("%u journal buckets", ca->sb.njournal_buckets); + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); + pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); + /* + * Read journal buckets ordered by golden ratio hash to quickly + * find a sequence of buckets with valid journal entries + */ + for (i = 0; i < ca->sb.njournal_buckets; i++) { /* - * Read journal buckets ordered by golden ratio hash to quickly - * find a sequence of buckets with valid journal entries + * We must try the index l with ZERO first for + * correctness due to the scenario that the journal + * bucket is circular buffer which might have wrapped */ - for (i = 0; i < ca->sb.njournal_buckets; i++) { - /* - * We must try the index l with ZERO first for - * correctness due to the scenario that the journal - * bucket is circular buffer which might have wrapped - */ - l = (i * 2654435769U) % ca->sb.njournal_buckets; + l = (i * 2654435769U) % ca->sb.njournal_buckets; - if (test_bit(l, bitmap)) - break; + if (test_bit(l, bitmap)) + break; - if (read_bucket(l)) - goto bsearch; - } + if (read_bucket(l)) + goto bsearch; + } - /* - * If that fails, check all the buckets we haven't checked - * already - */ - pr_debug("falling back to linear search"); + /* + * If that fails, check all the buckets we haven't checked + * already + */ + pr_debug("falling back to linear search\n"); - for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); - l < ca->sb.njournal_buckets; - l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, - l + 1)) - if (read_bucket(l)) - goto bsearch; + for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) + if (read_bucket(l)) + goto bsearch; - /* no journal entries on this device? */ - if (l == ca->sb.njournal_buckets) - continue; + /* no journal entries on this device? */ + if (l == ca->sb.njournal_buckets) + goto out; bsearch: - BUG_ON(list_empty(list)); + BUG_ON(list_empty(list)); - /* Binary search */ - m = l; - r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); - pr_debug("starting binary search, l %u r %u", l, r); + /* Binary search */ + m = l; + r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); + pr_debug("starting binary search, l %u r %u\n", l, r); - while (l + 1 < r) { - seq = list_entry(list->prev, struct journal_replay, - list)->j.seq; + while (l + 1 < r) { + seq = list_entry(list->prev, struct journal_replay, + list)->j.seq; - m = (l + r) >> 1; - read_bucket(m); + m = (l + r) >> 1; + read_bucket(m); - if (seq != list_entry(list->prev, struct journal_replay, - list)->j.seq) - l = m; - else - r = m; - } + if (seq != list_entry(list->prev, struct journal_replay, + list)->j.seq) + l = m; + else + r = m; + } - /* - * Read buckets in reverse order until we stop finding more - * journal entries - */ - pr_debug("finishing up: m %u njournal_buckets %u", - m, ca->sb.njournal_buckets); - l = m; + /* + * Read buckets in reverse order until we stop finding more + * journal entries + */ + pr_debug("finishing up: m %u njournal_buckets %u\n", + m, ca->sb.njournal_buckets); + l = m; - while (1) { - if (!l--) - l = ca->sb.njournal_buckets - 1; + while (1) { + if (!l--) + l = ca->sb.njournal_buckets - 1; - if (l == m) - break; + if (l == m) + break; - if (test_bit(l, bitmap)) - continue; + if (test_bit(l, bitmap)) + continue; - if (!read_bucket(l)) - break; - } + if (!read_bucket(l)) + break; + } - seq = 0; + seq = 0; - for (i = 0; i < ca->sb.njournal_buckets; i++) - if (ja->seq[i] > seq) { - seq = ja->seq[i]; - /* - * When journal_reclaim() goes to allocate for - * the first time, it'll use the bucket after - * ja->cur_idx - */ - ja->cur_idx = i; - ja->last_idx = ja->discard_idx = (i + 1) % - ca->sb.njournal_buckets; + for (i = 0; i < ca->sb.njournal_buckets; i++) + if (ja->seq[i] > seq) { + seq = ja->seq[i]; + /* + * When journal_reclaim() goes to allocate for + * the first time, it'll use the bucket after + * ja->cur_idx + */ + ja->cur_idx = i; + ja->last_idx = ja->discard_idx = (i + 1) % + ca->sb.njournal_buckets; - } - } + } +out: if (!list_empty(list)) c->journal.seq = list_entry(list->prev, struct journal_replay, @@ -345,12 +337,10 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) static bool is_discard_enabled(struct cache_set *s) { - struct cache *ca; - unsigned int i; + struct cache *ca = s->cache; - for_each_cache(ca, s, i) - if (ca->discard) - return true; + if (ca->discard) + return true; return false; } @@ -370,10 +360,10 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) if (n != i->j.seq) { if (n == start && is_discard_enabled(s)) - pr_info("bcache: journal entries %llu-%llu may be discarded! (replaying %llu-%llu)", + pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n", n, i->j.seq - 1, start, end); else { - pr_err("bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", + pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", n, i->j.seq - 1, start, end); ret = -EIO; goto err; @@ -403,7 +393,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) entries++; } - pr_info("journal replay done, %i keys in %i entries, seq %llu", + pr_info("journal replay done, %i keys in %i entries, seq %llu\n", keys, entries, end); err: while (!list_empty(list)) { @@ -415,6 +405,11 @@ err: return ret; } +void bch_journal_space_reserve(struct journal *j) +{ + j->do_reserve = true; +} + /* Journalling */ static void btree_flush_write(struct cache_set *c) @@ -481,7 +476,7 @@ static void btree_flush_write(struct cache_set *c) break; if (btree_node_journal_flush(b)) - pr_err("BUG: flush_write bit should not be set here!"); + pr_err("BUG: flush_write bit should not be set here!\n"); mutex_lock(&b->write_lock); @@ -506,7 +501,7 @@ static void btree_flush_write(struct cache_set *c) * - If there are matched nodes recorded in btree_nodes[], * they are clean now (this is why and how the oldest * journal entry can be reclaimed). These selected nodes - * will be ignored and skipped in the folowing for-loop. + * will be ignored and skipped in the following for-loop. */ if (((btree_current_write(b)->journal - fifo_front_p) & mask) != 0) { @@ -534,13 +529,13 @@ static void btree_flush_write(struct cache_set *c) for (i = 0; i < nr; i++) { b = btree_nodes[i]; if (!b) { - pr_err("BUG: btree_nodes[%d] is NULL", i); + pr_err("BUG: btree_nodes[%d] is NULL\n", i); continue; } /* safe to check without holding b->write_lock */ if (!btree_node_journal_flush(b)) { - pr_err("BUG: bnode %p: journal_flush bit cleaned", b); + pr_err("BUG: bnode %p: journal_flush bit cleaned\n", b); continue; } @@ -548,14 +543,14 @@ static void btree_flush_write(struct cache_set *c) if (!btree_current_write(b)->journal) { clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); - pr_debug("bnode %p: written by others", b); + pr_debug("bnode %p: written by others\n", b); continue; } if (!btree_node_dirty(b)) { clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); - pr_debug("bnode %p: dirty bit cleaned by others", b); + pr_debug("bnode %p: dirty bit cleaned by others\n", b); continue; } @@ -611,7 +606,7 @@ static void do_journal_discard(struct cache *ca) ca->sb.njournal_buckets; atomic_set(&ja->discard_in_flight, DISCARD_READY); - /* fallthrough */ + fallthrough; case DISCARD_READY: if (ja->discard_idx == ja->last_idx) @@ -619,11 +614,9 @@ static void do_journal_discard(struct cache *ca) atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); - bio_init(bio, bio->bi_inline_vecs, 1); - bio_set_op_attrs(bio, REQ_OP_DISCARD, 0); + bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD); bio->bi_iter.bi_sector = bucket_to_sector(ca->set, ca->sb.d[ja->discard_idx]); - bio_set_dev(bio, ca->bdev); bio->bi_iter.bi_size = bucket_bytes(ca); bio->bi_end_io = journal_discard_endio; @@ -633,12 +626,31 @@ static void do_journal_discard(struct cache *ca) } } +static unsigned int free_journal_buckets(struct cache_set *c) +{ + struct journal *j = &c->journal; + struct cache *ca = c->cache; + struct journal_device *ja = &c->cache->journal; + unsigned int n; + + /* In case njournal_buckets is not power of 2 */ + if (ja->cur_idx >= ja->discard_idx) + n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx; + else + n = ja->discard_idx - ja->cur_idx; + + if (n > (1 + j->do_reserve)) + return n - (1 + j->do_reserve); + + return 0; +} + static void journal_reclaim(struct cache_set *c) { struct bkey *k = &c->journal.key; - struct cache *ca; + struct cache *ca = c->cache; uint64_t last_seq; - unsigned int iter, n = 0; + struct journal_device *ja = &ca->journal; atomic_t p __maybe_unused; atomic_long_inc(&c->reclaim); @@ -650,46 +662,29 @@ static void journal_reclaim(struct cache_set *c) /* Update last_idx */ - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; - - while (ja->last_idx != ja->cur_idx && - ja->seq[ja->last_idx] < last_seq) - ja->last_idx = (ja->last_idx + 1) % - ca->sb.njournal_buckets; - } + while (ja->last_idx != ja->cur_idx && + ja->seq[ja->last_idx] < last_seq) + ja->last_idx = (ja->last_idx + 1) % + ca->sb.njournal_buckets; - for_each_cache(ca, c, iter) - do_journal_discard(ca); + do_journal_discard(ca); if (c->journal.blocks_free) goto out; - /* - * Allocate: - * XXX: Sort by free journal space - */ + if (!free_journal_buckets(c)) + goto out; - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; - unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + ja->cur_idx = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + k->ptr[0] = MAKE_PTR(0, + bucket_to_sector(c, ca->sb.d[ja->cur_idx]), + ca->sb.nr_this_dev); + atomic_long_inc(&c->reclaimed_journal_buckets); - /* No space available on this device */ - if (next == ja->discard_idx) - continue; + bkey_init(k); + SET_KEY_PTRS(k, 1); + c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits; - ja->cur_idx = next; - k->ptr[n++] = MAKE_PTR(0, - bucket_to_sector(c, ca->sb.d[ja->cur_idx]), - ca->sb.nr_this_dev); - atomic_long_inc(&c->reclaimed_journal_buckets); - } - - if (n) { - bkey_init(k); - SET_KEY_PTRS(k, n); - c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; - } out: if (!journal_full(&c->journal)) __closure_wake_up(&c->journal.wait); @@ -716,7 +711,7 @@ void bch_journal_next(struct journal *j) j->cur->data->keys = 0; if (fifo_full(&j->pin)) - pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); + pr_debug("journal_pin full (%zu)\n", fifo_used(&j->pin)); } static void journal_write_endio(struct bio *bio) @@ -753,11 +748,11 @@ static void journal_write_unlocked(struct closure *cl) __releases(c->journal.lock) { struct cache_set *c = container_of(cl, struct cache_set, journal.io); - struct cache *ca; + struct cache *ca = c->cache; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; - unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) * - c->sb.block_size; + unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * + ca->sb.block_size; struct bio *bio; struct bio_list list; @@ -776,36 +771,32 @@ static void journal_write_unlocked(struct closure *cl) return; } - c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); + c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); w->data->btree_level = c->root->level; bkey_copy(&w->data->btree_root, &c->root->key); bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); - for_each_cache(ca, c, i) - w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; - - w->data->magic = jset_magic(&c->sb); + w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; + w->data->magic = jset_magic(&ca->sb); w->data->version = BCACHE_JSET_VERSION; w->data->last_seq = last_seq(&c->journal); w->data->csum = csum_set(w->data); for (i = 0; i < KEY_PTRS(k); i++) { - ca = PTR_CACHE(c, k, i); + ca = c->cache; bio = &ca->journal.bio; atomic_long_add(sectors, &ca->meta_sectors_written); - bio_reset(bio); + bio_reset(bio, ca->bdev, REQ_OP_WRITE | + REQ_SYNC | REQ_META | REQ_PREFLUSH | REQ_FUA); bio->bi_iter.bi_sector = PTR_OFFSET(k, i); - bio_set_dev(bio, ca->bdev); bio->bi_iter.bi_size = sectors << 9; bio->bi_end_io = journal_write_endio; bio->bi_private = w; - bio_set_op_attrs(bio, REQ_OP_WRITE, - REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); bch_bio_map(bio, w->data); trace_bcache_journal_write(bio, w->data->keys); @@ -862,6 +853,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, size_t sectors; struct closure cl; bool wait = false; + struct cache *ca = c->cache; closure_init_stack(&cl); @@ -871,10 +863,10 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, struct journal_write *w = c->journal.cur; sectors = __set_blocks(w->data, w->data->keys + nkeys, - block_bytes(c)) * c->sb.block_size; + block_bytes(ca)) * ca->sb.block_size; if (sectors <= min_t(size_t, - c->journal.blocks_free * c->sb.block_size, + c->journal.blocks_free * ca->sb.block_size, PAGE_SECTORS << JSET_BITS)) return w; @@ -939,7 +931,7 @@ atomic_t *bch_journal(struct cache_set *c, if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) return NULL; - if (!CACHE_SYNC(&c->sb)) + if (!CACHE_SYNC(&c->cache->sb)) return NULL; w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); @@ -955,8 +947,8 @@ atomic_t *bch_journal(struct cache_set *c, journal_try_write(c); } else if (!w->dirty) { w->dirty = true; - schedule_delayed_work(&c->journal.work, - msecs_to_jiffies(c->journal_delay_ms)); + queue_delayed_work(bch_flush_wq, &c->journal.work, + msecs_to_jiffies(c->journal_delay_ms)); spin_unlock(&c->journal.lock); } else { spin_unlock(&c->journal.lock); @@ -999,8 +991,8 @@ int bch_journal_alloc(struct cache_set *c) j->w[1].c = c; if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || - !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) + !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) || + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS))) return -ENOMEM; return 0; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index f2ea34d5f431..cd316b4a1e95 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -105,6 +105,7 @@ struct journal { spinlock_t lock; spinlock_t flush_write_lock; bool btree_flushing; + bool do_reserve; /* used when waiting because the journal was full */ struct closure_waitlist wait; struct closure io; @@ -182,5 +183,6 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list); void bch_journal_free(struct cache_set *c); int bch_journal_alloc(struct cache_set *c); +void bch_journal_space_reserve(struct journal *j); #endif /* _BCACHE_JOURNAL_H */ diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 7891fb512736..99499d1f6e66 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -79,8 +79,8 @@ static void moving_init(struct moving_io *io) { struct bio *bio = &io->bio.bio; - bio_init(bio, bio->bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS)); + bio_init(bio, NULL, bio->bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0); bio_get(bio); bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); @@ -145,8 +145,8 @@ static void read_moving(struct cache_set *c) continue; } - io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) - * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), GFP_KERNEL); if (!io) goto err; @@ -196,50 +196,48 @@ static unsigned int bucket_heap_top(struct cache *ca) void bch_moving_gc(struct cache_set *c) { - struct cache *ca; + struct cache *ca = c->cache; struct bucket *b; - unsigned int i; + unsigned long sectors_to_move, reserve_sectors; if (!c->copy_gc_enabled) return; mutex_lock(&c->bucket_lock); - for_each_cache(ca, c, i) { - unsigned int sectors_to_move = 0; - unsigned int reserve_sectors = ca->sb.bucket_size * + sectors_to_move = 0; + reserve_sectors = ca->sb.bucket_size * fifo_used(&ca->free[RESERVE_MOVINGGC]); - ca->heap.used = 0; - - for_each_bucket(b, ca) { - if (GC_MARK(b) == GC_MARK_METADATA || - !GC_SECTORS_USED(b) || - GC_SECTORS_USED(b) == ca->sb.bucket_size || - atomic_read(&b->pin)) - continue; - - if (!heap_full(&ca->heap)) { - sectors_to_move += GC_SECTORS_USED(b); - heap_add(&ca->heap, b, bucket_cmp); - } else if (bucket_cmp(b, heap_peek(&ca->heap))) { - sectors_to_move -= bucket_heap_top(ca); - sectors_to_move += GC_SECTORS_USED(b); - - ca->heap.data[0] = b; - heap_sift(&ca->heap, 0, bucket_cmp); - } - } + ca->heap.used = 0; + + for_each_bucket(b, ca) { + if (GC_MARK(b) == GC_MARK_METADATA || + !GC_SECTORS_USED(b) || + GC_SECTORS_USED(b) == ca->sb.bucket_size || + atomic_read(&b->pin)) + continue; - while (sectors_to_move > reserve_sectors) { - heap_pop(&ca->heap, b, bucket_cmp); - sectors_to_move -= GC_SECTORS_USED(b); + if (!heap_full(&ca->heap)) { + sectors_to_move += GC_SECTORS_USED(b); + heap_add(&ca->heap, b, bucket_cmp); + } else if (bucket_cmp(b, heap_peek(&ca->heap))) { + sectors_to_move -= bucket_heap_top(ca); + sectors_to_move += GC_SECTORS_USED(b); + + ca->heap.data[0] = b; + heap_sift(&ca->heap, 0, bucket_cmp); } + } - while (heap_pop(&ca->heap, b, bucket_cmp)) - SET_GC_MOVE(b, 1); + while (sectors_to_move > reserve_sectors) { + heap_pop(&ca->heap, b, bucket_cmp); + sectors_to_move -= GC_SECTORS_USED(b); } + while (heap_pop(&ca->heap, b, bucket_cmp)) + SET_GC_MOVE(b, 1); + mutex_unlock(&c->bucket_lock); c->moving_gc_keys.last_scanned = ZERO_KEY; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 820d8402a1dc..3427555b0cca 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -44,10 +44,10 @@ static void bio_csum(struct bio *bio, struct bkey *k) uint64_t csum = 0; bio_for_each_segment(bv, bio, iter) { - void *d = kmap(bv.bv_page) + bv.bv_offset; + void *d = bvec_kmap_local(&bv); - csum = bch_crc64_update(csum, d, bv.bv_len); - kunmap(bv.bv_page); + csum = crc64_be(csum, d, bv.bv_len); + kunmap_local(d); } k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); @@ -99,7 +99,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned int u64s, * bch_data_insert_keys() will insert the keys created so far * and finish the rest when the keylist is empty. */ - if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) + if (newsize * sizeof(uint64_t) > block_bytes(c->cache) - sizeof(struct jset)) return -ENOMEM; return __bch_keylist_realloc(l, u64s); @@ -110,7 +110,7 @@ static void bch_data_invalidate(struct closure *cl) struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); struct bio *bio = op->bio; - pr_debug("invalidating %i sectors from %llu", + pr_debug("invalidating %i sectors from %llu\n", bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector); while (bio_sectors(bio)) { @@ -394,14 +394,14 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; } - if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || - bio_sectors(bio) & (c->sb.block_size - 1)) { - pr_debug("skipping unaligned io"); + if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - 1) || + bio_sectors(bio) & (c->cache->sb.block_size - 1)) { + pr_debug("skipping unaligned io\n"); goto skip; } if (bypass_torture_test(dc)) { - if ((get_random_int() & 3) == 3) + if (prandom_u32_max(4) == 3) goto skip; else goto rescale; @@ -475,6 +475,7 @@ struct search { unsigned int read_dirty_data:1; unsigned int cache_missed:1; + struct block_device *orig_bdev; unsigned long start_time; struct btree_op op; @@ -650,8 +651,8 @@ static void backing_request_endio(struct bio *bio) */ if (unlikely(s->iop.writeback && bio->bi_opf & REQ_PREFLUSH)) { - pr_err("Can't flush %s: returned bi_status %i", - dc->backing_dev_name, bio->bi_status); + pr_err("Can't flush %pg: returned bi_status %i\n", + dc->bdev, bio->bi_status); } else { /* set to orig_bio->bi_status in bio_complete() */ s->iop.status = bio->bi_status; @@ -668,9 +669,9 @@ static void backing_request_endio(struct bio *bio) static void bio_complete(struct search *s) { if (s->orig_bio) { - generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio), - &s->d->disk->part0, s->start_time); - + /* Count on bcache device */ + bio_end_io_acct_remapped(s->orig_bio, s->start_time, + s->orig_bdev); trace_bcache_request_end(s->d, s->orig_bio); s->orig_bio->bi_status = s->iop.status; bio_endio(s->orig_bio); @@ -684,8 +685,7 @@ static void do_bio_hook(struct search *s, { struct bio *bio = &s->bio.bio; - bio_init(bio, NULL, 0); - __bio_clone_fast(bio, orig_bio); + bio_init_clone(orig_bio->bi_bdev, bio, orig_bio, GFP_NOIO); /* * bi_end_io can be set separately somewhere else, e.g. the * variants in, @@ -713,7 +713,8 @@ static void search_free(struct closure *cl) } static inline struct search *search_alloc(struct bio *bio, - struct bcache_device *d) + struct bcache_device *d, struct block_device *orig_bdev, + unsigned long start_time) { struct search *s; @@ -730,8 +731,9 @@ static inline struct search *search_alloc(struct bio *bio, s->recoverable = 1; s->write = op_is_write(bio_op(bio)); s->read_dirty_data = 0; - s->start_time = jiffies; - + /* Count on the bcache device */ + s->orig_bdev = orig_bdev; + s->start_time = start_time; s->iop.c = d->c; s->iop.bio = NULL; s->iop.inode = d->id; @@ -828,11 +830,11 @@ static void cached_dev_read_done(struct closure *cl) */ if (s->iop.bio) { - bio_reset(s->iop.bio); + bio_reset(s->iop.bio, s->cache_miss->bi_bdev, REQ_OP_READ); s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector; - bio_copy_dev(s->iop.bio, s->cache_miss); s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; + bio_clone_blkg_association(s->iop.bio, s->cache_miss); bch_bio_map(s->iop.bio, NULL); bio_copy_data(s->cache_miss, s->iop.bio); @@ -877,9 +879,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned int sectors) { int ret = MAP_CONTINUE; - unsigned int reada = 0; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; + unsigned int size_limit; s->cache_missed = 1; @@ -889,13 +891,10 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_submit; } - if (!(bio->bi_opf & REQ_RAHEAD) && - !(bio->bi_opf & (REQ_META|REQ_PRIO)) && - s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) - reada = min_t(sector_t, dc->readahead >> 9, - get_capacity(bio->bi_disk) - bio_end_sector(bio)); - - s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); + /* Limitation for valid replace key size and cache_bio bvecs number */ + size_limit = min_t(unsigned int, BIO_MAX_VECS * PAGE_SECTORS, + (1 << KEY_SIZE_BITS) - 1); + s->insert_bio_sectors = min3(size_limit, sectors, bio_sectors(bio)); s->iop.replace_key = KEY(s->iop.inode, bio->bi_iter.bi_sector + s->insert_bio_sectors, @@ -907,19 +906,19 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->iop.replace = true; - miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split); + miss = bio_next_split(bio, s->insert_bio_sectors, GFP_NOIO, + &s->d->bio_split); /* btree_search_recurse()'s btree iterator is no good anymore */ ret = miss == bio ? MAP_DONE : -EINTR; - cache_bio = bio_alloc_bioset(GFP_NOWAIT, + cache_bio = bio_alloc_bioset(miss->bi_bdev, DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), - &dc->disk.bio_split); + 0, GFP_NOWAIT, &dc->disk.bio_split); if (!cache_bio) goto out_submit; cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector; - bio_copy_dev(cache_bio, miss); cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; cache_bio->bi_end_io = backing_request_endio; @@ -929,9 +928,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; - if (reada) - bch_mark_cache_readahead(s->iop.c, s->d); - s->cache_miss = miss; s->iop.bio = cache_bio; bio_get(cache_bio); @@ -1009,7 +1005,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) bio_get(s->iop.bio); if (bio_op(bio) == REQ_OP_DISCARD && - !blk_queue_discard(bdev_get_queue(dc->bdev))) + !bdev_max_discard_sectors(dc->bdev)) goto insert_data; /* I/O request sent to backing device */ @@ -1027,21 +1023,21 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) */ struct bio *flush; - flush = bio_alloc_bioset(GFP_NOIO, 0, - &dc->disk.bio_split); + flush = bio_alloc_bioset(bio->bi_bdev, 0, + REQ_OP_WRITE | REQ_PREFLUSH, + GFP_NOIO, &dc->disk.bio_split); if (!flush) { s->iop.status = BLK_STS_RESOURCE; goto insert_data; } - bio_copy_dev(flush, bio); flush->bi_end_io = backing_request_endio; flush->bi_private = cl; - flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; /* I/O request sent to backing device */ closure_bio_submit(s->iop.c, flush, cl); } } else { - s->iop.bio = bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split); + s->iop.bio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, + &dc->disk.bio_split); /* I/O request sent to backing device */ bio->bi_end_io = backing_request_endio; closure_bio_submit(s->iop.c, bio, cl); @@ -1072,6 +1068,7 @@ struct detached_dev_io_private { unsigned long start_time; bio_end_io_t *bi_end_io; void *bi_private; + struct block_device *orig_bdev; }; static void detached_dev_end_io(struct bio *bio) @@ -1082,8 +1079,8 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_end_io = ddip->bi_end_io; bio->bi_private = ddip->bi_private; - generic_end_io_acct(ddip->d->disk->queue, bio_op(bio), - &ddip->d->disk->part0, ddip->start_time); + /* Count on the bcache device */ + bio_end_io_acct_remapped(bio, ddip->start_time, ddip->orig_bdev); if (bio->bi_status) { struct cached_dev *dc = container_of(ddip->d, @@ -1096,7 +1093,8 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_end_io(bio); } -static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) +static void detached_dev_do_request(struct bcache_device *d, struct bio *bio, + struct block_device *orig_bdev, unsigned long start_time) { struct detached_dev_io_private *ddip; struct cached_dev *dc = container_of(d, struct cached_dev, disk); @@ -1107,18 +1105,26 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) * which would call closure_get(&dc->disk.cl) */ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); + if (!ddip) { + bio->bi_status = BLK_STS_RESOURCE; + bio->bi_end_io(bio); + return; + } + ddip->d = d; - ddip->start_time = jiffies; + /* Count on the bcache device */ + ddip->orig_bdev = orig_bdev; + ddip->start_time = start_time; ddip->bi_end_io = bio->bi_end_io; ddip->bi_private = bio->bi_private; bio->bi_end_io = detached_dev_end_io; bio->bi_private = ddip; if ((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(dc->bdev))) + !bdev_max_discard_sectors(dc->bdev)) bio->bi_end_io(bio); else - generic_make_request(bio); + submit_bio_noacct(bio); } static void quit_max_writeback_rate(struct cache_set *c, @@ -1161,19 +1167,20 @@ static void quit_max_writeback_rate(struct cache_set *c, /* Cached devices - read & write stuff */ -static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct bio *bio) +void cached_dev_submit_bio(struct bio *bio) { struct search *s; - struct bcache_device *d = bio->bi_disk->private_data; + struct block_device *orig_bdev = bio->bi_bdev; + struct bcache_device *d = orig_bdev->bd_disk->private_data; struct cached_dev *dc = container_of(d, struct cached_dev, disk); + unsigned long start_time; int rw = bio_data_dir(bio); if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || dc->io_disable)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); - return BLK_QC_T_NONE; + return; } if (likely(d->c)) { @@ -1191,22 +1198,19 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, } } - generic_start_io_acct(q, - bio_op(bio), - bio_sectors(bio), - &d->disk->part0); + start_time = bio_start_io_acct(bio); bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset; if (cached_dev_get(dc)) { - s = search_alloc(bio, d); + s = search_alloc(bio, d, orig_bdev, start_time); trace_bcache_request_start(s->d, bio); if (!bio->bi_iter.bi_size) { /* * can't call bch_journal_meta from under - * generic_make_request + * submit_bio_noacct */ continue_at_nobarrier(&s->cl, cached_dev_nodata, @@ -1221,9 +1225,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, } } else /* I/O request sent to backing device */ - detached_dev_do_request(d, bio); - - return BLK_QC_T_NONE; + detached_dev_do_request(d, bio, orig_bdev, start_time); } static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, @@ -1233,41 +1235,13 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, if (dc->io_disable) return -EIO; - - return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); -} - -static int cached_dev_congested(void *data, int bits) -{ - struct bcache_device *d = data; - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - struct request_queue *q = bdev_get_queue(dc->bdev); - int ret = 0; - - if (bdi_congested(q->backing_dev_info, bits)) - return 1; - - if (cached_dev_get(dc)) { - unsigned int i; - struct cache *ca; - - for_each_cache(ca, d->c, i) { - q = bdev_get_queue(ca->bdev); - ret |= bdi_congested(q->backing_dev_info, bits); - } - - cached_dev_put(dc); - } - - return ret; + if (!dc->bdev->bd_disk->fops->ioctl) + return -ENOTTY; + return dc->bdev->bd_disk->fops->ioctl(dc->bdev, mode, cmd, arg); } void bch_cached_dev_request_init(struct cached_dev *dc) { - struct gendisk *g = dc->disk.disk; - - g->queue->make_request_fn = cached_dev_make_request; - g->queue->backing_dev_info->congested_fn = cached_dev_congested; dc->disk.cache_miss = cached_dev_cache_miss; dc->disk.ioctl = cached_dev_ioctl; } @@ -1301,22 +1275,19 @@ static void flash_dev_nodata(struct closure *cl) continue_at(cl, search_free, NULL); } -static blk_qc_t flash_dev_make_request(struct request_queue *q, - struct bio *bio) +void flash_dev_submit_bio(struct bio *bio) { struct search *s; struct closure *cl; - struct bcache_device *d = bio->bi_disk->private_data; + struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); - return BLK_QC_T_NONE; + return; } - generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0); - - s = search_alloc(bio, d); + s = search_alloc(bio, d, bio->bi_bdev, bio_start_io_acct(bio)); cl = &s->cl; bio = &s->bio.bio; @@ -1324,13 +1295,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, if (!bio->bi_iter.bi_size) { /* - * can't call bch_journal_meta from under - * generic_make_request + * can't call bch_journal_meta from under submit_bio_noacct */ continue_at_nobarrier(&s->cl, flash_dev_nodata, bcache_wq); - return BLK_QC_T_NONE; + return; } else if (bio_data_dir(bio)) { bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &KEY(d->id, bio->bi_iter.bi_sector, 0), @@ -1346,7 +1316,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, } continue_at(cl, search_free, NULL); - return BLK_QC_T_NONE; } static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, @@ -1355,28 +1324,8 @@ static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, return -ENOTTY; } -static int flash_dev_congested(void *data, int bits) -{ - struct bcache_device *d = data; - struct request_queue *q; - struct cache *ca; - unsigned int i; - int ret = 0; - - for_each_cache(ca, d->c, i) { - q = bdev_get_queue(ca->bdev); - ret |= bdi_congested(q->backing_dev_info, bits); - } - - return ret; -} - void bch_flash_dev_request_init(struct bcache_device *d) { - struct gendisk *g = d->disk; - - g->queue->make_request_fn = flash_dev_make_request; - g->queue->backing_dev_info->congested_fn = flash_dev_congested; d->cache_miss = flash_dev_cache_miss; d->ioctl = flash_dev_ioctl; } diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index c64dbd7a91aa..38ab4856eaab 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -37,7 +37,10 @@ unsigned int bch_get_congested(const struct cache_set *c); void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); +void cached_dev_submit_bio(struct bio *bio); + void bch_flash_dev_request_init(struct bcache_device *d); +void flash_dev_submit_bio(struct bio *bio); extern struct kmem_cache *bch_search_cache; diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 503aafe188dc..68b02216033d 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -46,7 +46,6 @@ read_attribute(cache_misses); read_attribute(cache_bypass_hits); read_attribute(cache_bypass_misses); read_attribute(cache_hit_ratio); -read_attribute(cache_readaheads); read_attribute(cache_miss_collisions); read_attribute(bypassed); @@ -64,7 +63,6 @@ SHOW(bch_stats) DIV_SAFE(var(cache_hits) * 100, var(cache_hits) + var(cache_misses))); - var_print(cache_readaheads); var_print(cache_miss_collisions); sysfs_hprint(bypassed, var(sectors_bypassed) << 9); #undef var @@ -80,17 +78,17 @@ static void bch_stats_release(struct kobject *k) { } -static struct attribute *bch_stats_files[] = { +static struct attribute *bch_stats_attrs[] = { &sysfs_cache_hits, &sysfs_cache_misses, &sysfs_cache_bypass_hits, &sysfs_cache_bypass_misses, &sysfs_cache_hit_ratio, - &sysfs_cache_readaheads, &sysfs_cache_miss_collisions, &sysfs_bypassed, NULL }; +ATTRIBUTE_GROUPS(bch_stats); static KTYPE(bch_stats); int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, @@ -113,7 +111,6 @@ void bch_cache_accounting_clear(struct cache_accounting *acc) acc->total.cache_misses = 0; acc->total.cache_bypass_hits = 0; acc->total.cache_bypass_misses = 0; - acc->total.cache_readaheads = 0; acc->total.cache_miss_collisions = 0; acc->total.sectors_bypassed = 0; } @@ -145,7 +142,6 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) scale_stat(&stats->cache_misses); scale_stat(&stats->cache_bypass_hits); scale_stat(&stats->cache_bypass_misses); - scale_stat(&stats->cache_readaheads); scale_stat(&stats->cache_miss_collisions); scale_stat(&stats->sectors_bypassed); } @@ -168,7 +164,6 @@ static void scale_accounting(struct timer_list *t) move_stat(cache_misses); move_stat(cache_bypass_hits); move_stat(cache_bypass_misses); - move_stat(cache_readaheads); move_stat(cache_miss_collisions); move_stat(sectors_bypassed); @@ -209,14 +204,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, mark_cache_stats(&c->accounting.collector, hit, bypass); } -void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) -{ - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - - atomic_inc(&dc->accounting.collector.cache_readaheads); - atomic_inc(&c->accounting.collector.cache_readaheads); -} - void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index abfaabf7e7fc..bd3afc856d53 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -7,7 +7,6 @@ struct cache_stat_collector { atomic_t cache_misses; atomic_t cache_bypass_hits; atomic_t cache_bypass_misses; - atomic_t cache_readaheads; atomic_t cache_miss_collisions; atomic_t sectors_bypassed; }; @@ -55,7 +54,6 @@ void bch_cache_accounting_destroy(struct cache_accounting *acc); void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, bool hit, bool bypass); -void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d); void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d); void bch_mark_sectors_bypassed(struct cache_set *c, diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 0c3c5419c52b..ba3909bb6bea 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -13,12 +13,14 @@ #include "extents.h" #include "request.h" #include "writeback.h" +#include "features.h" #include <linux/blkdev.h> +#include <linux/pagemap.h> #include <linux/debugfs.h> -#include <linux/genhd.h> #include <linux/idr.h> #include <linux/kthread.h> +#include <linux/workqueue.h> #include <linux/module.h> #include <linux/random.h> #include <linux/reboot.h> @@ -47,6 +49,7 @@ static int bcache_major; static DEFINE_IDA(bcache_device_idx); static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; +struct workqueue_struct *bch_flush_wq; struct workqueue_struct *bch_journal_wq; @@ -58,6 +61,108 @@ struct workqueue_struct *bch_journal_wq; /* Superblock */ +static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s) +{ + unsigned int bucket_size = le16_to_cpu(s->bucket_size); + + if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { + if (bch_has_feature_large_bucket(sb)) { + unsigned int max, order; + + max = sizeof(unsigned int) * BITS_PER_BYTE - 1; + order = le16_to_cpu(s->bucket_size); + /* + * bcache tool will make sure the overflow won't + * happen, an error message here is enough. + */ + if (order > max) + pr_err("Bucket size (1 << %u) overflows\n", + order); + bucket_size = 1 << order; + } else if (bch_has_feature_obso_large_bucket(sb)) { + bucket_size += + le16_to_cpu(s->obso_bucket_size_hi) << 16; + } + } + + return bucket_size; +} + +static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev, + struct cache_sb_disk *s) +{ + const char *err; + unsigned int i; + + sb->first_bucket= le16_to_cpu(s->first_bucket); + sb->nbuckets = le64_to_cpu(s->nbuckets); + sb->bucket_size = get_bucket_size(sb, s); + + sb->nr_in_set = le16_to_cpu(s->nr_in_set); + sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); + + err = "Too many journal buckets"; + if (sb->keys > SB_JOURNAL_BUCKETS) + goto err; + + err = "Too many buckets"; + if (sb->nbuckets > LONG_MAX) + goto err; + + err = "Not enough buckets"; + if (sb->nbuckets < 1 << 7) + goto err; + + err = "Bad block size (not power of 2)"; + if (!is_power_of_2(sb->block_size)) + goto err; + + err = "Bad block size (larger than page size)"; + if (sb->block_size > PAGE_SECTORS) + goto err; + + err = "Bad bucket size (not power of 2)"; + if (!is_power_of_2(sb->bucket_size)) + goto err; + + err = "Bad bucket size (smaller than page size)"; + if (sb->bucket_size < PAGE_SECTORS) + goto err; + + err = "Invalid superblock: device too small"; + if (get_capacity(bdev->bd_disk) < + sb->bucket_size * sb->nbuckets) + goto err; + + err = "Bad UUID"; + if (bch_is_zero(sb->set_uuid, 16)) + goto err; + + err = "Bad cache device number in set"; + if (!sb->nr_in_set || + sb->nr_in_set <= sb->nr_this_dev || + sb->nr_in_set > MAX_CACHES_PER_SET) + goto err; + + err = "Journal buckets not sequential"; + for (i = 0; i < sb->keys; i++) + if (sb->d[i] != sb->first_bucket + i) + goto err; + + err = "Too many journal buckets"; + if (sb->first_bucket + sb->keys > sb->nbuckets) + goto err; + + err = "Invalid superblock: first bucket comes before end of super"; + if (sb->first_bucket * sb->bucket_size < 16) + goto err; + + err = NULL; +err: + return err; +} + + static const char *read_super(struct cache_sb *sb, struct block_device *bdev, struct cache_sb_disk **res) { @@ -83,13 +188,12 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, sb->flags = le64_to_cpu(s->flags); sb->seq = le64_to_cpu(s->seq); sb->last_mount = le32_to_cpu(s->last_mount); - sb->first_bucket = le16_to_cpu(s->first_bucket); sb->keys = le16_to_cpu(s->keys); for (i = 0; i < SB_JOURNAL_BUCKETS; i++) sb->d[i] = le64_to_cpu(s->d[i]); - pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", + pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u\n", sb->version, sb->flags, sb->seq, sb->keys); err = "Not a bcache superblock (bad offset)"; @@ -100,10 +204,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, if (memcmp(sb->magic, bcache_magic, 16)) goto err; - err = "Too many journal buckets"; - if (sb->keys > SB_JOURNAL_BUCKETS) - goto err; - err = "Bad checksum"; if (s->csum != csum_set(s)) goto err; @@ -123,6 +223,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, sb->data_offset = BDEV_DATA_START_DEFAULT; break; case BCACHE_SB_VERSION_BDEV_WITH_OFFSET: + case BCACHE_SB_VERSION_BDEV_WITH_FEATURES: sb->data_offset = le64_to_cpu(s->data_offset); err = "Bad data offset"; @@ -132,55 +233,35 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, break; case BCACHE_SB_VERSION_CDEV: case BCACHE_SB_VERSION_CDEV_WITH_UUID: - sb->nbuckets = le64_to_cpu(s->nbuckets); - sb->bucket_size = le16_to_cpu(s->bucket_size); - - sb->nr_in_set = le16_to_cpu(s->nr_in_set); - sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); - - err = "Too many buckets"; - if (sb->nbuckets > LONG_MAX) - goto err; - - err = "Not enough buckets"; - if (sb->nbuckets < 1 << 7) - goto err; - - err = "Bad block/bucket size"; - if (!is_power_of_2(sb->block_size) || - sb->block_size > PAGE_SECTORS || - !is_power_of_2(sb->bucket_size) || - sb->bucket_size < PAGE_SECTORS) - goto err; - - err = "Invalid superblock: device too small"; - if (get_capacity(bdev->bd_disk) < - sb->bucket_size * sb->nbuckets) + err = read_super_common(sb, bdev, s); + if (err) goto err; + break; + case BCACHE_SB_VERSION_CDEV_WITH_FEATURES: + /* + * Feature bits are needed in read_super_common(), + * convert them firstly. + */ + sb->feature_compat = le64_to_cpu(s->feature_compat); + sb->feature_incompat = le64_to_cpu(s->feature_incompat); + sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat); - err = "Bad UUID"; - if (bch_is_zero(sb->set_uuid, 16)) + /* Check incompatible features */ + err = "Unsupported compatible feature found"; + if (bch_has_unknown_compat_features(sb)) goto err; - err = "Bad cache device number in set"; - if (!sb->nr_in_set || - sb->nr_in_set <= sb->nr_this_dev || - sb->nr_in_set > MAX_CACHES_PER_SET) + err = "Unsupported read-only compatible feature found"; + if (bch_has_unknown_ro_compat_features(sb)) goto err; - err = "Journal buckets not sequential"; - for (i = 0; i < sb->keys; i++) - if (sb->d[i] != sb->first_bucket + i) - goto err; - - err = "Too many journal buckets"; - if (sb->first_bucket + sb->keys > sb->nbuckets) + err = "Unsupported incompatible feature found"; + if (bch_has_unknown_incompat_features(sb)) goto err; - err = "Invalid superblock: first bucket comes before end of super"; - if (sb->first_bucket * sb->bucket_size < 16) + err = read_super_common(sb, bdev, s); + if (err) goto err; - break; default: err = "Unsupported superblock version"; @@ -216,7 +297,6 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, offset_in_page(out)); out->offset = cpu_to_le64(sb->offset); - out->version = cpu_to_le64(sb->version); memcpy(out->uuid, sb->uuid, 16); memcpy(out->set_uuid, sb->set_uuid, 16); @@ -232,9 +312,16 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, for (i = 0; i < sb->keys; i++) out->d[i] = cpu_to_le64(sb->d[i]); + if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { + out->feature_compat = cpu_to_le64(sb->feature_compat); + out->feature_incompat = cpu_to_le64(sb->feature_incompat); + out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat); + } + + out->version = cpu_to_le64(sb->version); out->csum = csum_set(out); - pr_debug("ver %llu, flags %llu, seq %llu", + pr_debug("ver %llu, flags %llu, seq %llu\n", sb->version, sb->flags, sb->seq); submit_bio(bio); @@ -255,8 +342,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) down(&dc->sb_write_mutex); closure_init(cl, parent); - bio_init(bio, dc->sb_bv, 1); - bio_set_dev(bio, dc->bdev); + bio_init(bio, dc->bdev, dc->sb_bv, 1, 0); bio->bi_end_io = write_bdev_super_endio; bio->bi_private = dc; @@ -287,31 +373,24 @@ static void bcache_write_super_unlock(struct closure *cl) void bcache_write_super(struct cache_set *c) { struct closure *cl = &c->sb_write; - struct cache *ca; - unsigned int i; + struct cache *ca = c->cache; + struct bio *bio = &ca->sb_bio; + unsigned int version = BCACHE_SB_VERSION_CDEV_WITH_UUID; down(&c->sb_write_mutex); closure_init(cl, &c->cl); - c->sb.seq++; - - for_each_cache(ca, c, i) { - struct bio *bio = &ca->sb_bio; + ca->sb.seq++; - ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID; - ca->sb.seq = c->sb.seq; - ca->sb.last_mount = c->sb.last_mount; + if (ca->sb.version < version) + ca->sb.version = version; - SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); - - bio_init(bio, ca->sb_bv, 1); - bio_set_dev(bio, ca->bdev); - bio->bi_end_io = write_super_endio; - bio->bi_private = ca; + bio_init(bio, ca->bdev, ca->sb_bv, 1, 0); + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; - closure_get(cl); - __write_super(&ca->sb, ca->sb_disk, bio); - } + closure_get(cl); + __write_super(&ca->sb, ca->sb_disk, bio); closure_return_with_destructor(cl, bcache_write_super_unlock); } @@ -335,8 +414,8 @@ static void uuid_io_unlock(struct closure *cl) up(&c->uuid_write_mutex); } -static void uuid_io(struct cache_set *c, int op, unsigned long op_flags, - struct bkey *k, struct closure *parent) +static void uuid_io(struct cache_set *c, blk_opf_t opf, struct bkey *k, + struct closure *parent) { struct closure *cl = &c->uuid_write; struct uuid_entry *u; @@ -350,26 +429,26 @@ static void uuid_io(struct cache_set *c, int op, unsigned long op_flags, for (i = 0; i < KEY_PTRS(k); i++) { struct bio *bio = bch_bbio_alloc(c); - bio->bi_opf = REQ_SYNC | REQ_META | op_flags; + bio->bi_opf = opf | REQ_SYNC | REQ_META; bio->bi_iter.bi_size = KEY_SIZE(k) << 9; bio->bi_end_io = uuid_endio; bio->bi_private = cl; - bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags); bch_bio_map(bio, c->uuids); bch_submit_bbio(bio, c, k, i); - if (op != REQ_OP_WRITE) + if ((opf & REQ_OP_MASK) != REQ_OP_WRITE) break; } bch_extent_to_text(buf, sizeof(buf), k); - pr_debug("%s UUIDs at %s", op == REQ_OP_WRITE ? "wrote" : "read", buf); + pr_debug("%s UUIDs at %s\n", (opf & REQ_OP_MASK) == REQ_OP_WRITE ? + "wrote" : "read", buf); for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) if (!bch_is_zero(u->uuid, 16)) - pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u", + pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u\n", u - c->uuids, u->uuid, u->label, u->first_reg, u->last_reg, u->invalidated); @@ -384,7 +463,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) return "bad uuid pointer"; bkey_copy(&c->uuid_bucket, k); - uuid_io(c, REQ_OP_READ, 0, k, cl); + uuid_io(c, REQ_OP_READ, k, cl); if (j->version < BCACHE_JSET_VERSION_UUIDv1) { struct uuid_entry_v0 *u0 = (void *) c->uuids; @@ -421,20 +500,21 @@ static int __uuid_write(struct cache_set *c) { BKEY_PADDED(key) k; struct closure cl; - struct cache *ca; + struct cache *ca = c->cache; + unsigned int size; closure_init_stack(&cl); lockdep_assert_held(&bch_register_lock); - if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) + if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true)) return 1; - SET_KEY_SIZE(&k.key, c->sb.bucket_size); - uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl); + size = meta_bucket_pages(&ca->sb) * PAGE_SECTORS; + SET_KEY_SIZE(&k.key, size); + uuid_io(c, REQ_OP_WRITE, &k.key, &cl); closure_sync(&cl); /* Only one bucket used for uuid write */ - ca = PTR_CACHE(c, &k.key, 0); atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written); bkey_copy(&c->uuid_bucket, &k.key); @@ -507,8 +587,7 @@ static void prio_endio(struct bio *bio) closure_put(&ca->prio); } -static void prio_io(struct cache *ca, uint64_t bucket, int op, - unsigned long op_flags) +static void prio_io(struct cache *ca, uint64_t bucket, blk_opf_t opf) { struct closure *cl = &ca->prio; struct bio *bio = bch_bbio_alloc(ca->set); @@ -517,11 +596,11 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size; bio_set_dev(bio, ca->bdev); - bio->bi_iter.bi_size = bucket_bytes(ca); + bio->bi_iter.bi_size = meta_bucket_bytes(&ca->sb); bio->bi_end_io = prio_endio; bio->bi_private = ca; - bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags); + bio->bi_opf = opf | REQ_SYNC | REQ_META; bch_bio_map(bio, ca->disk_buckets); closure_bio_submit(ca->set, bio, &ca->prio); @@ -534,7 +613,7 @@ int bch_prio_write(struct cache *ca, bool wait) struct bucket *b; struct closure cl; - pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu", + pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu\n", fifo_used(&ca->free[RESERVE_PRIO]), fifo_used(&ca->free[RESERVE_NONE]), fifo_used(&ca->free_inc)); @@ -575,13 +654,13 @@ int bch_prio_write(struct cache *ca, bool wait) p->next_bucket = ca->prio_buckets[i + 1]; p->magic = pset_magic(&ca->sb); - p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); + p->csum = bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8); bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); - prio_io(ca, bucket, REQ_OP_WRITE, 0); + prio_io(ca, bucket, REQ_OP_WRITE); mutex_lock(&ca->set->bucket_lock); ca->prio_buckets[i] = bucket; @@ -625,16 +704,16 @@ static int prio_read(struct cache *ca, uint64_t bucket) ca->prio_last_buckets[bucket_nr] = bucket; bucket_nr++; - prio_io(ca, bucket, REQ_OP_READ, 0); + prio_io(ca, bucket, REQ_OP_READ); if (p->csum != - bch_crc64(&p->magic, bucket_bytes(ca) - 8)) { - pr_warn("bad csum reading priorities"); + bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) { + pr_warn("bad csum reading priorities\n"); goto out; } if (p->magic != pset_magic(&ca->sb)) { - pr_warn("bad magic reading priorities"); + pr_warn("bad magic reading priorities\n"); goto out; } @@ -679,7 +758,16 @@ static int ioctl_dev(struct block_device *b, fmode_t mode, return d->ioctl(d, mode, cmd, arg); } -static const struct block_device_operations bcache_ops = { +static const struct block_device_operations bcache_cached_ops = { + .submit_bio = cached_dev_submit_bio, + .open = open_dev, + .release = release_dev, + .ioctl = ioctl_dev, + .owner = THIS_MODULE, +}; + +static const struct block_device_operations bcache_flash_ops = { + .submit_bio = flash_dev_submit_bio, .open = open_dev, .release = release_dev, .ioctl = ioctl_dev, @@ -702,37 +790,33 @@ static void bcache_device_unlink(struct bcache_device *d) lockdep_assert_held(&bch_register_lock); if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { - unsigned int i; - struct cache *ca; + struct cache *ca = d->c->cache; sysfs_remove_link(&d->c->kobj, d->name); sysfs_remove_link(&d->kobj, "cache"); - for_each_cache(ca, d->c, i) - bd_unlink_disk_holder(ca->bdev, d->disk); + bd_unlink_disk_holder(ca->bdev, d->disk); } } static void bcache_device_link(struct bcache_device *d, struct cache_set *c, const char *name) { - unsigned int i; - struct cache *ca; + struct cache *ca = c->cache; int ret; - for_each_cache(ca, d->c, i) - bd_link_disk_holder(ca->bdev, d->disk); + bd_link_disk_holder(ca->bdev, d->disk); snprintf(d->name, BCACHEDEVNAME_SIZE, "%s%u", name, d->id); ret = sysfs_create_link(&d->kobj, &c->kobj, "cache"); if (ret < 0) - pr_err("Couldn't create device -> cache set symlink"); + pr_err("Couldn't create device -> cache set symlink\n"); ret = sysfs_create_link(&c->kobj, &d->kobj, d->name); if (ret < 0) - pr_err("Couldn't create cache set -> device symlink"); + pr_err("Couldn't create cache set -> device symlink\n"); clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); } @@ -789,20 +873,14 @@ static void bcache_device_free(struct bcache_device *d) lockdep_assert_held(&bch_register_lock); if (disk) - pr_info("%s stopped", disk->disk_name); + pr_info("%s stopped\n", disk->disk_name); else - pr_err("bcache device (NULL gendisk) stopped"); + pr_err("bcache device (NULL gendisk) stopped\n"); if (d->c) bcache_device_detach(d); if (disk) { - if (disk->flags & GENHD_FL_UP) - del_gendisk(disk); - - if (disk->queue) - blk_cleanup_queue(disk->queue); - ida_simple_remove(&bcache_device_idx, first_minor_to_idx(disk->first_minor)); put_disk(disk); @@ -816,24 +894,25 @@ static void bcache_device_free(struct bcache_device *d) } static int bcache_device_init(struct bcache_device *d, unsigned int block_size, - sector_t sectors) + sector_t sectors, struct block_device *cached_bdev, + const struct block_device_operations *ops) { struct request_queue *q; const size_t max_stripes = min_t(size_t, INT_MAX, SIZE_MAX / sizeof(atomic_t)); - size_t n; + uint64_t n; int idx; if (!d->stripe_size) d->stripe_size = 1 << 31; - d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); - - if (!d->nr_stripes || d->nr_stripes > max_stripes) { - pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)", - (unsigned int)d->nr_stripes); + n = DIV_ROUND_UP_ULL(sectors, d->stripe_size); + if (!n || n > max_stripes) { + pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n", + n); return -ENOMEM; } + d->nr_stripes = n; n = d->nr_stripes * sizeof(atomic_t); d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL); @@ -843,56 +922,69 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL); if (!d->full_dirty_stripes) - return -ENOMEM; + goto out_free_stripe_sectors_dirty; idx = ida_simple_get(&bcache_device_idx, 0, BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); if (idx < 0) - return idx; + goto out_free_full_dirty_stripes; if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) - goto err; + goto out_ida_remove; - d->disk = alloc_disk(BCACHE_MINORS); + d->disk = blk_alloc_disk(NUMA_NO_NODE); if (!d->disk) - goto err; + goto out_bioset_exit; set_capacity(d->disk, sectors); snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); d->disk->major = bcache_major; d->disk->first_minor = idx_to_first_minor(idx); - d->disk->fops = &bcache_ops; + d->disk->minors = BCACHE_MINORS; + d->disk->fops = ops; d->disk->private_data = d; - q = blk_alloc_queue(GFP_KERNEL); - if (!q) - return -ENOMEM; - - blk_queue_make_request(q, NULL); - d->disk->queue = q; - q->queuedata = d; - q->backing_dev_info->congested_data = d; + q = d->disk->queue; q->limits.max_hw_sectors = UINT_MAX; q->limits.max_sectors = UINT_MAX; q->limits.max_segment_size = UINT_MAX; - q->limits.max_segments = BIO_MAX_PAGES; + q->limits.max_segments = BIO_MAX_VECS; blk_queue_max_discard_sectors(q, UINT_MAX); q->limits.discard_granularity = 512; q->limits.io_min = block_size; q->limits.logical_block_size = block_size; q->limits.physical_block_size = block_size; + + if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) { + /* + * This should only happen with BCACHE_SB_VERSION_BDEV. + * Block/page size is checked for BCACHE_SB_VERSION_CDEV. + */ + pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n", + d->disk->disk_name, q->limits.logical_block_size, + PAGE_SIZE, bdev_logical_block_size(cached_bdev)); + + /* This also adjusts physical block size/min io size if needed */ + blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev)); + } + blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue); blk_queue_write_cache(q, true, true); return 0; -err: +out_bioset_exit: + bioset_exit(&d->bio_split); +out_ida_remove: ida_simple_remove(&bcache_device_idx, idx); +out_free_full_dirty_stripes: + kvfree(d->full_dirty_stripes); +out_free_stripe_sectors_dirty: + kvfree(d->stripe_sectors_dirty); return -ENOMEM; } @@ -905,7 +997,7 @@ static void calc_cached_dev_sectors(struct cache_set *c) struct cached_dev *dc; list_for_each_entry(dc, &c->cached_devs, list) - sectors += bdev_sectors(dc->bdev); + sectors += bdev_nr_sectors(dc->bdev); c->cached_dev_sectors = sectors; } @@ -929,11 +1021,11 @@ static int cached_dev_status_update(void *arg) dc->offline_seconds = 0; if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) { - pr_err("%s: device offline for %d seconds", - dc->backing_dev_name, + pr_err("%pg: device offline for %d seconds\n", + dc->bdev, BACKING_DEV_OFFLINE_TIMEOUT); - pr_err("%s: disable I/O request due to backing " - "device offline", dc->disk.name); + pr_err("%s: disable I/O request due to backing device offline\n", + dc->disk.name); dc->io_disable = true; /* let others know earlier that io_disable is true */ smp_mb(); @@ -950,6 +1042,7 @@ static int cached_dev_status_update(void *arg) int bch_cached_dev_run(struct cached_dev *dc) { + int ret = 0; struct bcache_device *d = &dc->disk; char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); char *env[] = { @@ -960,21 +1053,15 @@ int bch_cached_dev_run(struct cached_dev *dc) }; if (dc->io_disable) { - pr_err("I/O disabled on cached dev %s", - dc->backing_dev_name); - kfree(env[1]); - kfree(env[2]); - kfree(buf); - return -EIO; + pr_err("I/O disabled on cached dev %pg\n", dc->bdev); + ret = -EIO; + goto out; } if (atomic_xchg(&dc->running, 1)) { - kfree(env[1]); - kfree(env[2]); - kfree(buf); - pr_info("cached dev %s is running already", - dc->backing_dev_name); - return -EBUSY; + pr_info("cached dev %pg is running already\n", dc->bdev); + ret = -EBUSY; + goto out; } if (!d->c && @@ -988,33 +1075,35 @@ int bch_cached_dev_run(struct cached_dev *dc) closure_sync(&cl); } - add_disk(d->disk); + ret = add_disk(d->disk); + if (ret) + goto out; bd_link_disk_holder(dc->bdev, dc->disk.disk); /* * won't show up in the uevent file, use udevadm monitor -e instead * only class / kset properties are persistent */ kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); - kfree(env[1]); - kfree(env[2]); - kfree(buf); if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) { - pr_err("Couldn't create bcache dev <-> disk sysfs symlinks"); - return -ENOMEM; + pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n"); + ret = -ENOMEM; + goto out; } dc->status_update_thread = kthread_run(cached_dev_status_update, dc, "bcache_status_update"); if (IS_ERR(dc->status_update_thread)) { - pr_warn("failed to create bcache_status_update kthread, " - "continue to run without monitoring backing " - "device status"); + pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n"); } - return 0; +out: + kfree(env[1]); + kfree(env[2]); + kfree(buf); + return ret; } /* @@ -1037,7 +1126,7 @@ static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) } while (time_out > 0); if (time_out == 0) - pr_warn("give up waiting for dc->writeback_write_update to quit"); + pr_warn("give up waiting for dc->writeback_write_update to quit\n"); cancel_delayed_work_sync(&dc->writeback_rate_update); } @@ -1045,9 +1134,7 @@ static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) static void cached_dev_detach_finish(struct work_struct *w) { struct cached_dev *dc = container_of(w, struct cached_dev, detach); - struct closure cl; - - closure_init_stack(&cl); + struct cache_set *c = dc->disk.c; BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); BUG_ON(refcount_read(&dc->count)); @@ -1061,24 +1148,18 @@ static void cached_dev_detach_finish(struct work_struct *w) dc->writeback_thread = NULL; } - memset(&dc->sb.set_uuid, 0, 16); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); - - bch_write_bdev_super(dc, &cl); - closure_sync(&cl); - mutex_lock(&bch_register_lock); - calc_cached_dev_sectors(dc->disk.c); bcache_device_detach(&dc->disk); list_move(&dc->list, &uncached_devices); + calc_cached_dev_sectors(c); clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags); mutex_unlock(&bch_register_lock); - pr_info("Caching disabled for %s", dc->backing_dev_name); + pr_info("Caching disabled for %pg\n", dc->bdev); /* Drop ref we took in cached_dev_detach() */ closure_put(&dc->disk.cl); @@ -1113,34 +1194,32 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, struct cached_dev *exist_dc, *t; int ret = 0; - if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || - (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) + if ((set_uuid && memcmp(set_uuid, c->set_uuid, 16)) || + (!set_uuid && memcmp(dc->sb.set_uuid, c->set_uuid, 16))) return -ENOENT; if (dc->disk.c) { - pr_err("Can't attach %s: already attached", - dc->backing_dev_name); + pr_err("Can't attach %pg: already attached\n", dc->bdev); return -EINVAL; } if (test_bit(CACHE_SET_STOPPING, &c->flags)) { - pr_err("Can't attach %s: shutting down", - dc->backing_dev_name); + pr_err("Can't attach %pg: shutting down\n", dc->bdev); return -EINVAL; } - if (dc->sb.block_size < c->sb.block_size) { + if (dc->sb.block_size < c->cache->sb.block_size) { /* Will die */ - pr_err("Couldn't attach %s: block size less than set's block size", - dc->backing_dev_name); + pr_err("Couldn't attach %pg: block size less than set's block size\n", + dc->bdev); return -EINVAL; } /* Check whether already attached */ list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) { if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) { - pr_err("Tried to attach %s but duplicate UUID already attached", - dc->backing_dev_name); + pr_err("Tried to attach %pg but duplicate UUID already attached\n", + dc->bdev); return -EINVAL; } @@ -1158,15 +1237,13 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, if (!u) { if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - pr_err("Couldn't find uuid for %s in set", - dc->backing_dev_name); + pr_err("Couldn't find uuid for %pg in set\n", dc->bdev); return -ENOENT; } u = uuid_find_empty(c); if (!u) { - pr_err("Not caching %s, no room for UUID", - dc->backing_dev_name); + pr_err("Not caching %pg, no room for UUID\n", dc->bdev); return -EINVAL; } } @@ -1186,7 +1263,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, u->first_reg = u->last_reg = rtime; bch_uuid_write(c); - memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16); + memcpy(dc->sb.set_uuid, c->set_uuid, 16); SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); bch_write_bdev_super(dc, &cl); @@ -1211,7 +1288,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, down_write(&dc->writeback_lock); if (bch_cached_dev_writeback_start(dc)) { up_write(&dc->writeback_lock); - pr_err("Couldn't start writeback facilities for %s", + pr_err("Couldn't start writeback facilities for %s\n", dc->disk.disk->disk_name); return -ENOMEM; } @@ -1234,21 +1311,26 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, */ kthread_stop(dc->writeback_thread); cancel_writeback_rate_update_dwork(dc); - pr_err("Couldn't run cached device %s", - dc->backing_dev_name); + pr_err("Couldn't run cached device %pg\n", dc->bdev); return ret; } bcache_device_link(&dc->disk, c, "bdev"); atomic_inc(&c->attached_dev_nr); + if (bch_has_feature_obso_large_bucket(&(c->cache->sb))) { + pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n"); + pr_err("Please update to the latest bcache-tools to create the cache device\n"); + set_disk_ro(dc->disk.disk, 1); + } + /* Allow the writeback thread to proceed */ up_write(&dc->writeback_lock); - pr_info("Caching %s as %s on set %pU", - dc->backing_dev_name, + pr_info("Caching %pg as %s on set %pU\n", + dc->bdev, dc->disk.disk->disk_name, - dc->disk.c->sb.set_uuid); + dc->disk.c->set_uuid); return 0; } @@ -1275,8 +1357,10 @@ static void cached_dev_free(struct closure *cl) mutex_lock(&bch_register_lock); - if (atomic_read(&dc->running)) + if (atomic_read(&dc->running)) { bd_unlink_disk_holder(dc->bdev, dc->disk.disk); + del_gendisk(dc->disk.disk); + } bcache_device_free(&dc->disk); list_del(&dc->list); @@ -1339,13 +1423,13 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) q->limits.raid_partial_stripes_expensive; ret = bcache_device_init(&dc->disk, block_size, - dc->bdev->bd_part->nr_sects - dc->sb.data_offset); + bdev_nr_sectors(dc->bdev) - dc->sb.data_offset, + dc->bdev, &bcache_cached_ops); if (ret) return ret; - dc->disk.disk->queue->backing_dev_info->ra_pages = - max(dc->disk.disk->queue->backing_dev_info->ra_pages, - q->backing_dev_info->ra_pages); + blk_queue_io_opt(dc->disk.disk->queue, + max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q))); atomic_set(&dc->io_errors, 0); dc->io_disable = false; @@ -1368,7 +1452,6 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, struct cache_set *c; int ret = -ENOMEM; - bdevname(bdev, dc->backing_dev_name); memcpy(&dc->sb, sb, sizeof(struct cache_sb)); dc->bdev = bdev; dc->bdev->bd_holder = dc; @@ -1378,13 +1461,12 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, goto err; err = "error creating kobject"; - if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj, - "bcache")) + if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache")) goto err; if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) goto err; - pr_info("registered backing device %s", dc->backing_dev_name); + pr_info("registered backing device %pg\n", dc->bdev); list_add(&dc->list, &uncached_devices); /* attach to a matched cache set if it exists */ @@ -1401,7 +1483,7 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, return 0; err: - pr_notice("error %s: %s", dc->backing_dev_name, err); + pr_notice("error %pg: %s\n", dc->bdev, err); bcache_device_stop(&dc->disk); return ret; } @@ -1423,6 +1505,7 @@ static void flash_dev_free(struct closure *cl) mutex_lock(&bch_register_lock); atomic_long_sub(bcache_dev_sectors_dirty(d), &d->c->flash_dev_dirty_sectors); + del_gendisk(d->disk); bcache_device_free(d); mutex_unlock(&bch_register_lock); kobject_put(&d->kobj); @@ -1441,33 +1524,45 @@ static void flash_dev_flush(struct closure *cl) static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) { + int err = -ENOMEM; struct bcache_device *d = kzalloc(sizeof(struct bcache_device), GFP_KERNEL); if (!d) - return -ENOMEM; + goto err_ret; closure_init(&d->cl, NULL); set_closure_fn(&d->cl, flash_dev_flush, system_wq); kobject_init(&d->kobj, &bch_flash_dev_ktype); - if (bcache_device_init(d, block_bytes(c), u->sectors)) + if (bcache_device_init(d, block_bytes(c->cache), u->sectors, + NULL, &bcache_flash_ops)) goto err; bcache_device_attach(d, c, u - c->uuids); bch_sectors_dirty_init(d); bch_flash_dev_request_init(d); - add_disk(d->disk); + err = add_disk(d->disk); + if (err) + goto err; - if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache")) + err = kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"); + if (err) goto err; bcache_device_link(d, c, "volume"); + if (bch_has_feature_obso_large_bucket(&c->cache->sb)) { + pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n"); + pr_err("Please update to the latest bcache-tools to create the cache device\n"); + set_disk_ro(d->disk, 1); + } + return 0; err: kobject_put(&d->kobj); - return -ENOMEM; +err_ret: + return err; } static int flash_devs_run(struct cache_set *c) @@ -1496,7 +1591,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) u = uuid_find_empty(c); if (!u) { - pr_err("Can't create volume, no room for UUID"); + pr_err("Can't create volume, no room for UUID\n"); return -EINVAL; } @@ -1521,8 +1616,8 @@ bool bch_cached_dev_error(struct cached_dev *dc) /* make others know io_disable is true earlier */ smp_mb(); - pr_err("stop %s: too many IO errors on backing device %s\n", - dc->disk.disk->disk_name, dc->backing_dev_name); + pr_err("stop %s: too many IO errors on backing device %pg\n", + dc->disk.disk->disk_name, dc->bdev); bcache_device_stop(&dc->disk); return true; @@ -1533,6 +1628,7 @@ bool bch_cached_dev_error(struct cached_dev *dc) __printf(2, 3) bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) { + struct va_format vaf; va_list args; if (c->on_error != ON_ERROR_PANIC && @@ -1540,20 +1636,22 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) return false; if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) - pr_info("CACHE_SET_IO_DISABLE already set"); + pr_info("CACHE_SET_IO_DISABLE already set\n"); /* * XXX: we can be called from atomic context * acquire_console_sem(); */ - pr_err("bcache: error on %pU: ", c->sb.set_uuid); - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - pr_err(", disabling caching\n"); + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("error on %pU: %pV, disabling caching\n", + c->set_uuid, &vaf); + + va_end(args); if (c->on_error == ON_ERROR_PANIC) panic("panic forced after error\n"); @@ -1575,7 +1673,6 @@ static void cache_set_free(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, cl); struct cache *ca; - unsigned int i; debugfs_remove(c->debug); @@ -1584,15 +1681,16 @@ static void cache_set_free(struct closure *cl) bch_journal_free(c); mutex_lock(&bch_register_lock); - for_each_cache(ca, c, i) - if (ca) { - ca->set = NULL; - c->cache[ca->sb.nr_this_dev] = NULL; - kobject_put(&ca->kobj); - } - bch_bset_sort_state_free(&c->sort); - free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); + free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb))); + + ca = c->cache; + if (ca) { + ca->set = NULL; + c->cache = NULL; + kobject_put(&ca->kobj); + } + if (c->moving_gc_wq) destroy_workqueue(c->moving_gc_wq); @@ -1605,7 +1703,7 @@ static void cache_set_free(struct closure *cl) list_del(&c->list); mutex_unlock(&bch_register_lock); - pr_info("Cache set %pU unregistered", c->sb.set_uuid); + pr_info("Cache set %pU unregistered\n", c->set_uuid); wake_up(&unregister_wait); closure_debug_destroy(&c->cl); @@ -1615,9 +1713,8 @@ static void cache_set_free(struct closure *cl) static void cache_set_flush(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, caching); - struct cache *ca; + struct cache *ca = c->cache; struct btree *b; - unsigned int i; bch_cache_accounting_destroy(&c->accounting); @@ -1642,9 +1739,8 @@ static void cache_set_flush(struct closure *cl) mutex_unlock(&b->write_lock); } - for_each_cache(ca, c, i) - if (ca->alloc_thread) - kthread_stop(ca->alloc_thread); + if (ca->alloc_thread) + kthread_stop(ca->alloc_thread); if (c->journal.cur) { cancel_delayed_work_sync(&c->journal.work); @@ -1676,15 +1772,15 @@ static void conditional_stop_bcache_device(struct cache_set *c, struct cached_dev *dc) { if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) { - pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.", - d->disk->disk_name, c->sb.set_uuid); + pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.\n", + d->disk->disk_name, c->set_uuid); bcache_device_stop(d); } else if (atomic_read(&dc->has_dirty)) { /* * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO * and dc->has_dirty == 1 */ - pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.", + pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.\n", d->disk->disk_name); /* * There might be a small time gap that cache set is @@ -1706,7 +1802,7 @@ static void conditional_stop_bcache_device(struct cache_set *c, * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO * and dc->has_dirty == 0 */ - pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.", + pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.\n", d->disk->disk_name); } } @@ -1754,12 +1850,13 @@ void bch_cache_set_unregister(struct cache_set *c) bch_cache_set_stop(c); } -#define alloc_bucket_pages(gfp, c) \ - ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c)))) +#define alloc_meta_bucket_pages(gfp, sb) \ + ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb)))) struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) { int iter_size; + struct cache *ca = container_of(sb, struct cache, sb); struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL); if (!c) @@ -1781,17 +1878,16 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) bch_cache_accounting_init(&c->accounting, &c->cl); - memcpy(c->sb.set_uuid, sb->set_uuid, 16); - c->sb.block_size = sb->block_size; - c->sb.bucket_size = sb->bucket_size; - c->sb.nr_in_set = sb->nr_in_set; - c->sb.last_mount = sb->last_mount; + memcpy(c->set_uuid, sb->set_uuid, 16); + + c->cache = ca; + c->cache->set = c; c->bucket_bits = ilog2(sb->bucket_size); c->block_bits = ilog2(sb->block_size); - c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); + c->nr_uuids = meta_bucket_bytes(sb) / sizeof(struct uuid_entry); c->devices_max_used = 0; atomic_set(&c->attached_dev_nr, 0); - c->btree_pages = bucket_pages(c); + c->btree_pages = meta_bucket_pages(sb); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); @@ -1817,24 +1913,46 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) INIT_LIST_HEAD(&c->btree_cache_freed); INIT_LIST_HEAD(&c->data_buckets); - iter_size = (sb->bucket_size / sb->block_size + 1) * + iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) * sizeof(struct btree_iter_set); - if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) || - mempool_init_slab_pool(&c->search, 32, bch_search_cache) || - mempool_init_kmalloc_pool(&c->bio_meta, 2, - sizeof(struct bbio) + sizeof(struct bio_vec) * - bucket_pages(c)) || - mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || - bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio), - BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) || - !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || - !(c->moving_gc_wq = alloc_workqueue("bcache_gc", - WQ_MEM_RECLAIM, 0)) || - bch_journal_alloc(c) || - bch_btree_cache_alloc(c) || - bch_open_buckets_alloc(c) || - bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) + c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL); + if (!c->devices) + goto err; + + if (mempool_init_slab_pool(&c->search, 32, bch_search_cache)) + goto err; + + if (mempool_init_kmalloc_pool(&c->bio_meta, 2, + sizeof(struct bbio) + + sizeof(struct bio_vec) * meta_bucket_pages(sb))) + goto err; + + if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size)) + goto err; + + if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio), + BIOSET_NEED_RESCUER)) + goto err; + + c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb); + if (!c->uuids) + goto err; + + c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0); + if (!c->moving_gc_wq) + goto err; + + if (bch_journal_alloc(c)) + goto err; + + if (bch_btree_cache_alloc(c)) + goto err; + + if (bch_open_buckets_alloc(c)) + goto err; + + if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) goto err; c->congested_read_threshold_us = 2000; @@ -1853,19 +1971,17 @@ static int run_cache_set(struct cache_set *c) { const char *err = "cannot allocate memory"; struct cached_dev *dc, *t; - struct cache *ca; + struct cache *ca = c->cache; struct closure cl; - unsigned int i; LIST_HEAD(journal); struct journal_replay *l; closure_init_stack(&cl); - for_each_cache(ca, c, i) - c->nbuckets += ca->sb.nbuckets; + c->nbuckets = ca->sb.nbuckets; set_gc_sectors(c); - if (CACHE_SYNC(&c->sb)) { + if (CACHE_SYNC(&c->cache->sb)) { struct bkey *k; struct jset *j; @@ -1873,7 +1989,7 @@ static int run_cache_set(struct cache_set *c) if (bch_journal_read(c, &journal)) goto err; - pr_debug("btree_journal_read() done"); + pr_debug("btree_journal_read() done\n"); err = "no journal entries found"; if (list_empty(&journal)) @@ -1882,10 +1998,8 @@ static int run_cache_set(struct cache_set *c) j = &list_entry(journal.prev, struct journal_replay, list)->j; err = "IO error reading priorities"; - for_each_cache(ca, c, i) { - if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev])) - goto err; - } + if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev])) + goto err; /* * If prio_read() fails it'll call cache_set_error and we'll @@ -1919,7 +2033,7 @@ static int run_cache_set(struct cache_set *c) bch_journal_mark(c, &journal); bch_initial_gc_finish(c); - pr_debug("btree_check() done"); + pr_debug("btree_check() done\n"); /* * bcache_journal_next() can't happen sooner, or @@ -1929,9 +2043,8 @@ static int run_cache_set(struct cache_set *c) bch_journal_next(&c->journal); err = "error starting allocator thread"; - for_each_cache(ca, c, i) - if (bch_cache_allocator_start(ca)) - goto err; + if (bch_cache_allocator_start(ca)) + goto err; /* * First place it's safe to allocate: btree_check() and @@ -1950,28 +2063,23 @@ static int run_cache_set(struct cache_set *c) if (bch_journal_replay(c, &journal)) goto err; } else { - pr_notice("invalidating existing data"); + unsigned int j; - for_each_cache(ca, c, i) { - unsigned int j; + pr_notice("invalidating existing data\n"); + ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, + 2, SB_JOURNAL_BUCKETS); - ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, - 2, SB_JOURNAL_BUCKETS); - - for (j = 0; j < ca->sb.keys; j++) - ca->sb.d[j] = ca->sb.first_bucket + j; - } + for (j = 0; j < ca->sb.keys; j++) + ca->sb.d[j] = ca->sb.first_bucket + j; bch_initial_gc_finish(c); err = "error starting allocator thread"; - for_each_cache(ca, c, i) - if (bch_cache_allocator_start(ca)) - goto err; + if (bch_cache_allocator_start(ca)) + goto err; mutex_lock(&c->bucket_lock); - for_each_cache(ca, c, i) - bch_prio_write(ca, true); + bch_prio_write(ca, true); mutex_unlock(&c->bucket_lock); err = "cannot allocate new UUID bucket"; @@ -1996,7 +2104,7 @@ static int run_cache_set(struct cache_set *c) * everything is set up - fortunately journal entries won't be * written until the SET_CACHE_SYNC() here: */ - SET_CACHE_SYNC(&c->sb, true); + SET_CACHE_SYNC(&c->cache->sb, true); bch_journal_next(&c->journal); bch_journal_meta(c, &cl); @@ -2007,14 +2115,18 @@ static int run_cache_set(struct cache_set *c) goto err; closure_sync(&cl); - c->sb.last_mount = (u32)ktime_get_real_seconds(); + c->cache->sb.last_mount = (u32)ktime_get_real_seconds(); bcache_write_super(c); + if (bch_has_feature_obso_large_bucket(&c->cache->sb)) + pr_err("Detect obsoleted large bucket layout, all attached bcache device will be read-only\n"); + list_for_each_entry_safe(dc, t, &uncached_devices, list) bch_cached_dev_attach(dc, c, NULL); flash_devs_run(c); + bch_journal_space_reserve(&c->journal); set_bit(CACHE_SET_RUNNING, &c->flags); return 0; err: @@ -2031,13 +2143,6 @@ err: return -EIO; } -static bool can_attach_cache(struct cache *ca, struct cache_set *c) -{ - return ca->sb.block_size == c->sb.block_size && - ca->sb.bucket_size == c->sb.bucket_size && - ca->sb.nr_in_set == c->sb.nr_in_set; -} - static const char *register_cache_set(struct cache *ca) { char buf[12]; @@ -2045,16 +2150,10 @@ static const char *register_cache_set(struct cache *ca) struct cache_set *c; list_for_each_entry(c, &bch_cache_sets, list) - if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) { - if (c->cache[ca->sb.nr_this_dev]) + if (!memcmp(c->set_uuid, ca->sb.set_uuid, 16)) { + if (c->cache) return "duplicate cache set member"; - if (!can_attach_cache(ca, c)) - return "cache sb does not match set"; - - if (!CACHE_SYNC(&ca->sb)) - SET_CACHE_SYNC(&c->sb, false); - goto found; } @@ -2063,7 +2162,7 @@ static const char *register_cache_set(struct cache *ca) return err; err = "error creating kobject"; - if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) || + if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->set_uuid) || kobject_add(&c->internal, &c->kobj, "internal")) goto err; @@ -2079,24 +2178,13 @@ found: sysfs_create_link(&c->kobj, &ca->kobj, buf)) goto err; - if (ca->sb.seq > c->sb.seq) { - c->sb.version = ca->sb.version; - memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16); - c->sb.flags = ca->sb.flags; - c->sb.seq = ca->sb.seq; - pr_debug("set version = %llu", c->sb.version); - } - kobject_get(&ca->kobj); ca->set = c; - ca->set->cache[ca->sb.nr_this_dev] = ca; - c->cache_by_alloc[c->caches_loaded++] = ca; + ca->set->cache = ca; - if (c->caches_loaded == c->sb.nr_in_set) { - err = "failed to run cache set"; - if (run_cache_set(c) < 0) - goto err; - } + err = "failed to run cache set"; + if (run_cache_set(c) < 0) + goto err; return NULL; err: @@ -2113,11 +2201,11 @@ void bch_cache_release(struct kobject *kobj) unsigned int i; if (ca->set) { - BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca); - ca->set->cache[ca->sb.nr_this_dev] = NULL; + BUG_ON(ca->set->cache != ca); + ca->set->cache = NULL; } - free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); + free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb))); kfree(ca->prio_buckets); vfree(ca->buckets); @@ -2148,7 +2236,7 @@ static int cache_alloc(struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8); + bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0); /* * when ca->sb.njournal_buckets is not zero, journal exists, @@ -2214,7 +2302,7 @@ static int cache_alloc(struct cache *ca) goto err_prio_buckets_alloc; } - ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca); + ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb); if (!ca->disk_buckets) { err = "ca->disk_buckets alloc failed"; goto err_disk_buckets_alloc; @@ -2246,7 +2334,7 @@ err_btree_alloc: err_free: module_put(THIS_MODULE); if (err) - pr_notice("error %s: %s", ca->cache_dev_name, err); + pr_notice("error %pg: %s\n", ca->bdev, err); return ret; } @@ -2256,13 +2344,12 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, const char *err = NULL; /* must be set for any error case */ int ret = 0; - bdevname(bdev, ca->cache_dev_name); memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; ca->bdev->bd_holder = ca; ca->sb_disk = sb_disk; - if (blk_queue_discard(bdev_get_queue(bdev))) + if (bdev_max_discard_sectors((bdev))) ca->discard = CACHE_DISCARD(&ca->sb); ret = cache_alloc(ca); @@ -2283,9 +2370,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, goto err; } - if (kobject_add(&ca->kobj, - &part_to_dev(bdev->bd_part)->kobj, - "bcache")) { + if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) { err = "error calling kobject_add"; ret = -ENOMEM; goto out; @@ -2300,14 +2385,14 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, goto out; } - pr_info("registered cache device %s", ca->cache_dev_name); + pr_info("registered cache device %pg\n", ca->bdev); out: kobject_put(&ca->kobj); err: if (err) - pr_notice("error %s: %s", ca->cache_dev_name, err); + pr_notice("error %pg: %s\n", ca->bdev, err); return ret; } @@ -2324,37 +2409,116 @@ kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); -static bool bch_is_open_backing(struct block_device *bdev) +static bool bch_is_open_backing(dev_t dev) { struct cache_set *c, *tc; struct cached_dev *dc, *t; list_for_each_entry_safe(c, tc, &bch_cache_sets, list) list_for_each_entry_safe(dc, t, &c->cached_devs, list) - if (dc->bdev == bdev) + if (dc->bdev->bd_dev == dev) return true; list_for_each_entry_safe(dc, t, &uncached_devices, list) - if (dc->bdev == bdev) + if (dc->bdev->bd_dev == dev) return true; return false; } -static bool bch_is_open_cache(struct block_device *bdev) +static bool bch_is_open_cache(dev_t dev) { struct cache_set *c, *tc; - struct cache *ca; - unsigned int i; - list_for_each_entry_safe(c, tc, &bch_cache_sets, list) - for_each_cache(ca, c, i) - if (ca->bdev == bdev) - return true; + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { + struct cache *ca = c->cache; + + if (ca->bdev->bd_dev == dev) + return true; + } + return false; } -static bool bch_is_open(struct block_device *bdev) +static bool bch_is_open(dev_t dev) +{ + return bch_is_open_cache(dev) || bch_is_open_backing(dev); +} + +struct async_reg_args { + struct delayed_work reg_work; + char *path; + struct cache_sb *sb; + struct cache_sb_disk *sb_disk; + struct block_device *bdev; +}; + +static void register_bdev_worker(struct work_struct *work) { - return bch_is_open_cache(bdev) || bch_is_open_backing(bdev); + int fail = false; + struct async_reg_args *args = + container_of(work, struct async_reg_args, reg_work.work); + struct cached_dev *dc; + + dc = kzalloc(sizeof(*dc), GFP_KERNEL); + if (!dc) { + fail = true; + put_page(virt_to_page(args->sb_disk)); + blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + goto out; + } + + mutex_lock(&bch_register_lock); + if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0) + fail = true; + mutex_unlock(&bch_register_lock); + +out: + if (fail) + pr_info("error %s: fail to register backing device\n", + args->path); + kfree(args->sb); + kfree(args->path); + kfree(args); + module_put(THIS_MODULE); +} + +static void register_cache_worker(struct work_struct *work) +{ + int fail = false; + struct async_reg_args *args = + container_of(work, struct async_reg_args, reg_work.work); + struct cache *ca; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) { + fail = true; + put_page(virt_to_page(args->sb_disk)); + blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + goto out; + } + + /* blkdev_put() will be called in bch_cache_release() */ + if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0) + fail = true; + +out: + if (fail) + pr_info("error %s: fail to register cache device\n", + args->path); + kfree(args->sb); + kfree(args->path); + kfree(args); + module_put(THIS_MODULE); +} + +static void register_device_async(struct async_reg_args *args) +{ + if (SB_IS_BDEV(args->sb)) + INIT_DELAYED_WORK(&args->reg_work, register_bdev_worker); + else + INIT_DELAYED_WORK(&args->reg_work, register_cache_worker); + + /* 10 jiffies is enough for a delay */ + queue_delayed_work(system_wq, &args->reg_work, 10); } static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, @@ -2366,6 +2530,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, struct cache_sb_disk *sb_disk; struct block_device *bdev; ssize_t ret; + bool async_registration = false; + +#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION + async_registration = true; +#endif ret = -EBUSY; err = "failed to reference bcache module"; @@ -2395,15 +2564,15 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, sb); if (IS_ERR(bdev)) { if (bdev == ERR_PTR(-EBUSY)) { - bdev = lookup_bdev(strim(path)); + dev_t dev; + mutex_lock(&bch_register_lock); - if (!IS_ERR(bdev) && bch_is_open(bdev)) + if (lookup_bdev(strim(path), &dev) == 0 && + bch_is_open(dev)) err = "device already registered"; else err = "device busy"; mutex_unlock(&bch_register_lock); - if (!IS_ERR(bdev)) - bdput(bdev); if (attr == &ksysfs_register_quiet) goto done; } @@ -2419,11 +2588,35 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto out_blkdev_put; err = "failed to register device"; + + if (async_registration) { + /* register in asynchronous way */ + struct async_reg_args *args = + kzalloc(sizeof(struct async_reg_args), GFP_KERNEL); + + if (!args) { + ret = -ENOMEM; + err = "cannot allocate memory"; + goto out_put_sb_page; + } + + args->path = path; + args->sb = sb; + args->sb_disk = sb_disk; + args->bdev = bdev; + register_device_async(args); + /* No wait and returns to user space */ + goto async_done; + } + if (SB_IS_BDEV(sb)) { struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); - if (!dc) + if (!dc) { + ret = -ENOMEM; + err = "cannot allocate memory"; goto out_put_sb_page; + } mutex_lock(&bch_register_lock); ret = register_bdev(sb, sb_disk, bdev, dc); @@ -2434,11 +2627,15 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) + if (!ca) { + ret = -ENOMEM; + err = "cannot allocate memory"; goto out_put_sb_page; + } /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(sb, sb_disk, bdev, ca) != 0) + ret = register_cache(sb, sb_disk, bdev, ca); + if (ret) goto out_free_sb; } @@ -2446,6 +2643,7 @@ done: kfree(sb); kfree(path); module_put(THIS_MODULE); +async_done: return size; out_put_sb_page: @@ -2460,7 +2658,7 @@ out_free_path: out_module_put: module_put(THIS_MODULE); out: - pr_info("error %s: %s", path?path:"", err); + pr_info("error %s: %s\n", path?path:"", err); return ret; } @@ -2491,9 +2689,9 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, } list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { + char *pdev_set_uuid = pdev->dc->sb.set_uuid; list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { - char *pdev_set_uuid = pdev->dc->sb.set_uuid; - char *set_uuid = c->sb.uuid; + char *set_uuid = c->set_uuid; if (!memcmp(pdev_set_uuid, set_uuid, 16)) { list_del(&pdev->list); @@ -2505,7 +2703,7 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, mutex_unlock(&bch_register_lock); list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { - pr_info("delete pdev %p", pdev); + pr_info("delete pdev %p\n", pdev); list_del(&pdev->list); bcache_device_stop(&pdev->dc->disk); kfree(pdev); @@ -2548,13 +2746,13 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) mutex_unlock(&bch_register_lock); - pr_info("Stopping all devices:"); + pr_info("Stopping all devices:\n"); /* * The reason bch_register_lock is not held to call * bch_cache_set_stop() and bcache_device_stop() is to * avoid potential deadlock during reboot, because cache - * set or bcache device stopping process will acqurie + * set or bcache device stopping process will acquire * bch_register_lock too. * * We are safe here because bcache_is_reboot sets to @@ -2598,9 +2796,9 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) finish_wait(&unregister_wait, &wait); if (stopped) - pr_info("All devices stopped"); + pr_info("All devices stopped\n"); else - pr_notice("Timeout waiting for devices to be closed"); + pr_notice("Timeout waiting for devices to be closed\n"); out: mutex_unlock(&bch_register_lock); } @@ -2623,6 +2821,9 @@ static void bcache_exit(void) destroy_workqueue(bcache_wq); if (bch_journal_wq) destroy_workqueue(bch_journal_wq); + if (bch_flush_wq) + destroy_workqueue(bch_flush_wq); + bch_btree_exit(); if (bcache_major) unregister_blkdev(bcache_major, "bcache"); @@ -2636,7 +2837,7 @@ static void check_module_parameters(void) if (bch_cutoff_writeback_sync == 0) bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC; else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) { - pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u", + pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u\n", bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX); bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX; } @@ -2644,13 +2845,13 @@ static void check_module_parameters(void) if (bch_cutoff_writeback == 0) bch_cutoff_writeback = CUTOFF_WRITEBACK; else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) { - pr_warn("set bch_cutoff_writeback (%u) to max value %u", + pr_warn("set bch_cutoff_writeback (%u) to max value %u\n", bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX); bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX; } if (bch_cutoff_writeback > bch_cutoff_writeback_sync) { - pr_warn("set bch_cutoff_writeback (%u) to %u", + pr_warn("set bch_cutoff_writeback (%u) to %u\n", bch_cutoff_writeback, bch_cutoff_writeback_sync); bch_cutoff_writeback = bch_cutoff_writeback_sync; } @@ -2678,10 +2879,26 @@ static int __init bcache_init(void) return bcache_major; } + if (bch_btree_init()) + goto err; + bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); if (!bcache_wq) goto err; + /* + * Let's not make this `WQ_MEM_RECLAIM` for the following reasons: + * + * 1. It used `system_wq` before which also does no memory reclaim. + * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and + * reduced throughput can be observed. + * + * We still want to user our own queue to not congest the `system_wq`. + */ + bch_flush_wq = alloc_workqueue("bch_flush", 0, 0); + if (!bch_flush_wq) + goto err; + bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0); if (!bch_journal_wq) goto err; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 3470fae4eabc..c6f677059214 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -11,6 +11,7 @@ #include "btree.h" #include "request.h" #include "writeback.h" +#include "features.h" #include <linux/blkdev.h> #include <linux/sort.h> @@ -88,6 +89,9 @@ read_attribute(btree_used_percent); read_attribute(average_key_size); read_attribute(dirty_data); read_attribute(bset_tree_stats); +read_attribute(feature_compat); +read_attribute(feature_ro_compat); +read_attribute(feature_incompat); read_attribute(state); read_attribute(cache_read_races); @@ -113,10 +117,14 @@ rw_attribute(writeback_running); rw_attribute(writeback_percent); rw_attribute(writeback_delay); rw_attribute(writeback_rate); +rw_attribute(writeback_consider_fragment); rw_attribute(writeback_rate_update_seconds); rw_attribute(writeback_rate_i_term_inverse); rw_attribute(writeback_rate_p_term_inverse); +rw_attribute(writeback_rate_fp_term_low); +rw_attribute(writeback_rate_fp_term_mid); +rw_attribute(writeback_rate_fp_term_high); rw_attribute(writeback_rate_minimum); read_attribute(writeback_rate_debug); @@ -129,7 +137,6 @@ rw_attribute(io_disable); rw_attribute(discard); rw_attribute(running); rw_attribute(label); -rw_attribute(readahead); rw_attribute(errors); rw_attribute(io_error_limit); rw_attribute(io_error_halflife); @@ -154,7 +161,7 @@ static ssize_t bch_snprint_string_list(char *buf, size_t i; for (i = 0; list[i]; i++) - out += snprintf(out, buf + size - out, + out += scnprintf(out, buf + size - out, i == selected ? "[%s] " : "%s ", list[i]); out[-1] = '\n'; @@ -191,6 +198,7 @@ SHOW(__bch_cached_dev) var_printf(bypass_torture_test, "%i"); var_printf(writeback_metadata, "%i"); var_printf(writeback_running, "%i"); + var_printf(writeback_consider_fragment, "%i"); var_print(writeback_delay); var_print(writeback_percent); sysfs_hprint(writeback_rate, @@ -201,6 +209,9 @@ SHOW(__bch_cached_dev) var_print(writeback_rate_update_seconds); var_print(writeback_rate_i_term_inverse); var_print(writeback_rate_p_term_inverse); + var_print(writeback_rate_fp_term_low); + var_print(writeback_rate_fp_term_mid); + var_print(writeback_rate_fp_term_high); var_print(writeback_rate_minimum); if (attr == &sysfs_writeback_rate_debug) { @@ -248,7 +259,6 @@ SHOW(__bch_cached_dev) var_printf(partial_stripes_expensive, "%u"); var_hprint(sequential_cutoff); - var_hprint(readahead); sysfs_print(running, atomic_read(&dc->running)); sysfs_print(state, states[BDEV_STATE(&dc->sb)]); @@ -261,7 +271,7 @@ SHOW(__bch_cached_dev) } if (attr == &sysfs_backing_dev_name) { - snprintf(buf, BDEVNAME_SIZE + 1, "%s", dc->backing_dev_name); + snprintf(buf, BDEVNAME_SIZE + 1, "%pg", dc->bdev); strcat(buf, "\n"); return strlen(buf); } @@ -299,6 +309,7 @@ STORE(__cached_dev) sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test); sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata); sysfs_strtoul_bool(writeback_running, dc->writeback_running); + sysfs_strtoul_bool(writeback_consider_fragment, dc->writeback_consider_fragment); sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX); sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, @@ -327,6 +338,16 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_rate_p_term_inverse, dc->writeback_rate_p_term_inverse, 1, UINT_MAX); + sysfs_strtoul_clamp(writeback_rate_fp_term_low, + dc->writeback_rate_fp_term_low, + 1, dc->writeback_rate_fp_term_mid - 1); + sysfs_strtoul_clamp(writeback_rate_fp_term_mid, + dc->writeback_rate_fp_term_mid, + dc->writeback_rate_fp_term_low + 1, + dc->writeback_rate_fp_term_high - 1); + sysfs_strtoul_clamp(writeback_rate_fp_term_high, + dc->writeback_rate_fp_term_high, + dc->writeback_rate_fp_term_mid + 1, UINT_MAX); sysfs_strtoul_clamp(writeback_rate_minimum, dc->writeback_rate_minimum, 1, UINT_MAX); @@ -342,7 +363,6 @@ STORE(__cached_dev) sysfs_strtoul_clamp(sequential_cutoff, dc->sequential_cutoff, 0, UINT_MAX); - d_strtoi_h(readahead); if (attr == &sysfs_clear_stats) bch_cache_accounting_clear(&dc->accounting); @@ -400,7 +420,7 @@ STORE(__cached_dev) if (!env) return -ENOMEM; add_uevent_var(env, "DRIVER=bcache"); - add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid), + add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid); add_uevent_var(env, "CACHED_LABEL=%s", buf); kobject_uevent_env(&disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, @@ -421,7 +441,7 @@ STORE(__cached_dev) return size; } if (v == -ENOENT) - pr_err("Can't attach %s: cache set not found", buf); + pr_err("Can't attach %s: cache set not found\n", buf); return v; } @@ -455,7 +475,7 @@ STORE(bch_cached_dev) */ if (dc->writeback_running) { dc->writeback_running = false; - pr_err("%s: failed to run non-existent writeback thread", + pr_err("%s: failed to run non-existent writeback thread\n", dc->disk.disk->disk_name); } } else @@ -480,7 +500,7 @@ STORE(bch_cached_dev) return size; } -static struct attribute *bch_cached_dev_files[] = { +static struct attribute *bch_cached_dev_attrs[] = { &sysfs_attach, &sysfs_detach, &sysfs_stop, @@ -495,9 +515,13 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_delay, &sysfs_writeback_percent, &sysfs_writeback_rate, + &sysfs_writeback_consider_fragment, &sysfs_writeback_rate_update_seconds, &sysfs_writeback_rate_i_term_inverse, &sysfs_writeback_rate_p_term_inverse, + &sysfs_writeback_rate_fp_term_low, + &sysfs_writeback_rate_fp_term_mid, + &sysfs_writeback_rate_fp_term_high, &sysfs_writeback_rate_minimum, &sysfs_writeback_rate_debug, &sysfs_io_errors, @@ -511,7 +535,6 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_running, &sysfs_state, &sysfs_label, - &sysfs_readahead, #ifdef CONFIG_BCACHE_DEBUG &sysfs_verify, &sysfs_bypass_torture_test, @@ -520,6 +543,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_backing_dev_uuid, NULL }; +ATTRIBUTE_GROUPS(bch_cached_dev); KTYPE(bch_cached_dev); SHOW(bch_flash_dev) @@ -577,7 +601,7 @@ STORE(__bch_flash_dev) } STORE_LOCKED(bch_flash_dev) -static struct attribute *bch_flash_dev_files[] = { +static struct attribute *bch_flash_dev_attrs[] = { &sysfs_unregister, #if 0 &sysfs_data_csum, @@ -586,6 +610,7 @@ static struct attribute *bch_flash_dev_files[] = { &sysfs_size, NULL }; +ATTRIBUTE_GROUPS(bch_flash_dev); KTYPE(bch_flash_dev); struct bset_stats_op { @@ -707,10 +732,10 @@ SHOW(__bch_cache_set) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); - sysfs_print(synchronous, CACHE_SYNC(&c->sb)); + sysfs_print(synchronous, CACHE_SYNC(&c->cache->sb)); sysfs_print(journal_delay_ms, c->journal_delay_ms); - sysfs_hprint(bucket_size, bucket_bytes(c)); - sysfs_hprint(block_size, block_bytes(c)); + sysfs_hprint(bucket_size, bucket_bytes(c->cache)); + sysfs_hprint(block_size, block_bytes(c->cache)); sysfs_print(tree_depth, c->root->level); sysfs_print(root_usage_percent, bch_root_usage(c)); @@ -779,6 +804,13 @@ SHOW(__bch_cache_set) if (attr == &sysfs_bset_tree_stats) return bch_bset_print_stats(c, buf); + if (attr == &sysfs_feature_compat) + return bch_print_cache_set_feature_compat(c, buf, PAGE_SIZE); + if (attr == &sysfs_feature_ro_compat) + return bch_print_cache_set_feature_ro_compat(c, buf, PAGE_SIZE); + if (attr == &sysfs_feature_incompat) + return bch_print_cache_set_feature_incompat(c, buf, PAGE_SIZE); + return 0; } SHOW_LOCKED(bch_cache_set) @@ -801,8 +833,8 @@ STORE(__bch_cache_set) if (attr == &sysfs_synchronous) { bool sync = strtoul_or_return(buf); - if (sync != CACHE_SYNC(&c->sb)) { - SET_CACHE_SYNC(&c->sb, sync); + if (sync != CACHE_SYNC(&c->cache->sb)) { + SET_CACHE_SYNC(&c->cache->sb, sync); bcache_write_super(c); } } @@ -872,11 +904,11 @@ STORE(__bch_cache_set) if (v) { if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) - pr_warn("CACHE_SET_IO_DISABLE already set"); + pr_warn("CACHE_SET_IO_DISABLE already set\n"); } else { if (!test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)) - pr_warn("CACHE_SET_IO_DISABLE already cleared"); + pr_warn("CACHE_SET_IO_DISABLE already cleared\n"); } } @@ -925,7 +957,7 @@ static void bch_cache_set_internal_release(struct kobject *k) { } -static struct attribute *bch_cache_set_files[] = { +static struct attribute *bch_cache_set_attrs[] = { &sysfs_unregister, &sysfs_stop, &sysfs_synchronous, @@ -950,9 +982,10 @@ static struct attribute *bch_cache_set_files[] = { &sysfs_clear_stats, NULL }; +ATTRIBUTE_GROUPS(bch_cache_set); KTYPE(bch_cache_set); -static struct attribute *bch_cache_set_internal_files[] = { +static struct attribute *bch_cache_set_internal_attrs[] = { &sysfs_active_journal_entries, sysfs_time_stats_attribute_list(btree_gc, sec, ms) @@ -987,8 +1020,12 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_io_disable, &sysfs_cutoff_writeback, &sysfs_cutoff_writeback_sync, + &sysfs_feature_compat, + &sysfs_feature_ro_compat, + &sysfs_feature_incompat, NULL }; +ATTRIBUTE_GROUPS(bch_cache_set_internal); KTYPE(bch_cache_set_internal); static int __bch_cache_cmp(const void *l, const void *r) @@ -1057,8 +1094,10 @@ SHOW(__bch_cache) --n; while (cached < p + n && - *cached == BTREE_PRIO) - cached++, n--; + *cached == BTREE_PRIO) { + cached++; + n--; + } for (i = 0; i < n; i++) sum += INITIAL_PRIO - cached[i]; @@ -1112,7 +1151,7 @@ STORE(__bch_cache) if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); - if (blk_queue_discard(bdev_get_queue(ca->bdev))) + if (bdev_max_discard_sectors(ca->bdev)) ca->discard = v; if (v != CACHE_DISCARD(&ca->sb)) { @@ -1147,7 +1186,7 @@ STORE(__bch_cache) } STORE_LOCKED(bch_cache) -static struct attribute *bch_cache_files[] = { +static struct attribute *bch_cache_attrs[] = { &sysfs_bucket_size, &sysfs_block_size, &sysfs_nbuckets, @@ -1161,4 +1200,5 @@ static struct attribute *bch_cache_files[] = { &sysfs_cache_replacement_policy, NULL }; +ATTRIBUTE_GROUPS(bch_cache); KTYPE(bch_cache); diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h index 215df32f567b..a2ff6447b699 100644 --- a/drivers/md/bcache/sysfs.h +++ b/drivers/md/bcache/sysfs.h @@ -9,7 +9,7 @@ struct kobj_type type ## _ktype = { \ .show = type ## _show, \ .store = type ## _store \ }), \ - .default_attrs = type ## _files \ + .default_groups = type ## _groups \ } #define SHOW(fn) \ @@ -51,13 +51,27 @@ STORE(fn) \ #define sysfs_printf(file, fmt, ...) \ do { \ if (attr == &sysfs_ ## file) \ - return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \ + return sysfs_emit(buf, fmt "\n", __VA_ARGS__); \ } while (0) #define sysfs_print(file, var) \ do { \ if (attr == &sysfs_ ## file) \ - return snprint(buf, PAGE_SIZE, var); \ + return sysfs_emit(buf, \ + __builtin_types_compatible_p(typeof(var), int) \ + ? "%i\n" : \ + __builtin_types_compatible_p(typeof(var), unsigned int) \ + ? "%u\n" : \ + __builtin_types_compatible_p(typeof(var), long) \ + ? "%li\n" : \ + __builtin_types_compatible_p(typeof(var), unsigned long)\ + ? "%lu\n" : \ + __builtin_types_compatible_p(typeof(var), int64_t) \ + ? "%lli\n" : \ + __builtin_types_compatible_p(typeof(var), uint64_t) \ + ? "%llu\n" : \ + __builtin_types_compatible_p(typeof(var), const char *) \ + ? "%s\n" : "%i\n", var); \ } while (0) #define sysfs_hprint(file, val) \ diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 62fb917f7a4f..ae380bc3992e 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -33,27 +33,27 @@ int bch_ ## name ## _h(const char *cp, type *res) \ case 'y': \ case 'z': \ u++; \ - /* fall through */ \ + fallthrough; \ case 'e': \ u++; \ - /* fall through */ \ + fallthrough; \ case 'p': \ u++; \ - /* fall through */ \ + fallthrough; \ case 't': \ u++; \ - /* fall through */ \ + fallthrough; \ case 'g': \ u++; \ - /* fall through */ \ + fallthrough; \ case 'm': \ u++; \ - /* fall through */ \ + fallthrough; \ case 'k': \ u++; \ if (e++ == cp) \ return -EINVAL; \ - /* fall through */ \ + fallthrough; \ case '\n': \ case '\0': \ if (*e == '\n') \ diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index c029f7443190..6f3cb7c92130 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -15,8 +15,6 @@ #include "closure.h" -#define PAGE_SECTORS (PAGE_SIZE / 512) - struct closure; #ifdef CONFIG_BCACHE_DEBUG @@ -27,7 +25,7 @@ struct closure; #else /* DEBUG */ -#define EBUG_ON(cond) do { if (cond); } while (0) +#define EBUG_ON(cond) do { if (cond) do {} while (0); } while (0) #define atomic_dec_bug(v) atomic_dec(v) #define atomic_inc_bug(v, i) atomic_inc(v) @@ -342,23 +340,6 @@ static inline int bch_strtoul_h(const char *cp, long *res) _r; \ }) -#define snprint(buf, size, var) \ - snprintf(buf, size, \ - __builtin_types_compatible_p(typeof(var), int) \ - ? "%i\n" : \ - __builtin_types_compatible_p(typeof(var), unsigned int) \ - ? "%u\n" : \ - __builtin_types_compatible_p(typeof(var), long) \ - ? "%li\n" : \ - __builtin_types_compatible_p(typeof(var), unsigned long)\ - ? "%lu\n" : \ - __builtin_types_compatible_p(typeof(var), int64_t) \ - ? "%lli\n" : \ - __builtin_types_compatible_p(typeof(var), uint64_t) \ - ? "%llu\n" : \ - __builtin_types_compatible_p(typeof(var), const char *) \ - ? "%s\n" : "%i\n", var) - ssize_t bch_hprint(char *buf, int64_t v); bool bch_is_zero(const char *p, size_t n); @@ -550,14 +531,6 @@ static inline uint64_t bch_crc64(const void *p, size_t len) return crc ^ 0xffffffffffffffffULL; } -static inline uint64_t bch_crc64_update(uint64_t crc, - const void *p, - size_t len) -{ - crc = crc64_be(crc, p, len); - return crc; -} - /* * A stepwise-linear pseudo-exponential. This returns 1 << (x >> * frac_bits), with the less-significant bits filled in by linear @@ -586,8 +559,4 @@ static inline unsigned int fract_exp_two(unsigned int x, void bch_bio_map(struct bio *bio, void *base); int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask); -static inline sector_t bdev_sectors(struct block_device *bdev) -{ - return bdev->bd_inode->i_size >> 9; -} #endif /* _BCACHE_UTIL_H */ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 4a40f9eadeaf..0285b676e983 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -35,7 +35,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc) * This is the size of the cache, minus the amount used for * flash-only devices */ - uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - + uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size - atomic_long_read(&c->flash_dev_dirty_sectors); /* @@ -45,7 +45,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc) * backing volume uses about 2% of the cache for dirty data. */ uint32_t bdev_share = - div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT, + div64_u64(bdev_nr_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT, c->cached_dev_sectors); uint64_t cache_dirty_target = @@ -88,6 +88,44 @@ static void __update_writeback_rate(struct cached_dev *dc) int64_t integral_scaled; uint32_t new_rate; + /* + * We need to consider the number of dirty buckets as well + * when calculating the proportional_scaled, Otherwise we might + * have an unreasonable small writeback rate at a highly fragmented situation + * when very few dirty sectors consumed a lot dirty buckets, the + * worst case is when dirty buckets reached cutoff_writeback_sync and + * dirty data is still not even reached to writeback percent, so the rate + * still will be at the minimum value, which will cause the write + * stuck at a non-writeback mode. + */ + struct cache_set *c = dc->disk.c; + + int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets; + + if (dc->writeback_consider_fragment && + c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) { + int64_t fragment = + div_s64((dirty_buckets * c->cache->sb.bucket_size), dirty); + int64_t fp_term; + int64_t fps; + + if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) { + fp_term = (int64_t)dc->writeback_rate_fp_term_low * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW); + } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) { + fp_term = (int64_t)dc->writeback_rate_fp_term_mid * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID); + } else { + fp_term = (int64_t)dc->writeback_rate_fp_term_high * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH); + } + fps = div_s64(dirty, dirty_buckets) * fp_term; + if (fragment > 3 && fps > proportional_scaled) { + /* Only overrite the p when fragment > 3 */ + proportional_scaled = fps; + } + } + if ((error < 0 && dc->writeback_rate_integral > 0) || (error > 0 && time_before64(local_clock(), dc->writeback_rate.next + NSEC_PER_MSEC))) { @@ -119,6 +157,53 @@ static void __update_writeback_rate(struct cached_dev *dc) dc->writeback_rate_target = target; } +static bool idle_counter_exceeded(struct cache_set *c) +{ + int counter, dev_nr; + + /* + * If c->idle_counter is overflow (idel for really long time), + * reset as 0 and not set maximum rate this time for code + * simplicity. + */ + counter = atomic_inc_return(&c->idle_counter); + if (counter <= 0) { + atomic_set(&c->idle_counter, 0); + return false; + } + + dev_nr = atomic_read(&c->attached_dev_nr); + if (dev_nr == 0) + return false; + + /* + * c->idle_counter is increased by writeback thread of all + * attached backing devices, in order to represent a rough + * time period, counter should be divided by dev_nr. + * Otherwise the idle time cannot be larger with more backing + * device attached. + * The following calculation equals to checking + * (counter / dev_nr) < (dev_nr * 6) + */ + if (counter < (dev_nr * dev_nr * 6)) + return false; + + return true; +} + +/* + * Idle_counter is increased every time when update_writeback_rate() is + * called. If all backing devices attached to the same cache set have + * identical dc->writeback_rate_update_seconds values, it is about 6 + * rounds of update_writeback_rate() on each backing device before + * c->at_max_writeback_rate is set to 1, and then max wrteback rate set + * to each dc->writeback_rate.rate. + * In order to avoid extra locking cost for counting exact dirty cached + * devices number, c->attached_dev_nr is used to calculate the idle + * throushold. It might be bigger if not all cached device are in write- + * back mode, but it still works well with limited extra rounds of + * update_writeback_rate(). + */ static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { @@ -129,21 +214,8 @@ static bool set_at_max_writeback_rate(struct cache_set *c, /* Don't set max writeback rate if gc is running */ if (!c->gc_mark_valid) return false; - /* - * Idle_counter is increased everytime when update_writeback_rate() is - * called. If all backing devices attached to the same cache set have - * identical dc->writeback_rate_update_seconds values, it is about 6 - * rounds of update_writeback_rate() on each backing device before - * c->at_max_writeback_rate is set to 1, and then max wrteback rate set - * to each dc->writeback_rate.rate. - * In order to avoid extra locking cost for counting exact dirty cached - * devices number, c->attached_dev_nr is used to calculate the idle - * throushold. It might be bigger if not all cached device are in write- - * back mode, but it still works well with limited extra rounds of - * update_writeback_rate(). - */ - if (atomic_inc_return(&c->idle_counter) < - atomic_read(&c->attached_dev_nr) * 6) + + if (!idle_counter_exceeded(c)) return false; if (atomic_read(&c->at_max_writeback_rate) != 1) @@ -157,13 +229,10 @@ static bool set_at_max_writeback_rate(struct cache_set *c, dc->writeback_rate_change = 0; /* - * Check c->idle_counter and c->at_max_writeback_rate agagain in case - * new I/O arrives during before set_at_max_writeback_rate() returns. - * Then the writeback rate is set to 1, and its new value should be - * decided via __update_writeback_rate(). + * In case new I/O arrives during before + * set_at_max_writeback_rate() returns. */ - if ((atomic_read(&c->idle_counter) < - atomic_read(&c->attached_dev_nr) * 6) || + if (!idle_counter_exceeded(c) || !atomic_read(&c->at_max_writeback_rate)) return false; @@ -183,7 +252,7 @@ static void update_writeback_rate(struct work_struct *work) */ set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); /* * CACHE_SET_IO_DISABLE might be set via sysfs interface, @@ -193,23 +262,31 @@ static void update_writeback_rate(struct work_struct *work) test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); return; } - if (atomic_read(&dc->has_dirty) && dc->writeback_percent) { - /* - * If the whole cache set is idle, set_at_max_writeback_rate() - * will set writeback rate to a max number. Then it is - * unncessary to update writeback rate for an idle cache set - * in maximum writeback rate number(s). - */ - if (!set_at_max_writeback_rate(c, dc)) { - down_read(&dc->writeback_lock); + /* + * If the whole cache set is idle, set_at_max_writeback_rate() + * will set writeback rate to a max number. Then it is + * unncessary to update writeback rate for an idle cache set + * in maximum writeback rate number(s). + */ + if (atomic_read(&dc->has_dirty) && dc->writeback_percent && + !set_at_max_writeback_rate(c, dc)) { + do { + if (!down_read_trylock((&dc->writeback_lock))) { + dc->rate_update_retry++; + if (dc->rate_update_retry <= + BCH_WBRATE_UPDATE_MAX_SKIPS) + break; + down_read(&dc->writeback_lock); + dc->rate_update_retry = 0; + } __update_writeback_rate(dc); update_gc_after_writeback(c); up_read(&dc->writeback_lock); - } + } while (0); } @@ -229,7 +306,7 @@ static void update_writeback_rate(struct work_struct *work) */ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); } static unsigned int writeback_delay(struct cached_dev *dc, @@ -254,8 +331,8 @@ static void dirty_init(struct keybuf_key *w) struct dirty_io *io = w->private; struct bio *bio = &io->bio; - bio_init(bio, bio->bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)); + bio_init(bio, NULL, bio->bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); @@ -378,7 +455,7 @@ static void read_dirty_endio(struct bio *bio) struct dirty_io *io = w->private; /* is_read = 1 */ - bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), + bch_count_io_errors(io->dc->disk.c->cache, bio->bi_status, 1, "reading dirty data from cache"); @@ -459,10 +536,8 @@ static void read_dirty(struct cached_dev *dc) for (i = 0; i < nk; i++) { w = keys[i]; - io = kzalloc(sizeof(struct dirty_io) + - sizeof(struct bio_vec) * - DIV_ROUND_UP(KEY_SIZE(&w->key), - PAGE_SECTORS), + io = kzalloc(struct_size(io, bio.bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), GFP_KERNEL); if (!io) goto err; @@ -474,8 +549,7 @@ static void read_dirty(struct cached_dev *dc) dirty_init(w); bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); - bio_set_dev(&io->bio, - PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); + bio_set_dev(&io->bio, dc->disk.c->cache->bdev); io->bio.bi_end_io = read_dirty_endio; if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) @@ -523,15 +597,19 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, uint64_t offset, int nr_sectors) { struct bcache_device *d = c->devices[inode]; - unsigned int stripe_offset, stripe, sectors_dirty; + unsigned int stripe_offset, sectors_dirty; + int stripe; if (!d) return; + stripe = offset_to_stripe(d, offset); + if (stripe < 0) + return; + if (UUID_FLASH_ONLY(&c->uuids[inode])) atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors); - stripe = offset_to_stripe(d, offset); stripe_offset = offset & (d->stripe_size - 1); while (nr_sectors) { @@ -546,10 +624,13 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, sectors_dirty = atomic_add_return(s, d->stripe_sectors_dirty + stripe); - if (sectors_dirty == d->stripe_size) - set_bit(stripe, d->full_dirty_stripes); - else - clear_bit(stripe, d->full_dirty_stripes); + if (sectors_dirty == d->stripe_size) { + if (!test_bit(stripe, d->full_dirty_stripes)) + set_bit(stripe, d->full_dirty_stripes); + } else { + if (test_bit(stripe, d->full_dirty_stripes)) + clear_bit(stripe, d->full_dirty_stripes); + } nr_sectors -= s; stripe_offset = 0; @@ -571,12 +652,12 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) static void refill_full_stripes(struct cached_dev *dc) { struct keybuf *buf = &dc->writeback_keys; - unsigned int start_stripe, stripe, next_stripe; + unsigned int start_stripe, next_stripe; + int stripe; bool wrapped = false; stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); - - if (stripe >= dc->disk.nr_stripes) + if (stripe < 0) stripe = 0; start_stripe = stripe; @@ -703,6 +784,15 @@ static int bch_writeback_thread(void *arg) * bch_cached_dev_detach(). */ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) { + struct closure cl; + + closure_init_stack(&cl); + memset(&dc->sb.set_uuid, 0, 16); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); + + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + up_write(&dc->writeback_lock); break; } @@ -742,10 +832,9 @@ static int bch_writeback_thread(void *arg) } } - if (dc->writeback_write_wq) { - flush_workqueue(dc->writeback_write_wq); + if (dc->writeback_write_wq) destroy_workqueue(dc->writeback_write_wq); - } + cached_dev_put(dc); wait_for_kthread_stop(); @@ -754,13 +843,11 @@ static int bch_writeback_thread(void *arg) /* Init */ #define INIT_KEYS_EACH_TIME 500000 -#define INIT_KEYS_SLEEP_MS 100 struct sectors_dirty_init { struct btree_op op; unsigned int inode; size_t count; - struct bkey start; }; static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, @@ -776,16 +863,15 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, KEY_START(k), KEY_SIZE(k)); op->count++; - if (atomic_read(&b->c->search_inflight) && - !(op->count % INIT_KEYS_EACH_TIME)) { - bkey_copy_key(&op->start, k); - return -EAGAIN; - } + if (!(op->count % INIT_KEYS_EACH_TIME)) + cond_resched(); return MAP_CONTINUE; } -void bch_sectors_dirty_init(struct bcache_device *d) +static int bch_root_node_dirty_init(struct cache_set *c, + struct bcache_device *d, + struct bkey *k) { struct sectors_dirty_init op; int ret; @@ -793,19 +879,148 @@ void bch_sectors_dirty_init(struct bcache_device *d) bch_btree_op_init(&op.op, -1); op.inode = d->id; op.count = 0; - op.start = KEY(op.inode, 0, 0); - - do { - ret = bch_btree_map_keys(&op.op, d->c, &op.start, - sectors_dirty_init_fn, 0); - if (ret == -EAGAIN) - schedule_timeout_interruptible( - msecs_to_jiffies(INIT_KEYS_SLEEP_MS)); - else if (ret < 0) { - pr_warn("sectors dirty init failed, ret=%d!", ret); + + ret = bcache_btree(map_keys_recurse, + k, + c->root, + &op.op, + &KEY(op.inode, 0, 0), + sectors_dirty_init_fn, + 0); + if (ret < 0) + pr_warn("sectors dirty init failed, ret=%d!\n", ret); + + return ret; +} + +static int bch_dirty_init_thread(void *arg) +{ + struct dirty_init_thrd_info *info = arg; + struct bch_dirty_init_state *state = info->state; + struct cache_set *c = state->c; + struct btree_iter iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + + k = p = NULL; + cur_idx = prev_idx = 0; + + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + BUG_ON(!k); + + p = k; + + while (k) { + spin_lock(&state->idx_lock); + cur_idx = state->key_idx; + state->key_idx++; + spin_unlock(&state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + atomic_set(&state->enough, 1); + /* Update state->enough earlier */ + smp_mb__after_atomic(); + goto out; + } + skip_nr--; + } + + if (p) { + if (bch_root_node_dirty_init(c, state->d, p) < 0) + goto out; + } + + p = NULL; + prev_idx = cur_idx; + } + +out: + /* In order to wake up state->wait in time */ + smp_mb__before_atomic(); + if (atomic_dec_and_test(&state->started)) + wake_up(&state->wait); + + return 0; +} + +static int bch_btre_dirty_init_thread_nr(void) +{ + int n = num_online_cpus()/2; + + if (n == 0) + n = 1; + else if (n > BCH_DIRTY_INIT_THRD_MAX) + n = BCH_DIRTY_INIT_THRD_MAX; + + return n; +} + +void bch_sectors_dirty_init(struct bcache_device *d) +{ + int i; + struct bkey *k = NULL; + struct btree_iter iter; + struct sectors_dirty_init op; + struct cache_set *c = d->c; + struct bch_dirty_init_state state; + + /* Just count root keys if no leaf node */ + rw_lock(0, c->root, c->root->level); + if (c->root->level == 0) { + bch_btree_op_init(&op.op, -1); + op.inode = d->id; + op.count = 0; + + for_each_key_filter(&c->root->keys, + k, &iter, bch_ptr_invalid) + sectors_dirty_init_fn(&op.op, c->root, k); + + rw_unlock(0, c->root); + return; + } + + memset(&state, 0, sizeof(struct bch_dirty_init_state)); + state.c = c; + state.d = d; + state.total_threads = bch_btre_dirty_init_thread_nr(); + state.key_idx = 0; + spin_lock_init(&state.idx_lock); + atomic_set(&state.started, 0); + atomic_set(&state.enough, 0); + init_waitqueue_head(&state.wait); + + for (i = 0; i < state.total_threads; i++) { + /* Fetch latest state.enough earlier */ + smp_mb__before_atomic(); + if (atomic_read(&state.enough)) break; + + state.infos[i].state = &state; + state.infos[i].thread = + kthread_run(bch_dirty_init_thread, &state.infos[i], + "bch_dirtcnt[%d]", i); + if (IS_ERR(state.infos[i].thread)) { + pr_err("fails to run thread bch_dirty_init[%d]\n", i); + for (--i; i >= 0; i--) + kthread_stop(state.infos[i].thread); + goto out; } - } while (ret == -EAGAIN); + atomic_inc(&state.started); + } + +out: + /* Must wait for all threads to stop. */ + wait_event(state.wait, atomic_read(&state.started) == 0); + rw_unlock(0, c->root); } void bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -816,6 +1031,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_metadata = true; dc->writeback_running = false; + dc->writeback_consider_fragment = true; dc->writeback_percent = 10; dc->writeback_delay = 30; atomic_long_set(&dc->writeback_rate.rate, 1024); @@ -823,8 +1039,14 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_fp_term_low = 1; + dc->writeback_rate_fp_term_mid = 10; + dc->writeback_rate_fp_term_high = 1000; dc->writeback_rate_i_term_inverse = 10000; + /* For dc->writeback_lock contention in update_writeback_rate() */ + dc->rate_update_retry = 0; + WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 4e4c6810dc3c..31df716951f6 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -16,6 +16,11 @@ #define BCH_AUTO_GC_DIRTY_THRESHOLD 50 +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW 50 +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID 57 +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH 64 + +#define BCH_DIRTY_INIT_THRD_MAX 12 /* * 14 (16384ths) is chosen here as something that each backing device * should be a reasonable fraction of the share, and not to blow up @@ -23,6 +28,24 @@ */ #define WRITEBACK_SHARE_SHIFT 14 +struct bch_dirty_init_state; +struct dirty_init_thrd_info { + struct bch_dirty_init_state *state; + struct task_struct *thread; +}; + +struct bch_dirty_init_state { + struct cache_set *c; + struct bcache_device *d; + int total_threads; + int key_idx; + spinlock_t idx_lock; + atomic_t started; + atomic_t enough; + wait_queue_head_t wait; + struct dirty_init_thrd_info infos[BCH_DIRTY_INIT_THRD_MAX]; +}; + static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) { uint64_t i, ret = 0; @@ -33,10 +56,22 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } -static inline unsigned int offset_to_stripe(struct bcache_device *d, +static inline int offset_to_stripe(struct bcache_device *d, uint64_t offset) { do_div(offset, d->stripe_size); + + /* d->nr_stripes is in range [1, INT_MAX] */ + if (unlikely(offset >= d->nr_stripes)) { + pr_err("Invalid stripe %llu (>= nr_stripes %d).\n", + offset, d->nr_stripes); + return -EINVAL; + } + + /* + * Here offset is definitly smaller than INT_MAX, + * return it as int will never overflow. + */ return offset; } @@ -44,7 +79,10 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, uint64_t offset, unsigned int nr_sectors) { - unsigned int stripe = offset_to_stripe(&dc->disk, offset); + int stripe = offset_to_stripe(&dc->disk, offset); + + if (stripe < 0) + return false; while (1) { if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) |