From 34cf78bf34d48dddddfeeadb44f9841d7864997a Mon Sep 17 00:00:00 2001 From: Guoju Fang Date: Wed, 13 Nov 2019 16:03:16 +0800 Subject: bcache: fix a lost wake-up problem caused by mca_cannibalize_lock This patch fix a lost wake-up problem caused by the race between mca_cannibalize_lock and bch_cannibalize_unlock. Consider two processes, A and B. Process A is executing mca_cannibalize_lock, while process B takes c->btree_cache_alloc_lock and is executing bch_cannibalize_unlock. The problem happens that after process A executes cmpxchg and will execute prepare_to_wait. In this timeslice process B executes wake_up, but after that process A executes prepare_to_wait and set the state to TASK_INTERRUPTIBLE. Then process A goes to sleep but no one will wake up it. This problem may cause bcache device to dead. Signed-off-by: Guoju Fang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 20ed838e9413..ebb854ed05a4 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1769,6 +1769,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); init_waitqueue_head(&c->btree_cache_wait); + spin_lock_init(&c->btree_cannibalize_lock); init_waitqueue_head(&c->bucket_wait); init_waitqueue_head(&c->gc_wait); sema_init(&c->uuid_write_mutex, 1); -- cgit v1.2.3-59-g8ed1b From 2d8869518a525c9bce5f5268419df9dfbe3dfdeb Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 13 Nov 2019 16:03:17 +0800 Subject: bcache: fix static checker warning in bcache_device_free() Commit cafe56359144 ("bcache: A block layer cache") leads to the following static checker warning: ./drivers/md/bcache/super.c:770 bcache_device_free() warn: variable dereferenced before check 'd->disk' (see line 766) drivers/md/bcache/super.c 762 static void bcache_device_free(struct bcache_device *d) 763 { 764 lockdep_assert_held(&bch_register_lock); 765 766 pr_info("%s stopped", d->disk->disk_name); ^^^^^^^^^ Unchecked dereference. 767 768 if (d->c) 769 bcache_device_detach(d); 770 if (d->disk && d->disk->flags & GENHD_FL_UP) ^^^^^^^ Check too late. 771 del_gendisk(d->disk); 772 if (d->disk && d->disk->queue) 773 blk_cleanup_queue(d->disk->queue); 774 if (d->disk) { 775 ida_simple_remove(&bcache_device_idx, 776 first_minor_to_idx(d->disk->first_minor)); 777 put_disk(d->disk); 778 } 779 It is not 100% sure that the gendisk struct of bcache device will always be there, the warning makes sense when there is problem in block core. This patch tries to remove the static checking warning by checking d->disk to avoid NULL pointer deferences. Reported-by: Dan Carpenter Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index ebb854ed05a4..7beccede5360 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -761,20 +761,28 @@ static inline int idx_to_first_minor(int idx) static void bcache_device_free(struct bcache_device *d) { + struct gendisk *disk = d->disk; + lockdep_assert_held(&bch_register_lock); - pr_info("%s stopped", d->disk->disk_name); + if (disk) + pr_info("%s stopped", disk->disk_name); + else + pr_err("bcache device (NULL gendisk) stopped"); if (d->c) bcache_device_detach(d); - if (d->disk && d->disk->flags & GENHD_FL_UP) - del_gendisk(d->disk); - if (d->disk && d->disk->queue) - blk_cleanup_queue(d->disk->queue); - if (d->disk) { + + if (disk) { + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + + if (disk->queue) + blk_cleanup_queue(disk->queue); + ida_simple_remove(&bcache_device_idx, - first_minor_to_idx(d->disk->first_minor)); - put_disk(d->disk); + first_minor_to_idx(disk->first_minor)); + put_disk(disk); } bioset_exit(&d->bio_split); -- cgit v1.2.3-59-g8ed1b From aaf8dbeab5865720c66db60ae8329309e81a0c9c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 13 Nov 2019 16:03:18 +0800 Subject: bcache: add more accurate error messages in read_super() Previous code only returns "Not a bcache superblock" for both bcache super block offset and magic error. This patch addss more accurate error messages, - for super block unmatched offset: "Not a bcache superblock (bad offset)" - for super block unmatched magic number: "Not a bcache superblock (bad magic)" Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7beccede5360..623fdaf10c4c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -92,10 +92,11 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", sb->version, sb->flags, sb->seq, sb->keys); - err = "Not a bcache superblock"; + err = "Not a bcache superblock (bad offset)"; if (sb->offset != SB_SECTOR) goto err; + err = "Not a bcache superblock (bad magic)"; if (memcmp(sb->magic, bcache_magic, 16)) goto err; -- cgit v1.2.3-59-g8ed1b From 84c529aea182939e68f618ed9813740c9165c7eb Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 13 Nov 2019 16:03:21 +0800 Subject: bcache: fix deadlock in bcache_allocator bcache_allocator can call the following: bch_allocator_thread() -> bch_prio_write() -> bch_bucket_alloc() -> wait on &ca->set->bucket_wait But the wake up event on bucket_wait is supposed to come from bch_allocator_thread() itself => deadlock: [ 1158.490744] INFO: task bcache_allocato:15861 blocked for more than 10 seconds. [ 1158.495929] Not tainted 5.3.0-050300rc3-generic #201908042232 [ 1158.500653] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1158.504413] bcache_allocato D 0 15861 2 0x80004000 [ 1158.504419] Call Trace: [ 1158.504429] __schedule+0x2a8/0x670 [ 1158.504432] schedule+0x2d/0x90 [ 1158.504448] bch_bucket_alloc+0xe5/0x370 [bcache] [ 1158.504453] ? wait_woken+0x80/0x80 [ 1158.504466] bch_prio_write+0x1dc/0x390 [bcache] [ 1158.504476] bch_allocator_thread+0x233/0x490 [bcache] [ 1158.504491] kthread+0x121/0x140 [ 1158.504503] ? invalidate_buckets+0x890/0x890 [bcache] [ 1158.504506] ? kthread_park+0xb0/0xb0 [ 1158.504510] ret_from_fork+0x35/0x40 Fix by making the call to bch_prio_write() non-blocking, so that bch_allocator_thread() never waits on itself. Moreover, make sure to wake up the garbage collector thread when bch_prio_write() is failing to allocate buckets. BugLink: https://bugs.launchpad.net/bugs/1784665 BugLink: https://bugs.launchpad.net/bugs/1796292 Signed-off-by: Andrea Righi Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 5 ++++- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/super.c | 27 +++++++++++++++++++++------ 3 files changed, 26 insertions(+), 8 deletions(-) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 6f776823b9ba..a1df0d95151c 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -377,7 +377,10 @@ retry_invalidate: if (!fifo_full(&ca->free_inc)) goto retry_invalidate; - bch_prio_write(ca); + if (bch_prio_write(ca, false) < 0) { + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + } } } out: diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 3653faf3bf48..50241e045c70 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -978,7 +978,7 @@ bool bch_cached_dev_error(struct cached_dev *dc); __printf(2, 3) bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); -void bch_prio_write(struct cache *ca); +int bch_prio_write(struct cache *ca, bool wait); void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); extern struct workqueue_struct *bcache_wq; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 623fdaf10c4c..d1352fcc6ff2 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -530,12 +530,29 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, closure_sync(cl); } -void bch_prio_write(struct cache *ca) +int bch_prio_write(struct cache *ca, bool wait) { int i; struct bucket *b; struct closure cl; + pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu", + fifo_used(&ca->free[RESERVE_PRIO]), + fifo_used(&ca->free[RESERVE_NONE]), + fifo_used(&ca->free_inc)); + + /* + * Pre-check if there are enough free buckets. In the non-blocking + * scenario it's better to fail early rather than starting to allocate + * buckets and do a cleanup later in case of failure. + */ + if (!wait) { + size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) + + fifo_used(&ca->free[RESERVE_NONE]); + if (prio_buckets(ca) > avail) + return -ENOMEM; + } + closure_init_stack(&cl); lockdep_assert_held(&ca->set->bucket_lock); @@ -545,9 +562,6 @@ void bch_prio_write(struct cache *ca) atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), &ca->meta_sectors_written); - //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), - // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); - for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; struct prio_set *p = ca->disk_buckets; @@ -565,7 +579,7 @@ void bch_prio_write(struct cache *ca) p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); - bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); @@ -594,6 +608,7 @@ void bch_prio_write(struct cache *ca) ca->prio_last_buckets[i] = ca->prio_buckets[i]; } + return 0; } static void prio_read(struct cache *ca, uint64_t bucket) @@ -1964,7 +1979,7 @@ static int run_cache_set(struct cache_set *c) mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) - bch_prio_write(ca); + bch_prio_write(ca, true); mutex_unlock(&c->bucket_lock); err = "cannot allocate new UUID bucket"; -- cgit v1.2.3-59-g8ed1b From c5fcdedcee4e6ae15c0eb5e0fbe25467e57d2963 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 13 Nov 2019 16:03:23 +0800 Subject: bcache: add idle_max_writeback_rate sysfs interface For writeback mode, if there is no regular I/O request for a while, the writeback rate will be set to the maximum value (1TB/s for now). This is good for most of the storage workload, but there are still people don't what the maximum writeback rate in I/O idle time. This patch adds a sysfs interface file idle_max_writeback_rate to permit people to disable maximum writeback rate. Then the minimum writeback rate can be advised by writeback_rate_minimum in the bcache device's sysfs interface. Reported-by: Christian Balzer Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/super.c | 1 + drivers/md/bcache/sysfs.c | 7 +++++++ drivers/md/bcache/writeback.c | 4 ++++ 4 files changed, 13 insertions(+) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 50241e045c70..9198c1b480d9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -724,6 +724,7 @@ struct cache_set { unsigned int gc_always_rewrite:1; unsigned int shrinker_disabled:1; unsigned int copy_gc_enabled:1; + unsigned int idle_max_writeback_rate_enabled:1; #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d1352fcc6ff2..77e9869345e7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1834,6 +1834,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = DEFAULT_IO_ERROR_LIMIT; + c->idle_max_writeback_rate_enabled = 1; WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); return c; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 627dcea0f5b6..733e2ddf3c78 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -134,6 +134,7 @@ rw_attribute(expensive_debug_checks); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); +rw_attribute(idle_max_writeback_rate); rw_attribute(gc_after_writeback); rw_attribute(size); @@ -747,6 +748,8 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(idle_max_writeback_rate, "%i", + c->idle_max_writeback_rate_enabled); sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback); sysfs_printf(io_disable, "%i", test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -864,6 +867,9 @@ STORE(__bch_cache_set) sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled); + sysfs_strtoul_bool(idle_max_writeback_rate, + c->idle_max_writeback_rate_enabled); + /* * write gc_after_writeback here may overwrite an already set * BCH_DO_AUTO_GC, it doesn't matter because this flag will be @@ -954,6 +960,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, + &sysfs_idle_max_writeback_rate, &sysfs_gc_after_writeback, &sysfs_io_disable, &sysfs_cutoff_writeback, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d60268fe49e1..4a40f9eadeaf 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -122,6 +122,10 @@ static void __update_writeback_rate(struct cached_dev *dc) static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { + /* Don't sst max writeback rate if it is disabled */ + if (!c->idle_max_writeback_rate_enabled) + return false; + /* Don't set max writeback rate if gc is running */ if (!c->gc_mark_valid) return false; -- cgit v1.2.3-59-g8ed1b