aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2016-01-10 07:53:55 -0800
committerDan Williams <dan.j.williams@intel.com>2016-01-10 07:53:55 -0800
commit8b63b6bfc1a551acf154061699028c7032d7890c (patch)
tree16882e9bc9e35eacb870a6d8a71617e579c4ffdc /drivers/md
parentlibnvdimm: fix namespace object confusion in is_uuid_busy() (diff)
parentblock: kill disk_{check|set|clear|alloc}_badblocks (diff)
downloadlinux-dev-8b63b6bfc1a551acf154061699028c7032d7890c.tar.xz
linux-dev-8b63b6bfc1a551acf154061699028c7032d7890c.zip
Merge branch 'for-4.5/block-dax' into for-4.5/libnvdimm
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-thin-metadata.c34
-rw-r--r--drivers/md/md.c516
-rw-r--r--drivers/md/md.h40
-rw-r--r--drivers/md/persistent-data/dm-btree.c101
-rw-r--r--drivers/md/persistent-data/dm-btree.h14
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c32
6 files changed, 189 insertions, 548 deletions
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 1fa45695b68a..c219a053c7f6 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1207,6 +1207,12 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
dm_block_t held_root;
/*
+ * We commit to ensure the btree roots which we increment in a
+ * moment are up to date.
+ */
+ __commit_transaction(pmd);
+
+ /*
* Copy the superblock.
*/
dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
@@ -1538,7 +1544,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
{
int r;
- unsigned count;
+ unsigned count, total_count = 0;
struct dm_pool_metadata *pmd = td->pmd;
dm_block_t keys[1] = { td->id };
__le64 value;
@@ -1561,11 +1567,29 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_
if (r)
return r;
- r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
- if (r)
- return r;
+ /*
+ * Remove leaves stops at the first unmapped entry, so we have to
+ * loop round finding mapped ranges.
+ */
+ while (begin < end) {
+ r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value);
+ if (r == -ENODATA)
+ break;
+
+ if (r)
+ return r;
+
+ if (begin >= end)
+ break;
+
+ r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
+ if (r)
+ return r;
+
+ total_count += count;
+ }
- td->mapped_blocks -= count;
+ td->mapped_blocks -= total_count;
td->changed = 1;
/*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 807095f4c793..96a991821ae6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -34,6 +34,7 @@
#include <linux/kthread.h>
#include <linux/blkdev.h>
+#include <linux/badblocks.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
@@ -709,8 +710,7 @@ void md_rdev_clear(struct md_rdev *rdev)
put_page(rdev->bb_page);
rdev->bb_page = NULL;
}
- kfree(rdev->badblocks.page);
- rdev->badblocks.page = NULL;
+ badblocks_exit(&rdev->badblocks);
}
EXPORT_SYMBOL_GPL(md_rdev_clear);
@@ -1360,8 +1360,6 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
return cpu_to_le32(csum);
}
-static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
- int acknowledged);
static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
{
struct mdp_superblock_1 *sb;
@@ -1486,8 +1484,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
count <<= sb->bblog_shift;
if (bb + 1 == 0)
break;
- if (md_set_badblocks(&rdev->badblocks,
- sector, count, 1) == 0)
+ if (badblocks_set(&rdev->badblocks, sector, count, 1))
return -EINVAL;
}
} else if (sb->bblog_offset != 0)
@@ -2319,7 +2316,7 @@ repeat:
rdev_for_each(rdev, mddev) {
if (rdev->badblocks.changed) {
rdev->badblocks.changed = 0;
- md_ack_all_badblocks(&rdev->badblocks);
+ ack_all_badblocks(&rdev->badblocks);
md_error(mddev, rdev);
}
clear_bit(Blocked, &rdev->flags);
@@ -2445,7 +2442,7 @@ repeat:
clear_bit(Blocked, &rdev->flags);
if (any_badblocks_changed)
- md_ack_all_badblocks(&rdev->badblocks);
+ ack_all_badblocks(&rdev->badblocks);
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
}
@@ -3046,11 +3043,17 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_
static struct rdev_sysfs_entry rdev_recovery_start =
__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
-static ssize_t
-badblocks_show(struct badblocks *bb, char *page, int unack);
-static ssize_t
-badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
-
+/* sysfs access to bad-blocks list.
+ * We present two files.
+ * 'bad-blocks' lists sector numbers and lengths of ranges that
+ * are recorded as bad. The list is truncated to fit within
+ * the one-page limit of sysfs.
+ * Writing "sector length" to this file adds an acknowledged
+ * bad block list.
+ * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
+ * been acknowledged. Writing to this file adds bad blocks
+ * without acknowledging them. This is largely for testing.
+ */
static ssize_t bb_show(struct md_rdev *rdev, char *page)
{
return badblocks_show(&rdev->badblocks, page, 0);
@@ -3165,14 +3168,7 @@ int md_rdev_init(struct md_rdev *rdev)
* This reserves the space even on arrays where it cannot
* be used - I wonder if that matters
*/
- rdev->badblocks.count = 0;
- rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
- rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
- seqlock_init(&rdev->badblocks.lock);
- if (rdev->badblocks.page == NULL)
- return -ENOMEM;
-
- return 0;
+ return badblocks_init(&rdev->badblocks, 0);
}
EXPORT_SYMBOL_GPL(md_rdev_init);
/*
@@ -8478,254 +8474,9 @@ void md_finish_reshape(struct mddev *mddev)
}
EXPORT_SYMBOL(md_finish_reshape);
-/* Bad block management.
- * We can record which blocks on each device are 'bad' and so just
- * fail those blocks, or that stripe, rather than the whole device.
- * Entries in the bad-block table are 64bits wide. This comprises:
- * Length of bad-range, in sectors: 0-511 for lengths 1-512
- * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
- * A 'shift' can be set so that larger blocks are tracked and
- * consequently larger devices can be covered.
- * 'Acknowledged' flag - 1 bit. - the most significant bit.
- *
- * Locking of the bad-block table uses a seqlock so md_is_badblock
- * might need to retry if it is very unlucky.
- * We will sometimes want to check for bad blocks in a bi_end_io function,
- * so we use the write_seqlock_irq variant.
- *
- * When looking for a bad block we specify a range and want to
- * know if any block in the range is bad. So we binary-search
- * to the last range that starts at-or-before the given endpoint,
- * (or "before the sector after the target range")
- * then see if it ends after the given start.
- * We return
- * 0 if there are no known bad blocks in the range
- * 1 if there are known bad block which are all acknowledged
- * -1 if there are bad blocks which have not yet been acknowledged in metadata.
- * plus the start/length of the first bad section we overlap.
- */
-int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors)
-{
- int hi;
- int lo;
- u64 *p = bb->page;
- int rv;
- sector_t target = s + sectors;
- unsigned seq;
-
- if (bb->shift > 0) {
- /* round the start down, and the end up */
- s >>= bb->shift;
- target += (1<<bb->shift) - 1;
- target >>= bb->shift;
- sectors = target - s;
- }
- /* 'target' is now the first block after the bad range */
-
-retry:
- seq = read_seqbegin(&bb->lock);
- lo = 0;
- rv = 0;
- hi = bb->count;
-
- /* Binary search between lo and hi for 'target'
- * i.e. for the last range that starts before 'target'
- */
- /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
- * are known not to be the last range before target.
- * VARIANT: hi-lo is the number of possible
- * ranges, and decreases until it reaches 1
- */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a < target)
- /* This could still be the one, earlier ranges
- * could not. */
- lo = mid;
- else
- /* This and later ranges are definitely out. */
- hi = mid;
- }
- /* 'lo' might be the last that started before target, but 'hi' isn't */
- if (hi > lo) {
- /* need to check all range that end after 's' to see if
- * any are unacknowledged.
- */
- while (lo >= 0 &&
- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
- if (BB_OFFSET(p[lo]) < target) {
- /* starts before the end, and finishes after
- * the start, so they must overlap
- */
- if (rv != -1 && BB_ACK(p[lo]))
- rv = 1;
- else
- rv = -1;
- *first_bad = BB_OFFSET(p[lo]);
- *bad_sectors = BB_LEN(p[lo]);
- }
- lo--;
- }
- }
-
- if (read_seqretry(&bb->lock, seq))
- goto retry;
-
- return rv;
-}
-EXPORT_SYMBOL_GPL(md_is_badblock);
-
-/*
- * Add a range of bad blocks to the table.
- * This might extend the table, or might contract it
- * if two adjacent ranges can be merged.
- * We binary-search to find the 'insertion' point, then
- * decide how best to handle it.
- */
-static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
- int acknowledged)
-{
- u64 *p;
- int lo, hi;
- int rv = 1;
- unsigned long flags;
-
- if (bb->shift < 0)
- /* badblocks are disabled */
- return 0;
-
- if (bb->shift) {
- /* round the start down, and the end up */
- sector_t next = s + sectors;
- s >>= bb->shift;
- next += (1<<bb->shift) - 1;
- next >>= bb->shift;
- sectors = next - s;
- }
-
- write_seqlock_irqsave(&bb->lock, flags);
-
- p = bb->page;
- lo = 0;
- hi = bb->count;
- /* Find the last range that starts at-or-before 's' */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a <= s)
- lo = mid;
- else
- hi = mid;
- }
- if (hi > lo && BB_OFFSET(p[lo]) > s)
- hi = lo;
-
- if (hi > lo) {
- /* we found a range that might merge with the start
- * of our new range
- */
- sector_t a = BB_OFFSET(p[lo]);
- sector_t e = a + BB_LEN(p[lo]);
- int ack = BB_ACK(p[lo]);
- if (e >= s) {
- /* Yes, we can merge with a previous range */
- if (s == a && s + sectors >= e)
- /* new range covers old */
- ack = acknowledged;
- else
- ack = ack && acknowledged;
-
- if (e < s + sectors)
- e = s + sectors;
- if (e - a <= BB_MAX_LEN) {
- p[lo] = BB_MAKE(a, e-a, ack);
- s = e;
- } else {
- /* does not all fit in one range,
- * make p[lo] maximal
- */
- if (BB_LEN(p[lo]) != BB_MAX_LEN)
- p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
- s = a + BB_MAX_LEN;
- }
- sectors = e - s;
- }
- }
- if (sectors && hi < bb->count) {
- /* 'hi' points to the first range that starts after 's'.
- * Maybe we can merge with the start of that range */
- sector_t a = BB_OFFSET(p[hi]);
- sector_t e = a + BB_LEN(p[hi]);
- int ack = BB_ACK(p[hi]);
- if (a <= s + sectors) {
- /* merging is possible */
- if (e <= s + sectors) {
- /* full overlap */
- e = s + sectors;
- ack = acknowledged;
- } else
- ack = ack && acknowledged;
-
- a = s;
- if (e - a <= BB_MAX_LEN) {
- p[hi] = BB_MAKE(a, e-a, ack);
- s = e;
- } else {
- p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
- s = a + BB_MAX_LEN;
- }
- sectors = e - s;
- lo = hi;
- hi++;
- }
- }
- if (sectors == 0 && hi < bb->count) {
- /* we might be able to combine lo and hi */
- /* Note: 's' is at the end of 'lo' */
- sector_t a = BB_OFFSET(p[hi]);
- int lolen = BB_LEN(p[lo]);
- int hilen = BB_LEN(p[hi]);
- int newlen = lolen + hilen - (s - a);
- if (s >= a && newlen < BB_MAX_LEN) {
- /* yes, we can combine them */
- int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
- p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
- memmove(p + hi, p + hi + 1,
- (bb->count - hi - 1) * 8);
- bb->count--;
- }
- }
- while (sectors) {
- /* didn't merge (it all).
- * Need to add a range just before 'hi' */
- if (bb->count >= MD_MAX_BADBLOCKS) {
- /* No room for more */
- rv = 0;
- break;
- } else {
- int this_sectors = sectors;
- memmove(p + hi + 1, p + hi,
- (bb->count - hi) * 8);
- bb->count++;
-
- if (this_sectors > BB_MAX_LEN)
- this_sectors = BB_MAX_LEN;
- p[hi] = BB_MAKE(s, this_sectors, acknowledged);
- sectors -= this_sectors;
- s += this_sectors;
- }
- }
-
- bb->changed = 1;
- if (!acknowledged)
- bb->unacked_exist = 1;
- write_sequnlock_irqrestore(&bb->lock, flags);
-
- return rv;
-}
+/* Bad block management */
+/* Returns 1 on success, 0 on failure */
int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new)
{
@@ -8734,114 +8485,19 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
s += rdev->new_data_offset;
else
s += rdev->data_offset;
- rv = md_set_badblocks(&rdev->badblocks,
- s, sectors, 0);
- if (rv) {
+ rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
+ if (rv == 0) {
/* Make sure they get written out promptly */
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
md_wakeup_thread(rdev->mddev->thread);
- }
- return rv;
+ return 1;
+ } else
+ return 0;
}
EXPORT_SYMBOL_GPL(rdev_set_badblocks);
-/*
- * Remove a range of bad blocks from the table.
- * This may involve extending the table if we spilt a region,
- * but it must not fail. So if the table becomes full, we just
- * drop the remove request.
- */
-static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
-{
- u64 *p;
- int lo, hi;
- sector_t target = s + sectors;
- int rv = 0;
-
- if (bb->shift > 0) {
- /* When clearing we round the start up and the end down.
- * This should not matter as the shift should align with
- * the block size and no rounding should ever be needed.
- * However it is better the think a block is bad when it
- * isn't than to think a block is not bad when it is.
- */
- s += (1<<bb->shift) - 1;
- s >>= bb->shift;
- target >>= bb->shift;
- sectors = target - s;
- }
-
- write_seqlock_irq(&bb->lock);
-
- p = bb->page;
- lo = 0;
- hi = bb->count;
- /* Find the last range that starts before 'target' */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a < target)
- lo = mid;
- else
- hi = mid;
- }
- if (hi > lo) {
- /* p[lo] is the last range that could overlap the
- * current range. Earlier ranges could also overlap,
- * but only this one can overlap the end of the range.
- */
- if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
- /* Partial overlap, leave the tail of this range */
- int ack = BB_ACK(p[lo]);
- sector_t a = BB_OFFSET(p[lo]);
- sector_t end = a + BB_LEN(p[lo]);
-
- if (a < s) {
- /* we need to split this range */
- if (bb->count >= MD_MAX_BADBLOCKS) {
- rv = -ENOSPC;
- goto out;
- }
- memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
- bb->count++;
- p[lo] = BB_MAKE(a, s-a, ack);
- lo++;
- }
- p[lo] = BB_MAKE(target, end - target, ack);
- /* there is no longer an overlap */
- hi = lo;
- lo--;
- }
- while (lo >= 0 &&
- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
- /* This range does overlap */
- if (BB_OFFSET(p[lo]) < s) {
- /* Keep the early parts of this range. */
- int ack = BB_ACK(p[lo]);
- sector_t start = BB_OFFSET(p[lo]);
- p[lo] = BB_MAKE(start, s - start, ack);
- /* now low doesn't overlap, so.. */
- break;
- }
- lo--;
- }
- /* 'lo' is strictly before, 'hi' is strictly after,
- * anything between needs to be discarded
- */
- if (hi - lo > 1) {
- memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
- bb->count -= (hi - lo - 1);
- }
- }
-
- bb->changed = 1;
-out:
- write_sequnlock_irq(&bb->lock);
- return rv;
-}
-
int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new)
{
@@ -8849,133 +8505,11 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
s += rdev->new_data_offset;
else
s += rdev->data_offset;
- return md_clear_badblocks(&rdev->badblocks,
+ return badblocks_clear(&rdev->badblocks,
s, sectors);
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
-/*
- * Acknowledge all bad blocks in a list.
- * This only succeeds if ->changed is clear. It is used by
- * in-kernel metadata updates
- */
-void md_ack_all_badblocks(struct badblocks *bb)
-{
- if (bb->page == NULL || bb->changed)
- /* no point even trying */
- return;
- write_seqlock_irq(&bb->lock);
-
- if (bb->changed == 0 && bb->unacked_exist) {
- u64 *p = bb->page;
- int i;
- for (i = 0; i < bb->count ; i++) {
- if (!BB_ACK(p[i])) {
- sector_t start = BB_OFFSET(p[i]);
- int len = BB_LEN(p[i]);
- p[i] = BB_MAKE(start, len, 1);
- }
- }
- bb->unacked_exist = 0;
- }
- write_sequnlock_irq(&bb->lock);
-}
-EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
-
-/* sysfs access to bad-blocks list.
- * We present two files.
- * 'bad-blocks' lists sector numbers and lengths of ranges that
- * are recorded as bad. The list is truncated to fit within
- * the one-page limit of sysfs.
- * Writing "sector length" to this file adds an acknowledged
- * bad block list.
- * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
- * been acknowledged. Writing to this file adds bad blocks
- * without acknowledging them. This is largely for testing.
- */
-
-static ssize_t
-badblocks_show(struct badblocks *bb, char *page, int unack)
-{
- size_t len;
- int i;
- u64 *p = bb->page;
- unsigned seq;
-
- if (bb->shift < 0)
- return 0;
-
-retry:
- seq = read_seqbegin(&bb->lock);
-
- len = 0;
- i = 0;
-
- while (len < PAGE_SIZE && i < bb->count) {
- sector_t s = BB_OFFSET(p[i]);
- unsigned int length = BB_LEN(p[i]);
- int ack = BB_ACK(p[i]);
- i++;
-
- if (unack && ack)
- continue;
-
- len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
- (unsigned long long)s << bb->shift,
- length << bb->shift);
- }
- if (unack && len == 0)
- bb->unacked_exist = 0;
-
- if (read_seqretry(&bb->lock, seq))
- goto retry;
-
- return len;
-}
-
-#define DO_DEBUG 1
-
-static ssize_t
-badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
-{
- unsigned long long sector;
- int length;
- char newline;
-#ifdef DO_DEBUG
- /* Allow clearing via sysfs *only* for testing/debugging.
- * Normally only a successful write may clear a badblock
- */
- int clear = 0;
- if (page[0] == '-') {
- clear = 1;
- page++;
- }
-#endif /* DO_DEBUG */
-
- switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
- case 3:
- if (newline != '\n')
- return -EINVAL;
- case 2:
- if (length <= 0)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
-
-#ifdef DO_DEBUG
- if (clear) {
- md_clear_badblocks(bb, sector, length);
- return len;
- }
-#endif /* DO_DEBUG */
- if (md_set_badblocks(bb, sector, length, !unack))
- return len;
- else
- return -ENOSPC;
-}
-
static int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x)
{
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2bea51edfab7..389afc420db6 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -17,6 +17,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/badblocks.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mm.h>
@@ -28,13 +29,6 @@
#define MaxSector (~(sector_t)0)
-/* Bad block numbers are stored sorted in a single page.
- * 64bits is used for each block or extent.
- * 54 bits are sector number, 9 bits are extent size,
- * 1 bit is an 'acknowledged' flag.
- */
-#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
-
/*
* MD's 'extended' device
*/
@@ -117,22 +111,7 @@ struct md_rdev {
struct kernfs_node *sysfs_state; /* handle for 'state'
* sysfs entry */
- struct badblocks {
- int count; /* count of bad blocks */
- int unacked_exist; /* there probably are unacknowledged
- * bad blocks. This is only cleared
- * when a read discovers none
- */
- int shift; /* shift from sectors to block size
- * a -ve shift means badblocks are
- * disabled.*/
- u64 *page; /* badblock list */
- int changed;
- seqlock_t lock;
-
- sector_t sector;
- sector_t size; /* in sectors */
- } badblocks;
+ struct badblocks badblocks;
};
enum flag_bits {
Faulty, /* device is known to have a fault */
@@ -185,22 +164,11 @@ enum flag_bits {
*/
};
-#define BB_LEN_MASK (0x00000000000001FFULL)
-#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
-#define BB_ACK_MASK (0x8000000000000000ULL)
-#define BB_MAX_LEN 512
-#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
-#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
-#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
-#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
-
-extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors);
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors)
{
if (unlikely(rdev->badblocks.count)) {
- int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
+ int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s,
sectors,
first_bad, bad_sectors);
if (rv)
@@ -213,8 +181,6 @@ extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
-extern void md_ack_all_badblocks(struct badblocks *bb);
-
struct md_cluster_info;
struct mddev {
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index c573402033b2..b1ced58eb5e1 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -63,6 +63,11 @@ int lower_bound(struct btree_node *n, uint64_t key)
return bsearch(n, key, 0);
}
+static int upper_bound(struct btree_node *n, uint64_t key)
+{
+ return bsearch(n, key, 1);
+}
+
void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
struct dm_btree_value_type *vt)
{
@@ -252,6 +257,16 @@ static void pop_frame(struct del_stack *s)
dm_tm_unlock(s->tm, f->b);
}
+static void unlock_all_frames(struct del_stack *s)
+{
+ struct frame *f;
+
+ while (unprocessed_frames(s)) {
+ f = s->spine + s->top--;
+ dm_tm_unlock(s->tm, f->b);
+ }
+}
+
int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
{
int r;
@@ -308,9 +323,13 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
pop_frame(s);
}
}
-
out:
+ if (r) {
+ /* cleanup all frames of del_stack */
+ unlock_all_frames(s);
+ }
kfree(s);
+
return r;
}
EXPORT_SYMBOL_GPL(dm_btree_del);
@@ -392,6 +411,82 @@ int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
}
EXPORT_SYMBOL_GPL(dm_btree_lookup);
+static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
+ uint64_t key, uint64_t *rkey, void *value_le)
+{
+ int r, i;
+ uint32_t flags, nr_entries;
+ struct dm_block *node;
+ struct btree_node *n;
+
+ r = bn_read_lock(info, root, &node);
+ if (r)
+ return r;
+
+ n = dm_block_data(node);
+ flags = le32_to_cpu(n->header.flags);
+ nr_entries = le32_to_cpu(n->header.nr_entries);
+
+ if (flags & INTERNAL_NODE) {
+ i = lower_bound(n, key);
+ if (i < 0 || i >= nr_entries) {
+ r = -ENODATA;
+ goto out;
+ }
+
+ r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
+ if (r == -ENODATA && i < (nr_entries - 1)) {
+ i++;
+ r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
+ }
+
+ } else {
+ i = upper_bound(n, key);
+ if (i < 0 || i >= nr_entries) {
+ r = -ENODATA;
+ goto out;
+ }
+
+ *rkey = le64_to_cpu(n->keys[i]);
+ memcpy(value_le, value_ptr(n, i), info->value_type.size);
+ }
+out:
+ dm_tm_unlock(info->tm, node);
+ return r;
+}
+
+int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, uint64_t *rkey, void *value_le)
+{
+ unsigned level;
+ int r = -ENODATA;
+ __le64 internal_value_le;
+ struct ro_spine spine;
+
+ init_ro_spine(&spine, info);
+ for (level = 0; level < info->levels - 1u; level++) {
+ r = btree_lookup_raw(&spine, root, keys[level],
+ lower_bound, rkey,
+ &internal_value_le, sizeof(uint64_t));
+ if (r)
+ goto out;
+
+ if (*rkey != keys[level]) {
+ r = -ENODATA;
+ goto out;
+ }
+
+ root = le64_to_cpu(internal_value_le);
+ }
+
+ r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
+out:
+ exit_ro_spine(&spine);
+ return r;
+}
+
+EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
+
/*
* Splits a node by creating a sibling node and shifting half the nodes
* contents across. Assumes there is a parent node, and it has room for
@@ -473,8 +568,10 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
r = insert_at(sizeof(__le64), pn, parent_index + 1,
le64_to_cpu(rn->keys[0]), &location);
- if (r)
+ if (r) {
+ unlock_block(s->info, right);
return r;
+ }
if (key < le64_to_cpu(rn->keys[0])) {
unlock_block(s->info, right);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index 11d8cf78621d..c74301fa5a37 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -110,6 +110,13 @@ int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, void *value_le);
/*
+ * Tries to find the first key where the bottom level key is >= to that
+ * given. Useful for skipping empty sections of the btree.
+ */
+int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, uint64_t *rkey, void *value_le);
+
+/*
* Insertion (or overwrite an existing value). O(ln(n))
*/
int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
@@ -135,9 +142,10 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, dm_block_t *new_root);
/*
- * Removes values between 'keys' and keys2, where keys2 is keys with the
- * final key replaced with 'end_key'. 'end_key' is the one-past-the-end
- * value. 'keys' may be altered.
+ * Removes a _contiguous_ run of values starting from 'keys' and not
+ * reaching keys2 (where keys2 is keys with the final key replaced with
+ * 'end_key'). 'end_key' is the one-past-the-end value. 'keys' may be
+ * altered.
*/
int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, uint64_t end_key,
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 53091295fce9..fca6dbcf9a47 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -136,7 +136,7 @@ static int brb_push(struct bop_ring_buffer *brb,
return 0;
}
-static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
+static int brb_peek(struct bop_ring_buffer *brb, struct block_op *result)
{
struct block_op *bop;
@@ -147,6 +147,17 @@ static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
result->type = bop->type;
result->block = bop->block;
+ return 0;
+}
+
+static int brb_pop(struct bop_ring_buffer *brb)
+{
+ struct block_op *bop;
+
+ if (brb_empty(brb))
+ return -ENODATA;
+
+ bop = brb->bops + brb->begin;
brb->begin = brb_next(brb, brb->begin);
return 0;
@@ -211,7 +222,7 @@ static int apply_bops(struct sm_metadata *smm)
while (!brb_empty(&smm->uncommitted)) {
struct block_op bop;
- r = brb_pop(&smm->uncommitted, &bop);
+ r = brb_peek(&smm->uncommitted, &bop);
if (r) {
DMERR("bug in bop ring buffer");
break;
@@ -220,6 +231,8 @@ static int apply_bops(struct sm_metadata *smm)
r = commit_bop(smm, &bop);
if (r)
break;
+
+ brb_pop(&smm->uncommitted);
}
return r;
@@ -683,7 +696,6 @@ static struct dm_space_map bootstrap_ops = {
static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
{
int r, i;
- enum allocation_event ev;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
dm_block_t old_len = smm->ll.nr_blocks;
@@ -705,11 +717,12 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
* allocate any new blocks.
*/
do {
- for (i = old_len; !r && i < smm->begin; i++) {
- r = sm_ll_inc(&smm->ll, i, &ev);
- if (r)
- goto out;
- }
+ for (i = old_len; !r && i < smm->begin; i++)
+ r = add_bop(smm, BOP_INC, i);
+
+ if (r)
+ goto out;
+
old_len = smm->begin;
r = apply_bops(smm);
@@ -754,7 +767,6 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
{
int r;
dm_block_t i;
- enum allocation_event ev;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
smm->begin = superblock + 1;
@@ -782,7 +794,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
* allocated blocks that they were built from.
*/
for (i = superblock; !r && i < smm->begin; i++)
- r = sm_ll_inc(&smm->ll, i, &ev);
+ r = add_bop(smm, BOP_INC, i);
if (r)
return r;