aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig31
-rw-r--r--drivers/md/Makefile16
-rw-r--r--drivers/md/bitmap.c85
-rw-r--r--drivers/md/bitmap.h288
-rw-r--r--drivers/md/dm-bio-list.h107
-rw-r--r--drivers/md/dm-bio-record.h26
-rw-r--r--drivers/md/dm-crypt.c6
-rw-r--r--drivers/md/dm-delay.c2
-rw-r--r--drivers/md/dm-exception-store.c252
-rw-r--r--drivers/md/dm-exception-store.h58
-rw-r--r--drivers/md/dm-io.c5
-rw-r--r--drivers/md/dm-ioctl.c21
-rw-r--r--drivers/md/dm-kcopyd.c23
-rw-r--r--drivers/md/dm-linear.c1
-rw-r--r--drivers/md/dm-log.c75
-rw-r--r--drivers/md/dm-mpath.c1
-rw-r--r--drivers/md/dm-path-selector.c21
-rw-r--r--drivers/md/dm-raid1.c51
-rw-r--r--drivers/md/dm-region-hash.c1
-rw-r--r--drivers/md/dm-snap-persistent.c153
-rw-r--r--drivers/md/dm-snap-transient.c86
-rw-r--r--drivers/md/dm-snap.c385
-rw-r--r--drivers/md/dm-snap.h105
-rw-r--r--drivers/md/dm-table.c85
-rw-r--r--drivers/md/dm-target.c104
-rw-r--r--drivers/md/dm.c298
-rw-r--r--drivers/md/dm.h3
-rw-r--r--drivers/md/faulty.c19
-rw-r--r--drivers/md/linear.c25
-rw-r--r--drivers/md/linear.h29
-rw-r--r--drivers/md/md.c706
-rw-r--r--drivers/md/md.h441
-rw-r--r--drivers/md/mktables.c14
-rw-r--r--drivers/md/multipath.c17
-rw-r--r--drivers/md/multipath.h40
-rw-r--r--drivers/md/raid0.c66
-rw-r--r--drivers/md/raid0.h28
-rw-r--r--drivers/md/raid1.c43
-rw-r--r--drivers/md/raid1.h132
-rw-r--r--drivers/md/raid10.c53
-rw-r--r--drivers/md/raid10.h121
-rw-r--r--drivers/md/raid5.c1501
-rw-r--r--drivers/md/raid5.h474
-rw-r--r--drivers/md/raid6.h130
-rw-r--r--drivers/md/raid6algos.c21
-rw-r--r--drivers/md/raid6altivec.uc4
-rw-r--r--drivers/md/raid6int.uc4
-rw-r--r--drivers/md/raid6mmx.c4
-rw-r--r--drivers/md/raid6recov.c13
-rw-r--r--drivers/md/raid6sse1.c4
-rw-r--r--drivers/md/raid6sse2.c4
-rw-r--r--drivers/md/raid6test/Makefile2
-rw-r--r--drivers/md/raid6test/test.c2
-rw-r--r--drivers/md/raid6x86.h2
54 files changed, 4487 insertions, 1701 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2281b5098e95..36e0675be9f7 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -121,6 +121,7 @@ config MD_RAID10
config MD_RAID456
tristate "RAID-4/RAID-5/RAID-6 mode"
depends on BLK_DEV_MD
+ select MD_RAID6_PQ
select ASYNC_MEMCPY
select ASYNC_XOR
---help---
@@ -151,34 +152,8 @@ config MD_RAID456
If unsure, say Y.
-config MD_RAID5_RESHAPE
- bool "Support adding drives to a raid-5 array"
- depends on MD_RAID456
- default y
- ---help---
- A RAID-5 set can be expanded by adding extra drives. This
- requires "restriping" the array which means (almost) every
- block must be written to a different place.
-
- This option allows such restriping to be done while the array
- is online.
-
- You will need mdadm version 2.4.1 or later to use this
- feature safely. During the early stage of reshape there is
- a critical section where live data is being over-written. A
- crash during this time needs extra care for recovery. The
- newer mdadm takes a copy of the data in the critical section
- and will restore it, if necessary, after a crash.
-
- The mdadm usage is e.g.
- mdadm --grow /dev/md1 --raid-disks=6
- to grow '/dev/md1' to having 6 disks.
-
- Note: The array can only be expanded, not contracted.
- There should be enough spares already present to make the new
- array workable.
-
- If unsure, say Y.
+config MD_RAID6_PQ
+ tristate
config MD_MULTIPATH
tristate "Multipath I/O support"
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 72880b7e28d9..45cc5951d928 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -2,20 +2,21 @@
# Makefile for the kernel software RAID and LVM drivers.
#
-dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
+dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
-dm-multipath-objs := dm-path-selector.o dm-mpath.o
-dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \
+dm-multipath-y += dm-path-selector.o dm-mpath.o
+dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
dm-snap-persistent.o
-dm-mirror-objs := dm-raid1.o
-md-mod-objs := md.o bitmap.o
-raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \
+dm-mirror-y += dm-raid1.o
+md-mod-y += md.o bitmap.o
+raid456-y += raid5.o
+raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \
raid6int1.o raid6int2.o raid6int4.o \
raid6int8.o raid6int16.o raid6int32.o \
raid6altivec1.o raid6altivec2.o raid6altivec4.o \
raid6altivec8.o \
raid6mmx.o raid6sse1.o raid6sse2.o
-hostprogs-y := mktables
+hostprogs-y += mktables
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
@@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o
obj-$(CONFIG_MD_RAID0) += raid0.o
obj-$(CONFIG_MD_RAID1) += raid1.o
obj-$(CONFIG_MD_RAID10) += raid10.o
+obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o
obj-$(CONFIG_MD_RAID456) += raid456.o
obj-$(CONFIG_MD_MULTIPATH) += multipath.o
obj-$(CONFIG_MD_FAULTY) += faulty.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 719943763391..47c68bc75a17 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -16,6 +16,7 @@
* wait if count gets too high, wake when it drops to half.
*/
+#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
@@ -26,8 +27,8 @@
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/buffer_head.h>
-#include <linux/raid/md.h>
-#include <linux/raid/bitmap.h>
+#include "md.h"
+#include "bitmap.h"
/* debug macros */
@@ -111,9 +112,10 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat
unsigned char *mappage;
if (page >= bitmap->pages) {
- printk(KERN_ALERT
- "%s: invalid bitmap page request: %lu (> %lu)\n",
- bmname(bitmap), page, bitmap->pages-1);
+ /* This can happen if bitmap_start_sync goes beyond
+ * End-of-device while looking for a whole page.
+ * It is harmless.
+ */
return -EINVAL;
}
@@ -265,7 +267,6 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
list_for_each_continue_rcu(pos, &mddev->disks) {
rdev = list_entry(pos, mdk_rdev_t, same_set);
if (rdev->raid_disk >= 0 &&
- test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags)) {
/* this is a usable devices */
atomic_inc(&rdev->nr_pending);
@@ -297,7 +298,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+ size/512 > 0)
/* bitmap runs in to metadata */
goto bad_alignment;
- if (rdev->data_offset + mddev->size*2
+ if (rdev->data_offset + mddev->dev_sectors
> rdev->sb_start + bitmap->offset)
/* data runs in to bitmap */
goto bad_alignment;
@@ -570,7 +571,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
reason = "unrecognized superblock version";
- else if (chunksize < PAGE_SIZE)
+ else if (chunksize < 512)
reason = "bitmap chunksize too small";
else if ((1 << ffz(~chunksize)) != chunksize)
reason = "bitmap chunksize not a power of 2";
@@ -985,6 +986,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
oldindex = index;
oldpage = page;
+ bitmap->filemap[bitmap->file_pages++] = page;
+ bitmap->last_page_size = count;
+
if (outofdate) {
/*
* if bitmap is out of date, dirty the
@@ -997,15 +1001,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
write_page(bitmap, page, 1);
ret = -EIO;
- if (bitmap->flags & BITMAP_WRITE_ERROR) {
- /* release, page not in filemap yet */
- put_page(page);
+ if (bitmap->flags & BITMAP_WRITE_ERROR)
goto err;
- }
}
-
- bitmap->filemap[bitmap->file_pages++] = page;
- bitmap->last_page_size = count;
}
paddr = kmap_atomic(page, KM_USER0);
if (bitmap->flags & BITMAP_HOSTENDIAN)
@@ -1015,9 +1013,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
kunmap_atomic(paddr, KM_USER0);
if (b) {
/* if the disk bit is set, set the memory bit */
- bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap),
- ((i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) >= start)
- );
+ int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap))
+ >= start);
+ bitmap_set_memory_bits(bitmap,
+ (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
+ needed);
bit_cnt++;
set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
}
@@ -1153,8 +1153,9 @@ void bitmap_daemon_work(struct bitmap *bitmap)
spin_lock_irqsave(&bitmap->lock, flags);
clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
}
- bmc = bitmap_get_counter(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
- &blocks, 0);
+ bmc = bitmap_get_counter(bitmap,
+ (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
+ &blocks, 0);
if (bmc) {
/*
if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc);
@@ -1168,7 +1169,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
} else if (*bmc == 1) {
/* we can clear the bit */
*bmc = 0;
- bitmap_count_page(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
+ bitmap_count_page(bitmap,
+ (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
-1);
/* clear the bit */
@@ -1306,6 +1308,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
}
+ if (bitmap->mddev->degraded)
+ /* Never clear bits or update events_cleared when degraded */
+ success = 0;
while (sectors) {
int blocks;
@@ -1345,8 +1350,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
}
}
-int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
- int degraded)
+static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
+ int degraded)
{
bitmap_counter_t *bmc;
int rv;
@@ -1374,6 +1379,29 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
return rv;
}
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
+ int degraded)
+{
+ /* bitmap_start_sync must always report on multiples of whole
+ * pages, otherwise resync (which is very PAGE_SIZE based) will
+ * get confused.
+ * So call __bitmap_start_sync repeatedly (if needed) until
+ * At least PAGE_SIZE>>9 blocks are covered.
+ * Return the 'or' of the result.
+ */
+ int rv = 0;
+ int blocks1;
+
+ *blocks = 0;
+ while (*blocks < (PAGE_SIZE>>9)) {
+ rv |= __bitmap_start_sync(bitmap, offset,
+ &blocks1, degraded);
+ offset += blocks1;
+ *blocks += blocks1;
+ }
+ return rv;
+}
+
void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted)
{
bitmap_counter_t *bmc;
@@ -1443,6 +1471,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
wait_event(bitmap->mddev->recovery_wait,
atomic_read(&bitmap->mddev->recovery_active) == 0);
+ bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
+ set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
s = 0;
while (s < sector && s < bitmap->mddev->resync_max_sectors) {
@@ -1450,6 +1480,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
s += blocks;
}
bitmap->last_end_sync = jiffies;
+ sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed");
}
static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
@@ -1484,7 +1515,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
unsigned long chunk;
for (chunk = s; chunk <= e; chunk++) {
- sector_t sec = chunk << CHUNK_BLOCK_SHIFT(bitmap);
+ sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
bitmap_set_memory_bits(bitmap, sec, 1);
bitmap_file_set_bit(bitmap, sec);
}
@@ -1560,7 +1591,7 @@ void bitmap_destroy(mddev_t *mddev)
int bitmap_create(mddev_t *mddev)
{
struct bitmap *bitmap;
- unsigned long blocks = mddev->resync_max_sectors;
+ sector_t blocks = mddev->resync_max_sectors;
unsigned long chunks;
unsigned long pages;
struct file *file = mddev->bitmap_file;
@@ -1602,8 +1633,8 @@ int bitmap_create(mddev_t *mddev)
bitmap->chunkshift = ffz(~bitmap->chunksize);
/* now that chunksize and chunkshift are set, we can use these macros */
- chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) /
- CHUNK_BLOCK_RATIO(bitmap);
+ chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >>
+ CHUNK_BLOCK_SHIFT(bitmap);
pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
BUG_ON(!pages);
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
new file mode 100644
index 000000000000..e98900671ca9
--- /dev/null
+++ b/drivers/md/bitmap.h
@@ -0,0 +1,288 @@
+/*
+ * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
+ *
+ * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+ */
+#ifndef BITMAP_H
+#define BITMAP_H 1
+
+#define BITMAP_MAJOR_LO 3
+/* version 4 insists the bitmap is in little-endian order
+ * with version 3, it is host-endian which is non-portable
+ */
+#define BITMAP_MAJOR_HI 4
+#define BITMAP_MAJOR_HOSTENDIAN 3
+
+#define BITMAP_MINOR 39
+
+/*
+ * in-memory bitmap:
+ *
+ * Use 16 bit block counters to track pending writes to each "chunk".
+ * The 2 high order bits are special-purpose, the first is a flag indicating
+ * whether a resync is needed. The second is a flag indicating whether a
+ * resync is active.
+ * This means that the counter is actually 14 bits:
+ *
+ * +--------+--------+------------------------------------------------+
+ * | resync | resync | counter |
+ * | needed | active | |
+ * | (0-1) | (0-1) | (0-16383) |
+ * +--------+--------+------------------------------------------------+
+ *
+ * The "resync needed" bit is set when:
+ * a '1' bit is read from storage at startup.
+ * a write request fails on some drives
+ * a resync is aborted on a chunk with 'resync active' set
+ * It is cleared (and resync-active set) when a resync starts across all drives
+ * of the chunk.
+ *
+ *
+ * The "resync active" bit is set when:
+ * a resync is started on all drives, and resync_needed is set.
+ * resync_needed will be cleared (as long as resync_active wasn't already set).
+ * It is cleared when a resync completes.
+ *
+ * The counter counts pending write requests, plus the on-disk bit.
+ * When the counter is '1' and the resync bits are clear, the on-disk
+ * bit can be cleared aswell, thus setting the counter to 0.
+ * When we set a bit, or in the counter (to start a write), if the fields is
+ * 0, we first set the disk bit and set the counter to 1.
+ *
+ * If the counter is 0, the on-disk bit is clear and the stipe is clean
+ * Anything that dirties the stipe pushes the counter to 2 (at least)
+ * and sets the on-disk bit (lazily).
+ * If a periodic sweep find the counter at 2, it is decremented to 1.
+ * If the sweep find the counter at 1, the on-disk bit is cleared and the
+ * counter goes to zero.
+ *
+ * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
+ * counters as a fallback when "page" memory cannot be allocated:
+ *
+ * Normal case (page memory allocated):
+ *
+ * page pointer (32-bit)
+ *
+ * [ ] ------+
+ * |
+ * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
+ * c1 c2 c2048
+ *
+ * Hijacked case (page memory allocation failed):
+ *
+ * hijacked page pointer (32-bit)
+ *
+ * [ ][ ] (no page memory allocated)
+ * counter #1 (16-bit) counter #2 (16-bit)
+ *
+ */
+
+#ifdef __KERNEL__
+
+#define PAGE_BITS (PAGE_SIZE << 3)
+#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
+
+typedef __u16 bitmap_counter_t;
+#define COUNTER_BITS 16
+#define COUNTER_BIT_SHIFT 4
+#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
+#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
+
+#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
+#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
+#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
+#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
+#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
+#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
+
+/* how many counters per page? */
+#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
+/* same, except a shift value for more efficient bitops */
+#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
+/* same, except a mask value for more efficient bitops */
+#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
+
+#define BITMAP_BLOCK_SIZE 512
+#define BITMAP_BLOCK_SHIFT 9
+
+/* how many blocks per chunk? (this is variable) */
+#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
+
+/* when hijacked, the counters and bits represent even larger "chunks" */
+/* there will be 1024 chunks represented by each counter in the page pointers */
+#define PAGEPTR_BLOCK_RATIO(bitmap) \
+ (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
+#define PAGEPTR_BLOCK_SHIFT(bitmap) \
+ (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
+#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
+
+/*
+ * on-disk bitmap:
+ *
+ * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
+ * file a page at a time. There's a superblock at the start of the file.
+ */
+
+/* map chunks (bits) to file pages - offset by the size of the superblock */
+#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
+
+#endif
+
+/*
+ * bitmap structures:
+ */
+
+#define BITMAP_MAGIC 0x6d746962
+
+/* use these for bitmap->flags and bitmap->sb->state bit-fields */
+enum bitmap_state {
+ BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */
+ BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
+ BITMAP_HOSTENDIAN = 0x8000,
+};
+
+/* the superblock at the front of the bitmap file -- little endian */
+typedef struct bitmap_super_s {
+ __le32 magic; /* 0 BITMAP_MAGIC */
+ __le32 version; /* 4 the bitmap major for now, could change... */
+ __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */
+ __le64 events; /* 24 event counter for the bitmap (1)*/
+ __le64 events_cleared;/*32 event counter when last bit cleared (2) */
+ __le64 sync_size; /* 40 the size of the md device's sync range(3) */
+ __le32 state; /* 48 bitmap state information */
+ __le32 chunksize; /* 52 the bitmap chunk size in bytes */
+ __le32 daemon_sleep; /* 56 seconds between disk flushes */
+ __le32 write_behind; /* 60 number of outstanding write-behind writes */
+
+ __u8 pad[256 - 64]; /* set to zero */
+} bitmap_super_t;
+
+/* notes:
+ * (1) This event counter is updated before the eventcounter in the md superblock
+ * When a bitmap is loaded, it is only accepted if this event counter is equal
+ * to, or one greater than, the event counter in the superblock.
+ * (2) This event counter is updated when the other one is *if*and*only*if* the
+ * array is not degraded. As bits are not cleared when the array is degraded,
+ * this represents the last time that any bits were cleared.
+ * If a device is being added that has an event count with this value or
+ * higher, it is accepted as conforming to the bitmap.
+ * (3)This is the number of sectors represented by the bitmap, and is the range that
+ * resync happens across. For raid1 and raid5/6 it is the size of individual
+ * devices. For raid10 it is the size of the array.
+ */
+
+#ifdef __KERNEL__
+
+/* the in-memory bitmap is represented by bitmap_pages */
+struct bitmap_page {
+ /*
+ * map points to the actual memory page
+ */
+ char *map;
+ /*
+ * in emergencies (when map cannot be alloced), hijack the map
+ * pointer and use it as two counters itself
+ */
+ unsigned int hijacked:1;
+ /*
+ * count of dirty bits on the page
+ */
+ unsigned int count:31;
+};
+
+/* keep track of bitmap file pages that have pending writes on them */
+struct page_list {
+ struct list_head list;
+ struct page *page;
+};
+
+/* the main bitmap structure - one per mddev */
+struct bitmap {
+ struct bitmap_page *bp;
+ unsigned long pages; /* total number of pages in the bitmap */
+ unsigned long missing_pages; /* number of pages not yet allocated */
+
+ mddev_t *mddev; /* the md device that the bitmap is for */
+
+ int counter_bits; /* how many bits per block counter */
+
+ /* bitmap chunksize -- how much data does each bit represent? */
+ unsigned long chunksize;
+ unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
+ unsigned long chunks; /* total number of data chunks for the array */
+
+ /* We hold a count on the chunk currently being synced, and drop
+ * it when the last block is started. If the resync is aborted
+ * midway, we need to be able to drop that count, so we remember
+ * the counted chunk..
+ */
+ unsigned long syncchunk;
+
+ __u64 events_cleared;
+ int need_sync;
+
+ /* bitmap spinlock */
+ spinlock_t lock;
+
+ long offset; /* offset from superblock if file is NULL */
+ struct file *file; /* backing disk file */
+ struct page *sb_page; /* cached copy of the bitmap file superblock */
+ struct page **filemap; /* list of cache pages for the file */
+ unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
+ unsigned long file_pages; /* number of pages in the file */
+ int last_page_size; /* bytes in the last page */
+
+ unsigned long flags;
+
+ int allclean;
+
+ unsigned long max_write_behind; /* write-behind mode */
+ atomic_t behind_writes;
+
+ /*
+ * the bitmap daemon - periodically wakes up and sweeps the bitmap
+ * file, cleaning up bits and flushing out pages to disk as necessary
+ */
+ unsigned long daemon_lastrun; /* jiffies of last run */
+ unsigned long daemon_sleep; /* how many seconds between updates? */
+ unsigned long last_end_sync; /* when we lasted called end_sync to
+ * update bitmap with resync progress */
+
+ atomic_t pending_writes; /* pending writes to the bitmap file */
+ wait_queue_head_t write_wait;
+ wait_queue_head_t overflow_wait;
+
+};
+
+/* the bitmap API */
+
+/* these are used only by md/bitmap */
+int bitmap_create(mddev_t *mddev);
+void bitmap_flush(mddev_t *mddev);
+void bitmap_destroy(mddev_t *mddev);
+
+void bitmap_print_sb(struct bitmap *bitmap);
+void bitmap_update_sb(struct bitmap *bitmap);
+
+int bitmap_setallbits(struct bitmap *bitmap);
+void bitmap_write_all(struct bitmap *bitmap);
+
+void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
+
+/* these are exported */
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
+ unsigned long sectors, int behind);
+void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
+ unsigned long sectors, int success, int behind);
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
+void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
+void bitmap_close_sync(struct bitmap *bitmap);
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
+
+void bitmap_unplug(struct bitmap *bitmap);
+void bitmap_daemon_work(struct bitmap *bitmap);
+#endif
+
+#endif
diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h
deleted file mode 100644
index d4509be0fe67..000000000000
--- a/drivers/md/dm-bio-list.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (C) 2004 Red Hat UK Ltd.
- *
- * This file is released under the GPL.
- */
-
-#ifndef DM_BIO_LIST_H
-#define DM_BIO_LIST_H
-
-#include <linux/bio.h>
-
-#ifdef CONFIG_BLOCK
-
-struct bio_list {
- struct bio *head;
- struct bio *tail;
-};
-
-static inline int bio_list_empty(const struct bio_list *bl)
-{
- return bl->head == NULL;
-}
-
-static inline void bio_list_init(struct bio_list *bl)
-{
- bl->head = bl->tail = NULL;
-}
-
-#define bio_list_for_each(bio, bl) \
- for (bio = (bl)->head; bio; bio = bio->bi_next)
-
-static inline unsigned bio_list_size(const struct bio_list *bl)
-{
- unsigned sz = 0;
- struct bio *bio;
-
- bio_list_for_each(bio, bl)
- sz++;
-
- return sz;
-}
-
-static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
-{
- bio->bi_next = NULL;
-
- if (bl->tail)
- bl->tail->bi_next = bio;
- else
- bl->head = bio;
-
- bl->tail = bio;
-}
-
-static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
-{
- if (!bl2->head)
- return;
-
- if (bl->tail)
- bl->tail->bi_next = bl2->head;
- else
- bl->head = bl2->head;
-
- bl->tail = bl2->tail;
-}
-
-static inline void bio_list_merge_head(struct bio_list *bl,
- struct bio_list *bl2)
-{
- if (!bl2->head)
- return;
-
- if (bl->head)
- bl2->tail->bi_next = bl->head;
- else
- bl->tail = bl2->tail;
-
- bl->head = bl2->head;
-}
-
-static inline struct bio *bio_list_pop(struct bio_list *bl)
-{
- struct bio *bio = bl->head;
-
- if (bio) {
- bl->head = bl->head->bi_next;
- if (!bl->head)
- bl->tail = NULL;
-
- bio->bi_next = NULL;
- }
-
- return bio;
-}
-
-static inline struct bio *bio_list_get(struct bio_list *bl)
-{
- struct bio *bio = bl->head;
-
- bl->head = bl->tail = NULL;
-
- return bio;
-}
-
-#endif /* CONFIG_BLOCK */
-#endif
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index d3ec217847d6..3a8cfa2645c7 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -16,30 +16,56 @@
* functions in this file help the target record and restore the
* original bio state.
*/
+
+struct dm_bio_vec_details {
+#if PAGE_SIZE < 65536
+ __u16 bv_len;
+ __u16 bv_offset;
+#else
+ unsigned bv_len;
+ unsigned bv_offset;
+#endif
+};
+
struct dm_bio_details {
sector_t bi_sector;
struct block_device *bi_bdev;
unsigned int bi_size;
unsigned short bi_idx;
unsigned long bi_flags;
+ struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES];
};
static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
{
+ unsigned i;
+
bd->bi_sector = bio->bi_sector;
bd->bi_bdev = bio->bi_bdev;
bd->bi_size = bio->bi_size;
bd->bi_idx = bio->bi_idx;
bd->bi_flags = bio->bi_flags;
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len;
+ bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset;
+ }
}
static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
{
+ unsigned i;
+
bio->bi_sector = bd->bi_sector;
bio->bi_bdev = bd->bi_bdev;
bio->bi_size = bd->bi_size;
bio->bi_idx = bd->bi_idx;
bio->bi_flags = bd->bi_flags;
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len;
+ bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset;
+ }
}
#endif
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bfefd079a955..53394e863c74 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1156,8 +1156,7 @@ bad_ivmode:
crypto_free_ablkcipher(tfm);
bad_cipher:
/* Must zero key material before freeing */
- memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
- kfree(cc);
+ kzfree(cc);
return -EINVAL;
}
@@ -1183,8 +1182,7 @@ static void crypt_dtr(struct dm_target *ti)
dm_put_device(ti, cc->dev);
/* Must zero key material before freeing */
- memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
- kfree(cc);
+ kzfree(cc);
}
static int crypt_map(struct dm_target *ti, struct bio *bio,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 59ee1b015d2d..559dbb52bc85 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -15,8 +15,6 @@
#include <linux/device-mapper.h>
-#include "dm-bio-list.h"
-
#define DM_MSG_PREFIX "delay"
struct delay_c {
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index dccbfb0e010f..a2e26c242141 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -7,6 +7,7 @@
#include "dm-exception-store.h"
+#include <linux/ctype.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/vmalloc.h>
@@ -14,6 +15,257 @@
#define DM_MSG_PREFIX "snapshot exception stores"
+static LIST_HEAD(_exception_store_types);
+static DEFINE_SPINLOCK(_lock);
+
+static struct dm_exception_store_type *__find_exception_store_type(const char *name)
+{
+ struct dm_exception_store_type *type;
+
+ list_for_each_entry(type, &_exception_store_types, list)
+ if (!strcmp(name, type->name))
+ return type;
+
+ return NULL;
+}
+
+static struct dm_exception_store_type *_get_exception_store_type(const char *name)
+{
+ struct dm_exception_store_type *type;
+
+ spin_lock(&_lock);
+
+ type = __find_exception_store_type(name);
+
+ if (type && !try_module_get(type->module))
+ type = NULL;
+
+ spin_unlock(&_lock);
+
+ return type;
+}
+
+/*
+ * get_type
+ * @type_name
+ *
+ * Attempt to retrieve the dm_exception_store_type by name. If not already
+ * available, attempt to load the appropriate module.
+ *
+ * Exstore modules are named "dm-exstore-" followed by the 'type_name'.
+ * Modules may contain multiple types.
+ * This function will first try the module "dm-exstore-<type_name>",
+ * then truncate 'type_name' on the last '-' and try again.
+ *
+ * For example, if type_name was "clustered-shared", it would search
+ * 'dm-exstore-clustered-shared' then 'dm-exstore-clustered'.
+ *
+ * 'dm-exception-store-<type_name>' is too long of a name in my
+ * opinion, which is why I've chosen to have the files
+ * containing exception store implementations be 'dm-exstore-<type_name>'.
+ * If you want your module to be autoloaded, you will follow this
+ * naming convention.
+ *
+ * Returns: dm_exception_store_type* on success, NULL on failure
+ */
+static struct dm_exception_store_type *get_type(const char *type_name)
+{
+ char *p, *type_name_dup;
+ struct dm_exception_store_type *type;
+
+ type = _get_exception_store_type(type_name);
+ if (type)
+ return type;
+
+ type_name_dup = kstrdup(type_name, GFP_KERNEL);
+ if (!type_name_dup) {
+ DMERR("No memory left to attempt load for \"%s\"", type_name);
+ return NULL;
+ }
+
+ while (request_module("dm-exstore-%s", type_name_dup) ||
+ !(type = _get_exception_store_type(type_name))) {
+ p = strrchr(type_name_dup, '-');
+ if (!p)
+ break;
+ p[0] = '\0';
+ }
+
+ if (!type)
+ DMWARN("Module for exstore type \"%s\" not found.", type_name);
+
+ kfree(type_name_dup);
+
+ return type;
+}
+
+static void put_type(struct dm_exception_store_type *type)
+{
+ spin_lock(&_lock);
+ module_put(type->module);
+ spin_unlock(&_lock);
+}
+
+int dm_exception_store_type_register(struct dm_exception_store_type *type)
+{
+ int r = 0;
+
+ spin_lock(&_lock);
+ if (!__find_exception_store_type(type->name))
+ list_add(&type->list, &_exception_store_types);
+ else
+ r = -EEXIST;
+ spin_unlock(&_lock);
+
+ return r;
+}
+EXPORT_SYMBOL(dm_exception_store_type_register);
+
+int dm_exception_store_type_unregister(struct dm_exception_store_type *type)
+{
+ spin_lock(&_lock);
+
+ if (!__find_exception_store_type(type->name)) {
+ spin_unlock(&_lock);
+ return -EINVAL;
+ }
+
+ list_del(&type->list);
+
+ spin_unlock(&_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(dm_exception_store_type_unregister);
+
+/*
+ * Round a number up to the nearest 'size' boundary. size must
+ * be a power of 2.
+ */
+static ulong round_up(ulong n, ulong size)
+{
+ size--;
+ return (n + size) & ~size;
+}
+
+static int set_chunk_size(struct dm_exception_store *store,
+ const char *chunk_size_arg, char **error)
+{
+ unsigned long chunk_size_ulong;
+ char *value;
+
+ chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10);
+ if (*chunk_size_arg == '\0' || *value != '\0') {
+ *error = "Invalid chunk size";
+ return -EINVAL;
+ }
+
+ if (!chunk_size_ulong) {
+ store->chunk_size = store->chunk_mask = store->chunk_shift = 0;
+ return 0;
+ }
+
+ /*
+ * Chunk size must be multiple of page size. Silently
+ * round up if it's not.
+ */
+ chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9);
+
+ /* Check chunk_size is a power of 2 */
+ if (!is_power_of_2(chunk_size_ulong)) {
+ *error = "Chunk size is not a power of 2";
+ return -EINVAL;
+ }
+
+ /* Validate the chunk size against the device block size */
+ if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) {
+ *error = "Chunk size is not a multiple of device blocksize";
+ return -EINVAL;
+ }
+
+ store->chunk_size = chunk_size_ulong;
+ store->chunk_mask = chunk_size_ulong - 1;
+ store->chunk_shift = ffs(chunk_size_ulong) - 1;
+
+ return 0;
+}
+
+int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+ unsigned *args_used,
+ struct dm_exception_store **store)
+{
+ int r = 0;
+ struct dm_exception_store_type *type;
+ struct dm_exception_store *tmp_store;
+ char persistent;
+
+ if (argc < 3) {
+ ti->error = "Insufficient exception store arguments";
+ return -EINVAL;
+ }
+
+ tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL);
+ if (!tmp_store) {
+ ti->error = "Exception store allocation failed";
+ return -ENOMEM;
+ }
+
+ persistent = toupper(*argv[1]);
+ if (persistent != 'P' && persistent != 'N') {
+ ti->error = "Persistent flag is not P or N";
+ return -EINVAL;
+ }
+
+ type = get_type(argv[1]);
+ if (!type) {
+ ti->error = "Exception store type not recognised";
+ r = -EINVAL;
+ goto bad_type;
+ }
+
+ tmp_store->type = type;
+ tmp_store->ti = ti;
+
+ r = dm_get_device(ti, argv[0], 0, 0,
+ FMODE_READ | FMODE_WRITE, &tmp_store->cow);
+ if (r) {
+ ti->error = "Cannot get COW device";
+ goto bad_cow;
+ }
+
+ r = set_chunk_size(tmp_store, argv[2], &ti->error);
+ if (r)
+ goto bad_cow;
+
+ r = type->ctr(tmp_store, 0, NULL);
+ if (r) {
+ ti->error = "Exception store type constructor failed";
+ goto bad_ctr;
+ }
+
+ *args_used = 3;
+ *store = tmp_store;
+ return 0;
+
+bad_ctr:
+ dm_put_device(ti, tmp_store->cow);
+bad_cow:
+ put_type(type);
+bad_type:
+ kfree(tmp_store);
+ return r;
+}
+EXPORT_SYMBOL(dm_exception_store_create);
+
+void dm_exception_store_destroy(struct dm_exception_store *store)
+{
+ store->type->dtr(store);
+ dm_put_device(store->ti, store->cow);
+ put_type(store->type);
+ kfree(store);
+}
+EXPORT_SYMBOL(dm_exception_store_destroy);
+
int dm_exception_store_init(void)
{
int r;
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index bb9f33d5daa2..0a2e6e7f67b3 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -37,11 +37,18 @@ struct dm_snap_exception {
* Abstraction to handle the meta/layout of exception stores (the
* COW device).
*/
-struct dm_exception_store {
+struct dm_exception_store;
+struct dm_exception_store_type {
+ const char *name;
+ struct module *module;
+
+ int (*ctr) (struct dm_exception_store *store,
+ unsigned argc, char **argv);
+
/*
* Destroys this object when you've finished with it.
*/
- void (*destroy) (struct dm_exception_store *store);
+ void (*dtr) (struct dm_exception_store *store);
/*
* The target shouldn't read the COW device until this is
@@ -72,8 +79,9 @@ struct dm_exception_store {
*/
void (*drop_snapshot) (struct dm_exception_store *store);
- int (*status) (struct dm_exception_store *store, status_type_t status,
- char *result, unsigned int maxlen);
+ unsigned (*status) (struct dm_exception_store *store,
+ status_type_t status, char *result,
+ unsigned maxlen);
/*
* Return how full the snapshot is.
@@ -82,7 +90,21 @@ struct dm_exception_store {
sector_t *numerator,
sector_t *denominator);
- struct dm_snapshot *snap;
+ /* For internal device-mapper use only. */
+ struct list_head list;
+};
+
+struct dm_exception_store {
+ struct dm_exception_store_type *type;
+ struct dm_target *ti;
+
+ struct dm_dev *cow;
+
+ /* Size of data blocks saved - must be a power of 2 */
+ chunk_t chunk_size;
+ chunk_t chunk_mask;
+ chunk_t chunk_shift;
+
void *context;
};
@@ -129,6 +151,28 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
# endif
+/*
+ * Return the number of sectors in the device.
+ */
+static inline sector_t get_dev_size(struct block_device *bdev)
+{
+ return bdev->bd_inode->i_size >> SECTOR_SHIFT;
+}
+
+static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
+ sector_t sector)
+{
+ return (sector & ~store->chunk_mask) >> store->chunk_shift;
+}
+
+int dm_exception_store_type_register(struct dm_exception_store_type *type);
+int dm_exception_store_type_unregister(struct dm_exception_store_type *type);
+
+int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+ unsigned *args_used,
+ struct dm_exception_store **store);
+void dm_exception_store_destroy(struct dm_exception_store *store);
+
int dm_exception_store_init(void);
void dm_exception_store_exit(void);
@@ -141,8 +185,4 @@ void dm_persistent_snapshot_exit(void);
int dm_transient_snapshot_init(void);
void dm_transient_snapshot_exit(void);
-int dm_create_persistent(struct dm_exception_store *store);
-
-int dm_create_transient(struct dm_exception_store *store);
-
#endif /* _LINUX_DM_EXCEPTION_STORE */
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 36e2b5e46a6b..e73aabd61cd7 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -370,16 +370,13 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&io.count) || signal_pending(current))
+ if (!atomic_read(&io.count))
break;
io_schedule();
}
set_current_state(TASK_RUNNING);
- if (atomic_read(&io.count))
- return -EINTR;
-
if (error_bits)
*error_bits = io.error_bits;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index f01096549a93..823ceba6efa8 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1047,6 +1047,19 @@ static int populate_table(struct dm_table *table,
return dm_table_complete(table);
}
+static int table_prealloc_integrity(struct dm_table *t,
+ struct mapped_device *md)
+{
+ struct list_head *devices = dm_table_get_devices(t);
+ struct dm_dev_internal *dd;
+
+ list_for_each_entry(dd, devices, list)
+ if (bdev_get_integrity(dd->dm_dev.bdev))
+ return blk_integrity_register(dm_disk(md), NULL);
+
+ return 0;
+}
+
static int table_load(struct dm_ioctl *param, size_t param_size)
{
int r;
@@ -1068,6 +1081,14 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
goto out;
}
+ r = table_prealloc_integrity(t, md);
+ if (r) {
+ DMERR("%s: could not register integrity profile.",
+ dm_device_name(md));
+ dm_table_destroy(t);
+ goto out;
+ }
+
down_write(&_hash_lock);
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 0a225da21272..3e3fc06cb861 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -297,7 +297,8 @@ static int run_complete_job(struct kcopyd_job *job)
dm_kcopyd_notify_fn fn = job->fn;
struct dm_kcopyd_client *kc = job->kc;
- kcopyd_put_pages(kc, job->pages);
+ if (job->pages)
+ kcopyd_put_pages(kc, job->pages);
mempool_free(job, kc->job_pool);
fn(read_err, write_err, context);
@@ -461,6 +462,7 @@ static void segment_complete(int read_err, unsigned long write_err,
sector_t progress = 0;
sector_t count = 0;
struct kcopyd_job *job = (struct kcopyd_job *) context;
+ struct dm_kcopyd_client *kc = job->kc;
mutex_lock(&job->lock);
@@ -490,7 +492,7 @@ static void segment_complete(int read_err, unsigned long write_err,
if (count) {
int i;
- struct kcopyd_job *sub_job = mempool_alloc(job->kc->job_pool,
+ struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool,
GFP_NOIO);
*sub_job = *job;
@@ -509,13 +511,16 @@ static void segment_complete(int read_err, unsigned long write_err,
} else if (atomic_dec_and_test(&job->sub_jobs)) {
/*
- * To avoid a race we must keep the job around
- * until after the notify function has completed.
- * Otherwise the client may try and stop the job
- * after we've completed.
+ * Queue the completion callback to the kcopyd thread.
+ *
+ * Some callers assume that all the completions are called
+ * from a single thread and don't race with each other.
+ *
+ * We must not call the callback directly here because this
+ * code may not be executing in the thread.
*/
- job->fn(read_err, write_err, job->context);
- mempool_free(job, job->kc->job_pool);
+ push(&kc->complete_jobs, job);
+ wake(kc);
}
}
@@ -528,6 +533,8 @@ static void split_job(struct kcopyd_job *job)
{
int i;
+ atomic_inc(&job->kc->nr_jobs);
+
atomic_set(&job->sub_jobs, SPLIT_COUNT);
for (i = 0; i < SPLIT_COUNT; i++)
segment_complete(0, 0u, job);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index bfa107f59d96..79fb53e51c70 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -142,7 +142,6 @@ static struct target_type linear_target = {
.status = linear_status,
.ioctl = linear_ioctl,
.merge = linear_merge,
- .features = DM_TARGET_SUPPORTS_BARRIERS,
};
int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 737961f275c1..be233bc4d917 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -16,40 +16,29 @@
#define DM_MSG_PREFIX "dirty region log"
-struct dm_dirty_log_internal {
- struct dm_dirty_log_type *type;
-
- struct list_head list;
- long use;
-};
-
static LIST_HEAD(_log_types);
static DEFINE_SPINLOCK(_lock);
-static struct dm_dirty_log_internal *__find_dirty_log_type(const char *name)
+static struct dm_dirty_log_type *__find_dirty_log_type(const char *name)
{
- struct dm_dirty_log_internal *log_type;
+ struct dm_dirty_log_type *log_type;
list_for_each_entry(log_type, &_log_types, list)
- if (!strcmp(name, log_type->type->name))
+ if (!strcmp(name, log_type->name))
return log_type;
return NULL;
}
-static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name)
+static struct dm_dirty_log_type *_get_dirty_log_type(const char *name)
{
- struct dm_dirty_log_internal *log_type;
+ struct dm_dirty_log_type *log_type;
spin_lock(&_lock);
log_type = __find_dirty_log_type(name);
- if (log_type) {
- if (!log_type->use && !try_module_get(log_type->type->module))
- log_type = NULL;
- else
- log_type->use++;
- }
+ if (log_type && !try_module_get(log_type->module))
+ log_type = NULL;
spin_unlock(&_lock);
@@ -76,14 +65,14 @@ static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name)
static struct dm_dirty_log_type *get_type(const char *type_name)
{
char *p, *type_name_dup;
- struct dm_dirty_log_internal *log_type;
+ struct dm_dirty_log_type *log_type;
if (!type_name)
return NULL;
log_type = _get_dirty_log_type(type_name);
if (log_type)
- return log_type->type;
+ return log_type;
type_name_dup = kstrdup(type_name, GFP_KERNEL);
if (!type_name_dup) {
@@ -105,56 +94,33 @@ static struct dm_dirty_log_type *get_type(const char *type_name)
kfree(type_name_dup);
- return log_type ? log_type->type : NULL;
+ return log_type;
}
static void put_type(struct dm_dirty_log_type *type)
{
- struct dm_dirty_log_internal *log_type;
-
if (!type)
return;
spin_lock(&_lock);
- log_type = __find_dirty_log_type(type->name);
- if (!log_type)
+ if (!__find_dirty_log_type(type->name))
goto out;
- if (!--log_type->use)
- module_put(type->module);
-
- BUG_ON(log_type->use < 0);
+ module_put(type->module);
out:
spin_unlock(&_lock);
}
-static struct dm_dirty_log_internal *_alloc_dirty_log_type(struct dm_dirty_log_type *type)
-{
- struct dm_dirty_log_internal *log_type = kzalloc(sizeof(*log_type),
- GFP_KERNEL);
-
- if (log_type)
- log_type->type = type;
-
- return log_type;
-}
-
int dm_dirty_log_type_register(struct dm_dirty_log_type *type)
{
- struct dm_dirty_log_internal *log_type = _alloc_dirty_log_type(type);
int r = 0;
- if (!log_type)
- return -ENOMEM;
-
spin_lock(&_lock);
if (!__find_dirty_log_type(type->name))
- list_add(&log_type->list, &_log_types);
- else {
- kfree(log_type);
+ list_add(&type->list, &_log_types);
+ else
r = -EEXIST;
- }
spin_unlock(&_lock);
return r;
@@ -163,25 +129,16 @@ EXPORT_SYMBOL(dm_dirty_log_type_register);
int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
{
- struct dm_dirty_log_internal *log_type;
-
spin_lock(&_lock);
- log_type = __find_dirty_log_type(type->name);
- if (!log_type) {
+ if (!__find_dirty_log_type(type->name)) {
spin_unlock(&_lock);
return -EINVAL;
}
- if (log_type->use) {
- spin_unlock(&_lock);
- return -ETXTBSY;
- }
-
- list_del(&log_type->list);
+ list_del(&type->list);
spin_unlock(&_lock);
- kfree(log_type);
return 0;
}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 095f77bf9681..6a386ab4f7eb 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -8,7 +8,6 @@
#include <linux/device-mapper.h>
#include "dm-path-selector.h"
-#include "dm-bio-list.h"
#include "dm-bio-record.h"
#include "dm-uevent.h"
diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c
index 96ea226155b1..42c04f04a0c4 100644
--- a/drivers/md/dm-path-selector.c
+++ b/drivers/md/dm-path-selector.c
@@ -17,9 +17,7 @@
struct ps_internal {
struct path_selector_type pst;
-
struct list_head list;
- long use;
};
#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst)
@@ -45,12 +43,8 @@ static struct ps_internal *get_path_selector(const char *name)
down_read(&_ps_lock);
psi = __find_path_selector_type(name);
- if (psi) {
- if ((psi->use == 0) && !try_module_get(psi->pst.module))
- psi = NULL;
- else
- psi->use++;
- }
+ if (psi && !try_module_get(psi->pst.module))
+ psi = NULL;
up_read(&_ps_lock);
return psi;
@@ -84,11 +78,7 @@ void dm_put_path_selector(struct path_selector_type *pst)
if (!psi)
goto out;
- if (--psi->use == 0)
- module_put(psi->pst.module);
-
- BUG_ON(psi->use < 0);
-
+ module_put(psi->pst.module);
out:
up_read(&_ps_lock);
}
@@ -136,11 +126,6 @@ int dm_unregister_path_selector(struct path_selector_type *pst)
return -EINVAL;
}
- if (psi->use) {
- up_write(&_ps_lock);
- return -ETXTBSY;
- }
-
list_del(&psi->list);
up_write(&_ps_lock);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 4d6bc101962e..076fbb4e967a 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -5,7 +5,6 @@
* This file is released under the GPL.
*/
-#include "dm-bio-list.h"
#include "dm-bio-record.h"
#include <linux/init.h>
@@ -145,6 +144,8 @@ struct dm_raid1_read_record {
struct dm_bio_details details;
};
+static struct kmem_cache *_dm_raid1_read_record_cache;
+
/*
* Every mirror should look like this one.
*/
@@ -586,6 +587,9 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
int state;
struct bio *bio;
struct bio_list sync, nosync, recover, *this_list = NULL;
+ struct bio_list requeue;
+ struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+ region_t region;
if (!writes->head)
return;
@@ -596,10 +600,18 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
bio_list_init(&sync);
bio_list_init(&nosync);
bio_list_init(&recover);
+ bio_list_init(&requeue);
while ((bio = bio_list_pop(writes))) {
- state = dm_rh_get_state(ms->rh,
- dm_rh_bio_to_region(ms->rh, bio), 1);
+ region = dm_rh_bio_to_region(ms->rh, bio);
+
+ if (log->type->is_remote_recovering &&
+ log->type->is_remote_recovering(log, region)) {
+ bio_list_add(&requeue, bio);
+ continue;
+ }
+
+ state = dm_rh_get_state(ms->rh, region, 1);
switch (state) {
case DM_RH_CLEAN:
case DM_RH_DIRTY:
@@ -619,6 +631,16 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
}
/*
+ * Add bios that are delayed due to remote recovery
+ * back on to the write queue
+ */
+ if (unlikely(requeue.head)) {
+ spin_lock_irq(&ms->lock);
+ bio_list_merge(&ms->writes, &requeue);
+ spin_unlock_irq(&ms->lock);
+ }
+
+ /*
* Increment the pending counts for any regions that will
* be written to (writes to recover regions are going to
* be delayed).
@@ -764,9 +786,9 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
atomic_set(&ms->suspend, 0);
atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
- len = sizeof(struct dm_raid1_read_record);
- ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS,
- len);
+ ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS,
+ _dm_raid1_read_record_cache);
+
if (!ms->read_record_pool) {
ti->error = "Error creating mirror read_record_pool";
kfree(ms);
@@ -1279,16 +1301,31 @@ static int __init dm_mirror_init(void)
{
int r;
+ _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0);
+ if (!_dm_raid1_read_record_cache) {
+ DMERR("Can't allocate dm_raid1_read_record cache");
+ r = -ENOMEM;
+ goto bad_cache;
+ }
+
r = dm_register_target(&mirror_target);
- if (r < 0)
+ if (r < 0) {
DMERR("Failed to register mirror target");
+ goto bad_target;
+ }
+
+ return 0;
+bad_target:
+ kmem_cache_destroy(_dm_raid1_read_record_cache);
+bad_cache:
return r;
}
static void __exit dm_mirror_exit(void)
{
dm_unregister_target(&mirror_target);
+ kmem_cache_destroy(_dm_raid1_read_record_cache);
}
/* Module hooks */
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 59f8d9df9e1a..7b899be0b087 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -14,7 +14,6 @@
#include <linux/vmalloc.h>
#include "dm.h"
-#include "dm-bio-list.h"
#define DM_MSG_PREFIX "region hash"
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 936b34e0959f..e75c6dd76a9a 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -6,7 +6,6 @@
*/
#include "dm-exception-store.h"
-#include "dm-snap.h"
#include <linux/mm.h>
#include <linux/pagemap.h>
@@ -89,7 +88,7 @@ struct commit_callback {
* The top level structure for a persistent exception store.
*/
struct pstore {
- struct dm_snapshot *snap; /* up pointer to my snapshot */
+ struct dm_exception_store *store;
int version;
int valid;
uint32_t exceptions_per_area;
@@ -141,7 +140,7 @@ static int alloc_area(struct pstore *ps)
int r = -ENOMEM;
size_t len;
- len = ps->snap->chunk_size << SECTOR_SHIFT;
+ len = ps->store->chunk_size << SECTOR_SHIFT;
/*
* Allocate the chunk_size block of memory that will hold
@@ -163,9 +162,12 @@ static int alloc_area(struct pstore *ps)
static void free_area(struct pstore *ps)
{
- vfree(ps->area);
+ if (ps->area)
+ vfree(ps->area);
ps->area = NULL;
- vfree(ps->zero_area);
+
+ if (ps->zero_area)
+ vfree(ps->zero_area);
ps->zero_area = NULL;
}
@@ -189,9 +191,9 @@ static void do_metadata(struct work_struct *work)
static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
{
struct dm_io_region where = {
- .bdev = ps->snap->cow->bdev,
- .sector = ps->snap->chunk_size * chunk,
- .count = ps->snap->chunk_size,
+ .bdev = ps->store->cow->bdev,
+ .sector = ps->store->chunk_size * chunk,
+ .count = ps->store->chunk_size,
};
struct dm_io_request io_req = {
.bi_rw = rw,
@@ -247,15 +249,15 @@ static int area_io(struct pstore *ps, int rw)
static void zero_memory_area(struct pstore *ps)
{
- memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
+ memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
}
static int zero_disk_area(struct pstore *ps, chunk_t area)
{
struct dm_io_region where = {
- .bdev = ps->snap->cow->bdev,
- .sector = ps->snap->chunk_size * area_location(ps, area),
- .count = ps->snap->chunk_size,
+ .bdev = ps->store->cow->bdev,
+ .sector = ps->store->chunk_size * area_location(ps, area),
+ .count = ps->store->chunk_size,
};
struct dm_io_request io_req = {
.bi_rw = WRITE,
@@ -278,15 +280,15 @@ static int read_header(struct pstore *ps, int *new_snapshot)
/*
* Use default chunk size (or hardsect_size, if larger) if none supplied
*/
- if (!ps->snap->chunk_size) {
- ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
- bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
- ps->snap->chunk_mask = ps->snap->chunk_size - 1;
- ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
+ if (!ps->store->chunk_size) {
+ ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
+ bdev_hardsect_size(ps->store->cow->bdev) >> 9);
+ ps->store->chunk_mask = ps->store->chunk_size - 1;
+ ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
chunk_size_supplied = 0;
}
- ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
+ ps->io_client = dm_io_client_create(sectors_to_pages(ps->store->
chunk_size));
if (IS_ERR(ps->io_client))
return PTR_ERR(ps->io_client);
@@ -317,22 +319,22 @@ static int read_header(struct pstore *ps, int *new_snapshot)
ps->version = le32_to_cpu(dh->version);
chunk_size = le32_to_cpu(dh->chunk_size);
- if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
+ if (!chunk_size_supplied || ps->store->chunk_size == chunk_size)
return 0;
DMWARN("chunk size %llu in device metadata overrides "
"table chunk size of %llu.",
(unsigned long long)chunk_size,
- (unsigned long long)ps->snap->chunk_size);
+ (unsigned long long)ps->store->chunk_size);
/* We had a bogus chunk_size. Fix stuff up. */
free_area(ps);
- ps->snap->chunk_size = chunk_size;
- ps->snap->chunk_mask = chunk_size - 1;
- ps->snap->chunk_shift = ffs(chunk_size) - 1;
+ ps->store->chunk_size = chunk_size;
+ ps->store->chunk_mask = chunk_size - 1;
+ ps->store->chunk_shift = ffs(chunk_size) - 1;
- r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
+ r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size),
ps->io_client);
if (r)
return r;
@@ -349,13 +351,13 @@ static int write_header(struct pstore *ps)
{
struct disk_header *dh;
- memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
+ memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
dh = (struct disk_header *) ps->area;
dh->magic = cpu_to_le32(SNAP_MAGIC);
dh->valid = cpu_to_le32(ps->valid);
dh->version = cpu_to_le32(ps->version);
- dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
+ dh->chunk_size = cpu_to_le32(ps->store->chunk_size);
return chunk_io(ps, 0, WRITE, 1);
}
@@ -474,18 +476,25 @@ static struct pstore *get_info(struct dm_exception_store *store)
static void persistent_fraction_full(struct dm_exception_store *store,
sector_t *numerator, sector_t *denominator)
{
- *numerator = get_info(store)->next_free * store->snap->chunk_size;
- *denominator = get_dev_size(store->snap->cow->bdev);
+ *numerator = get_info(store)->next_free * store->chunk_size;
+ *denominator = get_dev_size(store->cow->bdev);
}
-static void persistent_destroy(struct dm_exception_store *store)
+static void persistent_dtr(struct dm_exception_store *store)
{
struct pstore *ps = get_info(store);
destroy_workqueue(ps->metadata_wq);
- dm_io_client_destroy(ps->io_client);
- vfree(ps->callbacks);
+
+ /* Created in read_header */
+ if (ps->io_client)
+ dm_io_client_destroy(ps->io_client);
free_area(ps);
+
+ /* Allocated in persistent_read_metadata */
+ if (ps->callbacks)
+ vfree(ps->callbacks);
+
kfree(ps);
}
@@ -507,7 +516,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
/*
* Now we know correct chunk_size, complete the initialisation.
*/
- ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
+ ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
sizeof(struct disk_exception);
ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
sizeof(*ps->callbacks));
@@ -564,10 +573,10 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
struct pstore *ps = get_info(store);
uint32_t stride;
chunk_t next_free;
- sector_t size = get_dev_size(store->snap->cow->bdev);
+ sector_t size = get_dev_size(store->cow->bdev);
/* Is there enough room ? */
- if (size < ((ps->next_free + 1) * store->snap->chunk_size))
+ if (size < ((ps->next_free + 1) * store->chunk_size))
return -ENOSPC;
e->new_chunk = ps->next_free;
@@ -656,16 +665,17 @@ static void persistent_drop_snapshot(struct dm_exception_store *store)
DMWARN("write header failed");
}
-int dm_create_persistent(struct dm_exception_store *store)
+static int persistent_ctr(struct dm_exception_store *store,
+ unsigned argc, char **argv)
{
struct pstore *ps;
/* allocate the pstore */
- ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
if (!ps)
return -ENOMEM;
- ps->snap = store->snap;
+ ps->store = store;
ps->valid = 1;
ps->version = SNAPSHOT_DISK_VERSION;
ps->area = NULL;
@@ -683,22 +693,77 @@ int dm_create_persistent(struct dm_exception_store *store)
return -ENOMEM;
}
- store->destroy = persistent_destroy;
- store->read_metadata = persistent_read_metadata;
- store->prepare_exception = persistent_prepare_exception;
- store->commit_exception = persistent_commit_exception;
- store->drop_snapshot = persistent_drop_snapshot;
- store->fraction_full = persistent_fraction_full;
store->context = ps;
return 0;
}
+static unsigned persistent_status(struct dm_exception_store *store,
+ status_type_t status, char *result,
+ unsigned maxlen)
+{
+ unsigned sz = 0;
+
+ switch (status) {
+ case STATUSTYPE_INFO:
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT(" %s P %llu", store->cow->name,
+ (unsigned long long)store->chunk_size);
+ }
+
+ return sz;
+}
+
+static struct dm_exception_store_type _persistent_type = {
+ .name = "persistent",
+ .module = THIS_MODULE,
+ .ctr = persistent_ctr,
+ .dtr = persistent_dtr,
+ .read_metadata = persistent_read_metadata,
+ .prepare_exception = persistent_prepare_exception,
+ .commit_exception = persistent_commit_exception,
+ .drop_snapshot = persistent_drop_snapshot,
+ .fraction_full = persistent_fraction_full,
+ .status = persistent_status,
+};
+
+static struct dm_exception_store_type _persistent_compat_type = {
+ .name = "P",
+ .module = THIS_MODULE,
+ .ctr = persistent_ctr,
+ .dtr = persistent_dtr,
+ .read_metadata = persistent_read_metadata,
+ .prepare_exception = persistent_prepare_exception,
+ .commit_exception = persistent_commit_exception,
+ .drop_snapshot = persistent_drop_snapshot,
+ .fraction_full = persistent_fraction_full,
+ .status = persistent_status,
+};
+
int dm_persistent_snapshot_init(void)
{
- return 0;
+ int r;
+
+ r = dm_exception_store_type_register(&_persistent_type);
+ if (r) {
+ DMERR("Unable to register persistent exception store type");
+ return r;
+ }
+
+ r = dm_exception_store_type_register(&_persistent_compat_type);
+ if (r) {
+ DMERR("Unable to register old-style persistent exception "
+ "store type");
+ dm_exception_store_type_unregister(&_persistent_type);
+ return r;
+ }
+
+ return r;
}
void dm_persistent_snapshot_exit(void)
{
+ dm_exception_store_type_unregister(&_persistent_type);
+ dm_exception_store_type_unregister(&_persistent_compat_type);
}
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index 7f6e2e6dcb0d..cde5aa558e6d 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -6,7 +6,6 @@
*/
#include "dm-exception-store.h"
-#include "dm-snap.h"
#include <linux/mm.h>
#include <linux/pagemap.h>
@@ -23,7 +22,7 @@ struct transient_c {
sector_t next_free;
};
-static void transient_destroy(struct dm_exception_store *store)
+static void transient_dtr(struct dm_exception_store *store)
{
kfree(store->context);
}
@@ -39,14 +38,14 @@ static int transient_read_metadata(struct dm_exception_store *store,
static int transient_prepare_exception(struct dm_exception_store *store,
struct dm_snap_exception *e)
{
- struct transient_c *tc = (struct transient_c *) store->context;
- sector_t size = get_dev_size(store->snap->cow->bdev);
+ struct transient_c *tc = store->context;
+ sector_t size = get_dev_size(store->cow->bdev);
- if (size < (tc->next_free + store->snap->chunk_size))
+ if (size < (tc->next_free + store->chunk_size))
return -1;
- e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
- tc->next_free += store->snap->chunk_size;
+ e->new_chunk = sector_to_chunk(store, tc->next_free);
+ tc->next_free += store->chunk_size;
return 0;
}
@@ -64,20 +63,14 @@ static void transient_fraction_full(struct dm_exception_store *store,
sector_t *numerator, sector_t *denominator)
{
*numerator = ((struct transient_c *) store->context)->next_free;
- *denominator = get_dev_size(store->snap->cow->bdev);
+ *denominator = get_dev_size(store->cow->bdev);
}
-int dm_create_transient(struct dm_exception_store *store)
+static int transient_ctr(struct dm_exception_store *store,
+ unsigned argc, char **argv)
{
struct transient_c *tc;
- store->destroy = transient_destroy;
- store->read_metadata = transient_read_metadata;
- store->prepare_exception = transient_prepare_exception;
- store->commit_exception = transient_commit_exception;
- store->drop_snapshot = NULL;
- store->fraction_full = transient_fraction_full;
-
tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
if (!tc)
return -ENOMEM;
@@ -88,11 +81,70 @@ int dm_create_transient(struct dm_exception_store *store)
return 0;
}
+static unsigned transient_status(struct dm_exception_store *store,
+ status_type_t status, char *result,
+ unsigned maxlen)
+{
+ unsigned sz = 0;
+
+ switch (status) {
+ case STATUSTYPE_INFO:
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT(" %s N %llu", store->cow->name,
+ (unsigned long long)store->chunk_size);
+ }
+
+ return sz;
+}
+
+static struct dm_exception_store_type _transient_type = {
+ .name = "transient",
+ .module = THIS_MODULE,
+ .ctr = transient_ctr,
+ .dtr = transient_dtr,
+ .read_metadata = transient_read_metadata,
+ .prepare_exception = transient_prepare_exception,
+ .commit_exception = transient_commit_exception,
+ .fraction_full = transient_fraction_full,
+ .status = transient_status,
+};
+
+static struct dm_exception_store_type _transient_compat_type = {
+ .name = "N",
+ .module = THIS_MODULE,
+ .ctr = transient_ctr,
+ .dtr = transient_dtr,
+ .read_metadata = transient_read_metadata,
+ .prepare_exception = transient_prepare_exception,
+ .commit_exception = transient_commit_exception,
+ .fraction_full = transient_fraction_full,
+ .status = transient_status,
+};
+
int dm_transient_snapshot_init(void)
{
- return 0;
+ int r;
+
+ r = dm_exception_store_type_register(&_transient_type);
+ if (r) {
+ DMWARN("Unable to register transient exception store type");
+ return r;
+ }
+
+ r = dm_exception_store_type_register(&_transient_compat_type);
+ if (r) {
+ DMWARN("Unable to register old-style transient "
+ "exception store type");
+ dm_exception_store_type_unregister(&_transient_type);
+ return r;
+ }
+
+ return r;
}
void dm_transient_snapshot_exit(void)
{
+ dm_exception_store_type_unregister(&_transient_type);
+ dm_exception_store_type_unregister(&_transient_compat_type);
}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 65ff82ff124e..d73f17fc7778 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -7,7 +7,6 @@
*/
#include <linux/blkdev.h>
-#include <linux/ctype.h>
#include <linux/device-mapper.h>
#include <linux/delay.h>
#include <linux/fs.h>
@@ -20,10 +19,9 @@
#include <linux/vmalloc.h>
#include <linux/log2.h>
#include <linux/dm-kcopyd.h>
+#include <linux/workqueue.h>
#include "dm-exception-store.h"
-#include "dm-snap.h"
-#include "dm-bio-list.h"
#define DM_MSG_PREFIX "snapshots"
@@ -47,9 +45,76 @@
*/
#define MIN_IOS 256
+#define DM_TRACKED_CHUNK_HASH_SIZE 16
+#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
+ (DM_TRACKED_CHUNK_HASH_SIZE - 1))
+
+struct exception_table {
+ uint32_t hash_mask;
+ unsigned hash_shift;
+ struct list_head *table;
+};
+
+struct dm_snapshot {
+ struct rw_semaphore lock;
+
+ struct dm_dev *origin;
+
+ /* List of snapshots per Origin */
+ struct list_head list;
+
+ /* You can't use a snapshot if this is 0 (e.g. if full) */
+ int valid;
+
+ /* Origin writes don't trigger exceptions until this is set */
+ int active;
+
+ mempool_t *pending_pool;
+
+ atomic_t pending_exceptions_count;
+
+ struct exception_table pending;
+ struct exception_table complete;
+
+ /*
+ * pe_lock protects all pending_exception operations and access
+ * as well as the snapshot_bios list.
+ */
+ spinlock_t pe_lock;
+
+ /* The on disk metadata handler */
+ struct dm_exception_store *store;
+
+ struct dm_kcopyd_client *kcopyd_client;
+
+ /* Queue of snapshot writes for ksnapd to flush */
+ struct bio_list queued_bios;
+ struct work_struct queued_bios_work;
+
+ /* Chunks with outstanding reads */
+ mempool_t *tracked_chunk_pool;
+ spinlock_t tracked_chunk_lock;
+ struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
+};
+
static struct workqueue_struct *ksnapd;
static void flush_queued_bios(struct work_struct *work);
+static sector_t chunk_to_sector(struct dm_exception_store *store,
+ chunk_t chunk)
+{
+ return chunk << store->chunk_shift;
+}
+
+static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
+{
+ /*
+ * There is only ever one instance of a particular block
+ * device so we can compare pointers safely.
+ */
+ return lhs == rhs;
+}
+
struct dm_snap_pending_exception {
struct dm_snap_exception e;
@@ -476,11 +541,11 @@ static int init_hash_tables(struct dm_snapshot *s)
* Calculate based on the size of the original volume or
* the COW volume...
*/
- cow_dev_size = get_dev_size(s->cow->bdev);
+ cow_dev_size = get_dev_size(s->store->cow->bdev);
origin_dev_size = get_dev_size(s->origin->bdev);
max_buckets = calc_max_buckets();
- hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
+ hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
hash_size = min(hash_size, max_buckets);
hash_size = rounddown_pow_of_two(hash_size);
@@ -505,58 +570,6 @@ static int init_hash_tables(struct dm_snapshot *s)
}
/*
- * Round a number up to the nearest 'size' boundary. size must
- * be a power of 2.
- */
-static ulong round_up(ulong n, ulong size)
-{
- size--;
- return (n + size) & ~size;
-}
-
-static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg,
- char **error)
-{
- unsigned long chunk_size;
- char *value;
-
- chunk_size = simple_strtoul(chunk_size_arg, &value, 10);
- if (*chunk_size_arg == '\0' || *value != '\0') {
- *error = "Invalid chunk size";
- return -EINVAL;
- }
-
- if (!chunk_size) {
- s->chunk_size = s->chunk_mask = s->chunk_shift = 0;
- return 0;
- }
-
- /*
- * Chunk size must be multiple of page size. Silently
- * round up if it's not.
- */
- chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
-
- /* Check chunk_size is a power of 2 */
- if (!is_power_of_2(chunk_size)) {
- *error = "Chunk size is not a power of 2";
- return -EINVAL;
- }
-
- /* Validate the chunk size against the device block size */
- if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) {
- *error = "Chunk size is not a multiple of device blocksize";
- return -EINVAL;
- }
-
- s->chunk_size = chunk_size;
- s->chunk_mask = chunk_size - 1;
- s->chunk_shift = ffs(chunk_size) - 1;
-
- return 0;
-}
-
-/*
* Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
*/
static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
@@ -564,91 +577,68 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
struct dm_snapshot *s;
int i;
int r = -EINVAL;
- char persistent;
char *origin_path;
- char *cow_path;
+ struct dm_exception_store *store;
+ unsigned args_used;
if (argc != 4) {
ti->error = "requires exactly 4 arguments";
r = -EINVAL;
- goto bad1;
+ goto bad_args;
}
origin_path = argv[0];
- cow_path = argv[1];
- persistent = toupper(*argv[2]);
+ argv++;
+ argc--;
- if (persistent != 'P' && persistent != 'N') {
- ti->error = "Persistent flag is not P or N";
+ r = dm_exception_store_create(ti, argc, argv, &args_used, &store);
+ if (r) {
+ ti->error = "Couldn't create exception store";
r = -EINVAL;
- goto bad1;
+ goto bad_args;
}
+ argv += args_used;
+ argc -= args_used;
+
s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (s == NULL) {
+ if (!s) {
ti->error = "Cannot allocate snapshot context private "
"structure";
r = -ENOMEM;
- goto bad1;
+ goto bad_snap;
}
r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
if (r) {
ti->error = "Cannot get origin device";
- goto bad2;
- }
-
- r = dm_get_device(ti, cow_path, 0, 0,
- FMODE_READ | FMODE_WRITE, &s->cow);
- if (r) {
- dm_put_device(ti, s->origin);
- ti->error = "Cannot get COW device";
- goto bad2;
+ goto bad_origin;
}
- r = set_chunk_size(s, argv[3], &ti->error);
- if (r)
- goto bad3;
-
- s->type = persistent;
-
+ s->store = store;
s->valid = 1;
s->active = 0;
atomic_set(&s->pending_exceptions_count, 0);
init_rwsem(&s->lock);
spin_lock_init(&s->pe_lock);
- s->ti = ti;
/* Allocate hash table for COW data */
if (init_hash_tables(s)) {
ti->error = "Unable to allocate hash table space";
r = -ENOMEM;
- goto bad3;
- }
-
- s->store.snap = s;
-
- if (persistent == 'P')
- r = dm_create_persistent(&s->store);
- else
- r = dm_create_transient(&s->store);
-
- if (r) {
- ti->error = "Couldn't create exception store";
- r = -EINVAL;
- goto bad4;
+ goto bad_hash_tables;
}
r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
if (r) {
ti->error = "Could not create kcopyd client";
- goto bad5;
+ goto bad_kcopyd;
}
s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
if (!s->pending_pool) {
ti->error = "Could not allocate mempool for pending exceptions";
- goto bad6;
+ goto bad_pending_pool;
}
s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS,
@@ -665,7 +655,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
spin_lock_init(&s->tracked_chunk_lock);
/* Metadata must only be loaded into one table at once */
- r = s->store.read_metadata(&s->store, dm_add_exception, (void *)s);
+ r = s->store->type->read_metadata(s->store, dm_add_exception,
+ (void *)s);
if (r < 0) {
ti->error = "Failed to read snapshot metadata";
goto bad_load_and_register;
@@ -686,34 +677,33 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
ti->private = s;
- ti->split_io = s->chunk_size;
+ ti->split_io = s->store->chunk_size;
return 0;
- bad_load_and_register:
+bad_load_and_register:
mempool_destroy(s->tracked_chunk_pool);
- bad_tracked_chunk_pool:
+bad_tracked_chunk_pool:
mempool_destroy(s->pending_pool);
- bad6:
+bad_pending_pool:
dm_kcopyd_client_destroy(s->kcopyd_client);
- bad5:
- s->store.destroy(&s->store);
-
- bad4:
+bad_kcopyd:
exit_exception_table(&s->pending, pending_cache);
exit_exception_table(&s->complete, exception_cache);
- bad3:
- dm_put_device(ti, s->cow);
+bad_hash_tables:
dm_put_device(ti, s->origin);
- bad2:
+bad_origin:
kfree(s);
- bad1:
+bad_snap:
+ dm_exception_store_destroy(store);
+
+bad_args:
return r;
}
@@ -724,8 +714,6 @@ static void __free_exceptions(struct dm_snapshot *s)
exit_exception_table(&s->pending, pending_cache);
exit_exception_table(&s->complete, exception_cache);
-
- s->store.destroy(&s->store);
}
static void snapshot_dtr(struct dm_target *ti)
@@ -761,7 +749,8 @@ static void snapshot_dtr(struct dm_target *ti)
mempool_destroy(s->pending_pool);
dm_put_device(ti, s->origin);
- dm_put_device(ti, s->cow);
+
+ dm_exception_store_destroy(s->store);
kfree(s);
}
@@ -820,12 +809,12 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
else if (err == -ENOMEM)
DMERR("Invalidating snapshot: Unable to allocate exception.");
- if (s->store.drop_snapshot)
- s->store.drop_snapshot(&s->store);
+ if (s->store->type->drop_snapshot)
+ s->store->type->drop_snapshot(s->store);
s->valid = 0;
- dm_table_event(s->ti->table);
+ dm_table_event(s->store->ti->table);
}
static void get_pending_exception(struct dm_snap_pending_exception *pe)
@@ -943,8 +932,8 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
else
/* Update the metadata if we are persistent */
- s->store.commit_exception(&s->store, &pe->e, commit_callback,
- pe);
+ s->store->type->commit_exception(s->store, &pe->e,
+ commit_callback, pe);
}
/*
@@ -960,11 +949,11 @@ static void start_copy(struct dm_snap_pending_exception *pe)
dev_size = get_dev_size(bdev);
src.bdev = bdev;
- src.sector = chunk_to_sector(s, pe->e.old_chunk);
- src.count = min(s->chunk_size, dev_size - src.sector);
+ src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
+ src.count = min(s->store->chunk_size, dev_size - src.sector);
- dest.bdev = s->cow->bdev;
- dest.sector = chunk_to_sector(s, pe->e.new_chunk);
+ dest.bdev = s->store->cow->bdev;
+ dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
dest.count = src.count;
/* Hand over to kcopyd */
@@ -972,6 +961,17 @@ static void start_copy(struct dm_snap_pending_exception *pe)
&src, 1, &dest, 0, copy_callback, pe);
}
+static struct dm_snap_pending_exception *
+__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
+{
+ struct dm_snap_exception *e = lookup_exception(&s->pending, chunk);
+
+ if (!e)
+ return NULL;
+
+ return container_of(e, struct dm_snap_pending_exception, e);
+}
+
/*
* Looks to see if this snapshot already has a pending exception
* for this chunk, otherwise it allocates a new one and inserts
@@ -981,40 +981,15 @@ static void start_copy(struct dm_snap_pending_exception *pe)
* this.
*/
static struct dm_snap_pending_exception *
-__find_pending_exception(struct dm_snapshot *s, struct bio *bio)
+__find_pending_exception(struct dm_snapshot *s,
+ struct dm_snap_pending_exception *pe, chunk_t chunk)
{
- struct dm_snap_exception *e;
- struct dm_snap_pending_exception *pe;
- chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
+ struct dm_snap_pending_exception *pe2;
- /*
- * Is there a pending exception for this already ?
- */
- e = lookup_exception(&s->pending, chunk);
- if (e) {
- /* cast the exception to a pending exception */
- pe = container_of(e, struct dm_snap_pending_exception, e);
- goto out;
- }
-
- /*
- * Create a new pending exception, we don't want
- * to hold the lock while we do this.
- */
- up_write(&s->lock);
- pe = alloc_pending_exception(s);
- down_write(&s->lock);
-
- if (!s->valid) {
- free_pending_exception(pe);
- return NULL;
- }
-
- e = lookup_exception(&s->pending, chunk);
- if (e) {
+ pe2 = __lookup_pending_exception(s, chunk);
+ if (pe2) {
free_pending_exception(pe);
- pe = container_of(e, struct dm_snap_pending_exception, e);
- goto out;
+ return pe2;
}
pe->e.old_chunk = chunk;
@@ -1024,7 +999,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
atomic_set(&pe->ref_count, 0);
pe->started = 0;
- if (s->store.prepare_exception(&s->store, &pe->e)) {
+ if (s->store->type->prepare_exception(s->store, &pe->e)) {
free_pending_exception(pe);
return NULL;
}
@@ -1032,17 +1007,18 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
get_pending_exception(pe);
insert_exception(&s->pending, &pe->e);
- out:
return pe;
}
static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
struct bio *bio, chunk_t chunk)
{
- bio->bi_bdev = s->cow->bdev;
- bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) +
- (chunk - e->old_chunk)) +
- (bio->bi_sector & s->chunk_mask);
+ bio->bi_bdev = s->store->cow->bdev;
+ bio->bi_sector = chunk_to_sector(s->store,
+ dm_chunk_number(e->new_chunk) +
+ (chunk - e->old_chunk)) +
+ (bio->bi_sector &
+ s->store->chunk_mask);
}
static int snapshot_map(struct dm_target *ti, struct bio *bio,
@@ -1054,7 +1030,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
chunk_t chunk;
struct dm_snap_pending_exception *pe = NULL;
- chunk = sector_to_chunk(s, bio->bi_sector);
+ chunk = sector_to_chunk(s->store, bio->bi_sector);
/* Full snapshots are not usable */
/* To get here the table must be live so s->active is always set. */
@@ -1083,11 +1059,31 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
* writeable.
*/
if (bio_rw(bio) == WRITE) {
- pe = __find_pending_exception(s, bio);
+ pe = __lookup_pending_exception(s, chunk);
if (!pe) {
- __invalidate_snapshot(s, -ENOMEM);
- r = -EIO;
- goto out_unlock;
+ up_write(&s->lock);
+ pe = alloc_pending_exception(s);
+ down_write(&s->lock);
+
+ if (!s->valid) {
+ free_pending_exception(pe);
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ e = lookup_exception(&s->complete, chunk);
+ if (e) {
+ free_pending_exception(pe);
+ remap_exception(s, e, bio, chunk);
+ goto out_unlock;
+ }
+
+ pe = __find_pending_exception(s, pe, chunk);
+ if (!pe) {
+ __invalidate_snapshot(s, -ENOMEM);
+ r = -EIO;
+ goto out_unlock;
+ }
}
remap_exception(s, &pe->e, bio, chunk);
@@ -1137,24 +1133,25 @@ static void snapshot_resume(struct dm_target *ti)
static int snapshot_status(struct dm_target *ti, status_type_t type,
char *result, unsigned int maxlen)
{
+ unsigned sz = 0;
struct dm_snapshot *snap = ti->private;
switch (type) {
case STATUSTYPE_INFO:
if (!snap->valid)
- snprintf(result, maxlen, "Invalid");
+ DMEMIT("Invalid");
else {
- if (snap->store.fraction_full) {
+ if (snap->store->type->fraction_full) {
sector_t numerator, denominator;
- snap->store.fraction_full(&snap->store,
- &numerator,
- &denominator);
- snprintf(result, maxlen, "%llu/%llu",
- (unsigned long long)numerator,
- (unsigned long long)denominator);
+ snap->store->type->fraction_full(snap->store,
+ &numerator,
+ &denominator);
+ DMEMIT("%llu/%llu",
+ (unsigned long long)numerator,
+ (unsigned long long)denominator);
}
else
- snprintf(result, maxlen, "Unknown");
+ DMEMIT("Unknown");
}
break;
@@ -1164,10 +1161,9 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
* to make private copies if the output is to
* make sense.
*/
- snprintf(result, maxlen, "%s %s %c %llu",
- snap->origin->name, snap->cow->name,
- snap->type,
- (unsigned long long)snap->chunk_size);
+ DMEMIT("%s", snap->origin->name);
+ snap->store->type->status(snap->store, type, result + sz,
+ maxlen - sz);
break;
}
@@ -1196,14 +1192,14 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
goto next_snapshot;
/* Nothing to do if writing beyond end of snapshot */
- if (bio->bi_sector >= dm_table_get_size(snap->ti->table))
+ if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table))
goto next_snapshot;
/*
* Remember, different snapshots can have
* different chunk sizes.
*/
- chunk = sector_to_chunk(snap, bio->bi_sector);
+ chunk = sector_to_chunk(snap->store, bio->bi_sector);
/*
* Check exception table to see if block
@@ -1217,10 +1213,28 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
if (e)
goto next_snapshot;
- pe = __find_pending_exception(snap, bio);
+ pe = __lookup_pending_exception(snap, chunk);
if (!pe) {
- __invalidate_snapshot(snap, -ENOMEM);
- goto next_snapshot;
+ up_write(&snap->lock);
+ pe = alloc_pending_exception(snap);
+ down_write(&snap->lock);
+
+ if (!snap->valid) {
+ free_pending_exception(pe);
+ goto next_snapshot;
+ }
+
+ e = lookup_exception(&snap->complete, chunk);
+ if (e) {
+ free_pending_exception(pe);
+ goto next_snapshot;
+ }
+
+ pe = __find_pending_exception(snap, pe, chunk);
+ if (!pe) {
+ __invalidate_snapshot(snap, -ENOMEM);
+ goto next_snapshot;
+ }
}
if (!primary_pe) {
@@ -1360,7 +1374,8 @@ static void origin_resume(struct dm_target *ti)
o = __lookup_origin(dev->bdev);
if (o)
list_for_each_entry (snap, &o->snapshots, list)
- chunk_size = min_not_zero(chunk_size, snap->chunk_size);
+ chunk_size = min_not_zero(chunk_size,
+ snap->store->chunk_size);
up_read(&_origins_lock);
ti->split_io = chunk_size;
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
deleted file mode 100644
index d9e62b43cf85..000000000000
--- a/drivers/md/dm-snap.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
- *
- * This file is released under the GPL.
- */
-
-#ifndef DM_SNAPSHOT_H
-#define DM_SNAPSHOT_H
-
-#include <linux/device-mapper.h>
-#include "dm-exception-store.h"
-#include "dm-bio-list.h"
-#include <linux/blkdev.h>
-#include <linux/workqueue.h>
-
-struct exception_table {
- uint32_t hash_mask;
- unsigned hash_shift;
- struct list_head *table;
-};
-
-#define DM_TRACKED_CHUNK_HASH_SIZE 16
-#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
- (DM_TRACKED_CHUNK_HASH_SIZE - 1))
-
-struct dm_snapshot {
- struct rw_semaphore lock;
- struct dm_target *ti;
-
- struct dm_dev *origin;
- struct dm_dev *cow;
-
- /* List of snapshots per Origin */
- struct list_head list;
-
- /* Size of data blocks saved - must be a power of 2 */
- chunk_t chunk_size;
- chunk_t chunk_mask;
- chunk_t chunk_shift;
-
- /* You can't use a snapshot if this is 0 (e.g. if full) */
- int valid;
-
- /* Origin writes don't trigger exceptions until this is set */
- int active;
-
- /* Used for display of table */
- char type;
-
- mempool_t *pending_pool;
-
- atomic_t pending_exceptions_count;
-
- struct exception_table pending;
- struct exception_table complete;
-
- /*
- * pe_lock protects all pending_exception operations and access
- * as well as the snapshot_bios list.
- */
- spinlock_t pe_lock;
-
- /* The on disk metadata handler */
- struct dm_exception_store store;
-
- struct dm_kcopyd_client *kcopyd_client;
-
- /* Queue of snapshot writes for ksnapd to flush */
- struct bio_list queued_bios;
- struct work_struct queued_bios_work;
-
- /* Chunks with outstanding reads */
- mempool_t *tracked_chunk_pool;
- spinlock_t tracked_chunk_lock;
- struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
-};
-
-/*
- * Return the number of sectors in the device.
- */
-static inline sector_t get_dev_size(struct block_device *bdev)
-{
- return bdev->bd_inode->i_size >> SECTOR_SHIFT;
-}
-
-static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
-{
- return (sector & ~s->chunk_mask) >> s->chunk_shift;
-}
-
-static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
-{
- return chunk << s->chunk_shift;
-}
-
-static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs)
-{
- /*
- * There is only ever one instance of a particular block
- * device so we can compare pointers safely.
- */
- return lhs == rhs;
-}
-
-#endif
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2fd66c30f7f8..429b50b975d5 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -52,8 +52,6 @@ struct dm_table {
sector_t *highs;
struct dm_target *targets;
- unsigned barriers_supported:1;
-
/*
* Indicates the rw permissions for the new logical
* device. This should be a combination of FMODE_READ
@@ -243,7 +241,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
INIT_LIST_HEAD(&t->devices);
atomic_set(&t->holders, 0);
- t->barriers_supported = 1;
if (!num_targets)
num_targets = KEYS_PER_NODE;
@@ -399,28 +396,30 @@ static int check_device_area(struct dm_dev_internal *dd, sector_t start,
}
/*
- * This upgrades the mode on an already open dm_dev. Being
+ * This upgrades the mode on an already open dm_dev, being
* careful to leave things as they were if we fail to reopen the
- * device.
+ * device and not to touch the existing bdev field in case
+ * it is accessed concurrently inside dm_table_any_congested().
*/
static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
struct mapped_device *md)
{
int r;
- struct dm_dev_internal dd_copy;
- dev_t dev = dd->dm_dev.bdev->bd_dev;
+ struct dm_dev_internal dd_new, dd_old;
+
+ dd_new = dd_old = *dd;
+
+ dd_new.dm_dev.mode |= new_mode;
+ dd_new.dm_dev.bdev = NULL;
- dd_copy = *dd;
+ r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md);
+ if (r)
+ return r;
dd->dm_dev.mode |= new_mode;
- dd->dm_dev.bdev = NULL;
- r = open_dev(dd, dev, md);
- if (!r)
- close_dev(&dd_copy, md);
- else
- *dd = dd_copy;
+ close_dev(&dd_old, md);
- return r;
+ return 0;
}
/*
@@ -749,10 +748,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
/* FIXME: the plan is to combine high here and then have
* the merge fn apply the target level restrictions. */
combine_restrictions_low(&t->limits, &tgt->limits);
-
- if (!(tgt->type->features & DM_TARGET_SUPPORTS_BARRIERS))
- t->barriers_supported = 0;
-
return 0;
bad:
@@ -797,12 +792,6 @@ int dm_table_complete(struct dm_table *t)
check_for_valid_limits(&t->limits);
- /*
- * We only support barriers if there is exactly one underlying device.
- */
- if (!list_is_singular(&t->devices))
- t->barriers_supported = 0;
-
/* how many indexes will the btree have ? */
leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -877,6 +866,45 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
return &t->targets[(KEYS_PER_NODE * n) + k];
}
+/*
+ * Set the integrity profile for this device if all devices used have
+ * matching profiles.
+ */
+static void dm_table_set_integrity(struct dm_table *t)
+{
+ struct list_head *devices = dm_table_get_devices(t);
+ struct dm_dev_internal *prev = NULL, *dd = NULL;
+
+ if (!blk_get_integrity(dm_disk(t->md)))
+ return;
+
+ list_for_each_entry(dd, devices, list) {
+ if (prev &&
+ blk_integrity_compare(prev->dm_dev.bdev->bd_disk,
+ dd->dm_dev.bdev->bd_disk) < 0) {
+ DMWARN("%s: integrity not set: %s and %s mismatch",
+ dm_device_name(t->md),
+ prev->dm_dev.bdev->bd_disk->disk_name,
+ dd->dm_dev.bdev->bd_disk->disk_name);
+ goto no_integrity;
+ }
+ prev = dd;
+ }
+
+ if (!prev || !bdev_get_integrity(prev->dm_dev.bdev))
+ goto no_integrity;
+
+ blk_integrity_register(dm_disk(t->md),
+ bdev_get_integrity(prev->dm_dev.bdev));
+
+ return;
+
+no_integrity:
+ blk_integrity_register(dm_disk(t->md), NULL);
+
+ return;
+}
+
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
{
/*
@@ -897,6 +925,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
else
queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
+ dm_table_set_integrity(t);
}
unsigned int dm_table_get_num_targets(struct dm_table *t)
@@ -1017,12 +1046,6 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
return t->md;
}
-int dm_table_barrier_ok(struct dm_table *t)
-{
- return t->barriers_supported;
-}
-EXPORT_SYMBOL(dm_table_barrier_ok);
-
EXPORT_SYMBOL(dm_vcalloc);
EXPORT_SYMBOL(dm_get_device);
EXPORT_SYMBOL(dm_put_device);
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 7decf10006e4..04feccf2a997 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -14,45 +14,34 @@
#define DM_MSG_PREFIX "target"
-struct tt_internal {
- struct target_type tt;
-
- struct list_head list;
- long use;
-};
-
static LIST_HEAD(_targets);
static DECLARE_RWSEM(_lock);
#define DM_MOD_NAME_SIZE 32
-static inline struct tt_internal *__find_target_type(const char *name)
+static inline struct target_type *__find_target_type(const char *name)
{
- struct tt_internal *ti;
+ struct target_type *tt;
- list_for_each_entry (ti, &_targets, list)
- if (!strcmp(name, ti->tt.name))
- return ti;
+ list_for_each_entry(tt, &_targets, list)
+ if (!strcmp(name, tt->name))
+ return tt;
return NULL;
}
-static struct tt_internal *get_target_type(const char *name)
+static struct target_type *get_target_type(const char *name)
{
- struct tt_internal *ti;
+ struct target_type *tt;
down_read(&_lock);
- ti = __find_target_type(name);
- if (ti) {
- if ((ti->use == 0) && !try_module_get(ti->tt.module))
- ti = NULL;
- else
- ti->use++;
- }
+ tt = __find_target_type(name);
+ if (tt && !try_module_get(tt->module))
+ tt = NULL;
up_read(&_lock);
- return ti;
+ return tt;
}
static void load_module(const char *name)
@@ -62,92 +51,59 @@ static void load_module(const char *name)
struct target_type *dm_get_target_type(const char *name)
{
- struct tt_internal *ti = get_target_type(name);
+ struct target_type *tt = get_target_type(name);
- if (!ti) {
+ if (!tt) {
load_module(name);
- ti = get_target_type(name);
+ tt = get_target_type(name);
}
- return ti ? &ti->tt : NULL;
+ return tt;
}
-void dm_put_target_type(struct target_type *t)
+void dm_put_target_type(struct target_type *tt)
{
- struct tt_internal *ti = (struct tt_internal *) t;
-
down_read(&_lock);
- if (--ti->use == 0)
- module_put(ti->tt.module);
-
- BUG_ON(ti->use < 0);
+ module_put(tt->module);
up_read(&_lock);
-
- return;
-}
-
-static struct tt_internal *alloc_target(struct target_type *t)
-{
- struct tt_internal *ti = kzalloc(sizeof(*ti), GFP_KERNEL);
-
- if (ti)
- ti->tt = *t;
-
- return ti;
}
-
int dm_target_iterate(void (*iter_func)(struct target_type *tt,
void *param), void *param)
{
- struct tt_internal *ti;
+ struct target_type *tt;
down_read(&_lock);
- list_for_each_entry (ti, &_targets, list)
- iter_func(&ti->tt, param);
+ list_for_each_entry(tt, &_targets, list)
+ iter_func(tt, param);
up_read(&_lock);
return 0;
}
-int dm_register_target(struct target_type *t)
+int dm_register_target(struct target_type *tt)
{
int rv = 0;
- struct tt_internal *ti = alloc_target(t);
-
- if (!ti)
- return -ENOMEM;
down_write(&_lock);
- if (__find_target_type(t->name))
+ if (__find_target_type(tt->name))
rv = -EEXIST;
else
- list_add(&ti->list, &_targets);
+ list_add(&tt->list, &_targets);
up_write(&_lock);
- if (rv)
- kfree(ti);
return rv;
}
-void dm_unregister_target(struct target_type *t)
+void dm_unregister_target(struct target_type *tt)
{
- struct tt_internal *ti;
-
down_write(&_lock);
- if (!(ti = __find_target_type(t->name))) {
- DMCRIT("Unregistering unrecognised target: %s", t->name);
- BUG();
- }
-
- if (ti->use) {
- DMCRIT("Attempt to unregister target still in use: %s",
- t->name);
+ if (!__find_target_type(tt->name)) {
+ DMCRIT("Unregistering unrecognised target: %s", tt->name);
BUG();
}
- list_del(&ti->list);
- kfree(ti);
+ list_del(&tt->list);
up_write(&_lock);
}
@@ -156,17 +112,17 @@ void dm_unregister_target(struct target_type *t)
* io-err: always fails an io, useful for bringing
* up LVs that have holes in them.
*/
-static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
+static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
{
return 0;
}
-static void io_err_dtr(struct dm_target *ti)
+static void io_err_dtr(struct dm_target *tt)
{
/* empty */
}
-static int io_err_map(struct dm_target *ti, struct bio *bio,
+static int io_err_map(struct dm_target *tt, struct bio *bio,
union map_info *map_context)
{
return -EIO;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8d40f27cce89..424f7b048c30 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -6,7 +6,6 @@
*/
#include "dm.h"
-#include "dm-bio-list.h"
#include "dm-uevent.h"
#include <linux/init.h>
@@ -89,29 +88,20 @@ union map_info *dm_get_mapinfo(struct bio *bio)
/*
* Bits for the md->flags field.
*/
-#define DMF_BLOCK_IO 0
+#define DMF_BLOCK_IO_FOR_SUSPEND 0
#define DMF_SUSPENDED 1
#define DMF_FROZEN 2
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
+#define DMF_QUEUE_IO_TO_THREAD 6
/*
* Work processed by per-device workqueue.
*/
-struct dm_wq_req {
- enum {
- DM_WQ_FLUSH_DEFERRED,
- } type;
- struct work_struct work;
- struct mapped_device *md;
- void *context;
-};
-
struct mapped_device {
struct rw_semaphore io_lock;
struct mutex suspend_lock;
- spinlock_t pushback_lock;
rwlock_t map_lock;
atomic_t holders;
atomic_t open_count;
@@ -129,8 +119,14 @@ struct mapped_device {
*/
atomic_t pending;
wait_queue_head_t wait;
+ struct work_struct work;
struct bio_list deferred;
- struct bio_list pushback;
+ spinlock_t deferred_lock;
+
+ /*
+ * An error from the barrier request currently being processed.
+ */
+ int barrier_error;
/*
* Processing queue (flush/barriers)
@@ -433,6 +429,10 @@ static void end_io_acct(struct dm_io *io)
part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
part_stat_unlock();
+ /*
+ * After this is decremented the bio must not be touched if it is
+ * a barrier.
+ */
dm_disk(md)->part0.in_flight = pending =
atomic_dec_return(&md->pending);
@@ -444,19 +444,18 @@ static void end_io_acct(struct dm_io *io)
/*
* Add the bio to the list of deferred io.
*/
-static int queue_io(struct mapped_device *md, struct bio *bio)
+static void queue_io(struct mapped_device *md, struct bio *bio)
{
down_write(&md->io_lock);
- if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
- up_write(&md->io_lock);
- return 1;
- }
-
+ spin_lock_irq(&md->deferred_lock);
bio_list_add(&md->deferred, bio);
+ spin_unlock_irq(&md->deferred_lock);
+
+ if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
+ queue_work(md->wq, &md->work);
up_write(&md->io_lock);
- return 0; /* deferred successfully */
}
/*
@@ -537,30 +536,38 @@ static void dec_pending(struct dm_io *io, int error)
if (io->error == DM_ENDIO_REQUEUE) {
/*
* Target requested pushing back the I/O.
- * This must be handled before the sleeper on
- * suspend queue merges the pushback list.
*/
- spin_lock_irqsave(&md->pushback_lock, flags);
+ spin_lock_irqsave(&md->deferred_lock, flags);
if (__noflush_suspending(md))
- bio_list_add(&md->pushback, io->bio);
+ bio_list_add_head(&md->deferred, io->bio);
else
/* noflush suspend was interrupted. */
io->error = -EIO;
- spin_unlock_irqrestore(&md->pushback_lock, flags);
+ spin_unlock_irqrestore(&md->deferred_lock, flags);
}
- end_io_acct(io);
-
io_error = io->error;
bio = io->bio;
- free_io(md, io);
+ if (bio_barrier(bio)) {
+ /*
+ * There can be just one barrier request so we use
+ * a per-device variable for error reporting.
+ * Note that you can't touch the bio after end_io_acct
+ */
+ md->barrier_error = io_error;
+ end_io_acct(io);
+ } else {
+ end_io_acct(io);
- if (io_error != DM_ENDIO_REQUEUE) {
- trace_block_bio_complete(md->queue, bio);
+ if (io_error != DM_ENDIO_REQUEUE) {
+ trace_block_bio_complete(md->queue, bio);
- bio_endio(bio, io_error);
+ bio_endio(bio, io_error);
+ }
}
+
+ free_io(md, io);
}
}
@@ -702,13 +709,19 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
clone->bi_sector = sector;
clone->bi_bdev = bio->bi_bdev;
- clone->bi_rw = bio->bi_rw;
+ clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER);
clone->bi_vcnt = 1;
clone->bi_size = to_bytes(len);
clone->bi_io_vec->bv_offset = offset;
clone->bi_io_vec->bv_len = clone->bi_size;
clone->bi_flags |= 1 << BIO_CLONED;
+ if (bio_integrity(bio)) {
+ bio_integrity_clone(clone, bio, GFP_NOIO);
+ bio_integrity_trim(clone,
+ bio_sector_offset(bio, idx, offset), len);
+ }
+
return clone;
}
@@ -723,6 +736,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
__bio_clone(clone, bio);
+ clone->bi_rw &= ~(1 << BIO_RW_BARRIER);
clone->bi_destructor = dm_bio_destructor;
clone->bi_sector = sector;
clone->bi_idx = idx;
@@ -730,6 +744,14 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
clone->bi_size = to_bytes(len);
clone->bi_flags &= ~(1 << BIO_SEG_VALID);
+ if (bio_integrity(bio)) {
+ bio_integrity_clone(clone, bio, GFP_NOIO);
+
+ if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
+ bio_integrity_trim(clone,
+ bio_sector_offset(bio, idx, 0), len);
+ }
+
return clone;
}
@@ -834,21 +856,22 @@ static int __clone_and_map(struct clone_info *ci)
}
/*
- * Split the bio into several clones.
+ * Split the bio into several clones and submit it to targets.
*/
-static int __split_bio(struct mapped_device *md, struct bio *bio)
+static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
{
struct clone_info ci;
int error = 0;
ci.map = dm_get_table(md);
- if (unlikely(!ci.map))
- return -EIO;
- if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) {
- dm_table_put(ci.map);
- bio_endio(bio, -EOPNOTSUPP);
- return 0;
+ if (unlikely(!ci.map)) {
+ if (!bio_barrier(bio))
+ bio_io_error(bio);
+ else
+ md->barrier_error = -EIO;
+ return;
}
+
ci.md = md;
ci.bio = bio;
ci.io = alloc_io(md);
@@ -867,8 +890,6 @@ static int __split_bio(struct mapped_device *md, struct bio *bio)
/* drop the extra reference count */
dec_pending(ci.io, error);
dm_table_put(ci.map);
-
- return 0;
}
/*-----------------------------------------------------------------
* CRUD END
@@ -927,7 +948,6 @@ out:
*/
static int dm_request(struct request_queue *q, struct bio *bio)
{
- int r = -EIO;
int rw = bio_data_dir(bio);
struct mapped_device *md = q->queuedata;
int cpu;
@@ -940,32 +960,26 @@ static int dm_request(struct request_queue *q, struct bio *bio)
part_stat_unlock();
/*
- * If we're suspended we have to queue
- * this io for later.
+ * If we're suspended or the thread is processing barriers
+ * we have to queue this io for later.
*/
- while (test_bit(DMF_BLOCK_IO, &md->flags)) {
+ if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
+ unlikely(bio_barrier(bio))) {
up_read(&md->io_lock);
- if (bio_rw(bio) != READA)
- r = queue_io(md, bio);
+ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
+ bio_rw(bio) == READA) {
+ bio_io_error(bio);
+ return 0;
+ }
- if (r <= 0)
- goto out_req;
+ queue_io(md, bio);
- /*
- * We're in a while loop, because someone could suspend
- * before we get to the following read lock.
- */
- down_read(&md->io_lock);
+ return 0;
}
- r = __split_bio(md, bio);
+ __split_and_process_bio(md, bio);
up_read(&md->io_lock);
-
-out_req:
- if (r < 0)
- bio_io_error(bio);
-
return 0;
}
@@ -986,7 +1000,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
struct mapped_device *md = congested_data;
struct dm_table *map;
- if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
+ if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
map = dm_get_table(md);
if (map) {
r = dm_table_any_congested(map, bdi_bits);
@@ -1074,6 +1088,8 @@ out:
static struct block_device_operations dm_blk_dops;
+static void dm_wq_work(struct work_struct *work);
+
/*
* Allocate and initialise a blank device with a given minor.
*/
@@ -1101,7 +1117,7 @@ static struct mapped_device *alloc_dev(int minor)
init_rwsem(&md->io_lock);
mutex_init(&md->suspend_lock);
- spin_lock_init(&md->pushback_lock);
+ spin_lock_init(&md->deferred_lock);
rwlock_init(&md->map_lock);
atomic_set(&md->holders, 1);
atomic_set(&md->open_count, 0);
@@ -1118,6 +1134,7 @@ static struct mapped_device *alloc_dev(int minor)
md->queue->backing_dev_info.congested_fn = dm_any_congested;
md->queue->backing_dev_info.congested_data = md;
blk_queue_make_request(md->queue, dm_request);
+ blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
md->queue->unplug_fn = dm_unplug_all;
blk_queue_merge_bvec(md->queue, dm_merge_bvec);
@@ -1140,6 +1157,7 @@ static struct mapped_device *alloc_dev(int minor)
atomic_set(&md->pending, 0);
init_waitqueue_head(&md->wait);
+ INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
md->disk->major = _major;
@@ -1197,6 +1215,7 @@ static void free_dev(struct mapped_device *md)
mempool_destroy(md->tio_pool);
mempool_destroy(md->io_pool);
bioset_free(md->bs);
+ blk_integrity_unregister(md->disk);
del_gendisk(md->disk);
free_minor(minor);
@@ -1379,18 +1398,24 @@ void dm_put(struct mapped_device *md)
}
EXPORT_SYMBOL_GPL(dm_put);
-static int dm_wait_for_completion(struct mapped_device *md)
+static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
{
int r = 0;
+ DECLARE_WAITQUEUE(wait, current);
+
+ dm_unplug_all(md->queue);
+
+ add_wait_queue(&md->wait, &wait);
while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(interruptible);
smp_mb();
if (!atomic_read(&md->pending))
break;
- if (signal_pending(current)) {
+ if (interruptible == TASK_INTERRUPTIBLE &&
+ signal_pending(current)) {
r = -EINTR;
break;
}
@@ -1399,68 +1424,80 @@ static int dm_wait_for_completion(struct mapped_device *md)
}
set_current_state(TASK_RUNNING);
+ remove_wait_queue(&md->wait, &wait);
+
return r;
}
-/*
- * Process the deferred bios
- */
-static void __flush_deferred_io(struct mapped_device *md)
+static int dm_flush(struct mapped_device *md)
{
- struct bio *c;
+ dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+ return 0;
+}
+
+static void process_barrier(struct mapped_device *md, struct bio *bio)
+{
+ int error = dm_flush(md);
- while ((c = bio_list_pop(&md->deferred))) {
- if (__split_bio(md, c))
- bio_io_error(c);
+ if (unlikely(error)) {
+ bio_endio(bio, error);
+ return;
+ }
+ if (bio_empty_barrier(bio)) {
+ bio_endio(bio, 0);
+ return;
}
- clear_bit(DMF_BLOCK_IO, &md->flags);
-}
+ __split_and_process_bio(md, bio);
-static void __merge_pushback_list(struct mapped_device *md)
-{
- unsigned long flags;
+ error = dm_flush(md);
- spin_lock_irqsave(&md->pushback_lock, flags);
- clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
- bio_list_merge_head(&md->deferred, &md->pushback);
- bio_list_init(&md->pushback);
- spin_unlock_irqrestore(&md->pushback_lock, flags);
+ if (!error && md->barrier_error)
+ error = md->barrier_error;
+
+ if (md->barrier_error != DM_ENDIO_REQUEUE)
+ bio_endio(bio, error);
}
+/*
+ * Process the deferred bios
+ */
static void dm_wq_work(struct work_struct *work)
{
- struct dm_wq_req *req = container_of(work, struct dm_wq_req, work);
- struct mapped_device *md = req->md;
+ struct mapped_device *md = container_of(work, struct mapped_device,
+ work);
+ struct bio *c;
down_write(&md->io_lock);
- switch (req->type) {
- case DM_WQ_FLUSH_DEFERRED:
- __flush_deferred_io(md);
- break;
- default:
- DMERR("dm_wq_work: unrecognised work type %d", req->type);
- BUG();
+
+ while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
+ spin_lock_irq(&md->deferred_lock);
+ c = bio_list_pop(&md->deferred);
+ spin_unlock_irq(&md->deferred_lock);
+
+ if (!c) {
+ clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
+ break;
+ }
+
+ up_write(&md->io_lock);
+
+ if (bio_barrier(c))
+ process_barrier(md, c);
+ else
+ __split_and_process_bio(md, c);
+
+ down_write(&md->io_lock);
}
- up_write(&md->io_lock);
-}
-static void dm_wq_queue(struct mapped_device *md, int type, void *context,
- struct dm_wq_req *req)
-{
- req->type = type;
- req->md = md;
- req->context = context;
- INIT_WORK(&req->work, dm_wq_work);
- queue_work(md->wq, &req->work);
+ up_write(&md->io_lock);
}
-static void dm_queue_flush(struct mapped_device *md, int type, void *context)
+static void dm_queue_flush(struct mapped_device *md)
{
- struct dm_wq_req req;
-
- dm_wq_queue(md, type, context, &req);
- flush_workqueue(md->wq);
+ clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+ smp_mb__after_clear_bit();
+ queue_work(md->wq, &md->work);
}
/*
@@ -1534,7 +1571,6 @@ static void unlock_fs(struct mapped_device *md)
int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
{
struct dm_table *map = NULL;
- DECLARE_WAITQUEUE(wait, current);
int r = 0;
int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
@@ -1579,38 +1615,54 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
}
/*
- * First we set the BLOCK_IO flag so no more ios will be mapped.
+ * Here we must make sure that no processes are submitting requests
+ * to target drivers i.e. no one may be executing
+ * __split_and_process_bio. This is called from dm_request and
+ * dm_wq_work.
+ *
+ * To get all processes out of __split_and_process_bio in dm_request,
+ * we take the write lock. To prevent any process from reentering
+ * __split_and_process_bio from dm_request, we set
+ * DMF_QUEUE_IO_TO_THREAD.
+ *
+ * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
+ * and call flush_workqueue(md->wq). flush_workqueue will wait until
+ * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
+ * further calls to __split_and_process_bio from dm_wq_work.
*/
down_write(&md->io_lock);
- set_bit(DMF_BLOCK_IO, &md->flags);
-
- add_wait_queue(&md->wait, &wait);
+ set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+ set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
up_write(&md->io_lock);
- /* unplug */
- if (map)
- dm_table_unplug_all(map);
+ flush_workqueue(md->wq);
/*
- * Wait for the already-mapped ios to complete.
+ * At this point no more requests are entering target request routines.
+ * We call dm_wait_for_completion to wait for all existing requests
+ * to finish.
*/
- r = dm_wait_for_completion(md);
+ r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
down_write(&md->io_lock);
- remove_wait_queue(&md->wait, &wait);
-
if (noflush)
- __merge_pushback_list(md);
+ clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
up_write(&md->io_lock);
/* were we interrupted ? */
if (r < 0) {
- dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
+ dm_queue_flush(md);
unlock_fs(md);
goto out; /* pushback list is already flushed, so skip flush */
}
+ /*
+ * If dm_wait_for_completion returned 0, the device is completely
+ * quiescent now. There is no request-processing activity. All new
+ * requests are being added to md->deferred list.
+ */
+
dm_table_postsuspend_targets(map);
set_bit(DMF_SUSPENDED, &md->flags);
@@ -1645,7 +1697,7 @@ int dm_resume(struct mapped_device *md)
if (r)
goto out;
- dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
+ dm_queue_flush(md);
unlock_fs(md);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 20194e000c5a..a31506d93e91 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -52,7 +52,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
* To check the return value from dm_table_find_target().
*/
#define dm_target_is_valid(t) ((t)->table)
-int dm_table_barrier_ok(struct dm_table *t);
/*-----------------------------------------------------------------
* A registry of target types.
@@ -60,7 +59,7 @@ int dm_table_barrier_ok(struct dm_table *t);
int dm_target_init(void);
void dm_target_exit(void);
struct target_type *dm_get_target_type(const char *name);
-void dm_put_target_type(struct target_type *t);
+void dm_put_target_type(struct target_type *tt);
int dm_target_iterate(void (*iter_func)(struct target_type *tt,
void *param), void *param);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 86d9adf90e79..8695809b24b0 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -62,7 +62,10 @@
#define ModeShift 5
#define MaxFault 50
-#include <linux/raid/md.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include "md.h"
+#include <linux/seq_file.h>
static void faulty_fail(struct bio *bio, int error)
@@ -280,6 +283,17 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
return 0;
}
+static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+ WARN_ONCE(raid_disks,
+ "%s does not support generic reshape\n", __func__);
+
+ if (sectors == 0)
+ return mddev->dev_sectors;
+
+ return sectors;
+}
+
static int run(mddev_t *mddev)
{
mdk_rdev_t *rdev;
@@ -298,7 +312,7 @@ static int run(mddev_t *mddev)
list_for_each_entry(rdev, &mddev->disks, same_set)
conf->rdev = rdev;
- mddev->array_sectors = mddev->size * 2;
+ md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
mddev->private = conf;
reconfig(mddev, mddev->layout, -1);
@@ -325,6 +339,7 @@ static struct mdk_personality faulty_personality =
.stop = stop,
.status = status,
.reconfig = reconfig,
+ .size = faulty_size,
};
static int __init raid_init(void)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 09658b218474..7a36e38393a1 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -16,7 +16,11 @@
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include <linux/raid/linear.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "linear.h"
/*
* find which device holds a particular offset
@@ -97,6 +101,16 @@ static int linear_congested(void *data, int bits)
return ret;
}
+static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+ linear_conf_t *conf = mddev_to_conf(mddev);
+
+ WARN_ONCE(sectors || raid_disks,
+ "%s does not support generic reshape\n", __func__);
+
+ return conf->array_sectors;
+}
+
static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
{
linear_conf_t *conf;
@@ -135,8 +149,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
mddev->queue->max_sectors > (PAGE_SIZE>>9))
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
- disk->num_sectors = rdev->size * 2;
- conf->array_sectors += rdev->size * 2;
+ disk->num_sectors = rdev->sectors;
+ conf->array_sectors += rdev->sectors;
cnt++;
}
@@ -249,7 +263,7 @@ static int linear_run (mddev_t *mddev)
if (!conf)
return 1;
mddev->private = conf;
- mddev->array_sectors = conf->array_sectors;
+ md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
mddev->queue->unplug_fn = linear_unplug;
@@ -283,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
newconf->prev = mddev_to_conf(mddev);
mddev->private = newconf;
mddev->raid_disks++;
- mddev->array_sectors = newconf->array_sectors;
+ md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
return 0;
}
@@ -381,6 +395,7 @@ static struct mdk_personality linear_personality =
.stop = linear_stop,
.status = linear_status,
.hot_add_disk = linear_add,
+ .size = linear_size,
};
static int __init linear_init (void)
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
new file mode 100644
index 000000000000..bf8179587f95
--- /dev/null
+++ b/drivers/md/linear.h
@@ -0,0 +1,29 @@
+#ifndef _LINEAR_H
+#define _LINEAR_H
+
+struct dev_info {
+ mdk_rdev_t *rdev;
+ sector_t num_sectors;
+ sector_t start_sector;
+};
+
+typedef struct dev_info dev_info_t;
+
+struct linear_private_data
+{
+ struct linear_private_data *prev; /* earlier version */
+ dev_info_t **hash_table;
+ sector_t spacing;
+ sector_t array_sectors;
+ int sector_shift; /* shift before dividing
+ * by spacing
+ */
+ dev_info_t disks[0];
+};
+
+
+typedef struct linear_private_data linear_conf_t;
+
+#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
+
+#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a307f87eb90e..fccc8343a250 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -33,9 +33,9 @@
*/
#include <linux/kthread.h>
-#include <linux/raid/md.h>
-#include <linux/raid/bitmap.h>
+#include <linux/blkdev.h>
#include <linux/sysctl.h>
+#include <linux/seq_file.h>
#include <linux/buffer_head.h> /* for invalidate_bdev */
#include <linux/poll.h>
#include <linux/ctype.h>
@@ -45,11 +45,10 @@
#include <linux/reboot.h>
#include <linux/file.h>
#include <linux/delay.h>
-
-#define MAJOR_NR MD_MAJOR
-
-/* 63 partitions with the alternate major number (mdp) */
-#define MdpMinorShift 6
+#include <linux/raid/md_p.h>
+#include <linux/raid/md_u.h>
+#include "md.h"
+#include "bitmap.h"
#define DEBUG 0
#define dprintk(x...) ((void)(DEBUG && printk(x)))
@@ -202,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
)
-static int md_fail_request(struct request_queue *q, struct bio *bio)
+/* Rather than calling directly into the personality make_request function,
+ * IO requests come here first so that we can check if the device is
+ * being suspended pending a reconfiguration.
+ * We hold a refcount over the call to ->make_request. By the time that
+ * call has finished, the bio has been linked into some internal structure
+ * and so is visible to ->quiesce(), so we don't need the refcount any more.
+ */
+static int md_make_request(struct request_queue *q, struct bio *bio)
{
- bio_io_error(bio);
- return 0;
+ mddev_t *mddev = q->queuedata;
+ int rv;
+ if (mddev == NULL || mddev->pers == NULL) {
+ bio_io_error(bio);
+ return 0;
+ }
+ rcu_read_lock();
+ if (mddev->suspended) {
+ DEFINE_WAIT(__wait);
+ for (;;) {
+ prepare_to_wait(&mddev->sb_wait, &__wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!mddev->suspended)
+ break;
+ rcu_read_unlock();
+ schedule();
+ rcu_read_lock();
+ }
+ finish_wait(&mddev->sb_wait, &__wait);
+ }
+ atomic_inc(&mddev->active_io);
+ rcu_read_unlock();
+ rv = mddev->pers->make_request(q, bio);
+ if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
+ wake_up(&mddev->sb_wait);
+
+ return rv;
+}
+
+static void mddev_suspend(mddev_t *mddev)
+{
+ BUG_ON(mddev->suspended);
+ mddev->suspended = 1;
+ synchronize_rcu();
+ wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
+ mddev->pers->quiesce(mddev, 1);
+ md_unregister_thread(mddev->thread);
+ mddev->thread = NULL;
+ /* we now know that no code is executing in the personality module,
+ * except possibly the tail end of a ->bi_end_io function, but that
+ * is certain to complete before the module has a chance to get
+ * unloaded
+ */
+}
+
+static void mddev_resume(mddev_t *mddev)
+{
+ mddev->suspended = 0;
+ wake_up(&mddev->sb_wait);
+ mddev->pers->quiesce(mddev, 0);
}
+
static inline mddev_t *mddev_get(mddev_t *mddev)
{
atomic_inc(&mddev->active);
@@ -310,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit)
init_timer(&new->safemode_timer);
atomic_set(&new->active, 1);
atomic_set(&new->openers, 0);
+ atomic_set(&new->active_io, 0);
spin_lock_init(&new->write_lock);
init_waitqueue_head(&new->sb_wait);
init_waitqueue_head(&new->recovery_wait);
@@ -326,6 +382,11 @@ static inline int mddev_lock(mddev_t * mddev)
return mutex_lock_interruptible(&mddev->reconfig_mutex);
}
+static inline int mddev_is_locked(mddev_t *mddev)
+{
+ return mutex_is_locked(&mddev->reconfig_mutex);
+}
+
static inline int mddev_trylock(mddev_t * mddev)
{
return mutex_trylock(&mddev->reconfig_mutex);
@@ -409,7 +470,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
rdev->sb_loaded = 0;
rdev->sb_page = NULL;
rdev->sb_start = 0;
- rdev->size = 0;
+ rdev->sectors = 0;
}
}
@@ -775,9 +836,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
else
ret = 0;
}
- rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2;
+ rdev->sectors = calc_num_sectors(rdev, sb->chunk_size);
- if (rdev->size < sb->size && sb->level > 1)
+ if (rdev->sectors < sb->size * 2 && sb->level > 1)
/* "this cannot possibly happen" ... */
ret = -EINVAL;
@@ -812,7 +873,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
mddev->clevel[0] = 0;
mddev->layout = sb->layout;
mddev->raid_disks = sb->raid_disks;
- mddev->size = sb->size;
+ mddev->dev_sectors = sb->size * 2;
mddev->events = ev1;
mddev->bitmap_offset = 0;
mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
@@ -926,7 +987,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->ctime = mddev->ctime;
sb->level = mddev->level;
- sb->size = mddev->size;
+ sb->size = mddev->dev_sectors / 2;
sb->raid_disks = mddev->raid_disks;
sb->md_minor = mddev->md_minor;
sb->not_persistent = 0;
@@ -1024,7 +1085,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
static unsigned long long
super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
{
- if (num_sectors && num_sectors < rdev->mddev->size * 2)
+ if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
return 0; /* component must fit device */
if (rdev->mddev->bitmap_offset)
return 0; /* can't move bitmap */
@@ -1180,16 +1241,17 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
ret = 0;
}
if (minor_version)
- rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
+ rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
+ le64_to_cpu(sb->data_offset);
else
- rdev->size = rdev->sb_start / 2;
- if (rdev->size < le64_to_cpu(sb->data_size)/2)
+ rdev->sectors = rdev->sb_start;
+ if (rdev->sectors < le64_to_cpu(sb->data_size))
return -EINVAL;
- rdev->size = le64_to_cpu(sb->data_size)/2;
+ rdev->sectors = le64_to_cpu(sb->data_size);
if (le32_to_cpu(sb->chunksize))
- rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+ rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1);
- if (le64_to_cpu(sb->size) > rdev->size*2)
+ if (le64_to_cpu(sb->size) > rdev->sectors)
return -EINVAL;
return ret;
}
@@ -1216,7 +1278,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
mddev->clevel[0] = 0;
mddev->layout = le32_to_cpu(sb->layout);
mddev->raid_disks = le32_to_cpu(sb->raid_disks);
- mddev->size = le64_to_cpu(sb->size)/2;
+ mddev->dev_sectors = le64_to_cpu(sb->size);
mddev->events = ev1;
mddev->bitmap_offset = 0;
mddev->default_bitmap_offset = 1024 >> 9;
@@ -1312,7 +1374,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
sb->raid_disks = cpu_to_le32(mddev->raid_disks);
- sb->size = cpu_to_le64(mddev->size<<1);
+ sb->size = cpu_to_le64(mddev->dev_sectors);
if (mddev->bitmap && mddev->bitmap_file == NULL) {
sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
@@ -1320,10 +1382,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
}
if (rdev->raid_disk >= 0 &&
- !test_bit(In_sync, &rdev->flags) &&
- rdev->recovery_offset > 0) {
- sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
- sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
+ !test_bit(In_sync, &rdev->flags)) {
+ if (mddev->curr_resync_completed > rdev->recovery_offset)
+ rdev->recovery_offset = mddev->curr_resync_completed;
+ if (rdev->recovery_offset > 0) {
+ sb->feature_map |=
+ cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+ sb->recovery_offset =
+ cpu_to_le64(rdev->recovery_offset);
+ }
}
if (mddev->reshape_position != MaxSector) {
@@ -1365,7 +1432,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
{
struct mdp_superblock_1 *sb;
sector_t max_sectors;
- if (num_sectors && num_sectors < rdev->mddev->size * 2)
+ if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
return 0; /* component must fit device */
if (rdev->sb_start < rdev->data_offset) {
/* minor versions 1 and 2; superblock before data */
@@ -1381,7 +1448,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
sector_t sb_start;
sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
sb_start &= ~(sector_t)(4*2 - 1);
- max_sectors = rdev->size * 2 + sb_start - rdev->sb_start;
+ max_sectors = rdev->sectors + sb_start - rdev->sb_start;
if (!num_sectors || num_sectors > max_sectors)
num_sectors = max_sectors;
rdev->sb_start = sb_start;
@@ -1433,6 +1500,38 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
static LIST_HEAD(pending_raid_disks);
+static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev)
+{
+ struct mdk_personality *pers = mddev->pers;
+ struct gendisk *disk = mddev->gendisk;
+ struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
+ struct blk_integrity *bi_mddev = blk_get_integrity(disk);
+
+ /* Data integrity passthrough not supported on RAID 4, 5 and 6 */
+ if (pers && pers->level >= 4 && pers->level <= 6)
+ return;
+
+ /* If rdev is integrity capable, register profile for mddev */
+ if (!bi_mddev && bi_rdev) {
+ if (blk_integrity_register(disk, bi_rdev))
+ printk(KERN_ERR "%s: %s Could not register integrity!\n",
+ __func__, disk->disk_name);
+ else
+ printk(KERN_NOTICE "Enabling data integrity on %s\n",
+ disk->disk_name);
+ return;
+ }
+
+ /* Check that mddev and rdev have matching profiles */
+ if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) {
+ printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__,
+ disk->disk_name, rdev->bdev->bd_disk->disk_name);
+ printk(KERN_NOTICE "Disabling data integrity on %s\n",
+ disk->disk_name);
+ blk_integrity_unregister(disk);
+ }
+}
+
static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
{
char b[BDEVNAME_SIZE];
@@ -1449,8 +1548,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
if (find_rdev(mddev, rdev->bdev->bd_dev))
return -EEXIST;
- /* make sure rdev->size exceeds mddev->size */
- if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
+ /* make sure rdev->sectors exceeds mddev->dev_sectors */
+ if (rdev->sectors && (mddev->dev_sectors == 0 ||
+ rdev->sectors < mddev->dev_sectors)) {
if (mddev->pers) {
/* Cannot change size, so fail
* If mddev->level <= 0, then we don't care
@@ -1459,7 +1559,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
if (mddev->level > 0)
return -ENOSPC;
} else
- mddev->size = rdev->size;
+ mddev->dev_sectors = rdev->sectors;
}
/* Verify rdev->desc_nr is unique.
@@ -1503,6 +1603,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
/* May as well allow recovery to be retried once */
mddev->recovery_disabled = 0;
+
+ md_integrity_check(rdev, mddev);
return 0;
fail:
@@ -1713,8 +1815,8 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
static void print_rdev(mdk_rdev_t *rdev, int major_version)
{
char b[BDEVNAME_SIZE];
- printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
- bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
+ printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
+ bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
rdev->desc_nr);
if (rdev->sb_loaded) {
@@ -1915,6 +2017,8 @@ repeat:
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
spin_unlock_irq(&mddev->write_lock);
wake_up(&mddev->sb_wait);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
@@ -1984,6 +2088,7 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
* -writemostly - clears write_mostly
* blocked - sets the Blocked flag
* -blocked - clears the Blocked flag
+ * insync - sets Insync providing device isn't active
*/
int err = -EINVAL;
if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
@@ -2016,6 +2121,9 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
md_wakeup_thread(rdev->mddev->thread);
err = 0;
+ } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
+ set_bit(In_sync, &rdev->flags);
+ err = 0;
}
if (!err && rdev->sysfs_state)
sysfs_notify_dirent(rdev->sysfs_state);
@@ -2088,7 +2196,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
} else if (rdev->mddev->pers) {
mdk_rdev_t *rdev2;
/* Activating a spare .. or possibly reactivating
- * if we every get bitmaps working here.
+ * if we ever get bitmaps working here.
*/
if (rdev->raid_disk != -1)
@@ -2153,7 +2261,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
return -EINVAL;
if (rdev->mddev->pers && rdev->raid_disk >= 0)
return -EBUSY;
- if (rdev->size && rdev->mddev->external)
+ if (rdev->sectors && rdev->mddev->external)
/* Must set offset before size, so overlap checks
* can be sane */
return -EBUSY;
@@ -2167,7 +2275,7 @@ __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
static ssize_t
rdev_size_show(mdk_rdev_t *rdev, char *page)
{
- return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
+ return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
}
static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
@@ -2180,34 +2288,52 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
return 1;
}
+static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
+{
+ unsigned long long blocks;
+ sector_t new;
+
+ if (strict_strtoull(buf, 10, &blocks) < 0)
+ return -EINVAL;
+
+ if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
+ return -EINVAL; /* sector conversion overflow */
+
+ new = blocks * 2;
+ if (new != blocks * 2)
+ return -EINVAL; /* unsigned long long to sector_t overflow */
+
+ *sectors = new;
+ return 0;
+}
+
static ssize_t
rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
{
- unsigned long long size;
- unsigned long long oldsize = rdev->size;
mddev_t *my_mddev = rdev->mddev;
+ sector_t oldsectors = rdev->sectors;
+ sector_t sectors;
- if (strict_strtoull(buf, 10, &size) < 0)
+ if (strict_blocks_to_sectors(buf, &sectors) < 0)
return -EINVAL;
if (my_mddev->pers && rdev->raid_disk >= 0) {
if (my_mddev->persistent) {
- size = super_types[my_mddev->major_version].
- rdev_size_change(rdev, size * 2);
- if (!size)
+ sectors = super_types[my_mddev->major_version].
+ rdev_size_change(rdev, sectors);
+ if (!sectors)
return -EBUSY;
- } else if (!size) {
- size = (rdev->bdev->bd_inode->i_size >> 10);
- size -= rdev->data_offset/2;
- }
+ } else if (!sectors)
+ sectors = (rdev->bdev->bd_inode->i_size >> 9) -
+ rdev->data_offset;
}
- if (size < my_mddev->size)
+ if (sectors < my_mddev->dev_sectors)
return -EINVAL; /* component must fit device */
- rdev->size = size;
- if (size > oldsize && my_mddev->external) {
+ rdev->sectors = sectors;
+ if (sectors > oldsectors && my_mddev->external) {
/* need to check that all other rdevs with the same ->bdev
* do not overlap. We need to unlock the mddev to avoid
- * a deadlock. We have already changed rdev->size, and if
+ * a deadlock. We have already changed rdev->sectors, and if
* we have to change it back, we will have the lock again.
*/
mddev_t *mddev;
@@ -2223,9 +2349,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
if (test_bit(AllReserved, &rdev2->flags) ||
(rdev->bdev == rdev2->bdev &&
rdev != rdev2 &&
- overlaps(rdev->data_offset, rdev->size * 2,
+ overlaps(rdev->data_offset, rdev->sectors,
rdev2->data_offset,
- rdev2->size * 2))) {
+ rdev2->sectors))) {
overlap = 1;
break;
}
@@ -2239,11 +2365,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
if (overlap) {
/* Someone else could have slipped in a size
* change here, but doing so is just silly.
- * We put oldsize back because we *know* it is
+ * We put oldsectors back because we *know* it is
* safe, and trust userspace not to race with
* itself
*/
- rdev->size = oldsize;
+ rdev->sectors = oldsectors;
return -EBUSY;
}
}
@@ -2547,18 +2673,101 @@ level_show(mddev_t *mddev, char *page)
static ssize_t
level_store(mddev_t *mddev, const char *buf, size_t len)
{
+ char level[16];
ssize_t rv = len;
- if (mddev->pers)
+ struct mdk_personality *pers;
+ void *priv;
+
+ if (mddev->pers == NULL) {
+ if (len == 0)
+ return 0;
+ if (len >= sizeof(mddev->clevel))
+ return -ENOSPC;
+ strncpy(mddev->clevel, buf, len);
+ if (mddev->clevel[len-1] == '\n')
+ len--;
+ mddev->clevel[len] = 0;
+ mddev->level = LEVEL_NONE;
+ return rv;
+ }
+
+ /* request to change the personality. Need to ensure:
+ * - array is not engaged in resync/recovery/reshape
+ * - old personality can be suspended
+ * - new personality will access other array.
+ */
+
+ if (mddev->sync_thread || mddev->reshape_position != MaxSector)
return -EBUSY;
- if (len == 0)
- return 0;
- if (len >= sizeof(mddev->clevel))
- return -ENOSPC;
- strncpy(mddev->clevel, buf, len);
- if (mddev->clevel[len-1] == '\n')
+
+ if (!mddev->pers->quiesce) {
+ printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
+ mdname(mddev), mddev->pers->name);
+ return -EINVAL;
+ }
+
+ /* Now find the new personality */
+ if (len == 0 || len >= sizeof(level))
+ return -EINVAL;
+ strncpy(level, buf, len);
+ if (level[len-1] == '\n')
len--;
- mddev->clevel[len] = 0;
- mddev->level = LEVEL_NONE;
+ level[len] = 0;
+
+ request_module("md-%s", level);
+ spin_lock(&pers_lock);
+ pers = find_pers(LEVEL_NONE, level);
+ if (!pers || !try_module_get(pers->owner)) {
+ spin_unlock(&pers_lock);
+ printk(KERN_WARNING "md: personality %s not loaded\n", level);
+ return -EINVAL;
+ }
+ spin_unlock(&pers_lock);
+
+ if (pers == mddev->pers) {
+ /* Nothing to do! */
+ module_put(pers->owner);
+ return rv;
+ }
+ if (!pers->takeover) {
+ module_put(pers->owner);
+ printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
+ mdname(mddev), level);
+ return -EINVAL;
+ }
+
+ /* ->takeover must set new_* and/or delta_disks
+ * if it succeeds, and may set them when it fails.
+ */
+ priv = pers->takeover(mddev);
+ if (IS_ERR(priv)) {
+ mddev->new_level = mddev->level;
+ mddev->new_layout = mddev->layout;
+ mddev->new_chunk = mddev->chunk_size;
+ mddev->raid_disks -= mddev->delta_disks;
+ mddev->delta_disks = 0;
+ module_put(pers->owner);
+ printk(KERN_WARNING "md: %s: %s would not accept array\n",
+ mdname(mddev), level);
+ return PTR_ERR(priv);
+ }
+
+ /* Looks like we have a winner */
+ mddev_suspend(mddev);
+ mddev->pers->stop(mddev);
+ module_put(mddev->pers->owner);
+ mddev->pers = pers;
+ mddev->private = priv;
+ strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ mddev->level = mddev->new_level;
+ mddev->layout = mddev->new_layout;
+ mddev->chunk_size = mddev->new_chunk;
+ mddev->delta_disks = 0;
+ pers->run(mddev);
+ mddev_resume(mddev);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
return rv;
}
@@ -2586,12 +2795,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
- if (mddev->pers)
- return -EBUSY;
- if (mddev->reshape_position != MaxSector)
+ if (mddev->pers) {
+ int err;
+ if (mddev->pers->reconfig == NULL)
+ return -EBUSY;
+ err = mddev->pers->reconfig(mddev, n, -1);
+ if (err)
+ return err;
+ } else {
mddev->new_layout = n;
- else
- mddev->layout = n;
+ if (mddev->reshape_position == MaxSector)
+ mddev->layout = n;
+ }
return len;
}
static struct md_sysfs_entry md_layout =
@@ -2648,19 +2863,24 @@ chunk_size_show(mddev_t *mddev, char *page)
static ssize_t
chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
{
- /* can only set chunk_size if array is not yet active */
char *e;
unsigned long n = simple_strtoul(buf, &e, 10);
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
- if (mddev->pers)
- return -EBUSY;
- else if (mddev->reshape_position != MaxSector)
+ if (mddev->pers) {
+ int err;
+ if (mddev->pers->reconfig == NULL)
+ return -EBUSY;
+ err = mddev->pers->reconfig(mddev, -1, n);
+ if (err)
+ return err;
+ } else {
mddev->new_chunk = n;
- else
- mddev->chunk_size = n;
+ if (mddev->reshape_position == MaxSector)
+ mddev->chunk_size = n;
+ }
return len;
}
static struct md_sysfs_entry md_chunk_size =
@@ -2669,6 +2889,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
static ssize_t
resync_start_show(mddev_t *mddev, char *page)
{
+ if (mddev->recovery_cp == MaxSector)
+ return sprintf(page, "none\n");
return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
}
@@ -2766,7 +2988,7 @@ array_state_show(mddev_t *mddev, char *page)
else {
if (list_empty(&mddev->disks) &&
mddev->raid_disks == 0 &&
- mddev->size == 0)
+ mddev->dev_sectors == 0)
st = clear;
else
st = inactive;
@@ -2844,11 +3066,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
} else
err = -EBUSY;
spin_unlock_irq(&mddev->write_lock);
- } else {
- mddev->ro = 0;
- mddev->recovery_cp = MaxSector;
- err = do_md_run(mddev);
- }
+ } else
+ err = -EINVAL;
break;
case active:
if (mddev->pers) {
@@ -2973,7 +3192,8 @@ __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
static ssize_t
size_show(mddev_t *mddev, char *page)
{
- return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
+ return sprintf(page, "%llu\n",
+ (unsigned long long)mddev->dev_sectors / 2);
}
static int update_size(mddev_t *mddev, sector_t num_sectors);
@@ -2985,20 +3205,18 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
* not increase it (except from 0).
* If array is active, we can try an on-line resize
*/
- char *e;
- int err = 0;
- unsigned long long size = simple_strtoull(buf, &e, 10);
- if (!*buf || *buf == '\n' ||
- (*e && *e != '\n'))
- return -EINVAL;
+ sector_t sectors;
+ int err = strict_blocks_to_sectors(buf, &sectors);
+ if (err < 0)
+ return err;
if (mddev->pers) {
- err = update_size(mddev, size * 2);
+ err = update_size(mddev, sectors);
md_update_sb(mddev, 1);
} else {
- if (mddev->size == 0 ||
- mddev->size > size)
- mddev->size = size;
+ if (mddev->dev_sectors == 0 ||
+ mddev->dev_sectors > sectors)
+ mddev->dev_sectors = sectors;
else
err = -ENOSPC;
}
@@ -3251,6 +3469,8 @@ static ssize_t
sync_speed_show(mddev_t *mddev, char *page)
{
unsigned long resync, dt, db;
+ if (mddev->curr_resync == 0)
+ return sprintf(page, "none\n");
resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
dt = (jiffies - mddev->resync_mark) / HZ;
if (!dt) dt++;
@@ -3263,15 +3483,18 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
static ssize_t
sync_completed_show(mddev_t *mddev, char *page)
{
- unsigned long max_blocks, resync;
+ unsigned long max_sectors, resync;
+
+ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ return sprintf(page, "none\n");
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
- max_blocks = mddev->resync_max_sectors;
+ max_sectors = mddev->resync_max_sectors;
else
- max_blocks = mddev->size << 1;
+ max_sectors = mddev->dev_sectors;
- resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
- return sprintf(page, "%lu / %lu\n", resync, max_blocks);
+ resync = mddev->curr_resync_completed;
+ return sprintf(page, "%lu / %lu\n", resync, max_sectors);
}
static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -3431,6 +3654,57 @@ static struct md_sysfs_entry md_reshape_position =
__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
reshape_position_store);
+static ssize_t
+array_size_show(mddev_t *mddev, char *page)
+{
+ if (mddev->external_size)
+ return sprintf(page, "%llu\n",
+ (unsigned long long)mddev->array_sectors/2);
+ else
+ return sprintf(page, "default\n");
+}
+
+static ssize_t
+array_size_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ sector_t sectors;
+
+ if (strncmp(buf, "default", 7) == 0) {
+ if (mddev->pers)
+ sectors = mddev->pers->size(mddev, 0, 0);
+ else
+ sectors = mddev->array_sectors;
+
+ mddev->external_size = 0;
+ } else {
+ if (strict_blocks_to_sectors(buf, &sectors) < 0)
+ return -EINVAL;
+ if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
+ return -EINVAL;
+
+ mddev->external_size = 1;
+ }
+
+ mddev->array_sectors = sectors;
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ if (mddev->pers) {
+ struct block_device *bdev = bdget_disk(mddev->gendisk, 0);
+
+ if (bdev) {
+ mutex_lock(&bdev->bd_inode->i_mutex);
+ i_size_write(bdev->bd_inode,
+ (loff_t)mddev->array_sectors << 9);
+ mutex_unlock(&bdev->bd_inode->i_mutex);
+ bdput(bdev);
+ }
+ }
+
+ return len;
+}
+
+static struct md_sysfs_entry md_array_size =
+__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
+ array_size_store);
static struct attribute *md_default_attrs[] = {
&md_level.attr,
@@ -3444,6 +3718,7 @@ static struct attribute *md_default_attrs[] = {
&md_safe_delay.attr,
&md_array_state.attr,
&md_reshape_position.attr,
+ &md_array_size.attr,
NULL,
};
@@ -3602,10 +3877,12 @@ static int md_alloc(dev_t dev, char *name)
mddev_put(mddev);
return -ENOMEM;
}
+ mddev->queue->queuedata = mddev;
+
/* Can be unlocked because the queue is new: no concurrency */
queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
- blk_queue_make_request(mddev->queue, md_fail_request);
+ blk_queue_make_request(mddev->queue, md_make_request);
disk = alloc_disk(1 << shift);
if (!disk) {
@@ -3731,13 +4008,13 @@ static int do_md_run(mddev_t * mddev)
list_for_each_entry(rdev, &mddev->disks, same_set) {
if (test_bit(Faulty, &rdev->flags))
continue;
- if (rdev->size < chunk_size / 1024) {
+ if (rdev->sectors < chunk_size / 512) {
printk(KERN_WARNING
"md: Dev %s smaller than chunk_size:"
- " %lluk < %dk\n",
+ " %llu < %d\n",
bdevname(rdev->bdev,b),
- (unsigned long long)rdev->size,
- chunk_size / 1024);
+ (unsigned long long)rdev->sectors,
+ chunk_size / 512);
return -EINVAL;
}
}
@@ -3761,11 +4038,11 @@ static int do_md_run(mddev_t * mddev)
/* perform some consistency tests on the device.
* We don't want the data to overlap the metadata,
- * Internal Bitmap issues has handled elsewhere.
+ * Internal Bitmap issues have been handled elsewhere.
*/
if (rdev->data_offset < rdev->sb_start) {
- if (mddev->size &&
- rdev->data_offset + mddev->size*2
+ if (mddev->dev_sectors &&
+ rdev->data_offset + mddev->dev_sectors
> rdev->sb_start) {
printk("md: %s: data overlaps metadata\n",
mdname(mddev));
@@ -3801,9 +4078,16 @@ static int do_md_run(mddev_t * mddev)
}
mddev->pers = pers;
spin_unlock(&pers_lock);
- mddev->level = pers->level;
+ if (mddev->level != pers->level) {
+ mddev->level = pers->level;
+ mddev->new_level = pers->level;
+ }
strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ if (pers->level >= 4 && pers->level <= 6)
+ /* Cannot support integrity (yet) */
+ blk_integrity_unregister(mddev->gendisk);
+
if (mddev->reshape_position != MaxSector &&
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
@@ -3843,7 +4127,9 @@ static int do_md_run(mddev_t * mddev)
}
mddev->recovery = 0;
- mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
+ /* may be over-ridden by personality */
+ mddev->resync_max_sectors = mddev->dev_sectors;
+
mddev->barriers_work = 1;
mddev->ok_start_degraded = start_dirty_degraded;
@@ -3853,7 +4139,17 @@ static int do_md_run(mddev_t * mddev)
err = mddev->pers->run(mddev);
if (err)
printk(KERN_ERR "md: pers->run() failed ...\n");
- else if (mddev->pers->sync_request) {
+ else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
+ WARN_ONCE(!mddev->external_size, "%s: default size too small,"
+ " but 'external_size' not in effect?\n", __func__);
+ printk(KERN_ERR
+ "md: invalid array_size %llu > default size %llu\n",
+ (unsigned long long)mddev->array_sectors / 2,
+ (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
+ err = -EINVAL;
+ mddev->pers->stop(mddev);
+ }
+ if (err == 0 && mddev->pers->sync_request) {
err = bitmap_create(mddev);
if (err) {
printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -3899,16 +4195,6 @@ static int do_md_run(mddev_t * mddev)
set_capacity(disk, mddev->array_sectors);
- /* If we call blk_queue_make_request here, it will
- * re-initialise max_sectors etc which may have been
- * refined inside -> run. So just set the bits we need to set.
- * Most initialisation happended when we called
- * blk_queue_make_request(..., md_fail_request)
- * earlier.
- */
- mddev->queue->queuedata = mddev;
- mddev->queue->make_request_fn = mddev->pers->make_request;
-
/* If there is a partially-recovered drive we need to
* start recovery here. If we leave it to md_check_recovery,
* it will remove the drives and not do the right thing
@@ -4008,6 +4294,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
{
int err = 0;
struct gendisk *disk = mddev->gendisk;
+ mdk_rdev_t *rdev;
if (atomic_read(&mddev->openers) > is_open) {
printk("md: %s still in use.\n",mdname(mddev));
@@ -4038,7 +4325,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
md_super_wait(mddev);
if (mddev->ro)
set_disk_ro(disk, 0);
- blk_queue_make_request(mddev->queue, md_fail_request);
+
mddev->pers->stop(mddev);
mddev->queue->merge_bvec_fn = NULL;
mddev->queue->unplug_fn = NULL;
@@ -4050,6 +4337,13 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
/* tell userspace to handle 'inactive' */
sysfs_notify_dirent(mddev->sysfs_state);
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ if (rdev->raid_disk >= 0) {
+ char nm[20];
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ }
+
set_capacity(disk, 0);
mddev->changed = 1;
@@ -4070,7 +4364,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
* Free resources if final stop
*/
if (mode == 0) {
- mdk_rdev_t *rdev;
printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
@@ -4082,20 +4375,14 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
}
mddev->bitmap_offset = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set)
- if (rdev->raid_disk >= 0) {
- char nm[20];
- sprintf(nm, "rd%d", rdev->raid_disk);
- sysfs_remove_link(&mddev->kobj, nm);
- }
-
/* make sure all md_delayed_delete calls have finished */
flush_scheduled_work();
export_array(mddev);
mddev->array_sectors = 0;
- mddev->size = 0;
+ mddev->external_size = 0;
+ mddev->dev_sectors = 0;
mddev->raid_disks = 0;
mddev->recovery_cp = 0;
mddev->resync_min = 0;
@@ -4135,6 +4422,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
printk(KERN_INFO "md: %s switched to read-only mode.\n",
mdname(mddev));
err = 0;
+ blk_integrity_unregister(disk);
md_new_event(mddev);
sysfs_notify_dirent(mddev->sysfs_state);
out:
@@ -4300,8 +4588,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
info.patch_version = MD_PATCHLEVEL_VERSION;
info.ctime = mddev->ctime;
info.level = mddev->level;
- info.size = mddev->size;
- if (info.size != mddev->size) /* overflow */
+ info.size = mddev->dev_sectors / 2;
+ if (info.size != mddev->dev_sectors / 2) /* overflow */
info.size = -1;
info.nr_disks = nr;
info.raid_disks = mddev->raid_disks;
@@ -4480,6 +4768,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
clear_bit(In_sync, &rdev->flags); /* just to be sure */
if (info->state & (1<<MD_DISK_WRITEMOSTLY))
set_bit(WriteMostly, &rdev->flags);
+ else
+ clear_bit(WriteMostly, &rdev->flags);
rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
@@ -4543,7 +4833,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
} else
rdev->sb_start = calc_dev_sboffset(rdev->bdev);
- rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
+ rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
err = bind_rdev_to_array(rdev, mddev);
if (err) {
@@ -4613,7 +4903,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
else
rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
- rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
+ rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
if (test_bit(Faulty, &rdev->flags)) {
printk(KERN_WARNING
@@ -4749,7 +5039,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
mddev->level = info->level;
mddev->clevel[0] = 0;
- mddev->size = info->size;
+ mddev->dev_sectors = 2 * (sector_t)info->size;
mddev->raid_disks = info->raid_disks;
/* don't set md_minor, it is determined by which /dev/md* was
* openned
@@ -4788,6 +5078,17 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
return 0;
}
+void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
+{
+ WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
+
+ if (mddev->external_size)
+ return;
+
+ mddev->array_sectors = array_sectors;
+}
+EXPORT_SYMBOL(md_set_array_sectors);
+
static int update_size(mddev_t *mddev, sector_t num_sectors)
{
mdk_rdev_t *rdev;
@@ -4814,8 +5115,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
*/
return -EBUSY;
list_for_each_entry(rdev, &mddev->disks, same_set) {
- sector_t avail;
- avail = rdev->size * 2;
+ sector_t avail = rdev->sectors;
if (fit && (num_sectors == 0 || num_sectors > avail))
num_sectors = avail;
@@ -4887,12 +5187,18 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
)
return -EINVAL;
/* Check there is only one change */
- if (info->size >= 0 && mddev->size != info->size) cnt++;
- if (mddev->raid_disks != info->raid_disks) cnt++;
- if (mddev->layout != info->layout) cnt++;
- if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
- if (cnt == 0) return 0;
- if (cnt > 1) return -EINVAL;
+ if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
+ cnt++;
+ if (mddev->raid_disks != info->raid_disks)
+ cnt++;
+ if (mddev->layout != info->layout)
+ cnt++;
+ if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
+ cnt++;
+ if (cnt == 0)
+ return 0;
+ if (cnt > 1)
+ return -EINVAL;
if (mddev->layout != info->layout) {
/* Change layout
@@ -4904,7 +5210,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
else
return mddev->pers->reconfig(mddev, info->layout, -1);
}
- if (info->size >= 0 && mddev->size != info->size)
+ if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
rv = update_size(mddev, (sector_t)info->size * 2);
if (mddev->raid_disks != info->raid_disks)
@@ -5331,6 +5637,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
void md_unregister_thread(mdk_thread_t *thread)
{
+ if (!thread)
+ return;
dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
kthread_stop(thread->tsk);
@@ -5394,37 +5702,38 @@ static void status_unused(struct seq_file *seq)
static void status_resync(struct seq_file *seq, mddev_t * mddev)
{
- sector_t max_blocks, resync, res;
- unsigned long dt, db, rt;
+ sector_t max_sectors, resync, res;
+ unsigned long dt, db;
+ sector_t rt;
int scale;
unsigned int per_milli;
- resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+ resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
- max_blocks = mddev->resync_max_sectors >> 1;
+ max_sectors = mddev->resync_max_sectors;
else
- max_blocks = mddev->size;
+ max_sectors = mddev->dev_sectors;
/*
* Should not happen.
*/
- if (!max_blocks) {
+ if (!max_sectors) {
MD_BUG();
return;
}
/* Pick 'scale' such that (resync>>scale)*1000 will fit
- * in a sector_t, and (max_blocks>>scale) will fit in a
+ * in a sector_t, and (max_sectors>>scale) will fit in a
* u32, as those are the requirements for sector_div.
* Thus 'scale' must be at least 10
*/
scale = 10;
if (sizeof(sector_t) > sizeof(unsigned long)) {
- while ( max_blocks/2 > (1ULL<<(scale+32)))
+ while ( max_sectors/2 > (1ULL<<(scale+32)))
scale++;
}
res = (resync>>scale)*1000;
- sector_div(res, (u32)((max_blocks>>scale)+1));
+ sector_div(res, (u32)((max_sectors>>scale)+1));
per_milli = res;
{
@@ -5445,25 +5754,35 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
"resync" : "recovery"))),
per_milli/10, per_milli % 10,
- (unsigned long long) resync,
- (unsigned long long) max_blocks);
+ (unsigned long long) resync/2,
+ (unsigned long long) max_sectors/2);
/*
- * We do not want to overflow, so the order of operands and
- * the * 100 / 100 trick are important. We do a +1 to be
- * safe against division by zero. We only estimate anyway.
- *
* dt: time from mark until now
* db: blocks written from mark until now
* rt: remaining time
+ *
+ * rt is a sector_t, so could be 32bit or 64bit.
+ * So we divide before multiply in case it is 32bit and close
+ * to the limit.
+ * We scale the divisor (db) by 32 to avoid loosing precision
+ * near the end of resync when the number of remaining sectors
+ * is close to 'db'.
+ * We then divide rt by 32 after multiplying by db to compensate.
+ * The '+1' avoids division by zero if db is very small.
*/
dt = ((jiffies - mddev->resync_mark) / HZ);
if (!dt) dt++;
db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
- mddev->resync_mark_cnt;
- rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
- seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+ rt = max_sectors - resync; /* number of remaining sectors */
+ sector_div(rt, db/32+1);
+ rt *= dt;
+ rt >>= 5;
+
+ seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
+ ((unsigned long)rt % 60)/6);
seq_printf(seq, " speed=%ldK/sec", db/2/dt);
}
@@ -5537,7 +5856,7 @@ struct mdstat_info {
static int md_seq_show(struct seq_file *seq, void *v)
{
mddev_t *mddev = v;
- sector_t size;
+ sector_t sectors;
mdk_rdev_t *rdev;
struct mdstat_info *mi = seq->private;
struct bitmap *bitmap;
@@ -5573,7 +5892,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, " %s", mddev->pers->name);
}
- size = 0;
+ sectors = 0;
list_for_each_entry(rdev, &mddev->disks, same_set) {
char b[BDEVNAME_SIZE];
seq_printf(seq, " %s[%d]",
@@ -5585,7 +5904,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
continue;
} else if (rdev->raid_disk < 0)
seq_printf(seq, "(S)"); /* spare */
- size += rdev->size;
+ sectors += rdev->sectors;
}
if (!list_empty(&mddev->disks)) {
@@ -5595,7 +5914,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
mddev->array_sectors / 2);
else
seq_printf(seq, "\n %llu blocks",
- (unsigned long long)size);
+ (unsigned long long)sectors / 2);
}
if (mddev->persistent) {
if (mddev->major_version != 0 ||
@@ -5654,7 +5973,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
return 0;
}
-static struct seq_operations md_seq_ops = {
+static const struct seq_operations md_seq_ops = {
.start = md_seq_start,
.next = md_seq_next,
.stop = md_seq_stop,
@@ -5722,19 +6041,19 @@ int unregister_md_personality(struct mdk_personality *p)
return 0;
}
-static int is_mddev_idle(mddev_t *mddev)
+static int is_mddev_idle(mddev_t *mddev, int init)
{
mdk_rdev_t * rdev;
int idle;
- long curr_events;
+ int curr_events;
idle = 1;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
- curr_events = part_stat_read(&disk->part0, sectors[0]) +
- part_stat_read(&disk->part0, sectors[1]) -
- atomic_read(&disk->sync_io);
+ curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+ (int)part_stat_read(&disk->part0, sectors[1]) -
+ atomic_read(&disk->sync_io);
/* sync IO will cause sync_io to increase before the disk_stats
* as sync_io is counted when a request starts, and
* disk_stats is counted when it completes.
@@ -5757,7 +6076,7 @@ static int is_mddev_idle(mddev_t *mddev)
* always make curr_events less than last_events.
*
*/
- if (curr_events - rdev->last_events > 4096) {
+ if (init || curr_events - rdev->last_events > 64) {
rdev->last_events = curr_events;
idle = 0;
}
@@ -5980,10 +6299,10 @@ void md_do_sync(mddev_t *mddev)
j = mddev->recovery_cp;
} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
- max_sectors = mddev->size << 1;
+ max_sectors = mddev->dev_sectors;
else {
/* recovery follows the physical size of devices */
- max_sectors = mddev->size << 1;
+ max_sectors = mddev->dev_sectors;
j = MaxSector;
list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0 &&
@@ -6000,7 +6319,7 @@ void md_do_sync(mddev_t *mddev)
"(but not more than %d KB/sec) for %s.\n",
speed_max(mddev), desc);
- is_mddev_idle(mddev); /* this also initializes IO event counters */
+ is_mddev_idle(mddev, 1); /* this initializes IO event counters */
io_sectors = 0;
for (m = 0; m < SYNC_MARKS; m++) {
@@ -6032,14 +6351,31 @@ void md_do_sync(mddev_t *mddev)
sector_t sectors;
skipped = 0;
- if (j >= mddev->resync_max) {
+
+ if ((mddev->curr_resync > mddev->curr_resync_completed &&
+ (mddev->curr_resync - mddev->curr_resync_completed)
+ > (max_sectors >> 4)) ||
+ (j - mddev->curr_resync_completed)*2
+ >= mddev->resync_max - mddev->curr_resync_completed
+ ) {
+ /* time to update curr_resync_completed */
+ blk_unplug(mddev->queue);
+ wait_event(mddev->recovery_wait,
+ atomic_read(&mddev->recovery_active) == 0);
+ mddev->curr_resync_completed =
+ mddev->curr_resync;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ }
+
+ if (j >= mddev->resync_max)
wait_event(mddev->recovery_wait,
mddev->resync_max > j
|| kthread_should_stop());
- }
+
if (kthread_should_stop())
goto interrupted;
+
sectors = mddev->pers->sync_request(mddev, j, &skipped,
currspeed < speed_min(mddev));
if (sectors == 0) {
@@ -6102,7 +6438,7 @@ void md_do_sync(mddev_t *mddev)
if (currspeed > speed_min(mddev)) {
if ((currspeed > speed_max(mddev)) ||
- !is_mddev_idle(mddev)) {
+ !is_mddev_idle(mddev, 0)) {
msleep(500);
goto repeat;
}
@@ -6147,6 +6483,7 @@ void md_do_sync(mddev_t *mddev)
skip:
mddev->curr_resync = 0;
+ mddev->curr_resync_completed = 0;
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
@@ -6173,6 +6510,8 @@ static int remove_and_add_spares(mddev_t *mddev)
mdk_rdev_t *rdev;
int spares = 0;
+ mddev->curr_resync_completed = 0;
+
list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) &&
@@ -6327,6 +6666,9 @@ void md_check_recovery(mddev_t *mddev)
sysfs_notify(&mddev->kobj, NULL,
"degraded");
}
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ mddev->pers->finish_reshape)
+ mddev->pers->finish_reshape(mddev);
md_update_sb(mddev, 1);
/* if array is no-longer degraded, then any saved_raid_disk
@@ -6470,13 +6812,13 @@ static void md_geninit(void)
static int __init md_init(void)
{
- if (register_blkdev(MAJOR_NR, "md"))
+ if (register_blkdev(MD_MAJOR, "md"))
return -1;
if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
- unregister_blkdev(MAJOR_NR, "md");
+ unregister_blkdev(MD_MAJOR, "md");
return -1;
}
- blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
+ blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
md_probe, NULL, NULL);
blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
md_probe, NULL, NULL);
@@ -6562,10 +6904,10 @@ static __exit void md_exit(void)
mddev_t *mddev;
struct list_head *tmp;
- blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
+ blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
- unregister_blkdev(MAJOR_NR,"md");
+ unregister_blkdev(MD_MAJOR,"md");
unregister_blkdev(mdp_major, "mdp");
unregister_reboot_notifier(&md_notifier);
unregister_sysctl_table(raid_table_header);
diff --git a/drivers/md/md.h b/drivers/md/md.h
new file mode 100644
index 000000000000..8227ab909d44
--- /dev/null
+++ b/drivers/md/md.h
@@ -0,0 +1,441 @@
+/*
+ md_k.h : kernel internal structure of the Linux MD driver
+ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#ifndef _MD_MD_H
+#define _MD_MD_H
+
+#include <linux/blkdev.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+
+#define MaxSector (~(sector_t)0)
+
+typedef struct mddev_s mddev_t;
+typedef struct mdk_rdev_s mdk_rdev_t;
+
+/*
+ * options passed in raidrun:
+ */
+
+/* Currently this must fit in an 'int' */
+#define MAX_CHUNK_SIZE (1<<30)
+
+/*
+ * MD's 'extended' device
+ */
+struct mdk_rdev_s
+{
+ struct list_head same_set; /* RAID devices within the same set */
+
+ sector_t sectors; /* Device size (in 512bytes sectors) */
+ mddev_t *mddev; /* RAID array if running */
+ int last_events; /* IO event timestamp */
+
+ struct block_device *bdev; /* block device handle */
+
+ struct page *sb_page;
+ int sb_loaded;
+ __u64 sb_events;
+ sector_t data_offset; /* start of data in array */
+ sector_t sb_start; /* offset of the super block (in 512byte sectors) */
+ int sb_size; /* bytes in the superblock */
+ int preferred_minor; /* autorun support */
+
+ struct kobject kobj;
+
+ /* A device can be in one of three states based on two flags:
+ * Not working: faulty==1 in_sync==0
+ * Fully working: faulty==0 in_sync==1
+ * Working, but not
+ * in sync with array
+ * faulty==0 in_sync==0
+ *
+ * It can never have faulty==1, in_sync==1
+ * This reduces the burden of testing multiple flags in many cases
+ */
+
+ unsigned long flags;
+#define Faulty 1 /* device is known to have a fault */
+#define In_sync 2 /* device is in_sync with rest of array */
+#define WriteMostly 4 /* Avoid reading if at all possible */
+#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */
+#define AllReserved 6 /* If whole device is reserved for
+ * one array */
+#define AutoDetected 7 /* added by auto-detect */
+#define Blocked 8 /* An error occured on an externally
+ * managed array, don't allow writes
+ * until it is cleared */
+#define StateChanged 9 /* Faulty or Blocked has changed during
+ * interrupt, so it needs to be
+ * notified by the thread */
+ wait_queue_head_t blocked_wait;
+
+ int desc_nr; /* descriptor index in the superblock */
+ int raid_disk; /* role of device in array */
+ int saved_raid_disk; /* role that device used to have in the
+ * array and could again if we did a partial
+ * resync from the bitmap
+ */
+ sector_t recovery_offset;/* If this device has been partially
+ * recovered, this is where we were
+ * up to.
+ */
+
+ atomic_t nr_pending; /* number of pending requests.
+ * only maintained for arrays that
+ * support hot removal
+ */
+ atomic_t read_errors; /* number of consecutive read errors that
+ * we have tried to ignore.
+ */
+ atomic_t corrected_errors; /* number of corrected read errors,
+ * for reporting to userspace and storing
+ * in superblock.
+ */
+ struct work_struct del_work; /* used for delayed sysfs removal */
+
+ struct sysfs_dirent *sysfs_state; /* handle for 'state'
+ * sysfs entry */
+};
+
+struct mddev_s
+{
+ void *private;
+ struct mdk_personality *pers;
+ dev_t unit;
+ int md_minor;
+ struct list_head disks;
+ unsigned long flags;
+#define MD_CHANGE_DEVS 0 /* Some device status has changed */
+#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
+#define MD_CHANGE_PENDING 2 /* superblock update in progress */
+
+ int suspended;
+ atomic_t active_io;
+ int ro;
+
+ struct gendisk *gendisk;
+
+ struct kobject kobj;
+ int hold_active;
+#define UNTIL_IOCTL 1
+#define UNTIL_STOP 2
+
+ /* Superblock information */
+ int major_version,
+ minor_version,
+ patch_version;
+ int persistent;
+ int external; /* metadata is
+ * managed externally */
+ char metadata_type[17]; /* externally set*/
+ int chunk_size;
+ time_t ctime, utime;
+ int level, layout;
+ char clevel[16];
+ int raid_disks;
+ int max_disks;
+ sector_t dev_sectors; /* used size of
+ * component devices */
+ sector_t array_sectors; /* exported array size */
+ int external_size; /* size managed
+ * externally */
+ __u64 events;
+
+ char uuid[16];
+
+ /* If the array is being reshaped, we need to record the
+ * new shape and an indication of where we are up to.
+ * This is written to the superblock.
+ * If reshape_position is MaxSector, then no reshape is happening (yet).
+ */
+ sector_t reshape_position;
+ int delta_disks, new_level, new_layout, new_chunk;
+
+ struct mdk_thread_s *thread; /* management thread */
+ struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
+ sector_t curr_resync; /* last block scheduled */
+ /* As resync requests can complete out of order, we cannot easily track
+ * how much resync has been completed. So we occasionally pause until
+ * everything completes, then set curr_resync_completed to curr_resync.
+ * As such it may be well behind the real resync mark, but it is a value
+ * we are certain of.
+ */
+ sector_t curr_resync_completed;
+ unsigned long resync_mark; /* a recent timestamp */
+ sector_t resync_mark_cnt;/* blocks written at resync_mark */
+ sector_t curr_mark_cnt; /* blocks scheduled now */
+
+ sector_t resync_max_sectors; /* may be set by personality */
+
+ sector_t resync_mismatches; /* count of sectors where
+ * parity/replica mismatch found
+ */
+
+ /* allow user-space to request suspension of IO to regions of the array */
+ sector_t suspend_lo;
+ sector_t suspend_hi;
+ /* if zero, use the system-wide default */
+ int sync_speed_min;
+ int sync_speed_max;
+
+ /* resync even though the same disks are shared among md-devices */
+ int parallel_resync;
+
+ int ok_start_degraded;
+ /* recovery/resync flags
+ * NEEDED: we might need to start a resync/recover
+ * RUNNING: a thread is running, or about to be started
+ * SYNC: actually doing a resync, not a recovery
+ * RECOVER: doing recovery, or need to try it.
+ * INTR: resync needs to be aborted for some reason
+ * DONE: thread is done and is waiting to be reaped
+ * REQUEST: user-space has requested a sync (used with SYNC)
+ * CHECK: user-space request for for check-only, no repair
+ * RESHAPE: A reshape is happening
+ *
+ * If neither SYNC or RESHAPE are set, then it is a recovery.
+ */
+#define MD_RECOVERY_RUNNING 0
+#define MD_RECOVERY_SYNC 1
+#define MD_RECOVERY_RECOVER 2
+#define MD_RECOVERY_INTR 3
+#define MD_RECOVERY_DONE 4
+#define MD_RECOVERY_NEEDED 5
+#define MD_RECOVERY_REQUESTED 6
+#define MD_RECOVERY_CHECK 7
+#define MD_RECOVERY_RESHAPE 8
+#define MD_RECOVERY_FROZEN 9
+
+ unsigned long recovery;
+ int recovery_disabled; /* if we detect that recovery
+ * will always fail, set this
+ * so we don't loop trying */
+
+ int in_sync; /* know to not need resync */
+ struct mutex reconfig_mutex;
+ atomic_t active; /* general refcount */
+ atomic_t openers; /* number of active opens */
+
+ int changed; /* true if we might need to reread partition info */
+ int degraded; /* whether md should consider
+ * adding a spare
+ */
+ int barriers_work; /* initialised to true, cleared as soon
+ * as a barrier request to slave
+ * fails. Only supported
+ */
+ struct bio *biolist; /* bios that need to be retried
+ * because BIO_RW_BARRIER is not supported
+ */
+
+ atomic_t recovery_active; /* blocks scheduled, but not written */
+ wait_queue_head_t recovery_wait;
+ sector_t recovery_cp;
+ sector_t resync_min; /* user requested sync
+ * starts here */
+ sector_t resync_max; /* resync should pause
+ * when it gets here */
+
+ struct sysfs_dirent *sysfs_state; /* handle for 'array_state'
+ * file in sysfs.
+ */
+ struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */
+
+ struct work_struct del_work; /* used for delayed sysfs removal */
+
+ spinlock_t write_lock;
+ wait_queue_head_t sb_wait; /* for waiting on superblock updates */
+ atomic_t pending_writes; /* number of active superblock writes */
+
+ unsigned int safemode; /* if set, update "clean" superblock
+ * when no writes pending.
+ */
+ unsigned int safemode_delay;
+ struct timer_list safemode_timer;
+ atomic_t writes_pending;
+ struct request_queue *queue; /* for plugging ... */
+
+ atomic_t write_behind; /* outstanding async IO */
+ unsigned int max_write_behind; /* 0 = sync */
+
+ struct bitmap *bitmap; /* the bitmap for the device */
+ struct file *bitmap_file; /* the bitmap file */
+ long bitmap_offset; /* offset from superblock of
+ * start of bitmap. May be
+ * negative, but not '0'
+ */
+ long default_bitmap_offset; /* this is the offset to use when
+ * hot-adding a bitmap. It should
+ * eventually be settable by sysfs.
+ */
+
+ struct list_head all_mddevs;
+};
+
+
+static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
+{
+ int faulty = test_bit(Faulty, &rdev->flags);
+ if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+}
+
+static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
+{
+ atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
+}
+
+struct mdk_personality
+{
+ char *name;
+ int level;
+ struct list_head list;
+ struct module *owner;
+ int (*make_request)(struct request_queue *q, struct bio *bio);
+ int (*run)(mddev_t *mddev);
+ int (*stop)(mddev_t *mddev);
+ void (*status)(struct seq_file *seq, mddev_t *mddev);
+ /* error_handler must set ->faulty and clear ->in_sync
+ * if appropriate, and should abort recovery if needed
+ */
+ void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
+ int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
+ int (*hot_remove_disk) (mddev_t *mddev, int number);
+ int (*spare_active) (mddev_t *mddev);
+ sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
+ int (*resize) (mddev_t *mddev, sector_t sectors);
+ sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks);
+ int (*check_reshape) (mddev_t *mddev);
+ int (*start_reshape) (mddev_t *mddev);
+ void (*finish_reshape) (mddev_t *mddev);
+ int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
+ /* quiesce moves between quiescence states
+ * 0 - fully active
+ * 1 - no new requests allowed
+ * others - reserved
+ */
+ void (*quiesce) (mddev_t *mddev, int state);
+ /* takeover is used to transition an array from one
+ * personality to another. The new personality must be able
+ * to handle the data in the current layout.
+ * e.g. 2drive raid1 -> 2drive raid5
+ * ndrive raid5 -> degraded n+1drive raid6 with special layout
+ * If the takeover succeeds, a new 'private' structure is returned.
+ * This needs to be installed and then ->run used to activate the
+ * array.
+ */
+ void *(*takeover) (mddev_t *mddev);
+};
+
+
+struct md_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(mddev_t *, char *);
+ ssize_t (*store)(mddev_t *, const char *, size_t);
+};
+
+
+static inline char * mdname (mddev_t * mddev)
+{
+ return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
+}
+
+/*
+ * iterates through some rdev ringlist. It's safe to remove the
+ * current 'rdev'. Dont touch 'tmp' though.
+ */
+#define rdev_for_each_list(rdev, tmp, head) \
+ list_for_each_entry_safe(rdev, tmp, head, same_set)
+
+/*
+ * iterates through the 'same array disks' ringlist
+ */
+#define rdev_for_each(rdev, tmp, mddev) \
+ list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
+
+#define rdev_for_each_rcu(rdev, mddev) \
+ list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
+
+typedef struct mdk_thread_s {
+ void (*run) (mddev_t *mddev);
+ mddev_t *mddev;
+ wait_queue_head_t wqueue;
+ unsigned long flags;
+ struct task_struct *tsk;
+ unsigned long timeout;
+} mdk_thread_t;
+
+#define THREAD_WAKEUP 0
+
+#define __wait_event_lock_irq(wq, condition, lock, cmd) \
+do { \
+ wait_queue_t __wait; \
+ init_waitqueue_entry(&__wait, current); \
+ \
+ add_wait_queue(&wq, &__wait); \
+ for (;;) { \
+ set_current_state(TASK_UNINTERRUPTIBLE); \
+ if (condition) \
+ break; \
+ spin_unlock_irq(&lock); \
+ cmd; \
+ schedule(); \
+ spin_lock_irq(&lock); \
+ } \
+ current->state = TASK_RUNNING; \
+ remove_wait_queue(&wq, &__wait); \
+} while (0)
+
+#define wait_event_lock_irq(wq, condition, lock, cmd) \
+do { \
+ if (condition) \
+ break; \
+ __wait_event_lock_irq(wq, condition, lock, cmd); \
+} while (0)
+
+static inline void safe_put_page(struct page *p)
+{
+ if (p) put_page(p);
+}
+
+extern int register_md_personality(struct mdk_personality *p);
+extern int unregister_md_personality(struct mdk_personality *p);
+extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
+ mddev_t *mddev, const char *name);
+extern void md_unregister_thread(mdk_thread_t *thread);
+extern void md_wakeup_thread(mdk_thread_t *thread);
+extern void md_check_recovery(mddev_t *mddev);
+extern void md_write_start(mddev_t *mddev, struct bio *bi);
+extern void md_write_end(mddev_t *mddev);
+extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
+extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
+
+extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+ sector_t sector, int size, struct page *page);
+extern void md_super_wait(mddev_t *mddev);
+extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+ struct page *page, int rw);
+extern void md_do_sync(mddev_t *mddev);
+extern void md_new_event(mddev_t *mddev);
+extern int md_allow_write(mddev_t *mddev);
+extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
+extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
+
+#endif /* _MD_MD_H */
diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c
index b61d5767aae7..3b1500843bba 100644
--- a/drivers/md/mktables.c
+++ b/drivers/md/mktables.c
@@ -59,7 +59,7 @@ int main(int argc, char *argv[])
uint8_t v;
uint8_t exptbl[256], invtbl[256];
- printf("#include \"raid6.h\"\n");
+ printf("#include <linux/raid/pq.h>\n");
/* Compute multiplication table */
printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -76,6 +76,9 @@ int main(int argc, char *argv[])
printf("\t},\n");
}
printf("};\n");
+ printf("#ifdef __KERNEL__\n");
+ printf("EXPORT_SYMBOL(raid6_gfmul);\n");
+ printf("#endif\n");
/* Compute power-of-2 table (exponent) */
v = 1;
@@ -92,6 +95,9 @@ int main(int argc, char *argv[])
}
}
printf("};\n");
+ printf("#ifdef __KERNEL__\n");
+ printf("EXPORT_SYMBOL(raid6_gfexp);\n");
+ printf("#endif\n");
/* Compute inverse table x^-1 == x^254 */
printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -104,6 +110,9 @@ int main(int argc, char *argv[])
}
}
printf("};\n");
+ printf("#ifdef __KERNEL__\n");
+ printf("EXPORT_SYMBOL(raid6_gfinv);\n");
+ printf("#endif\n");
/* Compute inv(2^x + 1) (exponent-xor-inverse) table */
printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -115,6 +124,9 @@ int main(int argc, char *argv[])
(j == 7) ? '\n' : ' ');
}
printf("};\n");
+ printf("#ifdef __KERNEL__\n");
+ printf("EXPORT_SYMBOL(raid6_gfexi);\n");
+ printf("#endif\n");
return 0;
}
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index f6d08f241671..41ced0cbe823 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -19,7 +19,11 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include <linux/raid/multipath.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "multipath.h"
#define MAX_WORK_PER_DISK 128
@@ -402,6 +406,14 @@ static void multipathd (mddev_t *mddev)
spin_unlock_irqrestore(&conf->device_lock, flags);
}
+static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+ WARN_ONCE(sectors || raid_disks,
+ "%s does not support generic reshape\n", __func__);
+
+ return mddev->dev_sectors;
+}
+
static int multipath_run (mddev_t *mddev)
{
multipath_conf_t *conf;
@@ -498,7 +510,7 @@ static int multipath_run (mddev_t *mddev)
/*
* Ok, everything is just fine now
*/
- mddev->array_sectors = mddev->size * 2;
+ md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
mddev->queue->unplug_fn = multipath_unplug;
mddev->queue->backing_dev_info.congested_fn = multipath_congested;
@@ -543,6 +555,7 @@ static struct mdk_personality multipath_personality =
.error_handler = multipath_error,
.hot_add_disk = multipath_add_disk,
.hot_remove_disk= multipath_remove_disk,
+ .size = multipath_size,
};
static int __init multipath_init (void)
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
new file mode 100644
index 000000000000..6fa70b400cda
--- /dev/null
+++ b/drivers/md/multipath.h
@@ -0,0 +1,40 @@
+#ifndef _MULTIPATH_H
+#define _MULTIPATH_H
+
+struct multipath_info {
+ mdk_rdev_t *rdev;
+};
+
+struct multipath_private_data {
+ mddev_t *mddev;
+ struct multipath_info *multipaths;
+ int raid_disks;
+ int working_disks;
+ spinlock_t device_lock;
+ struct list_head retry_list;
+
+ mempool_t *pool;
+};
+
+typedef struct multipath_private_data multipath_conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
+
+/*
+ * this is our 'private' 'collective' MULTIPATH buffer head.
+ * it contains information about what kind of IO operations were started
+ * for this MULTIPATH operation, and about their status:
+ */
+
+struct multipath_bh {
+ mddev_t *mddev;
+ struct bio *master_bio;
+ struct bio bio;
+ int path;
+ struct list_head retry_list;
+};
+#endif
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c605ba805586..c08d7559be55 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -18,7 +18,10 @@
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include <linux/raid/raid0.h>
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "raid0.h"
static void raid0_unplug(struct request_queue *q)
{
@@ -73,16 +76,15 @@ static int create_strip_zones (mddev_t *mddev)
list_for_each_entry(rdev2, &mddev->disks, same_set) {
printk(KERN_INFO "raid0: comparing %s(%llu)",
bdevname(rdev1->bdev,b),
- (unsigned long long)rdev1->size);
+ (unsigned long long)rdev1->sectors);
printk(KERN_INFO " with %s(%llu)\n",
bdevname(rdev2->bdev,b),
- (unsigned long long)rdev2->size);
+ (unsigned long long)rdev2->sectors);
if (rdev2 == rdev1) {
printk(KERN_INFO "raid0: END\n");
break;
}
- if (rdev2->size == rdev1->size)
- {
+ if (rdev2->sectors == rdev1->sectors) {
/*
* Not unique, don't count it as a new
* group
@@ -145,7 +147,7 @@ static int create_strip_zones (mddev_t *mddev)
mddev->queue->max_sectors > (PAGE_SIZE>>9))
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
- if (!smallest || (rdev1->size <smallest->size))
+ if (!smallest || (rdev1->sectors < smallest->sectors))
smallest = rdev1;
cnt++;
}
@@ -155,10 +157,10 @@ static int create_strip_zones (mddev_t *mddev)
goto abort;
}
zone->nb_dev = cnt;
- zone->sectors = smallest->size * cnt * 2;
+ zone->sectors = smallest->sectors * cnt;
zone->zone_start = 0;
- current_start = smallest->size * 2;
+ current_start = smallest->sectors;
curr_zone_start = zone->sectors;
/* now do the other zones */
@@ -177,29 +179,29 @@ static int create_strip_zones (mddev_t *mddev)
rdev = conf->strip_zone[0].dev[j];
printk(KERN_INFO "raid0: checking %s ...",
bdevname(rdev->bdev, b));
- if (rdev->size > current_start / 2) {
- printk(KERN_INFO " contained as device %d\n",
- c);
- zone->dev[c] = rdev;
- c++;
- if (!smallest || (rdev->size <smallest->size)) {
- smallest = rdev;
- printk(KERN_INFO " (%llu) is smallest!.\n",
- (unsigned long long)rdev->size);
- }
- } else
+ if (rdev->sectors <= current_start) {
printk(KERN_INFO " nope.\n");
+ continue;
+ }
+ printk(KERN_INFO " contained as device %d\n", c);
+ zone->dev[c] = rdev;
+ c++;
+ if (!smallest || rdev->sectors < smallest->sectors) {
+ smallest = rdev;
+ printk(KERN_INFO " (%llu) is smallest!.\n",
+ (unsigned long long)rdev->sectors);
+ }
}
zone->nb_dev = c;
- zone->sectors = (smallest->size * 2 - current_start) * c;
+ zone->sectors = (smallest->sectors - current_start) * c;
printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
zone->nb_dev, (unsigned long long)zone->sectors);
zone->zone_start = curr_zone_start;
curr_zone_start += zone->sectors;
- current_start = smallest->size * 2;
+ current_start = smallest->sectors;
printk(KERN_INFO "raid0: current zone start: %llu\n",
(unsigned long long)current_start);
}
@@ -261,12 +263,25 @@ static int raid0_mergeable_bvec(struct request_queue *q,
return max;
}
+static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+ sector_t array_sectors = 0;
+ mdk_rdev_t *rdev;
+
+ WARN_ONCE(sectors || raid_disks,
+ "%s does not support generic reshape\n", __func__);
+
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ array_sectors += rdev->sectors;
+
+ return array_sectors;
+}
+
static int raid0_run (mddev_t *mddev)
{
unsigned cur=0, i=0, nb_zone;
s64 sectors;
raid0_conf_t *conf;
- mdk_rdev_t *rdev;
if (mddev->chunk_size == 0) {
printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
@@ -291,16 +306,14 @@ static int raid0_run (mddev_t *mddev)
goto out_free_conf;
/* calculate array device size */
- mddev->array_sectors = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set)
- mddev->array_sectors += rdev->size * 2;
+ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
(unsigned long long)mddev->array_sectors);
printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
(unsigned long long)conf->spacing);
{
- sector_t s = mddev->array_sectors;
+ sector_t s = raid0_size(mddev, 0, 0);
sector_t space = conf->spacing;
int round;
conf->sector_shift = 0;
@@ -509,6 +522,7 @@ static struct mdk_personality raid0_personality=
.run = raid0_run,
.stop = raid0_stop,
.status = raid0_status,
+ .size = raid0_size,
};
static int __init raid0_init (void)
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
new file mode 100644
index 000000000000..824b12eb1d4f
--- /dev/null
+++ b/drivers/md/raid0.h
@@ -0,0 +1,28 @@
+#ifndef _RAID0_H
+#define _RAID0_H
+
+struct strip_zone
+{
+ sector_t zone_start; /* Zone offset in md_dev (in sectors) */
+ sector_t dev_start; /* Zone offset in real dev (in sectors) */
+ sector_t sectors; /* Zone size in sectors */
+ int nb_dev; /* # of devices attached to the zone */
+ mdk_rdev_t **dev; /* Devices attached to the zone */
+};
+
+struct raid0_private_data
+{
+ struct strip_zone **hash_table; /* Table of indexes into strip_zone */
+ struct strip_zone *strip_zone;
+ mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
+ int nr_strip_zones;
+
+ sector_t spacing;
+ int sector_shift; /* shift this before divide by spacing */
+};
+
+typedef struct raid0_private_data raid0_conf_t;
+
+#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
+
+#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e2466425d9ca..36df9109cde1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -31,10 +31,12 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include "dm-bio-list.h"
#include <linux/delay.h>
-#include <linux/raid/raid1.h>
-#include <linux/raid/bitmap.h>
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "raid1.h"
+#include "bitmap.h"
#define DEBUG 0
#if DEBUG
@@ -120,6 +122,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
goto out_free_pages;
bio->bi_io_vec[i].bv_page = page;
+ bio->bi_vcnt = i+1;
}
}
/* If not user-requests, copy the page pointers to all bios */
@@ -135,9 +138,9 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
return r1_bio;
out_free_pages:
- for (i=0; i < RESYNC_PAGES ; i++)
- for (j=0 ; j < pi->raid_disks; j++)
- safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
+ for (j=0 ; j < pi->raid_disks; j++)
+ for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
+ put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
j = -1;
out_free_bio:
while ( ++j < pi->raid_disks )
@@ -582,7 +585,7 @@ static int raid1_congested(void *data, int bits)
/* Note the '|| 1' - when read_balance prefers
* non-congested targets, it can be removed
*/
- if ((bits & (1<<BDI_write_congested)) || 1)
+ if ((bits & (1<<BDI_async_congested)) || 1)
ret |= bdi_congested(&q->backing_dev_info, bits);
else
ret &= bdi_congested(&q->backing_dev_info, bits);
@@ -1723,7 +1726,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return 0;
}
- max_sector = mddev->size << 1;
+ max_sector = mddev->dev_sectors;
if (sector_nr >= max_sector) {
/* If we aborted, we need to abort the
* sync on the 'current' bitmap chunk (there will
@@ -1919,6 +1922,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return nr_sectors;
}
+static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+ if (sectors)
+ return sectors;
+
+ return mddev->dev_sectors;
+}
+
static int run(mddev_t *mddev)
{
conf_t *conf;
@@ -2048,7 +2059,7 @@ static int run(mddev_t *mddev)
/*
* Ok, everything is just fine now
*/
- mddev->array_sectors = mddev->size * 2;
+ md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
@@ -2089,6 +2100,9 @@ static int stop(mddev_t *mddev)
/* need to kick something here to make sure I/O goes? */
}
+ raise_barrier(conf);
+ lower_barrier(conf);
+
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -2110,15 +2124,17 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
* any io in the removed space completes, but it hardly seems
* worth it.
*/
- mddev->array_sectors = sectors;
+ md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
+ if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
+ return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
- if (mddev->array_sectors / 2 > mddev->size &&
+ if (sectors > mddev->dev_sectors &&
mddev->recovery_cp == MaxSector) {
- mddev->recovery_cp = mddev->size << 1;
+ mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
- mddev->size = mddev->array_sectors / 2;
+ mddev->dev_sectors = sectors;
mddev->resync_max_sectors = sectors;
return 0;
}
@@ -2264,6 +2280,7 @@ static struct mdk_personality raid1_personality =
.spare_active = raid1_spare_active,
.sync_request = sync_request,
.resize = raid1_resize,
+ .size = raid1_size,
.check_reshape = raid1_reshape,
.quiesce = raid1_quiesce,
};
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
new file mode 100644
index 000000000000..1620eea3d57c
--- /dev/null
+++ b/drivers/md/raid1.h
@@ -0,0 +1,132 @@
+#ifndef _RAID1_H
+#define _RAID1_H
+
+typedef struct mirror_info mirror_info_t;
+
+struct mirror_info {
+ mdk_rdev_t *rdev;
+ sector_t head_position;
+};
+
+/*
+ * memory pools need a pointer to the mddev, so they can force an unplug
+ * when memory is tight, and a count of the number of drives that the
+ * pool was allocated for, so they know how much to allocate and free.
+ * mddev->raid_disks cannot be used, as it can change while a pool is active
+ * These two datums are stored in a kmalloced struct.
+ */
+
+struct pool_info {
+ mddev_t *mddev;
+ int raid_disks;
+};
+
+
+typedef struct r1bio_s r1bio_t;
+
+struct r1_private_data_s {
+ mddev_t *mddev;
+ mirror_info_t *mirrors;
+ int raid_disks;
+ int last_used;
+ sector_t next_seq_sect;
+ spinlock_t device_lock;
+
+ struct list_head retry_list;
+ /* queue pending writes and submit them on unplug */
+ struct bio_list pending_bio_list;
+ /* queue of writes that have been unplugged */
+ struct bio_list flushing_bio_list;
+
+ /* for use when syncing mirrors: */
+
+ spinlock_t resync_lock;
+ int nr_pending;
+ int nr_waiting;
+ int nr_queued;
+ int barrier;
+ sector_t next_resync;
+ int fullsync; /* set to 1 if a full sync is needed,
+ * (fresh device added).
+ * Cleared when a sync completes.
+ */
+
+ wait_queue_head_t wait_barrier;
+
+ struct pool_info *poolinfo;
+
+ struct page *tmppage;
+
+ mempool_t *r1bio_pool;
+ mempool_t *r1buf_pool;
+};
+
+typedef struct r1_private_data_s conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
+
+/*
+ * this is our 'private' RAID1 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID1 operation, and about their status:
+ */
+
+struct r1bio_s {
+ atomic_t remaining; /* 'have we finished' count,
+ * used from IRQ handlers
+ */
+ atomic_t behind_remaining; /* number of write-behind ios remaining
+ * in this BehindIO request
+ */
+ sector_t sector;
+ int sectors;
+ unsigned long state;
+ mddev_t *mddev;
+ /*
+ * original bio going to /dev/mdx
+ */
+ struct bio *master_bio;
+ /*
+ * if the IO is in READ direction, then this is where we read
+ */
+ int read_disk;
+
+ struct list_head retry_list;
+ struct bitmap_update *bitmap_update;
+ /*
+ * if the IO is in WRITE direction, then multiple bios are used.
+ * We choose the number when they are allocated.
+ */
+ struct bio *bios[0];
+ /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
+};
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error. To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
+/* bits for r1bio.state */
+#define R1BIO_Uptodate 0
+#define R1BIO_IsSync 1
+#define R1BIO_Degraded 2
+#define R1BIO_BehindIO 3
+#define R1BIO_Barrier 4
+#define R1BIO_BarrierRetry 5
+/* For write-behind requests, we call bi_end_io when
+ * the last non-write-behind device completes, providing
+ * any write was successful. Otherwise we call when
+ * any write-behind write succeeds, otherwise we call
+ * with failure when last write completes (and all failed).
+ * Record that bi_end_io was called with this flag...
+ */
+#define R1BIO_Returned 6
+
+#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7301631abe04..499620afb44b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,10 +18,12 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include "dm-bio-list.h"
#include <linux/delay.h>
-#include <linux/raid/raid10.h>
-#include <linux/raid/bitmap.h>
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "raid10.h"
+#include "bitmap.h"
/*
* RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -1695,7 +1697,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return 0;
skipped:
- max_sector = mddev->size << 1;
+ max_sector = mddev->dev_sectors;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) {
@@ -1807,17 +1809,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
r10_bio->sector = sect;
raid10_find_phys(conf, r10_bio);
- /* Need to check if this section will still be
+
+ /* Need to check if the array will still be
* degraded
*/
- for (j=0; j<conf->copies;j++) {
- int d = r10_bio->devs[j].devnum;
- if (conf->mirrors[d].rdev == NULL ||
- test_bit(Faulty, &conf->mirrors[d].rdev->flags)) {
+ for (j=0; j<conf->raid_disks; j++)
+ if (conf->mirrors[j].rdev == NULL ||
+ test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
still_degraded = 1;
break;
}
- }
+
must_sync = bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, still_degraded);
@@ -2020,6 +2022,25 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
goto skipped;
}
+static sector_t
+raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+ sector_t size;
+ conf_t *conf = mddev_to_conf(mddev);
+
+ if (!raid_disks)
+ raid_disks = mddev->raid_disks;
+ if (!sectors)
+ sectors = mddev->dev_sectors;
+
+ size = sectors >> conf->chunk_shift;
+ sector_div(size, conf->far_copies);
+ size = size * raid_disks;
+ sector_div(size, conf->near_copies);
+
+ return size << conf->chunk_shift;
+}
+
static int run(mddev_t *mddev)
{
conf_t *conf;
@@ -2076,7 +2097,7 @@ static int run(mddev_t *mddev)
conf->far_offset = fo;
conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
- size = mddev->size >> (conf->chunk_shift-1);
+ size = mddev->dev_sectors >> conf->chunk_shift;
sector_div(size, fc);
size = size * conf->raid_disks;
sector_div(size, nc);
@@ -2089,7 +2110,7 @@ static int run(mddev_t *mddev)
*/
stride += conf->raid_disks - 1;
sector_div(stride, conf->raid_disks);
- mddev->size = stride << (conf->chunk_shift-1);
+ mddev->dev_sectors = stride << conf->chunk_shift;
if (fo)
stride = 1;
@@ -2171,8 +2192,8 @@ static int run(mddev_t *mddev)
/*
* Ok, everything is just fine now
*/
- mddev->array_sectors = size << conf->chunk_shift;
- mddev->resync_max_sectors = size << conf->chunk_shift;
+ md_set_array_sectors(mddev, raid10_size(mddev, 0, 0));
+ mddev->resync_max_sectors = raid10_size(mddev, 0, 0);
mddev->queue->unplug_fn = raid10_unplug;
mddev->queue->backing_dev_info.congested_fn = raid10_congested;
@@ -2208,6 +2229,9 @@ static int stop(mddev_t *mddev)
{
conf_t *conf = mddev_to_conf(mddev);
+ raise_barrier(conf, 0);
+ lower_barrier(conf);
+
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -2255,6 +2279,7 @@ static struct mdk_personality raid10_personality =
.spare_active = raid10_spare_active,
.sync_request = sync_request,
.quiesce = raid10_quiesce,
+ .size = raid10_size,
};
static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
new file mode 100644
index 000000000000..244dbe507a54
--- /dev/null
+++ b/drivers/md/raid10.h
@@ -0,0 +1,121 @@
+#ifndef _RAID10_H
+#define _RAID10_H
+
+typedef struct mirror_info mirror_info_t;
+
+struct mirror_info {
+ mdk_rdev_t *rdev;
+ sector_t head_position;
+};
+
+typedef struct r10bio_s r10bio_t;
+
+struct r10_private_data_s {
+ mddev_t *mddev;
+ mirror_info_t *mirrors;
+ int raid_disks;
+ spinlock_t device_lock;
+
+ /* geometry */
+ int near_copies; /* number of copies layed out raid0 style */
+ int far_copies; /* number of copies layed out
+ * at large strides across drives
+ */
+ int far_offset; /* far_copies are offset by 1 stripe
+ * instead of many
+ */
+ int copies; /* near_copies * far_copies.
+ * must be <= raid_disks
+ */
+ sector_t stride; /* distance between far copies.
+ * This is size / far_copies unless
+ * far_offset, in which case it is
+ * 1 stripe.
+ */
+
+ int chunk_shift; /* shift from chunks to sectors */
+ sector_t chunk_mask;
+
+ struct list_head retry_list;
+ /* queue pending writes and submit them on unplug */
+ struct bio_list pending_bio_list;
+
+
+ spinlock_t resync_lock;
+ int nr_pending;
+ int nr_waiting;
+ int nr_queued;
+ int barrier;
+ sector_t next_resync;
+ int fullsync; /* set to 1 if a full sync is needed,
+ * (fresh device added).
+ * Cleared when a sync completes.
+ */
+
+ wait_queue_head_t wait_barrier;
+
+ mempool_t *r10bio_pool;
+ mempool_t *r10buf_pool;
+ struct page *tmppage;
+};
+
+typedef struct r10_private_data_s conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
+
+/*
+ * this is our 'private' RAID10 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID10 operation, and about their status:
+ */
+
+struct r10bio_s {
+ atomic_t remaining; /* 'have we finished' count,
+ * used from IRQ handlers
+ */
+ sector_t sector; /* virtual sector number */
+ int sectors;
+ unsigned long state;
+ mddev_t *mddev;
+ /*
+ * original bio going to /dev/mdx
+ */
+ struct bio *master_bio;
+ /*
+ * if the IO is in READ direction, then this is where we read
+ */
+ int read_slot;
+
+ struct list_head retry_list;
+ /*
+ * if the IO is in WRITE direction, then multiple bios are used,
+ * one for each copy.
+ * When resyncing we also use one for each copy.
+ * When reconstructing, we use 2 bios, one for read, one for write.
+ * We choose the number when they are allocated.
+ */
+ struct {
+ struct bio *bio;
+ sector_t addr;
+ int devnum;
+ } devs[0];
+};
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error. To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
+/* bits for r10bio.state */
+#define R10BIO_Uptodate 0
+#define R10BIO_IsSync 1
+#define R10BIO_IsRecover 2
+#define R10BIO_Degraded 3
+#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a5ba080d303b..4616bc3a6e71 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -43,11 +43,14 @@
* miss any bits.
*/
+#include <linux/blkdev.h>
#include <linux/kthread.h>
-#include "raid6.h"
-
-#include <linux/raid/bitmap.h>
+#include <linux/raid/pq.h>
#include <linux/async_tx.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "raid5.h"
+#include "bitmap.h"
/*
* Stripe cache
@@ -91,11 +94,6 @@
#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
-#if !RAID6_USE_EMPTY_ZERO_PAGE
-/* In .bss so it's zeroed */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-#endif
-
/*
* We maintain a biased count of active stripes in the bottom 16 bits of
* bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -130,12 +128,42 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
}
+/* Find first data disk in a raid6 stripe */
+static inline int raid6_d0(struct stripe_head *sh)
+{
+ if (sh->ddf_layout)
+ /* ddf always start from first device */
+ return 0;
+ /* md starts just after Q block */
+ if (sh->qd_idx == sh->disks - 1)
+ return 0;
+ else
+ return sh->qd_idx + 1;
+}
static inline int raid6_next_disk(int disk, int raid_disks)
{
disk++;
return (disk < raid_disks) ? disk : 0;
}
+/* When walking through the disks in a raid5, starting at raid6_d0,
+ * We need to map each disk to a 'slot', where the data disks are slot
+ * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
+ * is raid_disks-1. This help does that mapping.
+ */
+static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
+ int *count, int syndrome_disks)
+{
+ int slot;
+
+ if (idx == sh->pd_idx)
+ return syndrome_disks;
+ if (idx == sh->qd_idx)
+ return syndrome_disks + 1;
+ slot = (*count)++;
+ return slot;
+}
+
static void return_io(struct bio *return_bi)
{
struct bio *bi = return_bi;
@@ -193,6 +221,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
}
}
}
+
static void release_stripe(struct stripe_head *sh)
{
raid5_conf_t *conf = sh->raid_conf;
@@ -270,9 +299,11 @@ static int grow_buffers(struct stripe_head *sh, int num)
return 0;
}
-static void raid5_build_block(struct stripe_head *sh, int i);
+static void raid5_build_block(struct stripe_head *sh, int i, int previous);
+static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
+ struct stripe_head *sh);
-static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
+static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
{
raid5_conf_t *conf = sh->raid_conf;
int i;
@@ -287,11 +318,12 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
remove_hash(sh);
+ sh->generation = conf->generation - previous;
+ sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
sh->sector = sector;
- sh->pd_idx = pd_idx;
+ stripe_set_idx(sector, conf, previous, sh);
sh->state = 0;
- sh->disks = disks;
for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
@@ -305,12 +337,13 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
BUG();
}
dev->flags = 0;
- raid5_build_block(sh, i);
+ raid5_build_block(sh, i, previous);
}
insert_hash(conf, sh);
}
-static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
+ short generation)
{
struct stripe_head *sh;
struct hlist_node *hn;
@@ -318,7 +351,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
CHECK_DEVLOCK();
pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
- if (sh->sector == sector && sh->disks == disks)
+ if (sh->sector == sector && sh->generation == generation)
return sh;
pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
return NULL;
@@ -327,8 +360,9 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
static void unplug_slaves(mddev_t *mddev);
static void raid5_unplug_device(struct request_queue *q);
-static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
- int pd_idx, int noblock)
+static struct stripe_head *
+get_active_stripe(raid5_conf_t *conf, sector_t sector,
+ int previous, int noblock)
{
struct stripe_head *sh;
@@ -340,7 +374,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
wait_event_lock_irq(conf->wait_for_stripe,
conf->quiesce == 0,
conf->device_lock, /* nothing */);
- sh = __find_stripe(conf, sector, disks);
+ sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) {
if (!conf->inactive_blocked)
sh = get_free_stripe(conf);
@@ -358,10 +392,11 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
);
conf->inactive_blocked = 0;
} else
- init_stripe(sh, sector, pd_idx, disks);
+ init_stripe(sh, sector, previous);
} else {
if (atomic_read(&sh->count)) {
- BUG_ON(!list_empty(&sh->lru));
+ BUG_ON(!list_empty(&sh->lru)
+ && !test_bit(STRIPE_EXPANDING, &sh->state));
} else {
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
@@ -895,8 +930,10 @@ static int grow_stripes(raid5_conf_t *conf, int num)
struct kmem_cache *sc;
int devs = conf->raid_disks;
- sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev));
- sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev));
+ sprintf(conf->cache_name[0],
+ "raid%d-%s", conf->level, mdname(conf->mddev));
+ sprintf(conf->cache_name[1],
+ "raid%d-%s-alt", conf->level, mdname(conf->mddev));
conf->active_name = 0;
sc = kmem_cache_create(conf->cache_name[conf->active_name],
sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
@@ -911,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num)
return 0;
}
-#ifdef CONFIG_MD_RAID5_RESHAPE
static int resize_stripes(raid5_conf_t *conf, int newsize)
{
/* Make all the stripes able to hold 'newsize' devices.
@@ -1036,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
conf->pool_size = newsize;
return err;
}
-#endif
static int drop_one_stripe(raid5_conf_t *conf)
{
@@ -1066,7 +1101,7 @@ static void shrink_stripes(raid5_conf_t *conf)
static void raid5_end_read_request(struct bio * bi, int error)
{
- struct stripe_head *sh = bi->bi_private;
+ struct stripe_head *sh = bi->bi_private;
raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks, i;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1148,7 +1183,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
static void raid5_end_write_request(struct bio *bi, int error)
{
- struct stripe_head *sh = bi->bi_private;
+ struct stripe_head *sh = bi->bi_private;
raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks, i;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1176,9 +1211,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
}
-static sector_t compute_blocknr(struct stripe_head *sh, int i);
+static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
-static void raid5_build_block(struct stripe_head *sh, int i)
+static void raid5_build_block(struct stripe_head *sh, int i, int previous)
{
struct r5dev *dev = &sh->dev[i];
@@ -1194,7 +1229,7 @@ static void raid5_build_block(struct stripe_head *sh, int i)
dev->req.bi_private = sh;
dev->flags = 0;
- dev->sector = compute_blocknr(sh, i);
+ dev->sector = compute_blocknr(sh, i, previous);
}
static void error(mddev_t *mddev, mdk_rdev_t *rdev)
@@ -1227,15 +1262,23 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
* Input: a 'big' sector number,
* Output: index of the data and parity disk, and the sector # in them.
*/
-static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
- unsigned int data_disks, unsigned int * dd_idx,
- unsigned int * pd_idx, raid5_conf_t *conf)
+static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
+ int previous, int *dd_idx,
+ struct stripe_head *sh)
{
long stripe;
unsigned long chunk_number;
unsigned int chunk_offset;
+ int pd_idx, qd_idx;
+ int ddf_layout = 0;
sector_t new_sector;
- int sectors_per_chunk = conf->chunk_size >> 9;
+ int algorithm = previous ? conf->prev_algo
+ : conf->algorithm;
+ int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
+ : (conf->chunk_size >> 9);
+ int raid_disks = previous ? conf->previous_raid_disks
+ : conf->raid_disks;
+ int data_disks = raid_disks - conf->max_degraded;
/* First compute the information on this sector */
@@ -1259,68 +1302,170 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
/*
* Select the parity disk based on the user selected algorithm.
*/
+ pd_idx = qd_idx = ~0;
switch(conf->level) {
case 4:
- *pd_idx = data_disks;
+ pd_idx = data_disks;
break;
case 5:
- switch (conf->algorithm) {
+ switch (algorithm) {
case ALGORITHM_LEFT_ASYMMETRIC:
- *pd_idx = data_disks - stripe % raid_disks;
- if (*dd_idx >= *pd_idx)
+ pd_idx = data_disks - stripe % raid_disks;
+ if (*dd_idx >= pd_idx)
(*dd_idx)++;
break;
case ALGORITHM_RIGHT_ASYMMETRIC:
- *pd_idx = stripe % raid_disks;
- if (*dd_idx >= *pd_idx)
+ pd_idx = stripe % raid_disks;
+ if (*dd_idx >= pd_idx)
(*dd_idx)++;
break;
case ALGORITHM_LEFT_SYMMETRIC:
- *pd_idx = data_disks - stripe % raid_disks;
- *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+ pd_idx = data_disks - stripe % raid_disks;
+ *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
break;
case ALGORITHM_RIGHT_SYMMETRIC:
- *pd_idx = stripe % raid_disks;
- *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+ pd_idx = stripe % raid_disks;
+ *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
+ break;
+ case ALGORITHM_PARITY_0:
+ pd_idx = 0;
+ (*dd_idx)++;
+ break;
+ case ALGORITHM_PARITY_N:
+ pd_idx = data_disks;
break;
default:
printk(KERN_ERR "raid5: unsupported algorithm %d\n",
- conf->algorithm);
+ algorithm);
+ BUG();
}
break;
case 6:
- /**** FIX THIS ****/
- switch (conf->algorithm) {
+ switch (algorithm) {
case ALGORITHM_LEFT_ASYMMETRIC:
- *pd_idx = raid_disks - 1 - (stripe % raid_disks);
- if (*pd_idx == raid_disks-1)
- (*dd_idx)++; /* Q D D D P */
- else if (*dd_idx >= *pd_idx)
+ pd_idx = raid_disks - 1 - (stripe % raid_disks);
+ qd_idx = pd_idx + 1;
+ if (pd_idx == raid_disks-1) {
+ (*dd_idx)++; /* Q D D D P */
+ qd_idx = 0;
+ } else if (*dd_idx >= pd_idx)
(*dd_idx) += 2; /* D D P Q D */
break;
case ALGORITHM_RIGHT_ASYMMETRIC:
- *pd_idx = stripe % raid_disks;
- if (*pd_idx == raid_disks-1)
- (*dd_idx)++; /* Q D D D P */
- else if (*dd_idx >= *pd_idx)
+ pd_idx = stripe % raid_disks;
+ qd_idx = pd_idx + 1;
+ if (pd_idx == raid_disks-1) {
+ (*dd_idx)++; /* Q D D D P */
+ qd_idx = 0;
+ } else if (*dd_idx >= pd_idx)
(*dd_idx) += 2; /* D D P Q D */
break;
case ALGORITHM_LEFT_SYMMETRIC:
- *pd_idx = raid_disks - 1 - (stripe % raid_disks);
- *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+ pd_idx = raid_disks - 1 - (stripe % raid_disks);
+ qd_idx = (pd_idx + 1) % raid_disks;
+ *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
break;
case ALGORITHM_RIGHT_SYMMETRIC:
- *pd_idx = stripe % raid_disks;
- *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+ pd_idx = stripe % raid_disks;
+ qd_idx = (pd_idx + 1) % raid_disks;
+ *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
+ break;
+
+ case ALGORITHM_PARITY_0:
+ pd_idx = 0;
+ qd_idx = 1;
+ (*dd_idx) += 2;
+ break;
+ case ALGORITHM_PARITY_N:
+ pd_idx = data_disks;
+ qd_idx = data_disks + 1;
break;
+
+ case ALGORITHM_ROTATING_ZERO_RESTART:
+ /* Exactly the same as RIGHT_ASYMMETRIC, but or
+ * of blocks for computing Q is different.
+ */
+ pd_idx = stripe % raid_disks;
+ qd_idx = pd_idx + 1;
+ if (pd_idx == raid_disks-1) {
+ (*dd_idx)++; /* Q D D D P */
+ qd_idx = 0;
+ } else if (*dd_idx >= pd_idx)
+ (*dd_idx) += 2; /* D D P Q D */
+ ddf_layout = 1;
+ break;
+
+ case ALGORITHM_ROTATING_N_RESTART:
+ /* Same a left_asymmetric, by first stripe is
+ * D D D P Q rather than
+ * Q D D D P
+ */
+ pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks);
+ qd_idx = pd_idx + 1;
+ if (pd_idx == raid_disks-1) {
+ (*dd_idx)++; /* Q D D D P */
+ qd_idx = 0;
+ } else if (*dd_idx >= pd_idx)
+ (*dd_idx) += 2; /* D D P Q D */
+ ddf_layout = 1;
+ break;
+
+ case ALGORITHM_ROTATING_N_CONTINUE:
+ /* Same as left_symmetric but Q is before P */
+ pd_idx = raid_disks - 1 - (stripe % raid_disks);
+ qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
+ *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
+ ddf_layout = 1;
+ break;
+
+ case ALGORITHM_LEFT_ASYMMETRIC_6:
+ /* RAID5 left_asymmetric, with Q on last device */
+ pd_idx = data_disks - stripe % (raid_disks-1);
+ if (*dd_idx >= pd_idx)
+ (*dd_idx)++;
+ qd_idx = raid_disks - 1;
+ break;
+
+ case ALGORITHM_RIGHT_ASYMMETRIC_6:
+ pd_idx = stripe % (raid_disks-1);
+ if (*dd_idx >= pd_idx)
+ (*dd_idx)++;
+ qd_idx = raid_disks - 1;
+ break;
+
+ case ALGORITHM_LEFT_SYMMETRIC_6:
+ pd_idx = data_disks - stripe % (raid_disks-1);
+ *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
+ qd_idx = raid_disks - 1;
+ break;
+
+ case ALGORITHM_RIGHT_SYMMETRIC_6:
+ pd_idx = stripe % (raid_disks-1);
+ *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
+ qd_idx = raid_disks - 1;
+ break;
+
+ case ALGORITHM_PARITY_0_6:
+ pd_idx = 0;
+ (*dd_idx)++;
+ qd_idx = raid_disks - 1;
+ break;
+
+
default:
printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
- conf->algorithm);
+ algorithm);
+ BUG();
}
break;
}
+ if (sh) {
+ sh->pd_idx = pd_idx;
+ sh->qd_idx = qd_idx;
+ sh->ddf_layout = ddf_layout;
+ }
/*
* Finally, compute the new sector number
*/
@@ -1329,17 +1474,21 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
}
-static sector_t compute_blocknr(struct stripe_head *sh, int i)
+static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
{
raid5_conf_t *conf = sh->raid_conf;
int raid_disks = sh->disks;
int data_disks = raid_disks - conf->max_degraded;
sector_t new_sector = sh->sector, check;
- int sectors_per_chunk = conf->chunk_size >> 9;
+ int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
+ : (conf->chunk_size >> 9);
+ int algorithm = previous ? conf->prev_algo
+ : conf->algorithm;
sector_t stripe;
int chunk_offset;
- int chunk_number, dummy1, dummy2, dd_idx = i;
+ int chunk_number, dummy1, dd_idx = i;
sector_t r_sector;
+ struct stripe_head sh2;
chunk_offset = sector_div(new_sector, sectors_per_chunk);
@@ -1351,7 +1500,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
switch(conf->level) {
case 4: break;
case 5:
- switch (conf->algorithm) {
+ switch (algorithm) {
case ALGORITHM_LEFT_ASYMMETRIC:
case ALGORITHM_RIGHT_ASYMMETRIC:
if (i > sh->pd_idx)
@@ -1363,19 +1512,27 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
i += raid_disks;
i -= (sh->pd_idx + 1);
break;
+ case ALGORITHM_PARITY_0:
+ i -= 1;
+ break;
+ case ALGORITHM_PARITY_N:
+ break;
default:
printk(KERN_ERR "raid5: unsupported algorithm %d\n",
- conf->algorithm);
+ algorithm);
+ BUG();
}
break;
case 6:
- if (i == raid6_next_disk(sh->pd_idx, raid_disks))
+ if (i == sh->qd_idx)
return 0; /* It is the Q disk */
- switch (conf->algorithm) {
+ switch (algorithm) {
case ALGORITHM_LEFT_ASYMMETRIC:
case ALGORITHM_RIGHT_ASYMMETRIC:
- if (sh->pd_idx == raid_disks-1)
- i--; /* Q D D D P */
+ case ALGORITHM_ROTATING_ZERO_RESTART:
+ case ALGORITHM_ROTATING_N_RESTART:
+ if (sh->pd_idx == raid_disks-1)
+ i--; /* Q D D D P */
else if (i > sh->pd_idx)
i -= 2; /* D D P Q D */
break;
@@ -1390,9 +1547,35 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
i -= (sh->pd_idx + 2);
}
break;
+ case ALGORITHM_PARITY_0:
+ i -= 2;
+ break;
+ case ALGORITHM_PARITY_N:
+ break;
+ case ALGORITHM_ROTATING_N_CONTINUE:
+ if (sh->pd_idx == 0)
+ i--; /* P D D D Q */
+ else if (i > sh->pd_idx)
+ i -= 2; /* D D Q P D */
+ break;
+ case ALGORITHM_LEFT_ASYMMETRIC_6:
+ case ALGORITHM_RIGHT_ASYMMETRIC_6:
+ if (i > sh->pd_idx)
+ i--;
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC_6:
+ case ALGORITHM_RIGHT_SYMMETRIC_6:
+ if (i < sh->pd_idx)
+ i += data_disks + 1;
+ i -= (sh->pd_idx + 1);
+ break;
+ case ALGORITHM_PARITY_0_6:
+ i -= 1;
+ break;
default:
printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
- conf->algorithm);
+ algorithm);
+ BUG();
}
break;
}
@@ -1400,8 +1583,10 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
chunk_number = stripe * data_disks + i;
r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
- check = raid5_compute_sector(r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
- if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
+ check = raid5_compute_sector(conf, r_sector,
+ previous, &dummy1, &sh2);
+ if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
+ || sh2.qd_idx != sh->qd_idx) {
printk(KERN_ERR "compute_blocknr: map not correct\n");
return 0;
}
@@ -1468,14 +1653,16 @@ static void copy_data(int frombio, struct bio *bio,
static void compute_parity6(struct stripe_head *sh, int method)
{
- raid6_conf_t *conf = sh->raid_conf;
- int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
+ raid5_conf_t *conf = sh->raid_conf;
+ int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
+ int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
struct bio *chosen;
/**** FIX THIS: This could be very bad if disks is close to 256 ****/
- void *ptrs[disks];
+ void *ptrs[syndrome_disks+2];
- qd_idx = raid6_next_disk(pd_idx, disks);
- d0_idx = raid6_next_disk(qd_idx, disks);
+ pd_idx = sh->pd_idx;
+ qd_idx = sh->qd_idx;
+ d0_idx = raid6_d0(sh);
pr_debug("compute_parity, stripe %llu, method %d\n",
(unsigned long long)sh->sector, method);
@@ -1513,24 +1700,29 @@ static void compute_parity6(struct stripe_head *sh, int method)
set_bit(R5_UPTODATE, &sh->dev[i].flags);
}
-// switch(method) {
-// case RECONSTRUCT_WRITE:
-// case CHECK_PARITY:
-// case UPDATE_PARITY:
- /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
- /* FIX: Is this ordering of drives even remotely optimal? */
- count = 0;
- i = d0_idx;
- do {
- ptrs[count++] = page_address(sh->dev[i].page);
- if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
- printk("block %d/%d not uptodate on parity calc\n", i,count);
- i = raid6_next_disk(i, disks);
- } while ( i != d0_idx );
-// break;
-// }
-
- raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
+ /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
+
+ for (i = 0; i < disks; i++)
+ ptrs[i] = (void *)raid6_empty_zero_page;
+
+ count = 0;
+ i = d0_idx;
+ do {
+ int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+ ptrs[slot] = page_address(sh->dev[i].page);
+ if (slot < syndrome_disks &&
+ !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
+ printk(KERN_ERR "block %d/%d not uptodate "
+ "on parity calc\n", i, count);
+ BUG();
+ }
+
+ i = raid6_next_disk(i, disks);
+ } while (i != d0_idx);
+ BUG_ON(count != syndrome_disks);
+
+ raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
switch(method) {
case RECONSTRUCT_WRITE:
@@ -1552,8 +1744,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
{
int i, count, disks = sh->disks;
void *ptr[MAX_XOR_BLOCKS], *dest, *p;
- int pd_idx = sh->pd_idx;
- int qd_idx = raid6_next_disk(pd_idx, disks);
+ int qd_idx = sh->qd_idx;
pr_debug("compute_block_1, stripe %llu, idx %d\n",
(unsigned long long)sh->sector, dd_idx);
@@ -1589,63 +1780,65 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
{
int i, count, disks = sh->disks;
- int pd_idx = sh->pd_idx;
- int qd_idx = raid6_next_disk(pd_idx, disks);
- int d0_idx = raid6_next_disk(qd_idx, disks);
- int faila, failb;
+ int syndrome_disks = sh->ddf_layout ? disks : disks-2;
+ int d0_idx = raid6_d0(sh);
+ int faila = -1, failb = -1;
+ /**** FIX THIS: This could be very bad if disks is close to 256 ****/
+ void *ptrs[syndrome_disks+2];
- /* faila and failb are disk numbers relative to d0_idx */
- /* pd_idx become disks-2 and qd_idx become disks-1 */
- faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
- failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
+ for (i = 0; i < disks ; i++)
+ ptrs[i] = (void *)raid6_empty_zero_page;
+ count = 0;
+ i = d0_idx;
+ do {
+ int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+ ptrs[slot] = page_address(sh->dev[i].page);
+
+ if (i == dd_idx1)
+ faila = slot;
+ if (i == dd_idx2)
+ failb = slot;
+ i = raid6_next_disk(i, disks);
+ } while (i != d0_idx);
+ BUG_ON(count != syndrome_disks);
BUG_ON(faila == failb);
if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
- (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
+ (unsigned long long)sh->sector, dd_idx1, dd_idx2,
+ faila, failb);
- if ( failb == disks-1 ) {
+ if (failb == syndrome_disks+1) {
/* Q disk is one of the missing disks */
- if ( faila == disks-2 ) {
+ if (faila == syndrome_disks) {
/* Missing P+Q, just recompute */
compute_parity6(sh, UPDATE_PARITY);
return;
} else {
/* We're missing D+Q; recompute D from P */
- compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
+ compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
+ dd_idx2 : dd_idx1),
+ 0);
compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
return;
}
}
- /* We're missing D+P or D+D; build pointer table */
- {
- /**** FIX THIS: This could be very bad if disks is close to 256 ****/
- void *ptrs[disks];
-
- count = 0;
- i = d0_idx;
- do {
- ptrs[count++] = page_address(sh->dev[i].page);
- i = raid6_next_disk(i, disks);
- if (i != dd_idx1 && i != dd_idx2 &&
- !test_bit(R5_UPTODATE, &sh->dev[i].flags))
- printk("compute_2 with missing block %d/%d\n", count, i);
- } while ( i != d0_idx );
-
- if ( failb == disks-2 ) {
- /* We're missing D+P. */
- raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
- } else {
- /* We're missing D+D. */
- raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
- }
-
- /* Both the above update both missing blocks */
- set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
- set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
+ /* We're missing D+P or D+D; */
+ if (failb == syndrome_disks) {
+ /* We're missing D+P. */
+ raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
+ } else {
+ /* We're missing D+D. */
+ raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
+ ptrs);
}
+
+ /* Both the above update both missing blocks */
+ set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
+ set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
}
static void
@@ -1800,17 +1993,21 @@ static int page_is_zero(struct page *p)
memcmp(a, a+4, STRIPE_SIZE-4)==0);
}
-static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
+static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
+ struct stripe_head *sh)
{
- int sectors_per_chunk = conf->chunk_size >> 9;
- int pd_idx, dd_idx;
+ int sectors_per_chunk =
+ previous ? (conf->prev_chunk >> 9)
+ : (conf->chunk_size >> 9);
+ int dd_idx;
int chunk_offset = sector_div(stripe, sectors_per_chunk);
+ int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
- raid5_compute_sector(stripe * (disks - conf->max_degraded)
+ raid5_compute_sector(conf,
+ stripe * (disks - conf->max_degraded)
*sectors_per_chunk + chunk_offset,
- disks, disks - conf->max_degraded,
- &dd_idx, &pd_idx, conf);
- return pd_idx;
+ previous,
+ &dd_idx, sh);
}
static void
@@ -2181,7 +2378,7 @@ static void handle_stripe_dirtying6(raid5_conf_t *conf,
struct r6_state *r6s, int disks)
{
int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
- int qd_idx = r6s->qd_idx;
+ int qd_idx = sh->qd_idx;
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
/* Would I have to read this buffer for reconstruct_write */
@@ -2371,7 +2568,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
int update_p = 0, update_q = 0;
struct r5dev *dev;
int pd_idx = sh->pd_idx;
- int qd_idx = r6s->qd_idx;
+ int qd_idx = sh->qd_idx;
set_bit(STRIPE_HANDLE, &sh->state);
@@ -2467,17 +2664,14 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
struct dma_async_tx_descriptor *tx = NULL;
clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
for (i = 0; i < sh->disks; i++)
- if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) {
- int dd_idx, pd_idx, j;
+ if (i != sh->pd_idx && i != sh->qd_idx) {
+ int dd_idx, j;
struct stripe_head *sh2;
- sector_t bn = compute_blocknr(sh, i);
- sector_t s = raid5_compute_sector(bn, conf->raid_disks,
- conf->raid_disks -
- conf->max_degraded, &dd_idx,
- &pd_idx, conf);
- sh2 = get_active_stripe(conf, s, conf->raid_disks,
- pd_idx, 1);
+ sector_t bn = compute_blocknr(sh, i, 1);
+ sector_t s = raid5_compute_sector(conf, bn, 0,
+ &dd_idx, NULL);
+ sh2 = get_active_stripe(conf, s, 0, 1);
if (sh2 == NULL)
/* so far only the early blocks of this stripe
* have been requested. When later blocks
@@ -2500,8 +2694,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
for (j = 0; j < conf->raid_disks; j++)
if (j != sh2->pd_idx &&
- (!r6s || j != raid6_next_disk(sh2->pd_idx,
- sh2->disks)) &&
+ (!r6s || j != sh2->qd_idx) &&
!test_bit(R5_Expanded, &sh2->dev[j].flags))
break;
if (j == conf->raid_disks) {
@@ -2750,6 +2943,23 @@ static bool handle_stripe5(struct stripe_head *sh)
/* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) {
+ struct stripe_head *sh2
+ = get_active_stripe(conf, sh->sector, 1, 1);
+ if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+ /* sh cannot be written until sh2 has been read.
+ * so arrange for sh to be delayed a little
+ */
+ set_bit(STRIPE_DELAYED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+ &sh2->state))
+ atomic_inc(&conf->preread_active_stripes);
+ release_stripe(sh2);
+ goto unlock;
+ }
+ if (sh2)
+ release_stripe(sh2);
+
sh->reconstruct_state = reconstruct_state_idle;
clear_bit(STRIPE_EXPANDING, &sh->state);
for (i = conf->raid_disks; i--; ) {
@@ -2763,8 +2973,7 @@ static bool handle_stripe5(struct stripe_head *sh)
!sh->reconstruct_state) {
/* Need to write out all blocks after computing parity */
sh->disks = conf->raid_disks;
- sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
- conf->raid_disks);
+ stripe_set_idx(sh->sector, conf, 0, sh);
schedule_reconstruction5(sh, &s, 1, 1);
} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
@@ -2796,20 +3005,19 @@ static bool handle_stripe5(struct stripe_head *sh)
static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
{
- raid6_conf_t *conf = sh->raid_conf;
+ raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks;
struct bio *return_bi = NULL;
- int i, pd_idx = sh->pd_idx;
+ int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
struct stripe_head_state s;
struct r6_state r6s;
struct r5dev *dev, *pdev, *qdev;
mdk_rdev_t *blocked_rdev = NULL;
- r6s.qd_idx = raid6_next_disk(pd_idx, disks);
pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
"pd_idx=%d, qd_idx=%d\n",
(unsigned long long)sh->sector, sh->state,
- atomic_read(&sh->count), pd_idx, r6s.qd_idx);
+ atomic_read(&sh->count), pd_idx, qd_idx);
memset(&s, 0, sizeof(s));
spin_lock(&sh->lock);
@@ -2920,9 +3128,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
pdev = &sh->dev[pd_idx];
r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
|| (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
- qdev = &sh->dev[r6s.qd_idx];
- r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx)
- || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx);
+ qdev = &sh->dev[qd_idx];
+ r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx)
+ || (s.failed >= 2 && r6s.failed_num[1] == qd_idx);
if ( s.written &&
( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
@@ -2980,10 +3188,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
}
if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+ struct stripe_head *sh2
+ = get_active_stripe(conf, sh->sector, 1, 1);
+ if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+ /* sh cannot be written until sh2 has been read.
+ * so arrange for sh to be delayed a little
+ */
+ set_bit(STRIPE_DELAYED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+ &sh2->state))
+ atomic_inc(&conf->preread_active_stripes);
+ release_stripe(sh2);
+ goto unlock;
+ }
+ if (sh2)
+ release_stripe(sh2);
+
/* Need to write out all blocks after computing P&Q */
sh->disks = conf->raid_disks;
- sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
- conf->raid_disks);
+ stripe_set_idx(sh->sector, conf, 0, sh);
compute_parity6(sh, RECONSTRUCT_WRITE);
for (i = conf->raid_disks ; i-- ; ) {
set_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -3134,6 +3358,8 @@ static int raid5_mergeable_bvec(struct request_queue *q,
if ((bvm->bi_rw & 1) == WRITE)
return biovec->bv_len; /* always allow writes to be mergeable */
+ if (mddev->new_chunk < mddev->chunk_size)
+ chunk_sectors = mddev->new_chunk >> 9;
max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
if (max < 0) max = 0;
if (max <= biovec->bv_len && bio_sectors == 0)
@@ -3149,6 +3375,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
unsigned int chunk_sectors = mddev->chunk_size >> 9;
unsigned int bio_sectors = bio->bi_size >> 9;
+ if (mddev->new_chunk < mddev->chunk_size)
+ chunk_sectors = mddev->new_chunk >> 9;
return chunk_sectors >=
((sector & (chunk_sectors - 1)) + bio_sectors);
}
@@ -3255,9 +3483,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
{
mddev_t *mddev = q->queuedata;
raid5_conf_t *conf = mddev_to_conf(mddev);
- const unsigned int raid_disks = conf->raid_disks;
- const unsigned int data_disks = raid_disks - conf->max_degraded;
- unsigned int dd_idx, pd_idx;
+ unsigned int dd_idx;
struct bio* align_bi;
mdk_rdev_t *rdev;
@@ -3266,7 +3492,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
return 0;
}
/*
- * use bio_clone to make a copy of the bio
+ * use bio_clone to make a copy of the bio
*/
align_bi = bio_clone(raid_bio, GFP_NOIO);
if (!align_bi)
@@ -3280,12 +3506,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
/*
* compute position
*/
- align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector,
- raid_disks,
- data_disks,
- &dd_idx,
- &pd_idx,
- conf);
+ align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector,
+ 0,
+ &dd_idx, NULL);
rcu_read_lock();
rdev = rcu_dereference(conf->disks[dd_idx].rdev);
@@ -3377,7 +3600,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
{
mddev_t *mddev = q->queuedata;
raid5_conf_t *conf = mddev_to_conf(mddev);
- unsigned int dd_idx, pd_idx;
+ int dd_idx;
sector_t new_sector;
sector_t logical_sector, last_sector;
struct stripe_head *sh;
@@ -3400,7 +3623,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
if (rw == READ &&
mddev->reshape_position == MaxSector &&
chunk_aligned_read(q,bi))
- return 0;
+ return 0;
logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bi->bi_sector + (bi->bi_size>>9);
@@ -3410,26 +3633,31 @@ static int make_request(struct request_queue *q, struct bio * bi)
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
DEFINE_WAIT(w);
int disks, data_disks;
+ int previous;
retry:
+ previous = 0;
+ disks = conf->raid_disks;
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
- if (likely(conf->expand_progress == MaxSector))
- disks = conf->raid_disks;
- else {
- /* spinlock is needed as expand_progress may be
+ if (unlikely(conf->reshape_progress != MaxSector)) {
+ /* spinlock is needed as reshape_progress may be
* 64bit on a 32bit platform, and so it might be
* possible to see a half-updated value
- * Ofcourse expand_progress could change after
+ * Ofcourse reshape_progress could change after
* the lock is dropped, so once we get a reference
* to the stripe that we think it is, we will have
* to check again.
*/
spin_lock_irq(&conf->device_lock);
- disks = conf->raid_disks;
- if (logical_sector >= conf->expand_progress)
+ if (mddev->delta_disks < 0
+ ? logical_sector < conf->reshape_progress
+ : logical_sector >= conf->reshape_progress) {
disks = conf->previous_raid_disks;
- else {
- if (logical_sector >= conf->expand_lo) {
+ previous = 1;
+ } else {
+ if (mddev->delta_disks < 0
+ ? logical_sector < conf->reshape_safe
+ : logical_sector >= conf->reshape_safe) {
spin_unlock_irq(&conf->device_lock);
schedule();
goto retry;
@@ -3439,15 +3667,17 @@ static int make_request(struct request_queue *q, struct bio * bi)
}
data_disks = disks - conf->max_degraded;
- new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
- &dd_idx, &pd_idx, conf);
+ new_sector = raid5_compute_sector(conf, logical_sector,
+ previous,
+ &dd_idx, NULL);
pr_debug("raid5: make_request, sector %llu logical %llu\n",
(unsigned long long)new_sector,
(unsigned long long)logical_sector);
- sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
+ sh = get_active_stripe(conf, new_sector, previous,
+ (bi->bi_rw&RWA_MASK));
if (sh) {
- if (unlikely(conf->expand_progress != MaxSector)) {
+ if (unlikely(previous)) {
/* expansion might have moved on while waiting for a
* stripe, so we must do the range check again.
* Expansion could still move past after this
@@ -3458,8 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
*/
int must_retry = 0;
spin_lock_irq(&conf->device_lock);
- if (logical_sector < conf->expand_progress &&
- disks == conf->previous_raid_disks)
+ if (mddev->delta_disks < 0
+ ? logical_sector >= conf->reshape_progress
+ : logical_sector < conf->reshape_progress)
/* mismatch, need to try again */
must_retry = 1;
spin_unlock_irq(&conf->device_lock);
@@ -3514,6 +3745,8 @@ static int make_request(struct request_queue *q, struct bio * bi)
return 0;
}
+static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
+
static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
{
/* reshaping is quite different to recovery/resync so it is
@@ -3527,61 +3760,120 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
*/
raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
struct stripe_head *sh;
- int pd_idx;
sector_t first_sector, last_sector;
int raid_disks = conf->previous_raid_disks;
int data_disks = raid_disks - conf->max_degraded;
int new_data_disks = conf->raid_disks - conf->max_degraded;
int i;
int dd_idx;
- sector_t writepos, safepos, gap;
-
- if (sector_nr == 0 &&
- conf->expand_progress != 0) {
- /* restarting in the middle, skip the initial sectors */
- sector_nr = conf->expand_progress;
+ sector_t writepos, readpos, safepos;
+ sector_t stripe_addr;
+ int reshape_sectors;
+ struct list_head stripes;
+
+ if (sector_nr == 0) {
+ /* If restarting in the middle, skip the initial sectors */
+ if (mddev->delta_disks < 0 &&
+ conf->reshape_progress < raid5_size(mddev, 0, 0)) {
+ sector_nr = raid5_size(mddev, 0, 0)
+ - conf->reshape_progress;
+ } else if (mddev->delta_disks > 0 &&
+ conf->reshape_progress > 0)
+ sector_nr = conf->reshape_progress;
sector_div(sector_nr, new_data_disks);
- *skipped = 1;
- return sector_nr;
+ if (sector_nr) {
+ *skipped = 1;
+ return sector_nr;
+ }
}
+ /* We need to process a full chunk at a time.
+ * If old and new chunk sizes differ, we need to process the
+ * largest of these
+ */
+ if (mddev->new_chunk > mddev->chunk_size)
+ reshape_sectors = mddev->new_chunk / 512;
+ else
+ reshape_sectors = mddev->chunk_size / 512;
+
/* we update the metadata when there is more than 3Meg
* in the block range (that is rather arbitrary, should
* probably be time based) or when the data about to be
* copied would over-write the source of the data at
* the front of the range.
- * i.e. one new_stripe forward from expand_progress new_maps
- * to after where expand_lo old_maps to
+ * i.e. one new_stripe along from reshape_progress new_maps
+ * to after where reshape_safe old_maps to
*/
- writepos = conf->expand_progress +
- conf->chunk_size/512*(new_data_disks);
+ writepos = conf->reshape_progress;
sector_div(writepos, new_data_disks);
- safepos = conf->expand_lo;
+ readpos = conf->reshape_progress;
+ sector_div(readpos, data_disks);
+ safepos = conf->reshape_safe;
sector_div(safepos, data_disks);
- gap = conf->expand_progress - conf->expand_lo;
+ if (mddev->delta_disks < 0) {
+ writepos -= reshape_sectors;
+ readpos += reshape_sectors;
+ safepos += reshape_sectors;
+ } else {
+ writepos += reshape_sectors;
+ readpos -= reshape_sectors;
+ safepos -= reshape_sectors;
+ }
- if (writepos >= safepos ||
- gap > (new_data_disks)*3000*2 /*3Meg*/) {
+ /* 'writepos' is the most advanced device address we might write.
+ * 'readpos' is the least advanced device address we might read.
+ * 'safepos' is the least address recorded in the metadata as having
+ * been reshaped.
+ * If 'readpos' is behind 'writepos', then there is no way that we can
+ * ensure safety in the face of a crash - that must be done by userspace
+ * making a backup of the data. So in that case there is no particular
+ * rush to update metadata.
+ * Otherwise if 'safepos' is behind 'writepos', then we really need to
+ * update the metadata to advance 'safepos' to match 'readpos' so that
+ * we can be safe in the event of a crash.
+ * So we insist on updating metadata if safepos is behind writepos and
+ * readpos is beyond writepos.
+ * In any case, update the metadata every 10 seconds.
+ * Maybe that number should be configurable, but I'm not sure it is
+ * worth it.... maybe it could be a multiple of safemode_delay???
+ */
+ if ((mddev->delta_disks < 0
+ ? (safepos > writepos && readpos < writepos)
+ : (safepos < writepos && readpos > writepos)) ||
+ time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes)==0);
- mddev->reshape_position = conf->expand_progress;
+ mddev->reshape_position = conf->reshape_progress;
+ mddev->curr_resync_completed = mddev->curr_resync;
+ conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait, mddev->flags == 0 ||
kthread_should_stop());
spin_lock_irq(&conf->device_lock);
- conf->expand_lo = mddev->reshape_position;
+ conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
- for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
+ if (mddev->delta_disks < 0) {
+ BUG_ON(conf->reshape_progress == 0);
+ stripe_addr = writepos;
+ BUG_ON((mddev->dev_sectors &
+ ~((sector_t)reshape_sectors - 1))
+ - reshape_sectors - stripe_addr
+ != sector_nr);
+ } else {
+ BUG_ON(writepos != sector_nr + reshape_sectors);
+ stripe_addr = sector_nr;
+ }
+ INIT_LIST_HEAD(&stripes);
+ for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
int j;
int skipped = 0;
- pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
- sh = get_active_stripe(conf, sector_nr+i,
- conf->raid_disks, pd_idx, 0);
+ sh = get_active_stripe(conf, stripe_addr+i, 0, 0);
set_bit(STRIPE_EXPANDING, &sh->state);
atomic_inc(&conf->reshape_stripes);
/* If any of this stripe is beyond the end of the old
@@ -3592,10 +3884,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
if (j == sh->pd_idx)
continue;
if (conf->level == 6 &&
- j == raid6_next_disk(sh->pd_idx, sh->disks))
+ j == sh->qd_idx)
continue;
- s = compute_blocknr(sh, j);
- if (s < mddev->array_sectors) {
+ s = compute_blocknr(sh, j, 0);
+ if (s < raid5_size(mddev, 0, 0)) {
skipped = 1;
continue;
}
@@ -3607,10 +3899,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
set_bit(STRIPE_EXPAND_READY, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
}
- release_stripe(sh);
+ list_add(&sh->lru, &stripes);
}
spin_lock_irq(&conf->device_lock);
- conf->expand_progress = (sector_nr + i) * new_data_disks;
+ if (mddev->delta_disks < 0)
+ conf->reshape_progress -= reshape_sectors * new_data_disks;
+ else
+ conf->reshape_progress += reshape_sectors * new_data_disks;
spin_unlock_irq(&conf->device_lock);
/* Ok, those stripe are ready. We can start scheduling
* reads on the source stripes.
@@ -3618,46 +3913,53 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
* block on the destination stripes.
*/
first_sector =
- raid5_compute_sector(sector_nr*(new_data_disks),
- raid_disks, data_disks,
- &dd_idx, &pd_idx, conf);
+ raid5_compute_sector(conf, stripe_addr*(new_data_disks),
+ 1, &dd_idx, NULL);
last_sector =
- raid5_compute_sector((sector_nr+conf->chunk_size/512)
- *(new_data_disks) -1,
- raid_disks, data_disks,
- &dd_idx, &pd_idx, conf);
- if (last_sector >= (mddev->size<<1))
- last_sector = (mddev->size<<1)-1;
+ raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512)
+ *(new_data_disks) - 1),
+ 1, &dd_idx, NULL);
+ if (last_sector >= mddev->dev_sectors)
+ last_sector = mddev->dev_sectors - 1;
while (first_sector <= last_sector) {
- pd_idx = stripe_to_pdidx(first_sector, conf,
- conf->previous_raid_disks);
- sh = get_active_stripe(conf, first_sector,
- conf->previous_raid_disks, pd_idx, 0);
+ sh = get_active_stripe(conf, first_sector, 1, 0);
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
first_sector += STRIPE_SECTORS;
}
+ /* Now that the sources are clearly marked, we can release
+ * the destination stripes
+ */
+ while (!list_empty(&stripes)) {
+ sh = list_entry(stripes.next, struct stripe_head, lru);
+ list_del_init(&sh->lru);
+ release_stripe(sh);
+ }
/* If this takes us to the resync_max point where we have to pause,
* then we need to write out the superblock.
*/
- sector_nr += conf->chunk_size>>9;
- if (sector_nr >= mddev->resync_max) {
+ sector_nr += reshape_sectors;
+ if ((sector_nr - mddev->curr_resync_completed) * 2
+ >= mddev->resync_max - mddev->curr_resync_completed) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes) == 0);
- mddev->reshape_position = conf->expand_progress;
+ mddev->reshape_position = conf->reshape_progress;
+ mddev->curr_resync_completed = mddev->curr_resync;
+ conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_DEVS, &mddev->flags)
|| kthread_should_stop());
spin_lock_irq(&conf->device_lock);
- conf->expand_lo = mddev->reshape_position;
+ conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
- return conf->chunk_size>>9;
+ return reshape_sectors;
}
/* FIXME go_faster isn't used */
@@ -3665,9 +3967,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
{
raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
struct stripe_head *sh;
- int pd_idx;
- int raid_disks = conf->raid_disks;
- sector_t max_sector = mddev->size << 1;
+ sector_t max_sector = mddev->dev_sectors;
int sync_blocks;
int still_degraded = 0;
int i;
@@ -3675,6 +3975,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
if (sector_nr >= max_sector) {
/* just being told to finish up .. nothing much to do */
unplug_slaves(mddev);
+
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
end_reshape(conf);
return 0;
@@ -3705,7 +4006,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
*/
if (mddev->degraded >= conf->max_degraded &&
test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
- sector_t rv = (mddev->size << 1) - sector_nr;
+ sector_t rv = mddev->dev_sectors - sector_nr;
*skipped = 1;
return rv;
}
@@ -3721,10 +4022,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
- pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
- sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
+ sh = get_active_stripe(conf, sector_nr, 0, 1);
if (sh == NULL) {
- sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
+ sh = get_active_stripe(conf, sector_nr, 0, 0);
/* make sure we don't swamp the stripe cache if someone else
* is trying to get access
*/
@@ -3766,19 +4066,15 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
* it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
*/
struct stripe_head *sh;
- int dd_idx, pd_idx;
+ int dd_idx;
sector_t sector, logical_sector, last_sector;
int scnt = 0;
int remaining;
int handled = 0;
logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
- sector = raid5_compute_sector( logical_sector,
- conf->raid_disks,
- conf->raid_disks - conf->max_degraded,
- &dd_idx,
- &pd_idx,
- conf);
+ sector = raid5_compute_sector(conf, logical_sector,
+ 0, &dd_idx, NULL);
last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
for (; logical_sector < last_sector;
@@ -3790,7 +4086,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
/* already done this stripe */
continue;
- sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1);
+ sh = get_active_stripe(conf, sector, 0, 1);
if (!sh) {
/* failed to get a stripe - must wait */
@@ -3992,89 +4288,69 @@ static struct attribute_group raid5_attrs_group = {
.attrs = raid5_attrs,
};
-static int run(mddev_t *mddev)
+static sector_t
+raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+
+ if (!sectors)
+ sectors = mddev->dev_sectors;
+ if (!raid_disks) {
+ /* size is defined by the smallest of previous and new size */
+ if (conf->raid_disks < conf->previous_raid_disks)
+ raid_disks = conf->raid_disks;
+ else
+ raid_disks = conf->previous_raid_disks;
+ }
+
+ sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
+ sectors &= ~((sector_t)mddev->new_chunk/512 - 1);
+ return sectors * (raid_disks - conf->max_degraded);
+}
+
+static raid5_conf_t *setup_conf(mddev_t *mddev)
{
raid5_conf_t *conf;
int raid_disk, memory;
mdk_rdev_t *rdev;
struct disk_info *disk;
- int working_disks = 0;
- if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
+ if (mddev->new_level != 5
+ && mddev->new_level != 4
+ && mddev->new_level != 6) {
printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
- mdname(mddev), mddev->level);
- return -EIO;
+ mdname(mddev), mddev->new_level);
+ return ERR_PTR(-EIO);
}
-
- if (mddev->chunk_size < PAGE_SIZE) {
- printk(KERN_ERR "md/raid5: chunk_size must be at least "
- "PAGE_SIZE but %d < %ld\n",
- mddev->chunk_size, PAGE_SIZE);
- return -EINVAL;
+ if ((mddev->new_level == 5
+ && !algorithm_valid_raid5(mddev->new_layout)) ||
+ (mddev->new_level == 6
+ && !algorithm_valid_raid6(mddev->new_layout))) {
+ printk(KERN_ERR "raid5: %s: layout %d not supported\n",
+ mdname(mddev), mddev->new_layout);
+ return ERR_PTR(-EIO);
}
-
- if (mddev->reshape_position != MaxSector) {
- /* Check that we can continue the reshape.
- * Currently only disks can change, it must
- * increase, and we must be past the point where
- * a stripe over-writes itself
- */
- sector_t here_new, here_old;
- int old_disks;
- int max_degraded = (mddev->level == 5 ? 1 : 2);
-
- if (mddev->new_level != mddev->level ||
- mddev->new_layout != mddev->layout ||
- mddev->new_chunk != mddev->chunk_size) {
- printk(KERN_ERR "raid5: %s: unsupported reshape "
- "required - aborting.\n",
- mdname(mddev));
- return -EINVAL;
- }
- if (mddev->delta_disks <= 0) {
- printk(KERN_ERR "raid5: %s: unsupported reshape "
- "(reduce disks) required - aborting.\n",
- mdname(mddev));
- return -EINVAL;
- }
- old_disks = mddev->raid_disks - mddev->delta_disks;
- /* reshape_position must be on a new-stripe boundary, and one
- * further up in new geometry must map after here in old
- * geometry.
- */
- here_new = mddev->reshape_position;
- if (sector_div(here_new, (mddev->chunk_size>>9)*
- (mddev->raid_disks - max_degraded))) {
- printk(KERN_ERR "raid5: reshape_position not "
- "on a stripe boundary\n");
- return -EINVAL;
- }
- /* here_new is the stripe we will write to */
- here_old = mddev->reshape_position;
- sector_div(here_old, (mddev->chunk_size>>9)*
- (old_disks-max_degraded));
- /* here_old is the first stripe that we might need to read
- * from */
- if (here_new >= here_old) {
- /* Reading from the same stripe as writing to - bad */
- printk(KERN_ERR "raid5: reshape_position too early for "
- "auto-recovery - aborting.\n");
- return -EINVAL;
- }
- printk(KERN_INFO "raid5: reshape will continue\n");
- /* OK, we should be able to continue; */
+ if (mddev->new_level == 6 && mddev->raid_disks < 4) {
+ printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
+ mdname(mddev), mddev->raid_disks);
+ return ERR_PTR(-EINVAL);
}
+ if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) {
+ printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
+ mddev->new_chunk, mdname(mddev));
+ return ERR_PTR(-EINVAL);
+ }
- mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
- if ((conf = mddev->private) == NULL)
+ conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
+ if (conf == NULL)
goto abort;
- if (mddev->reshape_position == MaxSector) {
- conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
- } else {
- conf->raid_disks = mddev->raid_disks;
+
+ conf->raid_disks = mddev->raid_disks;
+ if (mddev->reshape_position == MaxSector)
+ conf->previous_raid_disks = mddev->raid_disks;
+ else
conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
- }
conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
GFP_KERNEL);
@@ -4086,13 +4362,12 @@ static int run(mddev_t *mddev)
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
goto abort;
- if (mddev->level == 6) {
+ if (mddev->new_level == 6) {
conf->spare_page = alloc_page(GFP_KERNEL);
if (!conf->spare_page)
goto abort;
}
spin_lock_init(&conf->device_lock);
- mddev->queue->queue_lock = &conf->device_lock;
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
@@ -4121,47 +4396,134 @@ static int run(mddev_t *mddev)
printk(KERN_INFO "raid5: device %s operational as raid"
" disk %d\n", bdevname(rdev->bdev,b),
raid_disk);
- working_disks++;
} else
/* Cannot rely on bitmap to complete recovery */
conf->fullsync = 1;
}
- /*
- * 0 for a fully functional array, 1 or 2 for a degraded array.
- */
- mddev->degraded = conf->raid_disks - working_disks;
- conf->mddev = mddev;
- conf->chunk_size = mddev->chunk_size;
- conf->level = mddev->level;
+ conf->chunk_size = mddev->new_chunk;
+ conf->level = mddev->new_level;
if (conf->level == 6)
conf->max_degraded = 2;
else
conf->max_degraded = 1;
- conf->algorithm = mddev->layout;
+ conf->algorithm = mddev->new_layout;
conf->max_nr_stripes = NR_STRIPES;
- conf->expand_progress = mddev->reshape_position;
-
- /* device size must be a multiple of chunk size */
- mddev->size &= ~(mddev->chunk_size/1024 -1);
- mddev->resync_max_sectors = mddev->size << 1;
+ conf->reshape_progress = mddev->reshape_position;
+ if (conf->reshape_progress != MaxSector) {
+ conf->prev_chunk = mddev->chunk_size;
+ conf->prev_algo = mddev->layout;
+ }
- if (conf->level == 6 && conf->raid_disks < 4) {
- printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
- mdname(mddev), conf->raid_disks);
+ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+ conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
+ if (grow_stripes(conf, conf->max_nr_stripes)) {
+ printk(KERN_ERR
+ "raid5: couldn't allocate %dkB for buffers\n", memory);
goto abort;
- }
- if (!conf->chunk_size || conf->chunk_size % 4) {
- printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
- conf->chunk_size, mdname(mddev));
+ } else
+ printk(KERN_INFO "raid5: allocated %dkB for %s\n",
+ memory, mdname(mddev));
+
+ conf->thread = md_register_thread(raid5d, mddev, "%s_raid5");
+ if (!conf->thread) {
+ printk(KERN_ERR
+ "raid5: couldn't allocate thread for %s\n",
+ mdname(mddev));
goto abort;
}
- if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
- printk(KERN_ERR
- "raid5: unsupported parity algorithm %d for %s\n",
- conf->algorithm, mdname(mddev));
- goto abort;
+
+ return conf;
+
+ abort:
+ if (conf) {
+ shrink_stripes(conf);
+ safe_put_page(conf->spare_page);
+ kfree(conf->disks);
+ kfree(conf->stripe_hashtbl);
+ kfree(conf);
+ return ERR_PTR(-EIO);
+ } else
+ return ERR_PTR(-ENOMEM);
+}
+
+static int run(mddev_t *mddev)
+{
+ raid5_conf_t *conf;
+ int working_disks = 0;
+ mdk_rdev_t *rdev;
+
+ if (mddev->reshape_position != MaxSector) {
+ /* Check that we can continue the reshape.
+ * Currently only disks can change, it must
+ * increase, and we must be past the point where
+ * a stripe over-writes itself
+ */
+ sector_t here_new, here_old;
+ int old_disks;
+ int max_degraded = (mddev->level == 6 ? 2 : 1);
+
+ if (mddev->new_level != mddev->level) {
+ printk(KERN_ERR "raid5: %s: unsupported reshape "
+ "required - aborting.\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ old_disks = mddev->raid_disks - mddev->delta_disks;
+ /* reshape_position must be on a new-stripe boundary, and one
+ * further up in new geometry must map after here in old
+ * geometry.
+ */
+ here_new = mddev->reshape_position;
+ if (sector_div(here_new, (mddev->new_chunk>>9)*
+ (mddev->raid_disks - max_degraded))) {
+ printk(KERN_ERR "raid5: reshape_position not "
+ "on a stripe boundary\n");
+ return -EINVAL;
+ }
+ /* here_new is the stripe we will write to */
+ here_old = mddev->reshape_position;
+ sector_div(here_old, (mddev->chunk_size>>9)*
+ (old_disks-max_degraded));
+ /* here_old is the first stripe that we might need to read
+ * from */
+ if (here_new >= here_old) {
+ /* Reading from the same stripe as writing to - bad */
+ printk(KERN_ERR "raid5: reshape_position too early for "
+ "auto-recovery - aborting.\n");
+ return -EINVAL;
+ }
+ printk(KERN_INFO "raid5: reshape will continue\n");
+ /* OK, we should be able to continue; */
+ } else {
+ BUG_ON(mddev->level != mddev->new_level);
+ BUG_ON(mddev->layout != mddev->new_layout);
+ BUG_ON(mddev->chunk_size != mddev->new_chunk);
+ BUG_ON(mddev->delta_disks != 0);
}
+
+ if (mddev->private == NULL)
+ conf = setup_conf(mddev);
+ else
+ conf = mddev->private;
+
+ if (IS_ERR(conf))
+ return PTR_ERR(conf);
+
+ mddev->thread = conf->thread;
+ conf->thread = NULL;
+ mddev->private = conf;
+
+ /*
+ * 0 for a fully functional array, 1 or 2 for a degraded array.
+ */
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ if (rdev->raid_disk >= 0 &&
+ test_bit(In_sync, &rdev->flags))
+ working_disks++;
+
+ mddev->degraded = conf->raid_disks - working_disks;
+
if (mddev->degraded > conf->max_degraded) {
printk(KERN_ERR "raid5: not enough operational devices for %s"
" (%d/%d failed)\n",
@@ -4169,6 +4531,10 @@ static int run(mddev_t *mddev)
goto abort;
}
+ /* device size must be a multiple of chunk size */
+ mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1);
+ mddev->resync_max_sectors = mddev->dev_sectors;
+
if (mddev->degraded > 0 &&
mddev->recovery_cp != MaxSector) {
if (mddev->ok_start_degraded)
@@ -4184,43 +4550,22 @@ static int run(mddev_t *mddev)
}
}
- {
- mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5");
- if (!mddev->thread) {
- printk(KERN_ERR
- "raid5: couldn't allocate thread for %s\n",
- mdname(mddev));
- goto abort;
- }
- }
- memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
- conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
- if (grow_stripes(conf, conf->max_nr_stripes)) {
- printk(KERN_ERR
- "raid5: couldn't allocate %dkB for buffers\n", memory);
- shrink_stripes(conf);
- md_unregister_thread(mddev->thread);
- goto abort;
- } else
- printk(KERN_INFO "raid5: allocated %dkB for %s\n",
- memory, mdname(mddev));
-
if (mddev->degraded == 0)
printk("raid5: raid level %d set %s active with %d out of %d"
- " devices, algorithm %d\n", conf->level, mdname(mddev),
- mddev->raid_disks-mddev->degraded, mddev->raid_disks,
- conf->algorithm);
+ " devices, algorithm %d\n", conf->level, mdname(mddev),
+ mddev->raid_disks-mddev->degraded, mddev->raid_disks,
+ mddev->new_layout);
else
printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
" out of %d devices, algorithm %d\n", conf->level,
mdname(mddev), mddev->raid_disks - mddev->degraded,
- mddev->raid_disks, conf->algorithm);
+ mddev->raid_disks, mddev->new_layout);
print_raid5_conf(conf);
- if (conf->expand_progress != MaxSector) {
+ if (conf->reshape_progress != MaxSector) {
printk("...ok start reshape thread\n");
- conf->expand_lo = conf->expand_progress;
+ conf->reshape_safe = conf->reshape_progress;
atomic_set(&conf->reshape_stripes, 0);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -4247,18 +4592,22 @@ static int run(mddev_t *mddev)
"raid5: failed to create sysfs attributes for %s\n",
mdname(mddev));
+ mddev->queue->queue_lock = &conf->device_lock;
+
mddev->queue->unplug_fn = raid5_unplug_device;
mddev->queue->backing_dev_info.congested_data = mddev;
mddev->queue->backing_dev_info.congested_fn = raid5_congested;
- mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks -
- conf->max_degraded);
+ md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
return 0;
abort:
+ md_unregister_thread(mddev->thread);
+ mddev->thread = NULL;
if (conf) {
+ shrink_stripes(conf);
print_raid5_conf(conf);
safe_put_page(conf->spare_page);
kfree(conf->disks);
@@ -4396,6 +4745,10 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
print_raid5_conf(conf);
rdev = p->rdev;
if (rdev) {
+ if (number >= conf->raid_disks &&
+ conf->reshape_progress == MaxSector)
+ clear_bit(In_sync, &rdev->flags);
+
if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) {
err = -EBUSY;
@@ -4405,7 +4758,8 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
* isn't possible.
*/
if (!test_bit(Faulty, &rdev->flags) &&
- mddev->degraded <= conf->max_degraded) {
+ mddev->degraded <= conf->max_degraded &&
+ number < conf->raid_disks) {
err = -EBUSY;
goto abort;
}
@@ -4472,36 +4826,48 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
* any io in the removed space completes, but it hardly seems
* worth it.
*/
- raid5_conf_t *conf = mddev_to_conf(mddev);
-
sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
- mddev->array_sectors = sectors * (mddev->raid_disks
- - conf->max_degraded);
+ md_set_array_sectors(mddev, raid5_size(mddev, sectors,
+ mddev->raid_disks));
+ if (mddev->array_sectors >
+ raid5_size(mddev, sectors, mddev->raid_disks))
+ return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
- if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
- mddev->recovery_cp = mddev->size << 1;
+ if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
+ mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
- mddev->size = sectors /2;
+ mddev->dev_sectors = sectors;
mddev->resync_max_sectors = sectors;
return 0;
}
-#ifdef CONFIG_MD_RAID5_RESHAPE
static int raid5_check_reshape(mddev_t *mddev)
{
raid5_conf_t *conf = mddev_to_conf(mddev);
- int err;
- if (mddev->delta_disks < 0 ||
- mddev->new_level != mddev->level)
- return -EINVAL; /* Cannot shrink array or change level yet */
- if (mddev->delta_disks == 0)
- return 0; /* nothing to do */
+ if (mddev->delta_disks == 0 &&
+ mddev->new_layout == mddev->layout &&
+ mddev->new_chunk == mddev->chunk_size)
+ return -EINVAL; /* nothing to do */
if (mddev->bitmap)
/* Cannot grow a bitmap yet */
return -EBUSY;
+ if (mddev->degraded > conf->max_degraded)
+ return -EINVAL;
+ if (mddev->delta_disks < 0) {
+ /* We might be able to shrink, but the devices must
+ * be made bigger first.
+ * For raid6, 4 is the minimum size.
+ * Otherwise 2 is the minimum
+ */
+ int min = 2;
+ if (mddev->level == 6)
+ min = 4;
+ if (mddev->raid_disks + mddev->delta_disks < min)
+ return -EINVAL;
+ }
/* Can only proceed if there are plenty of stripe_heads.
* We need a minimum of one full stripe,, and for sensible progress
@@ -4514,18 +4880,12 @@ static int raid5_check_reshape(mddev_t *mddev)
if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
(mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
- (mddev->chunk_size / STRIPE_SIZE)*4);
+ (max(mddev->chunk_size, mddev->new_chunk)
+ / STRIPE_SIZE)*4);
return -ENOSPC;
}
- err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
- if (err)
- return err;
-
- if (mddev->degraded > conf->max_degraded)
- return -EINVAL;
- /* looks like we might be able to manage this */
- return 0;
+ return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
}
static int raid5_start_reshape(mddev_t *mddev)
@@ -4550,12 +4910,31 @@ static int raid5_start_reshape(mddev_t *mddev)
*/
return -EINVAL;
+ /* Refuse to reduce size of the array. Any reductions in
+ * array size must be through explicit setting of array_size
+ * attribute.
+ */
+ if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
+ < mddev->array_sectors) {
+ printk(KERN_ERR "md: %s: array size must be reduced "
+ "before number of disks\n", mdname(mddev));
+ return -EINVAL;
+ }
+
atomic_set(&conf->reshape_stripes, 0);
spin_lock_irq(&conf->device_lock);
conf->previous_raid_disks = conf->raid_disks;
conf->raid_disks += mddev->delta_disks;
- conf->expand_progress = 0;
- conf->expand_lo = 0;
+ conf->prev_chunk = conf->chunk_size;
+ conf->chunk_size = mddev->new_chunk;
+ conf->prev_algo = conf->algorithm;
+ conf->algorithm = mddev->new_layout;
+ if (mddev->delta_disks < 0)
+ conf->reshape_progress = raid5_size(mddev, 0, 0);
+ else
+ conf->reshape_progress = 0;
+ conf->reshape_safe = conf->reshape_progress;
+ conf->generation++;
spin_unlock_irq(&conf->device_lock);
/* Add some new drives, as many as will fit.
@@ -4580,9 +4959,12 @@ static int raid5_start_reshape(mddev_t *mddev)
break;
}
- spin_lock_irqsave(&conf->device_lock, flags);
- mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ if (mddev->delta_disks > 0) {
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded = (conf->raid_disks - conf->previous_raid_disks)
+ - added_devices;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ }
mddev->raid_disks = conf->raid_disks;
mddev->reshape_position = 0;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -4597,52 +4979,86 @@ static int raid5_start_reshape(mddev_t *mddev)
mddev->recovery = 0;
spin_lock_irq(&conf->device_lock);
mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
- conf->expand_progress = MaxSector;
+ conf->reshape_progress = MaxSector;
spin_unlock_irq(&conf->device_lock);
return -EAGAIN;
}
+ conf->reshape_checkpoint = jiffies;
md_wakeup_thread(mddev->sync_thread);
md_new_event(mddev);
return 0;
}
-#endif
+/* This is called from the reshape thread and should make any
+ * changes needed in 'conf'
+ */
static void end_reshape(raid5_conf_t *conf)
{
- struct block_device *bdev;
if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
- conf->mddev->array_sectors = 2 * conf->mddev->size *
- (conf->raid_disks - conf->max_degraded);
- set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
- conf->mddev->changed = 1;
-
- bdev = bdget_disk(conf->mddev->gendisk, 0);
- if (bdev) {
- mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode,
- (loff_t)conf->mddev->array_sectors << 9);
- mutex_unlock(&bdev->bd_inode->i_mutex);
- bdput(bdev);
- }
+
spin_lock_irq(&conf->device_lock);
- conf->expand_progress = MaxSector;
+ conf->previous_raid_disks = conf->raid_disks;
+ conf->reshape_progress = MaxSector;
spin_unlock_irq(&conf->device_lock);
- conf->mddev->reshape_position = MaxSector;
+ wake_up(&conf->wait_for_overlap);
/* read-ahead size must cover two whole stripes, which is
* 2 * (datadisks) * chunksize where 'n' is the number of raid devices
*/
{
- int data_disks = conf->previous_raid_disks - conf->max_degraded;
- int stripe = data_disks *
- (conf->mddev->chunk_size / PAGE_SIZE);
+ int data_disks = conf->raid_disks - conf->max_degraded;
+ int stripe = data_disks * (conf->chunk_size
+ / PAGE_SIZE);
if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
}
}
}
+/* This is called from the raid5d thread with mddev_lock held.
+ * It makes config changes to the device.
+ */
+static void raid5_finish_reshape(mddev_t *mddev)
+{
+ struct block_device *bdev;
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+
+ if (mddev->delta_disks > 0) {
+ md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ mddev->changed = 1;
+
+ bdev = bdget_disk(mddev->gendisk, 0);
+ if (bdev) {
+ mutex_lock(&bdev->bd_inode->i_mutex);
+ i_size_write(bdev->bd_inode,
+ (loff_t)mddev->array_sectors << 9);
+ mutex_unlock(&bdev->bd_inode->i_mutex);
+ bdput(bdev);
+ }
+ } else {
+ int d;
+ mddev->degraded = conf->raid_disks;
+ for (d = 0; d < conf->raid_disks ; d++)
+ if (conf->disks[d].rdev &&
+ test_bit(In_sync,
+ &conf->disks[d].rdev->flags))
+ mddev->degraded--;
+ for (d = conf->raid_disks ;
+ d < conf->raid_disks - mddev->delta_disks;
+ d++)
+ raid5_remove_disk(mddev, d);
+ }
+ mddev->layout = conf->algorithm;
+ mddev->chunk_size = conf->chunk_size;
+ mddev->reshape_position = MaxSector;
+ mddev->delta_disks = 0;
+ }
+}
+
static void raid5_quiesce(mddev_t *mddev, int state)
{
raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -4672,6 +5088,212 @@ static void raid5_quiesce(mddev_t *mddev, int state)
}
}
+
+static void *raid5_takeover_raid1(mddev_t *mddev)
+{
+ int chunksect;
+
+ if (mddev->raid_disks != 2 ||
+ mddev->degraded > 1)
+ return ERR_PTR(-EINVAL);
+
+ /* Should check if there are write-behind devices? */
+
+ chunksect = 64*2; /* 64K by default */
+
+ /* The array must be an exact multiple of chunksize */
+ while (chunksect && (mddev->array_sectors & (chunksect-1)))
+ chunksect >>= 1;
+
+ if ((chunksect<<9) < STRIPE_SIZE)
+ /* array size does not allow a suitable chunk size */
+ return ERR_PTR(-EINVAL);
+
+ mddev->new_level = 5;
+ mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
+ mddev->new_chunk = chunksect << 9;
+
+ return setup_conf(mddev);
+}
+
+static void *raid5_takeover_raid6(mddev_t *mddev)
+{
+ int new_layout;
+
+ switch (mddev->layout) {
+ case ALGORITHM_LEFT_ASYMMETRIC_6:
+ new_layout = ALGORITHM_LEFT_ASYMMETRIC;
+ break;
+ case ALGORITHM_RIGHT_ASYMMETRIC_6:
+ new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC_6:
+ new_layout = ALGORITHM_LEFT_SYMMETRIC;
+ break;
+ case ALGORITHM_RIGHT_SYMMETRIC_6:
+ new_layout = ALGORITHM_RIGHT_SYMMETRIC;
+ break;
+ case ALGORITHM_PARITY_0_6:
+ new_layout = ALGORITHM_PARITY_0;
+ break;
+ case ALGORITHM_PARITY_N:
+ new_layout = ALGORITHM_PARITY_N;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+ mddev->new_level = 5;
+ mddev->new_layout = new_layout;
+ mddev->delta_disks = -1;
+ mddev->raid_disks -= 1;
+ return setup_conf(mddev);
+}
+
+
+static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+{
+ /* For a 2-drive array, the layout and chunk size can be changed
+ * immediately as not restriping is needed.
+ * For larger arrays we record the new value - after validation
+ * to be used by a reshape pass.
+ */
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+
+ if (new_layout >= 0 && !algorithm_valid_raid5(new_layout))
+ return -EINVAL;
+ if (new_chunk > 0) {
+ if (new_chunk & (new_chunk-1))
+ /* not a power of 2 */
+ return -EINVAL;
+ if (new_chunk < PAGE_SIZE)
+ return -EINVAL;
+ if (mddev->array_sectors & ((new_chunk>>9)-1))
+ /* not factor of array size */
+ return -EINVAL;
+ }
+
+ /* They look valid */
+
+ if (mddev->raid_disks == 2) {
+
+ if (new_layout >= 0) {
+ conf->algorithm = new_layout;
+ mddev->layout = mddev->new_layout = new_layout;
+ }
+ if (new_chunk > 0) {
+ conf->chunk_size = new_chunk;
+ mddev->chunk_size = mddev->new_chunk = new_chunk;
+ }
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ md_wakeup_thread(mddev->thread);
+ } else {
+ if (new_layout >= 0)
+ mddev->new_layout = new_layout;
+ if (new_chunk > 0)
+ mddev->new_chunk = new_chunk;
+ }
+ return 0;
+}
+
+static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+{
+ if (new_layout >= 0 && !algorithm_valid_raid6(new_layout))
+ return -EINVAL;
+ if (new_chunk > 0) {
+ if (new_chunk & (new_chunk-1))
+ /* not a power of 2 */
+ return -EINVAL;
+ if (new_chunk < PAGE_SIZE)
+ return -EINVAL;
+ if (mddev->array_sectors & ((new_chunk>>9)-1))
+ /* not factor of array size */
+ return -EINVAL;
+ }
+
+ /* They look valid */
+
+ if (new_layout >= 0)
+ mddev->new_layout = new_layout;
+ if (new_chunk > 0)
+ mddev->new_chunk = new_chunk;
+
+ return 0;
+}
+
+static void *raid5_takeover(mddev_t *mddev)
+{
+ /* raid5 can take over:
+ * raid0 - if all devices are the same - make it a raid4 layout
+ * raid1 - if there are two drives. We need to know the chunk size
+ * raid4 - trivial - just use a raid4 layout.
+ * raid6 - Providing it is a *_6 layout
+ *
+ * For now, just do raid1
+ */
+
+ if (mddev->level == 1)
+ return raid5_takeover_raid1(mddev);
+ if (mddev->level == 4) {
+ mddev->new_layout = ALGORITHM_PARITY_N;
+ mddev->new_level = 5;
+ return setup_conf(mddev);
+ }
+ if (mddev->level == 6)
+ return raid5_takeover_raid6(mddev);
+
+ return ERR_PTR(-EINVAL);
+}
+
+
+static struct mdk_personality raid5_personality;
+
+static void *raid6_takeover(mddev_t *mddev)
+{
+ /* Currently can only take over a raid5. We map the
+ * personality to an equivalent raid6 personality
+ * with the Q block at the end.
+ */
+ int new_layout;
+
+ if (mddev->pers != &raid5_personality)
+ return ERR_PTR(-EINVAL);
+ if (mddev->degraded > 1)
+ return ERR_PTR(-EINVAL);
+ if (mddev->raid_disks > 253)
+ return ERR_PTR(-EINVAL);
+ if (mddev->raid_disks < 3)
+ return ERR_PTR(-EINVAL);
+
+ switch (mddev->layout) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
+ break;
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
+ break;
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
+ break;
+ case ALGORITHM_PARITY_0:
+ new_layout = ALGORITHM_PARITY_0_6;
+ break;
+ case ALGORITHM_PARITY_N:
+ new_layout = ALGORITHM_PARITY_N;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+ mddev->new_level = 6;
+ mddev->new_layout = new_layout;
+ mddev->delta_disks = 1;
+ mddev->raid_disks += 1;
+ return setup_conf(mddev);
+}
+
+
static struct mdk_personality raid6_personality =
{
.name = "raid6",
@@ -4687,11 +5309,13 @@ static struct mdk_personality raid6_personality =
.spare_active = raid5_spare_active,
.sync_request = sync_request,
.resize = raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+ .size = raid5_size,
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
-#endif
+ .finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
+ .takeover = raid6_takeover,
+ .reconfig = raid6_reconfig,
};
static struct mdk_personality raid5_personality =
{
@@ -4708,11 +5332,13 @@ static struct mdk_personality raid5_personality =
.spare_active = raid5_spare_active,
.sync_request = sync_request,
.resize = raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+ .size = raid5_size,
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
-#endif
+ .finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
+ .takeover = raid5_takeover,
+ .reconfig = raid5_reconfig,
};
static struct mdk_personality raid4_personality =
@@ -4730,20 +5356,15 @@ static struct mdk_personality raid4_personality =
.spare_active = raid5_spare_active,
.sync_request = sync_request,
.resize = raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+ .size = raid5_size,
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
-#endif
+ .finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
};
static int __init raid5_init(void)
{
- int e;
-
- e = raid6_select_algo();
- if ( e )
- return e;
register_md_personality(&raid6_personality);
register_md_personality(&raid5_personality);
register_md_personality(&raid4_personality);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
new file mode 100644
index 000000000000..52ba99954dec
--- /dev/null
+++ b/drivers/md/raid5.h
@@ -0,0 +1,474 @@
+#ifndef _RAID5_H
+#define _RAID5_H
+
+#include <linux/raid/xor.h>
+
+/*
+ *
+ * Each stripe contains one buffer per disc. Each buffer can be in
+ * one of a number of states stored in "flags". Changes between
+ * these states happen *almost* exclusively under a per-stripe
+ * spinlock. Some very specific changes can happen in bi_end_io, and
+ * these are not protected by the spin lock.
+ *
+ * The flag bits that are used to represent these states are:
+ * R5_UPTODATE and R5_LOCKED
+ *
+ * State Empty == !UPTODATE, !LOCK
+ * We have no data, and there is no active request
+ * State Want == !UPTODATE, LOCK
+ * A read request is being submitted for this block
+ * State Dirty == UPTODATE, LOCK
+ * Some new data is in this buffer, and it is being written out
+ * State Clean == UPTODATE, !LOCK
+ * We have valid data which is the same as on disc
+ *
+ * The possible state transitions are:
+ *
+ * Empty -> Want - on read or write to get old data for parity calc
+ * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
+ * Empty -> Clean - on compute_block when computing a block for failed drive
+ * Want -> Empty - on failed read
+ * Want -> Clean - on successful completion of read request
+ * Dirty -> Clean - on successful completion of write request
+ * Dirty -> Clean - on failed write
+ * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
+ *
+ * The Want->Empty, Want->Clean, Dirty->Clean, transitions
+ * all happen in b_end_io at interrupt time.
+ * Each sets the Uptodate bit before releasing the Lock bit.
+ * This leaves one multi-stage transition:
+ * Want->Dirty->Clean
+ * This is safe because thinking that a Clean buffer is actually dirty
+ * will at worst delay some action, and the stripe will be scheduled
+ * for attention after the transition is complete.
+ *
+ * There is one possibility that is not covered by these states. That
+ * is if one drive has failed and there is a spare being rebuilt. We
+ * can't distinguish between a clean block that has been generated
+ * from parity calculations, and a clean block that has been
+ * successfully written to the spare ( or to parity when resyncing).
+ * To distingush these states we have a stripe bit STRIPE_INSYNC that
+ * is set whenever a write is scheduled to the spare, or to the parity
+ * disc if there is no spare. A sync request clears this bit, and
+ * when we find it set with no buffers locked, we know the sync is
+ * complete.
+ *
+ * Buffers for the md device that arrive via make_request are attached
+ * to the appropriate stripe in one of two lists linked on b_reqnext.
+ * One list (bh_read) for read requests, one (bh_write) for write.
+ * There should never be more than one buffer on the two lists
+ * together, but we are not guaranteed of that so we allow for more.
+ *
+ * If a buffer is on the read list when the associated cache buffer is
+ * Uptodate, the data is copied into the read buffer and it's b_end_io
+ * routine is called. This may happen in the end_request routine only
+ * if the buffer has just successfully been read. end_request should
+ * remove the buffers from the list and then set the Uptodate bit on
+ * the buffer. Other threads may do this only if they first check
+ * that the Uptodate bit is set. Once they have checked that they may
+ * take buffers off the read queue.
+ *
+ * When a buffer on the write list is committed for write it is copied
+ * into the cache buffer, which is then marked dirty, and moved onto a
+ * third list, the written list (bh_written). Once both the parity
+ * block and the cached buffer are successfully written, any buffer on
+ * a written list can be returned with b_end_io.
+ *
+ * The write list and read list both act as fifos. The read list is
+ * protected by the device_lock. The write and written lists are
+ * protected by the stripe lock. The device_lock, which can be
+ * claimed while the stipe lock is held, is only for list
+ * manipulations and will only be held for a very short time. It can
+ * be claimed from interrupts.
+ *
+ *
+ * Stripes in the stripe cache can be on one of two lists (or on
+ * neither). The "inactive_list" contains stripes which are not
+ * currently being used for any request. They can freely be reused
+ * for another stripe. The "handle_list" contains stripes that need
+ * to be handled in some way. Both of these are fifo queues. Each
+ * stripe is also (potentially) linked to a hash bucket in the hash
+ * table so that it can be found by sector number. Stripes that are
+ * not hashed must be on the inactive_list, and will normally be at
+ * the front. All stripes start life this way.
+ *
+ * The inactive_list, handle_list and hash bucket lists are all protected by the
+ * device_lock.
+ * - stripes on the inactive_list never have their stripe_lock held.
+ * - stripes have a reference counter. If count==0, they are on a list.
+ * - If a stripe might need handling, STRIPE_HANDLE is set.
+ * - When refcount reaches zero, then if STRIPE_HANDLE it is put on
+ * handle_list else inactive_list
+ *
+ * This, combined with the fact that STRIPE_HANDLE is only ever
+ * cleared while a stripe has a non-zero count means that if the
+ * refcount is 0 and STRIPE_HANDLE is set, then it is on the
+ * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
+ * the stripe is on inactive_list.
+ *
+ * The possible transitions are:
+ * activate an unhashed/inactive stripe (get_active_stripe())
+ * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
+ * activate a hashed, possibly active stripe (get_active_stripe())
+ * lockdev check-hash if(!cnt++)unlink-stripe unlockdev
+ * attach a request to an active stripe (add_stripe_bh())
+ * lockdev attach-buffer unlockdev
+ * handle a stripe (handle_stripe())
+ * lockstripe clrSTRIPE_HANDLE ...
+ * (lockdev check-buffers unlockdev) ..
+ * change-state ..
+ * record io/ops needed unlockstripe schedule io/ops
+ * release an active stripe (release_stripe())
+ * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
+ *
+ * The refcount counts each thread that have activated the stripe,
+ * plus raid5d if it is handling it, plus one for each active request
+ * on a cached buffer, and plus one if the stripe is undergoing stripe
+ * operations.
+ *
+ * Stripe operations are performed outside the stripe lock,
+ * the stripe operations are:
+ * -copying data between the stripe cache and user application buffers
+ * -computing blocks to save a disk access, or to recover a missing block
+ * -updating the parity on a write operation (reconstruct write and
+ * read-modify-write)
+ * -checking parity correctness
+ * -running i/o to disk
+ * These operations are carried out by raid5_run_ops which uses the async_tx
+ * api to (optionally) offload operations to dedicated hardware engines.
+ * When requesting an operation handle_stripe sets the pending bit for the
+ * operation and increments the count. raid5_run_ops is then run whenever
+ * the count is non-zero.
+ * There are some critical dependencies between the operations that prevent some
+ * from being requested while another is in flight.
+ * 1/ Parity check operations destroy the in cache version of the parity block,
+ * so we prevent parity dependent operations like writes and compute_blocks
+ * from starting while a check is in progress. Some dma engines can perform
+ * the check without damaging the parity block, in these cases the parity
+ * block is re-marked up to date (assuming the check was successful) and is
+ * not re-read from disk.
+ * 2/ When a write operation is requested we immediately lock the affected
+ * blocks, and mark them as not up to date. This causes new read requests
+ * to be held off, as well as parity checks and compute block operations.
+ * 3/ Once a compute block operation has been requested handle_stripe treats
+ * that block as if it is up to date. raid5_run_ops guaruntees that any
+ * operation that is dependent on the compute block result is initiated after
+ * the compute block completes.
+ */
+
+/*
+ * Operations state - intermediate states that are visible outside of sh->lock
+ * In general _idle indicates nothing is running, _run indicates a data
+ * processing operation is active, and _result means the data processing result
+ * is stable and can be acted upon. For simple operations like biofill and
+ * compute that only have an _idle and _run state they are indicated with
+ * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
+ */
+/**
+ * enum check_states - handles syncing / repairing a stripe
+ * @check_state_idle - check operations are quiesced
+ * @check_state_run - check operation is running
+ * @check_state_result - set outside lock when check result is valid
+ * @check_state_compute_run - check failed and we are repairing
+ * @check_state_compute_result - set outside lock when compute result is valid
+ */
+enum check_states {
+ check_state_idle = 0,
+ check_state_run, /* parity check */
+ check_state_check_result,
+ check_state_compute_run, /* parity repair */
+ check_state_compute_result,
+};
+
+/**
+ * enum reconstruct_states - handles writing or expanding a stripe
+ */
+enum reconstruct_states {
+ reconstruct_state_idle = 0,
+ reconstruct_state_prexor_drain_run, /* prexor-write */
+ reconstruct_state_drain_run, /* write */
+ reconstruct_state_run, /* expand */
+ reconstruct_state_prexor_drain_result,
+ reconstruct_state_drain_result,
+ reconstruct_state_result,
+};
+
+struct stripe_head {
+ struct hlist_node hash;
+ struct list_head lru; /* inactive_list or handle_list */
+ struct raid5_private_data *raid_conf;
+ short generation; /* increments with every
+ * reshape */
+ sector_t sector; /* sector of this row */
+ short pd_idx; /* parity disk index */
+ short qd_idx; /* 'Q' disk index for raid6 */
+ short ddf_layout;/* use DDF ordering to calculate Q */
+ unsigned long state; /* state flags */
+ atomic_t count; /* nr of active thread/requests */
+ spinlock_t lock;
+ int bm_seq; /* sequence number for bitmap flushes */
+ int disks; /* disks in stripe */
+ enum check_states check_state;
+ enum reconstruct_states reconstruct_state;
+ /* stripe_operations
+ * @target - STRIPE_OP_COMPUTE_BLK target
+ */
+ struct stripe_operations {
+ int target;
+ u32 zero_sum_result;
+ } ops;
+ struct r5dev {
+ struct bio req;
+ struct bio_vec vec;
+ struct page *page;
+ struct bio *toread, *read, *towrite, *written;
+ sector_t sector; /* sector of this page */
+ unsigned long flags;
+ } dev[1]; /* allocated with extra space depending of RAID geometry */
+};
+
+/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
+ * for handle_stripe. It is only valid under spin_lock(sh->lock);
+ */
+struct stripe_head_state {
+ int syncing, expanding, expanded;
+ int locked, uptodate, to_read, to_write, failed, written;
+ int to_fill, compute, req_compute, non_overwrite;
+ int failed_num;
+ unsigned long ops_request;
+};
+
+/* r6_state - extra state data only relevant to r6 */
+struct r6_state {
+ int p_failed, q_failed, failed_num[2];
+};
+
+/* Flags */
+#define R5_UPTODATE 0 /* page contains current data */
+#define R5_LOCKED 1 /* IO has been submitted on "req" */
+#define R5_OVERWRITE 2 /* towrite covers whole page */
+/* and some that are internal to handle_stripe */
+#define R5_Insync 3 /* rdev && rdev->in_sync at start */
+#define R5_Wantread 4 /* want to schedule a read */
+#define R5_Wantwrite 5
+#define R5_Overlap 7 /* There is a pending overlapping request on this block */
+#define R5_ReadError 8 /* seen a read error here recently */
+#define R5_ReWrite 9 /* have tried to over-write the readerror */
+
+#define R5_Expanded 10 /* This block now has post-expand data */
+#define R5_Wantcompute 11 /* compute_block in progress treat as
+ * uptodate
+ */
+#define R5_Wantfill 12 /* dev->toread contains a bio that needs
+ * filling
+ */
+#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
+/*
+ * Write method
+ */
+#define RECONSTRUCT_WRITE 1
+#define READ_MODIFY_WRITE 2
+/* not a write method, but a compute_parity mode */
+#define CHECK_PARITY 3
+/* Additional compute_parity mode -- updates the parity w/o LOCKING */
+#define UPDATE_PARITY 4
+
+/*
+ * Stripe state
+ */
+#define STRIPE_HANDLE 2
+#define STRIPE_SYNCING 3
+#define STRIPE_INSYNC 4
+#define STRIPE_PREREAD_ACTIVE 5
+#define STRIPE_DELAYED 6
+#define STRIPE_DEGRADED 7
+#define STRIPE_BIT_DELAY 8
+#define STRIPE_EXPANDING 9
+#define STRIPE_EXPAND_SOURCE 10
+#define STRIPE_EXPAND_READY 11
+#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */
+#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */
+#define STRIPE_BIOFILL_RUN 14
+#define STRIPE_COMPUTE_RUN 15
+/*
+ * Operation request flags
+ */
+#define STRIPE_OP_BIOFILL 0
+#define STRIPE_OP_COMPUTE_BLK 1
+#define STRIPE_OP_PREXOR 2
+#define STRIPE_OP_BIODRAIN 3
+#define STRIPE_OP_POSTXOR 4
+#define STRIPE_OP_CHECK 5
+
+/*
+ * Plugging:
+ *
+ * To improve write throughput, we need to delay the handling of some
+ * stripes until there has been a chance that several write requests
+ * for the one stripe have all been collected.
+ * In particular, any write request that would require pre-reading
+ * is put on a "delayed" queue until there are no stripes currently
+ * in a pre-read phase. Further, if the "delayed" queue is empty when
+ * a stripe is put on it then we "plug" the queue and do not process it
+ * until an unplug call is made. (the unplug_io_fn() is called).
+ *
+ * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
+ * it to the count of prereading stripes.
+ * When write is initiated, or the stripe refcnt == 0 (just in case) we
+ * clear the PREREAD_ACTIVE flag and decrement the count
+ * Whenever the 'handle' queue is empty and the device is not plugged, we
+ * move any strips from delayed to handle and clear the DELAYED flag and set
+ * PREREAD_ACTIVE.
+ * In stripe_handle, if we find pre-reading is necessary, we do it if
+ * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
+ * HANDLE gets cleared if stripe_handle leave nothing locked.
+ */
+
+
+struct disk_info {
+ mdk_rdev_t *rdev;
+};
+
+struct raid5_private_data {
+ struct hlist_head *stripe_hashtbl;
+ mddev_t *mddev;
+ struct disk_info *spare;
+ int chunk_size, level, algorithm;
+ int max_degraded;
+ int raid_disks;
+ int max_nr_stripes;
+
+ /* reshape_progress is the leading edge of a 'reshape'
+ * It has value MaxSector when no reshape is happening
+ * If delta_disks < 0, it is the last sector we started work on,
+ * else is it the next sector to work on.
+ */
+ sector_t reshape_progress;
+ /* reshape_safe is the trailing edge of a reshape. We know that
+ * before (or after) this address, all reshape has completed.
+ */
+ sector_t reshape_safe;
+ int previous_raid_disks;
+ int prev_chunk, prev_algo;
+ short generation; /* increments with every reshape */
+ unsigned long reshape_checkpoint; /* Time we last updated
+ * metadata */
+
+ struct list_head handle_list; /* stripes needing handling */
+ struct list_head hold_list; /* preread ready stripes */
+ struct list_head delayed_list; /* stripes that have plugged requests */
+ struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
+ struct bio *retry_read_aligned; /* currently retrying aligned bios */
+ struct bio *retry_read_aligned_list; /* aligned bios retry list */
+ atomic_t preread_active_stripes; /* stripes with scheduled io */
+ atomic_t active_aligned_reads;
+ atomic_t pending_full_writes; /* full write backlog */
+ int bypass_count; /* bypassed prereads */
+ int bypass_threshold; /* preread nice */
+ struct list_head *last_hold; /* detect hold_list promotions */
+
+ atomic_t reshape_stripes; /* stripes with pending writes for reshape */
+ /* unfortunately we need two cache names as we temporarily have
+ * two caches.
+ */
+ int active_name;
+ char cache_name[2][20];
+ struct kmem_cache *slab_cache; /* for allocating stripes */
+
+ int seq_flush, seq_write;
+ int quiesce;
+
+ int fullsync; /* set to 1 if a full sync is needed,
+ * (fresh device added).
+ * Cleared when a sync completes.
+ */
+
+ struct page *spare_page; /* Used when checking P/Q in raid6 */
+
+ /*
+ * Free stripes pool
+ */
+ atomic_t active_stripes;
+ struct list_head inactive_list;
+ wait_queue_head_t wait_for_stripe;
+ wait_queue_head_t wait_for_overlap;
+ int inactive_blocked; /* release of inactive stripes blocked,
+ * waiting for 25% to be free
+ */
+ int pool_size; /* number of disks in stripeheads in pool */
+ spinlock_t device_lock;
+ struct disk_info *disks;
+
+ /* When taking over an array from a different personality, we store
+ * the new thread here until we fully activate the array.
+ */
+ struct mdk_thread_s *thread;
+};
+
+typedef struct raid5_private_data raid5_conf_t;
+
+#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
+
+/*
+ * Our supported algorithms
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */
+#define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */
+#define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */
+#define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */
+
+/* Define non-rotating (raid4) algorithms. These allow
+ * conversion of raid4 to raid5.
+ */
+#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */
+#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */
+
+/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
+ * Firstly, the exact positioning of the parity block is slightly
+ * different between the 'LEFT_*' modes of md and the "_N_*" modes
+ * of DDF.
+ * Secondly, or order of datablocks over which the Q syndrome is computed
+ * is different.
+ * Consequently we have different layouts for DDF/raid6 than md/raid6.
+ * These layouts are from the DDFv1.2 spec.
+ * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
+ * leaves RLQ=3 as 'Vendor Specific'
+ */
+
+#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */
+#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */
+#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */
+
+
+/* For every RAID5 algorithm we define a RAID6 algorithm
+ * with exactly the same layout for data and parity, and
+ * with the Q block always on the last device (N-1).
+ * This allows trivial conversion from RAID5 to RAID6
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC_6 16
+#define ALGORITHM_RIGHT_ASYMMETRIC_6 17
+#define ALGORITHM_LEFT_SYMMETRIC_6 18
+#define ALGORITHM_RIGHT_SYMMETRIC_6 19
+#define ALGORITHM_PARITY_0_6 20
+#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N
+
+static inline int algorithm_valid_raid5(int layout)
+{
+ return (layout >= 0) &&
+ (layout <= 5);
+}
+static inline int algorithm_valid_raid6(int layout)
+{
+ return (layout >= 0 && layout <= 5)
+ ||
+ (layout == 8 || layout == 10)
+ ||
+ (layout >= 16 && layout <= 20);
+}
+
+static inline int algorithm_is_DDF(int layout)
+{
+ return layout >= 8 && layout <= 10;
+}
+#endif
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h
deleted file mode 100644
index 98dcde88470e..000000000000
--- a/drivers/md/raid6.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- * Copyright 2003 H. Peter Anvin - All Rights Reserved
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- * Bostom MA 02111-1307, USA; either version 2 of the License, or
- * (at your option) any later version; incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-
-#ifndef LINUX_RAID_RAID6_H
-#define LINUX_RAID_RAID6_H
-
-#ifdef __KERNEL__
-
-/* Set to 1 to use kernel-wide empty_zero_page */
-#define RAID6_USE_EMPTY_ZERO_PAGE 0
-
-#include <linux/raid/md.h>
-#include <linux/raid/raid5.h>
-
-typedef raid5_conf_t raid6_conf_t; /* Same configuration */
-
-/* Additional compute_parity mode -- updates the parity w/o LOCKING */
-#define UPDATE_PARITY 4
-
-/* We need a pre-zeroed page... if we don't want to use the kernel-provided
- one define it here */
-#if RAID6_USE_EMPTY_ZERO_PAGE
-# define raid6_empty_zero_page empty_zero_page
-#else
-extern const char raid6_empty_zero_page[PAGE_SIZE];
-#endif
-
-#else /* ! __KERNEL__ */
-/* Used for testing in user space */
-
-#include <errno.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <stddef.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-
-/* Not standard, but glibc defines it */
-#define BITS_PER_LONG __WORDSIZE
-
-typedef uint8_t u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-
-#ifndef PAGE_SIZE
-# define PAGE_SIZE 4096
-#endif
-extern const char raid6_empty_zero_page[PAGE_SIZE];
-
-#define __init
-#define __exit
-#define __attribute_const__ __attribute__((const))
-#define noinline __attribute__((noinline))
-
-#define preempt_enable()
-#define preempt_disable()
-#define cpu_has_feature(x) 1
-#define enable_kernel_altivec()
-#define disable_kernel_altivec()
-
-#endif /* __KERNEL__ */
-
-/* Routine choices */
-struct raid6_calls {
- void (*gen_syndrome)(int, size_t, void **);
- int (*valid)(void); /* Returns 1 if this routine set is usable */
- const char *name; /* Name of this routine set */
- int prefer; /* Has special performance attribute */
-};
-
-/* Selected algorithm */
-extern struct raid6_calls raid6_call;
-
-/* Algorithm list */
-extern const struct raid6_calls * const raid6_algos[];
-int raid6_select_algo(void);
-
-/* Return values from chk_syndrome */
-#define RAID6_OK 0
-#define RAID6_P_BAD 1
-#define RAID6_Q_BAD 2
-#define RAID6_PQ_BAD 3
-
-/* Galois field tables */
-extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
-extern const u8 raid6_gfexp[256] __attribute__((aligned(256)));
-extern const u8 raid6_gfinv[256] __attribute__((aligned(256)));
-extern const u8 raid6_gfexi[256] __attribute__((aligned(256)));
-
-/* Recovery routines */
-void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
-void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
-void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
-
-/* Some definitions to allow code to be compiled for testing in userspace */
-#ifndef __KERNEL__
-
-# define jiffies raid6_jiffies()
-# define printk printf
-# define GFP_KERNEL 0
-# define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0))
-# define free_pages(x,y) munmap((void *)(x), (y)*PAGE_SIZE)
-
-static inline void cpu_relax(void)
-{
- /* Nothing */
-}
-
-#undef HZ
-#define HZ 1000
-static inline uint32_t raid6_jiffies(void)
-{
- struct timeval tv;
- gettimeofday(&tv, NULL);
- return tv.tv_sec*1000 + tv.tv_usec/1000;
-}
-
-#endif /* ! __KERNEL__ */
-
-#endif /* LINUX_RAID_RAID6_H */
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c
index 21987e3dbe6c..866215ac7f25 100644
--- a/drivers/md/raid6algos.c
+++ b/drivers/md/raid6algos.c
@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- * Bostom MA 02111-1307, USA; either version 2 of the License, or
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -16,13 +16,20 @@
* Algorithm list and algorithm selection for RAID-6
*/
-#include "raid6.h"
+#include <linux/raid/pq.h>
#ifndef __KERNEL__
#include <sys/mman.h>
#include <stdio.h>
+#else
+#if !RAID6_USE_EMPTY_ZERO_PAGE
+/* In .bss so it's zeroed */
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+EXPORT_SYMBOL(raid6_empty_zero_page);
+#endif
#endif
struct raid6_calls raid6_call;
+EXPORT_SYMBOL_GPL(raid6_call);
/* Various routine sets */
extern const struct raid6_calls raid6_intx1;
@@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = {
#else
/* Need more time to be stable in userspace */
#define RAID6_TIME_JIFFIES_LG2 9
+#define time_before(x, y) ((x) < (y))
#endif
/* Try to pick the best algorithm */
@@ -152,3 +160,12 @@ int __init raid6_select_algo(void)
return best ? 0 : -EINVAL;
}
+
+static void raid6_exit(void)
+{
+ do { } while (0);
+}
+
+subsys_initcall(raid6_select_algo);
+module_exit(raid6_exit);
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc
index b9afd35b8812..699dfeee4944 100644
--- a/drivers/md/raid6altivec.uc
+++ b/drivers/md/raid6altivec.uc
@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- * Bostom MA 02111-1307, USA; either version 2 of the License, or
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -22,7 +22,7 @@
* bracked this with preempt_disable/enable or in a lock)
*/
-#include "raid6.h"
+#include <linux/raid/pq.h>
#ifdef CONFIG_ALTIVEC
diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc
index ad004cee0e26..f9bf9cba357f 100644
--- a/drivers/md/raid6int.uc
+++ b/drivers/md/raid6int.uc
@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- * Bostom MA 02111-1307, USA; either version 2 of the License, or
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
* This file is postprocessed using unroll.pl
*/
-#include "raid6.h"
+#include <linux/raid/pq.h>
/*
* This is the C data type to use
diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c
index d4e4a1bd70ad..e7f6c13132bf 100644
--- a/drivers/md/raid6mmx.c
+++ b/drivers/md/raid6mmx.c
@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- * Bostom MA 02111-1307, USA; either version 2 of the License, or
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
#if defined(__i386__) && !defined(__arch_um__)
-#include "raid6.h"
+#include <linux/raid/pq.h>
#include "raid6x86.h"
/* Shared with raid6sse1.c */
diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c
index a8c4d9451bd9..2609f00e0d61 100644
--- a/drivers/md/raid6recov.c
+++ b/drivers/md/raid6recov.c
@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- * Bostom MA 02111-1307, USA; either version 2 of the License, or
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
* the syndrome.)
*/
-#include "raid6.h"
+#include <linux/raid/pq.h>
/* Recover two failed data blocks. */
void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
@@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
p++; q++;
}
}
-
-
-
+EXPORT_SYMBOL_GPL(raid6_2data_recov);
/* Recover failure of one data block plus the P block */
void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
@@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
q++; dq++;
}
}
+EXPORT_SYMBOL_GPL(raid6_datap_recov);
-
-#ifndef __KERNEL__ /* Testing only */
+#ifndef __KERNEL__
+/* Testing only */
/* Recover two failed blocks. */
void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)
diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c
index 0666237276ff..b274dd5eab8f 100644
--- a/drivers/md/raid6sse1.c
+++ b/drivers/md/raid6sse1.c
@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- * Bostom MA 02111-1307, USA; either version 2 of the License, or
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -23,7 +23,7 @@
#if defined(__i386__) && !defined(__arch_um__)
-#include "raid6.h"
+#include <linux/raid/pq.h>
#include "raid6x86.h"
/* Defined in raid6mmx.c */
diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c
index b034ad868039..6ed6c6c0389f 100644
--- a/drivers/md/raid6sse2.c
+++ b/drivers/md/raid6sse2.c
@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- * Bostom MA 02111-1307, USA; either version 2 of the License, or
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -19,7 +19,7 @@
#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
-#include "raid6.h"
+#include <linux/raid/pq.h>
#include "raid6x86.h"
static const struct raid6_sse_constants {
diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile
index 78e0396adf2a..58ffdf4f5161 100644
--- a/drivers/md/raid6test/Makefile
+++ b/drivers/md/raid6test/Makefile
@@ -5,7 +5,7 @@
CC = gcc
OPTFLAGS = -O2 # Adjust as desired
-CFLAGS = -I.. -g $(OPTFLAGS)
+CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS)
LD = ld
PERL = perl
AR = ar
diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c
index 559cc41b2585..7a930318b17d 100644
--- a/drivers/md/raid6test/test.c
+++ b/drivers/md/raid6test/test.c
@@ -17,7 +17,7 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
-#include "raid6.h"
+#include <linux/raid/pq.h>
#define NDISKS 16 /* Including P and Q */
diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h
index 99fea7a70ca7..4c22c1568558 100644
--- a/drivers/md/raid6x86.h
+++ b/drivers/md/raid6x86.h
@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- * Bostom MA 02111-1307, USA; either version 2 of the License, or
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */