From 9114d79569a3fb858a7ecb1f21cb1dec93dc2f21 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:42 +0100 Subject: drbd: cleanup bogus assert message This fixes ASSERT( mdev->state.disk == D_FAILED ) in drivers/block/drbd/drbd_main.c When we detach from local disk, we let the local refcount hit zero twice. First, we transition to D_FAILED, so we won't give out new references to incoming requests; we still may give out *internal* references, though. Once the refcount hits zero [1] while in D_FAILED, we queue a transition to D_DISKLESS to our worker. We need to queue it, because we may be in atomic context when putting the reference. Once the transition to D_DISKLESS actually happened [2] from worker context, we don't give out new internal references either. Between hitting zero the first time [1] and actually transition to D_DISKLESS [2], there may be a few very short lived internal get/put, so we may hit zero more than once while being in D_FAILED, or even see a race where a an internal get_ldev() happened while D_FAILED, but the corresponding put_ldev() happens just after the transition to D_DISKLESS. That's why we have the additional test_and_set_bit(GO_DISKLESS,); and that's why the assert was placed wrong. Since there was exactly one code path left to drbd_go_diskless(), and that checks already for D_FAILED, drop that assert, and fold in the drbd_queue_work(). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 7 ++++--- drivers/block/drbd/drbd_main.c | 7 ------- 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 6b51afa1aae1..db504d021a6e 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1148,7 +1148,6 @@ extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, char *why, enum bm_flag flags); extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); -extern void drbd_go_diskless(struct drbd_conf *mdev); extern void drbd_ldev_destroy(struct drbd_conf *mdev); /* Meta data layout @@ -2053,9 +2052,11 @@ static inline void put_ldev(struct drbd_conf *mdev) if (mdev->state.disk == D_DISKLESS) /* even internal references gone, safe to destroy */ drbd_ldev_destroy(mdev); - if (mdev->state.disk == D_FAILED) + if (mdev->state.disk == D_FAILED) { /* all application IO references gone. */ - drbd_go_diskless(mdev); + if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) + drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); + } wake_up(&mdev->misc_wait); } } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index e98da675f0c1..731a28eedc56 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3252,13 +3252,6 @@ static int w_go_diskless(struct drbd_work *w, int unused) return 0; } -void drbd_go_diskless(struct drbd_conf *mdev) -{ - D_ASSERT(mdev->state.disk == D_FAILED); - if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) - drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); -} - /** * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap * @mdev: DRBD device. -- cgit v1.2.3-59-g8ed1b From ae8bf312e97d554b6aa32e7b2ceb993812ad0835 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:43 +0100 Subject: drbd: cleanup ondisk meta data layout calculations and defines Add a comment about our meta data layout variants, and rename a few defines (e.g. MD_RESERVED_SECT -> MD_128MB_SECT) to make it clear that they are short hand for fixed constants, and not arbitrarily to be redefined as one may see fit. Properly pad struct meta_data_on_disk to 4kB, and initialize to zero not only the first 512 Byte, but all of it in drbd_md_sync(). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 28 ++++++++++--- drivers/block/drbd/drbd_bitmap.c | 13 +++++- drivers/block/drbd/drbd_int.h | 86 +++++++++++++++++++++++----------------- drivers/block/drbd/drbd_main.c | 11 +++-- drivers/block/drbd/drbd_nl.c | 42 +++++++++++++++----- 5 files changed, 123 insertions(+), 57 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 92510f8ad013..b230d91ec430 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -209,7 +209,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, current->comm, current->pid, __func__, (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); - err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); + /* we do all our meta data IO in aligned 4k blocks. */ + err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096); if (err) { dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); @@ -350,6 +351,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); } +static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev) +{ + const unsigned int stripes = 1; + const unsigned int stripe_size_4kB = MD_32kB_SECT/MD_4kB_SECT; + + /* transaction number, modulo on-disk ring buffer wrap around */ + unsigned int t = mdev->al_tr_number % (stripe_size_4kB * stripes); + + /* ... to aligned 4k on disk block */ + t = ((t % stripes) * stripe_size_4kB) + t/stripes; + + /* ... to 512 byte sector in activity log */ + t *= 8; + + /* ... plus offset to the on disk position */ + return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t; +} + static int _al_write_transaction(struct drbd_conf *mdev) { @@ -432,13 +451,12 @@ _al_write_transaction(struct drbd_conf *mdev) if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) mdev->al_tr_cycle = 0; - sector = mdev->ldev->md.md_offset - + mdev->ldev->md.al_offset - + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9); + sector = al_tr_number_to_on_disk_sector(mdev); crc = crc32c(0, buffer, 4096); buffer->crc32c = cpu_to_be32(crc); + /* normal execution path goes through all three branches */ if (drbd_bm_write_hinted(mdev)) err = -EIO; /* drbd_chk_io_error done already */ @@ -446,8 +464,6 @@ _al_write_transaction(struct drbd_conf *mdev) err = -EIO; drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); } else { - /* advance ringbuffer position and transaction counter */ - mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); mdev->al_tr_number++; } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 8dc29502dc08..64fbb8385cdc 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -612,6 +612,17 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) } } +/* For the layout, see comment above drbd_md_set_sector_offsets(). */ +static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev) +{ + u64 bitmap_sectors; + if (ldev->md.al_offset == 8) + bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset; + else + bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset; + return bitmap_sectors << (9 + 3); +} + /* * make sure the bitmap has enough room for the attached storage, * if necessary, resize. @@ -668,7 +679,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) words = ALIGN(bits, 64) >> LN2_BPL; if (get_ldev(mdev)) { - u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12; + u64 bits_on_disk = drbd_md_on_disk_bits(mdev->ldev); put_ldev(mdev); if (bits > bits_on_disk) { dev_info(DEV, "bits = %lu\n", bits); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index db504d021a6e..60c89e5b298c 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -753,13 +753,8 @@ struct drbd_md { u32 flags; u32 md_size_sect; - s32 al_offset; /* signed relative sector offset to al area */ + s32 al_offset; /* signed relative sector offset to activity log */ s32 bm_offset; /* signed relative sector offset to bitmap */ - - /* u32 al_nr_extents; important for restoring the AL - * is stored into ldev->dc.al_extents, which in turn - * gets applied to act_log->nr_elements - */ }; struct drbd_backing_dev { @@ -1009,7 +1004,6 @@ struct drbd_conf { struct lru_cache *act_log; /* activity log */ unsigned int al_tr_number; int al_tr_cycle; - int al_tr_pos; /* position of the next transaction in the journal */ wait_queue_head_t seq_wait; atomic_t packet_seq; unsigned int peer_seq; @@ -1151,21 +1145,41 @@ extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); extern void drbd_ldev_destroy(struct drbd_conf *mdev); /* Meta data layout - We reserve a 128MB Block (4k aligned) - * either at the end of the backing device - * or on a separate meta data device. */ + * + * We currently have two possible layouts. + * Offsets in (512 byte) sectors. + * external: + * |----------- md_size_sect ------------------| + * [ 4k superblock ][ activity log ][ Bitmap ] + * | al_offset == 8 | + * | bm_offset = al_offset + X | + * ==> bitmap sectors = md_size_sect - bm_offset + * + * Variants: + * old, indexed fixed size meta data: + * + * internal: + * |----------- md_size_sect ------------------| + * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*] + * | al_offset < 0 | + * | bm_offset = al_offset - Y | + * ==> bitmap sectors = Y = al_offset - bm_offset + * + * [padding*] are zero or up to 7 unused 512 Byte sectors to the + * end of the device, so that the [4k superblock] will be 4k aligned. + * + * The activity log consists of 4k transaction blocks, + * which are written in a ring-buffer, or striped ring-buffer like fashion, + * which are writtensize used to be fixed 32kB, + * but is about to become configurable. + */ -/* The following numbers are sectors */ -/* Allows up to about 3.8TB, so if you want more, +/* Our old fixed size meta data layout + * allows up to about 3.8TB, so if you want more, * you need to use the "flexible" meta data format. */ -#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ -#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ -#define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */ -#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS) - -/* we do all meta data IO in 4k blocks */ -#define MD_BLOCK_SHIFT 12 -#define MD_BLOCK_SIZE (1<md.md_offset + MD_AL_OFFSET - 1; + return bdev->md.md_offset + MD_4kB_SECT -1; case DRBD_MD_INDEX_FLEX_EXT: default: - return bdev->md.md_offset + bdev->md.md_size_sect; + return bdev->md.md_offset + bdev->md.md_size_sect -1; } } @@ -1861,13 +1876,11 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, rcu_read_unlock(); switch (meta_dev_idx) { - default: /* external, some index */ - return MD_RESERVED_SECT * meta_dev_idx; + default: /* external, some index; this is the old fixed size layout */ + return MD_128MB_SECT * meta_dev_idx; case DRBD_MD_INDEX_INTERNAL: /* with drbd08, internal meta data is always "flexible" */ case DRBD_MD_INDEX_FLEX_INT: - /* sizeof(struct md_on_disk_07) == 4k - * position: last 4k aligned block of 4k size */ if (!bdev->backing_bdev) { if (__ratelimit(&drbd_ratelimit_state)) { dev_err(DEV, "bdev->backing_bdev==NULL\n"); @@ -1875,8 +1888,9 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, } return 0; } - return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - - MD_AL_OFFSET; + /* sizeof(struct md_on_disk_07) == 4k + * position: last 4k aligned block of 4k size */ + return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8; case DRBD_MD_INDEX_FLEX_EXT: return 0; } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 731a28eedc56..76faeab40c8f 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2834,6 +2834,7 @@ void conn_md_sync(struct drbd_tconn *tconn) rcu_read_unlock(); } +/* aligned 4kByte */ struct meta_data_on_disk { u64 la_size; /* last agreed size. */ u64 uuid[UI_SIZE]; /* UUIDs. */ @@ -2843,13 +2844,13 @@ struct meta_data_on_disk { u32 magic; u32 md_size_sect; u32 al_offset; /* offset to this block */ - u32 al_nr_extents; /* important for restoring the AL */ + u32 al_nr_extents; /* important for restoring the AL (userspace) */ /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ u32 bm_offset; /* offset to the bitmap, from here */ u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ u32 la_peer_max_bio_size; /* last peer max_bio_size */ - u32 reserved_u32[3]; + u8 reserved_u8[4096 - (7*8 + 8*4)]; } __packed; /** @@ -2862,6 +2863,10 @@ void drbd_md_sync(struct drbd_conf *mdev) sector_t sector; int i; + /* Don't accidentally change the DRBD meta data layout. */ + BUILD_BUG_ON(UI_SIZE != 4); + BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); + del_timer(&mdev->md_sync_timer); /* timer may be rearmed by drbd_md_mark_dirty() now. */ if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) @@ -2876,7 +2881,7 @@ void drbd_md_sync(struct drbd_conf *mdev) if (!buffer) goto out; - memset(buffer, 0, 512); + memset(buffer, 0, sizeof(*buffer)); buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); for (i = UI_CURRENT; i < UI_SIZE; i++) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 2af26fc95280..581f6800cc30 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -696,12 +696,32 @@ out: return 0; } -/* initializes the md.*_offset members, so we are able to find - * the on disk meta data */ +/* Initializes the md.*_offset members, so we are able to find + * the on disk meta data. + * + * We currently have two possible layouts: + * external: + * |----------- md_size_sect ------------------| + * [ 4k superblock ][ activity log ][ Bitmap ] + * | al_offset == 8 | + * | bm_offset = al_offset + X | + * ==> bitmap sectors = md_size_sect - bm_offset + * + * internal: + * |----------- md_size_sect ------------------| + * [data.....][ Bitmap ][ activity log ][ 4k superblock ] + * | al_offset < 0 | + * | bm_offset = al_offset - Y | + * ==> bitmap sectors = Y = al_offset - bm_offset + * + * Activity log size used to be fixed 32kB, + * but is about to become configurable. + */ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { sector_t md_size_sect = 0; + unsigned int al_size_sect = MD_32kB_SECT; int meta_dev_idx; rcu_read_lock(); @@ -710,23 +730,23 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, switch (meta_dev_idx) { default: /* v07 style fixed size indexed meta data */ - bdev->md.md_size_sect = MD_RESERVED_SECT; + bdev->md.md_size_sect = MD_128MB_SECT; bdev->md.md_offset = drbd_md_ss__(mdev, bdev); - bdev->md.al_offset = MD_AL_OFFSET; - bdev->md.bm_offset = MD_BM_OFFSET; + bdev->md.al_offset = MD_4kB_SECT; + bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; break; case DRBD_MD_INDEX_FLEX_EXT: /* just occupy the full device; unit: sectors */ bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); bdev->md.md_offset = 0; - bdev->md.al_offset = MD_AL_OFFSET; - bdev->md.bm_offset = MD_BM_OFFSET; + bdev->md.al_offset = MD_4kB_SECT; + bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; break; case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: bdev->md.md_offset = drbd_md_ss__(mdev, bdev); /* al size is still fixed */ - bdev->md.al_offset = -MD_AL_SECTORS; + bdev->md.al_offset = -al_size_sect; /* we need (slightly less than) ~ this much bitmap sectors: */ md_size_sect = drbd_get_capacity(bdev->backing_bdev); md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); @@ -735,11 +755,11 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, /* plus the "drbd meta data super block", * and the activity log; */ - md_size_sect += MD_BM_OFFSET; + md_size_sect += MD_4kB_SECT + al_size_sect; bdev->md.md_size_sect = md_size_sect; /* bitmap offset is adjusted by 'super' block size */ - bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; + bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT; break; } rcu_read_unlock(); @@ -1416,7 +1436,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) min_md_device_sectors = (2<<10); } else { max_possible_sectors = DRBD_MAX_SECTORS; - min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1); + min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1); } if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { -- cgit v1.2.3-59-g8ed1b From 3a4d4eb3cb03fbc66696fc8cd472701d56f3aee7 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:44 +0100 Subject: drbd: prepare for new striped layout of activity log Introduce two new on-disk meta data fields: al_stripes and al_stripe_size_4k The intended use case is activity log on RAID 0 or similar. Logically consecutive transactions will advance their on-disk position by al_stripe_size_4k 4kB (transaction sized) blocks. Right now, these are still asserted to be the backward compatible values al_stripes = 1, al_stripe_size_4k = 8 (which amounts to 32kB). Also introduce a caching member for meta_dev_idx in the in-core structure: even though it is initially passed in in the rcu-protected disk_conf structure, it cannot change without a detach/attach cycle. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 6 ++-- drivers/block/drbd/drbd_int.h | 46 +++++++++++------------- drivers/block/drbd/drbd_main.c | 77 +++++++++++++++++++++++++++++++++++----- drivers/block/drbd/drbd_nl.c | 5 ++- 4 files changed, 94 insertions(+), 40 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index b230d91ec430..7e7680e8da6c 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -353,11 +353,11 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev) { - const unsigned int stripes = 1; - const unsigned int stripe_size_4kB = MD_32kB_SECT/MD_4kB_SECT; + const unsigned int stripes = mdev->ldev->md.al_stripes; + const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k; /* transaction number, modulo on-disk ring buffer wrap around */ - unsigned int t = mdev->al_tr_number % (stripe_size_4kB * stripes); + unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k); /* ... to aligned 4k on disk block */ t = ((t % stripes) * stripe_size_4kB) + t/stripes; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 60c89e5b298c..ee19ba28b59a 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -755,6 +755,14 @@ struct drbd_md { s32 al_offset; /* signed relative sector offset to activity log */ s32 bm_offset; /* signed relative sector offset to bitmap */ + + /* cached value of bdev->disk_conf->meta_dev_idx (see below) */ + s32 meta_dev_idx; + + /* see al_tr_number_to_on_disk_sector() */ + u32 al_stripes; + u32 al_stripe_size_4k; + u32 al_size_4k; /* cached product of the above */ }; struct drbd_backing_dev { @@ -1862,38 +1870,24 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) } /** - * drbd_md_ss__() - Return the sector number of our meta data super block - * @mdev: DRBD device. + * drbd_md_ss() - Return the sector number of our meta data super block * @bdev: Meta data block device. */ -static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, - struct drbd_backing_dev *bdev) +static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev) { - int meta_dev_idx; + const int meta_dev_idx = bdev->md.meta_dev_idx; - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); + if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT) + return 0; - switch (meta_dev_idx) { - default: /* external, some index; this is the old fixed size layout */ - return MD_128MB_SECT * meta_dev_idx; - case DRBD_MD_INDEX_INTERNAL: - /* with drbd08, internal meta data is always "flexible" */ - case DRBD_MD_INDEX_FLEX_INT: - if (!bdev->backing_bdev) { - if (__ratelimit(&drbd_ratelimit_state)) { - dev_err(DEV, "bdev->backing_bdev==NULL\n"); - dump_stack(); - } - return 0; - } - /* sizeof(struct md_on_disk_07) == 4k - * position: last 4k aligned block of 4k size */ + /* Since drbd08, internal meta data is always "flexible". + * position: last 4k aligned block of 4k size */ + if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL || + meta_dev_idx == DRBD_MD_INDEX_FLEX_INT) return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8; - case DRBD_MD_INDEX_FLEX_EXT: - return 0; - } + + /* external, some index; this is the old fixed size layout */ + return MD_128MB_SECT * bdev->md.meta_dev_idx; } static inline void diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 76faeab40c8f..7a2e07b45ecf 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2850,7 +2850,11 @@ struct meta_data_on_disk { u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ u32 la_peer_max_bio_size; /* last peer max_bio_size */ - u8 reserved_u8[4096 - (7*8 + 8*4)]; + /* see al_tr_number_to_on_disk_sector() */ + u32 al_stripes; + u32 al_stripe_size_4k; + + u8 reserved_u8[4096 - (7*8 + 10*4)]; } __packed; /** @@ -2898,7 +2902,10 @@ void drbd_md_sync(struct drbd_conf *mdev) buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size); - D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); + buffer->al_stripes = cpu_to_be32(mdev->ldev->md.al_stripes); + buffer->al_stripe_size_4k = cpu_to_be32(mdev->ldev->md.al_stripe_size_4k); + + D_ASSERT(drbd_md_ss(mdev->ldev) == mdev->ldev->md.md_offset); sector = mdev->ldev->md.md_offset; if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { @@ -2916,13 +2923,60 @@ out: put_ldev(mdev); } +static int check_activity_log_stripe_size(struct drbd_conf *mdev, + struct meta_data_on_disk *on_disk, + struct drbd_md *in_core) +{ + u32 al_stripes = be32_to_cpu(on_disk->al_stripes); + u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k); + u64 al_size_4k; + + /* both not set: default to old fixed size activity log */ + if (al_stripes == 0 && al_stripe_size_4k == 0) { + al_stripes = 1; + al_stripe_size_4k = MD_32kB_SECT/8; + } + + /* some paranoia plausibility checks */ + + /* we need both values to be set */ + if (al_stripes == 0 || al_stripe_size_4k == 0) + goto err; + + al_size_4k = (u64)al_stripes * al_stripe_size_4k; + + /* Upper limit of activity log area, to avoid potential overflow + * problems in al_tr_number_to_on_disk_sector(). As right now, more + * than 72 * 4k blocks total only increases the amount of history, + * limiting this arbitrarily to 16 GB is not a real limitation ;-) */ + if (al_size_4k > (16 * 1024 * 1024/4)) + goto err; + + /* Lower limit: we need at least 8 transaction slots (32kB) + * to not break existing setups */ + if (al_size_4k < MD_32kB_SECT/8) + goto err; + + in_core->al_stripe_size_4k = al_stripe_size_4k; + in_core->al_stripes = al_stripes; + in_core->al_size_4k = al_size_4k; + + return 0; +err: + dev_err(DEV, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n", + al_stripes, al_stripe_size_4k); + return -EINVAL; +} + /** * drbd_md_read() - Reads in the meta data super block * @mdev: DRBD device. * @bdev: Device from which the meta data should be read in. * - * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case + * Return NO_ERROR on success, and an enum drbd_ret_code in case * something goes wrong. + * + * Called exactly once during drbd_adm_attach() */ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { @@ -2937,6 +2991,10 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!buffer) goto out; + /* First, figure out where our meta data superblock is located. */ + bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; + bdev->md.md_offset = drbd_md_ss(bdev); + if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is called BEFORE disk is attached */ @@ -2954,40 +3012,43 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) rv = ERR_MD_UNCLEAN; goto err; } + + rv = ERR_MD_INVALID; if (magic != DRBD_MD_MAGIC_08) { if (magic == DRBD_MD_MAGIC_07) dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); else dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); - rv = ERR_MD_INVALID; goto err; } + + if (check_activity_log_stripe_size(mdev, buffer, &bdev->md)) + goto err; + if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", be32_to_cpu(buffer->al_offset), bdev->md.al_offset); - rv = ERR_MD_INVALID; goto err; } if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); - rv = ERR_MD_INVALID; goto err; } if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { dev_err(DEV, "unexpected md_size: %u (expected %u)\n", be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); - rv = ERR_MD_INVALID; goto err; } if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); - rv = ERR_MD_INVALID; goto err; } + rv = NO_ERROR; + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); for (i = UI_CURRENT; i < UI_SIZE; i++) bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 581f6800cc30..104b7cea691e 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -727,24 +727,23 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, rcu_read_lock(); meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; + bdev->md.md_offset = drbd_md_ss(bdev); + switch (meta_dev_idx) { default: /* v07 style fixed size indexed meta data */ bdev->md.md_size_sect = MD_128MB_SECT; - bdev->md.md_offset = drbd_md_ss__(mdev, bdev); bdev->md.al_offset = MD_4kB_SECT; bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; break; case DRBD_MD_INDEX_FLEX_EXT: /* just occupy the full device; unit: sectors */ bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); - bdev->md.md_offset = 0; bdev->md.al_offset = MD_4kB_SECT; bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; break; case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: - bdev->md.md_offset = drbd_md_ss__(mdev, bdev); /* al size is still fixed */ bdev->md.al_offset = -al_size_sect; /* we need (slightly less than) ~ this much bitmap sectors: */ -- cgit v1.2.3-59-g8ed1b From 68e41a43f18b681f83329c8ad83123571bb8db0d Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:45 +0100 Subject: drbd: use the cached meta_dev_idx Now we have the cached meta_dev_idx member, we can get rid of a few rcu_read_lock() sections and rcu_dereference(). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 32 +++++--------------------------- drivers/block/drbd/drbd_nl.c | 7 +------ 2 files changed, 6 insertions(+), 33 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index ee19ba28b59a..6eecdec9da2b 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1777,9 +1777,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev, * BTW, for internal meta data, this happens to be the maximum capacity * we could agree upon with our peer node. */ -static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev) +static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) { - switch (meta_dev_idx) { + switch (bdev->md.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: return bdev->md.md_offset + bdev->md.bm_offset; @@ -1789,30 +1789,13 @@ static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backi } } -static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) -{ - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - - return _drbd_md_first_sector(meta_dev_idx, bdev); -} - /** * drbd_md_last_sector() - Return the last sector number of the meta data area * @bdev: Meta data block device. */ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) { - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - - switch (meta_dev_idx) { + switch (bdev->md.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: return bdev->md.md_offset + MD_4kB_SECT -1; @@ -1840,18 +1823,13 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev) static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) { sector_t s; - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - switch (meta_dev_idx) { + switch (bdev->md.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: s = drbd_get_capacity(bdev->backing_bdev) ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, - _drbd_md_first_sector(meta_dev_idx, bdev)) + drbd_md_first_sector(bdev)) : 0; break; case DRBD_MD_INDEX_FLEX_EXT: diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 104b7cea691e..5621df86967a 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -722,14 +722,10 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, { sector_t md_size_sect = 0; unsigned int al_size_sect = MD_32kB_SECT; - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; bdev->md.md_offset = drbd_md_ss(bdev); - switch (meta_dev_idx) { + switch (bdev->md.meta_dev_idx) { default: /* v07 style fixed size indexed meta data */ bdev->md.md_size_sect = MD_128MB_SECT; @@ -761,7 +757,6 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT; break; } - rcu_read_unlock(); } /* input size is expected to be in KB */ -- cgit v1.2.3-59-g8ed1b From cccac9857d624dab74b23bafe0482fcdd91df7d8 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:46 +0100 Subject: drbd: mechanically rename la_size to la_size_sect Make it obvious that this value is in units of 512 Byte sectors. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 6 +++--- drivers/block/drbd/drbd_nl.c | 16 ++++++++-------- drivers/block/drbd/drbd_receiver.c | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 7a2e07b45ecf..6b956fc04dc8 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2836,7 +2836,7 @@ void conn_md_sync(struct drbd_tconn *tconn) /* aligned 4kByte */ struct meta_data_on_disk { - u64 la_size; /* last agreed size. */ + u64 la_size_sect; /* last agreed size. */ u64 uuid[UI_SIZE]; /* UUIDs. */ u64 device_uuid; u64 reserved_u64_1; @@ -2887,7 +2887,7 @@ void drbd_md_sync(struct drbd_conf *mdev) memset(buffer, 0, sizeof(*buffer)); - buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); + buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); for (i = UI_CURRENT; i < UI_SIZE; i++) buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); buffer->flags = cpu_to_be32(mdev->ldev->md.flags); @@ -3049,7 +3049,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) rv = NO_ERROR; - bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect); for (i = UI_CURRENT; i < UI_SIZE; i++) bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); bdev->md.flags = be32_to_cpu(buffer->flags); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 5621df86967a..d5211b06df45 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -819,7 +819,7 @@ void drbd_resume_io(struct drbd_conf *mdev) enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) { sector_t prev_first_sect, prev_size; /* previous meta location */ - sector_t la_size, u_size; + sector_t la_size_sect, u_size; sector_t size; char ppb[10]; @@ -842,7 +842,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds prev_first_sect = drbd_md_first_sector(mdev->ldev); prev_size = mdev->ldev->md.md_size_sect; - la_size = mdev->ldev->md.la_size_sect; + la_size_sect = mdev->ldev->md.la_size_sect; /* TODO: should only be some assert here, not (re)init... */ drbd_md_set_sector_offsets(mdev, mdev->ldev); @@ -878,7 +878,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds if (rv == dev_size_error) goto out; - la_size_changed = (la_size != mdev->ldev->md.la_size_sect); + la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect); md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) || prev_size != mdev->ldev->md.md_size_sect; @@ -900,9 +900,9 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds drbd_md_mark_dirty(mdev); } - if (size > la_size) + if (size > la_size_sect) rv = grew; - if (size < la_size) + if (size < la_size_sect) rv = shrunk; out: lc_unlock(mdev->act_log); @@ -917,7 +917,7 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, sector_t u_size, int assume_peer_has_space) { sector_t p_size = mdev->p_size; /* partner's disk size. */ - sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ + sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */ sector_t m_size; /* my size */ sector_t size = 0; @@ -931,8 +931,8 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, if (p_size && m_size) { size = min_t(sector_t, p_size, m_size); } else { - if (la_size) { - size = la_size; + if (la_size_sect) { + size = la_size_sect; if (m_size && m_size < size) size = m_size; if (p_size && p_size < size) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index a9eccfc6079b..8172a2cfdead 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3992,7 +3992,7 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) clear_bit(DISCARD_MY_DATA, &mdev->flags); - drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ + drbd_md_sync(mdev); /* update connected indicator, la_size_sect, ... */ return 0; } -- cgit v1.2.3-59-g8ed1b From c04ccaa669e147ffb66e4e74d82c7dbfc100ec5e Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:47 +0100 Subject: drbd: read meta data early, base on-disk offsets on super block We used to calculate all on-disk meta data offsets, and then compare the stored offsets, basically treating them as magic numbers. Now with the activity log striping, the activity log size is no longer fixed. We need to first read the super block, then base the activity log and bitmap offsets on the stored offsets/al stripe settings. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 11 +++- drivers/block/drbd/drbd_main.c | 131 ++++++++++++++++++++++++++++++++------- drivers/block/drbd/drbd_nl.c | 15 ++--- drivers/block/drbd/drbd_worker.c | 3 +- 4 files changed, 123 insertions(+), 37 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 7e7680e8da6c..c79625aa8cf2 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -168,7 +168,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; - if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ + if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL) + /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ + ; + else if (!get_ldev_if_state(mdev, D_ATTACHING)) { + /* Corresponding put_ldev in drbd_md_io_complete() */ dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); err = -ENODEV; goto out; @@ -199,9 +203,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, BUG_ON(!bdev->md_bdev); - dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", + dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", current->comm, current->pid, __func__, - (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", + (void*)_RET_IP_ ); if (sector < drbd_md_first_sector(bdev) || sector + 7 > drbd_md_last_sector(bdev)) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 6b956fc04dc8..e55271d6e7f6 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2968,6 +2968,86 @@ err: return -EINVAL; } +static int check_offsets_and_sizes(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) +{ + sector_t capacity = drbd_get_capacity(bdev->md_bdev); + struct drbd_md *in_core = &bdev->md; + s32 on_disk_al_sect; + s32 on_disk_bm_sect; + + /* The on-disk size of the activity log, calculated from offsets, and + * the size of the activity log calculated from the stripe settings, + * should match. + * Though we could relax this a bit: it is ok, if the striped activity log + * fits in the available on-disk activity log size. + * Right now, that would break how resize is implemented. + * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware + * of possible unused padding space in the on disk layout. */ + if (in_core->al_offset < 0) { + if (in_core->bm_offset > in_core->al_offset) + goto err; + on_disk_al_sect = -in_core->al_offset; + on_disk_bm_sect = in_core->al_offset - in_core->bm_offset; + } else { + if (in_core->al_offset != MD_4kB_SECT) + goto err; + if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT) + goto err; + + on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT; + on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset; + } + + /* old fixed size meta data is exactly that: fixed. */ + if (in_core->meta_dev_idx >= 0) { + if (in_core->md_size_sect != MD_128MB_SECT + || in_core->al_offset != MD_4kB_SECT + || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT + || in_core->al_stripes != 1 + || in_core->al_stripe_size_4k != MD_32kB_SECT/8) + goto err; + } + + if (capacity < in_core->md_size_sect) + goto err; + if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev)) + goto err; + + /* should be aligned, and at least 32k */ + if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT)) + goto err; + + /* should fit (for now: exactly) into the available on-disk space; + * overflow prevention is in check_activity_log_stripe_size() above. */ + if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT) + goto err; + + /* again, should be aligned */ + if (in_core->bm_offset & 7) + goto err; + + /* FIXME check for device grow with flex external meta data? */ + + /* can the available bitmap space cover the last agreed device size? */ + if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512) + goto err; + + return 0; + +err: + dev_err(DEV, "meta data offsets don't make sense: idx=%d " + "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, " + "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n", + in_core->meta_dev_idx, + in_core->al_stripes, in_core->al_stripe_size_4k, + in_core->al_offset, in_core->bm_offset, in_core->md_size_sect, + (unsigned long long)in_core->la_size_sect, + (unsigned long long)capacity); + + return -EINVAL; +} + + /** * drbd_md_read() - Reads in the meta data super block * @mdev: DRBD device. @@ -2976,7 +3056,8 @@ err: * Return NO_ERROR on success, and an enum drbd_ret_code in case * something goes wrong. * - * Called exactly once during drbd_adm_attach() + * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS, + * even before @bdev is assigned to @mdev->ldev. */ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { @@ -2984,14 +3065,15 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) u32 magic, flags; int i, rv = NO_ERROR; - if (!get_ldev_if_state(mdev, D_ATTACHING)) - return ERR_IO_MD_DISK; + if (mdev->state.disk != D_DISKLESS) + return ERR_DISK_CONFIGURED; buffer = drbd_md_get_buffer(mdev); if (!buffer) - goto out; + return ERR_NOMEM; - /* First, figure out where our meta data superblock is located. */ + /* First, figure out where our meta data superblock is located, + * and read it. */ bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; bdev->md.md_offset = drbd_md_ss(bdev); @@ -3022,14 +3104,29 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) goto err; } - if (check_activity_log_stripe_size(mdev, buffer, &bdev->md)) + if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { + dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", + be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); goto err; + } - if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { - dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", - be32_to_cpu(buffer->al_offset), bdev->md.al_offset); + + /* convert to in_core endian */ + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect); + for (i = UI_CURRENT; i < UI_SIZE; i++) + bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); + bdev->md.flags = be32_to_cpu(buffer->flags); + bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); + + bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect); + bdev->md.al_offset = be32_to_cpu(buffer->al_offset); + bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset); + + if (check_activity_log_stripe_size(mdev, buffer, &bdev->md)) goto err; - } + if (check_offsets_and_sizes(mdev, bdev)) + goto err; + if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); @@ -3041,20 +3138,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) goto err; } - if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { - dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", - be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); - goto err; - } - rv = NO_ERROR; - bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect); - for (i = UI_CURRENT; i < UI_SIZE; i++) - bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); - bdev->md.flags = be32_to_cpu(buffer->flags); - bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); - spin_lock_irq(&mdev->tconn->req_lock); if (mdev->state.conn < C_CONNECTED) { unsigned int peer; @@ -3066,8 +3151,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) err: drbd_md_put_buffer(mdev); - out: - put_ldev(mdev); return rv; } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index d5211b06df45..974ea47a656a 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -721,7 +721,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { sector_t md_size_sect = 0; - unsigned int al_size_sect = MD_32kB_SECT; + unsigned int al_size_sect = bdev->md.al_size_4k * 8; bdev->md.md_offset = drbd_md_ss(bdev); @@ -1413,8 +1413,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto fail; } - /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ - drbd_md_set_sector_offsets(mdev, nbc); + /* Read our meta data super block early. + * This also sets other on-disk offsets. */ + retcode = drbd_md_read(mdev, nbc); + if (retcode != NO_ERROR) + goto fail; if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", @@ -1481,8 +1484,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) if (!get_ldev_if_state(mdev, D_ATTACHING)) goto force_diskless; - drbd_md_set_sector_offsets(mdev, nbc); - if (!mdev->bitmap) { if (drbd_bm_init(mdev)) { retcode = ERR_NOMEM; @@ -1490,10 +1491,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } } - retcode = drbd_md_read(mdev, nbc); - if (retcode != NO_ERROR) - goto force_diskless_dec; - if (mdev->state.conn < C_CONNECTED && mdev->state.role == R_PRIMARY && (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 424dc7bdf9b7..34b5d5d23ac4 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -89,7 +89,8 @@ void drbd_md_io_complete(struct bio *bio, int error) md_io->done = 1; wake_up(&mdev->misc_wait); bio_put(bio); - put_ldev(mdev); + if (mdev->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */ + put_ldev(mdev); } /* reads on behalf of the partner, -- cgit v1.2.3-59-g8ed1b From 56392d2f40aac4b520fc50bc356f40e07f7e1c7d Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:48 +0100 Subject: drbd: Clarify when activity log I/O is delegated to the worker thread Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 49 +++++++++++++++++++++----------------- drivers/block/drbd/drbd_int.h | 2 +- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/drbd/drbd_req.c | 2 +- drivers/block/drbd/drbd_worker.c | 2 +- 5 files changed, 31 insertions(+), 26 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index c79625aa8cf2..82199d9a9a61 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -104,7 +104,7 @@ struct update_al_work { int err; }; -static int al_write_transaction(struct drbd_conf *mdev); +static int al_write_transaction(struct drbd_conf *mdev, bool delegate); void *drbd_md_get_buffer(struct drbd_conf *mdev) { @@ -246,7 +246,10 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) return al_ext; } -void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) +/* + * @delegate: delegate activity log I/O to the worker thread + */ +void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate) { /* for bios crossing activity log extent boundaries, * we may need to activate two extents in one go */ @@ -255,6 +258,17 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) unsigned enr; bool locked = false; + /* When called through generic_make_request(), we must delegate + * activity log I/O to the worker thread: a further request + * submitted via generic_make_request() within the same task + * would be queued on current->bio_list, and would only start + * after this function returns (see generic_make_request()). + * + * However, if we *are* the worker, we must not delegate to ourselves. + */ + + if (delegate) + BUG_ON(current == mdev->tconn->worker.task); D_ASSERT(first <= last); D_ASSERT(atomic_read(&mdev->local_cnt) > 0); @@ -270,13 +284,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) (locked = lc_try_lock_for_transaction(mdev->act_log))); if (locked) { - /* drbd_al_write_transaction(mdev,al_ext,enr); - * recurses into generic_make_request(), which - * disallows recursion, bios being serialized on the - * current->bio_tail list now. - * we have to delegate updates to the activity log - * to the worker thread. */ - /* Double check: it may have been committed by someone else, * while we have been waiting for the lock. */ if (mdev->act_log->pending_changes) { @@ -287,7 +294,7 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) rcu_read_unlock(); if (write_al_updates) { - al_write_transaction(mdev); + al_write_transaction(mdev, delegate); mdev->al_writ_cnt++; } @@ -495,20 +502,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused) /* Calls from worker context (see w_restart_disk_io()) need to write the transaction directly. Others came through generic_make_request(), those need to delegate it to the worker. */ -static int al_write_transaction(struct drbd_conf *mdev) +static int al_write_transaction(struct drbd_conf *mdev, bool delegate) { - struct update_al_work al_work; - - if (current == mdev->tconn->worker.task) + if (delegate) { + struct update_al_work al_work; + init_completion(&al_work.event); + al_work.w.cb = w_al_write_transaction; + al_work.w.mdev = mdev; + drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); + wait_for_completion(&al_work.event); + return al_work.err; + } else return _al_write_transaction(mdev); - - init_completion(&al_work.event); - al_work.w.cb = w_al_write_transaction; - al_work.w.mdev = mdev; - drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); - wait_for_completion(&al_work.event); - - return al_work.err; } static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 6eecdec9da2b..453fccfc440c 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1598,7 +1598,7 @@ extern const char *drbd_conn_str(enum drbd_conns s); extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ -extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i); +extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate); extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 8172a2cfdead..1921871ca9a8 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2265,7 +2265,7 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi) drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); peer_req->flags |= EE_CALL_AL_COMPLETE_IO; peer_req->flags &= ~EE_MAY_SET_IN_SYNC; - drbd_al_begin_io(mdev, &peer_req->i); + drbd_al_begin_io(mdev, &peer_req->i, true); } err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 2b8303ad63c9..7d1ff1aaeb71 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1054,7 +1054,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long if (rw == WRITE && req->private_bio && req->i.size && !test_bit(AL_SUSPENDED, &mdev->flags)) { req->rq_state |= RQ_IN_ACT_LOG; - drbd_al_begin_io(mdev, &req->i); + drbd_al_begin_io(mdev, &req->i, true); } spin_lock_irq(&mdev->tconn->req_lock); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 34b5d5d23ac4..f41e224caa7c 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1411,7 +1411,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel) struct drbd_conf *mdev = w->mdev; if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) - drbd_al_begin_io(mdev, &req->i); + drbd_al_begin_io(mdev, &req->i, false); drbd_req_make_private_bio(req, req->master_bio); req->private_bio->bi_bdev = mdev->ldev->backing_bdev; -- cgit v1.2.3-59-g8ed1b From ebfd5d8f715167b886c9401e6b123847187f137b Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:49 +0100 Subject: drbd: drbd_al_being_io: short circuit to reduce latency A request hitting an already "hot" extent should proceed right away, even if some other requests need to wait for pending transactions. Without that short-circuit, several simultaneous make_request contexts race for committing the transaction, possibly penalizing the innocent. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 82199d9a9a61..1d7244d2a910 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -256,6 +256,7 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool dele unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); unsigned enr; + bool need_transaction = false; bool locked = false; /* When called through generic_make_request(), we must delegate @@ -273,8 +274,17 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool dele D_ASSERT(first <= last); D_ASSERT(atomic_read(&mdev->local_cnt) > 0); - for (enr = first; enr <= last; enr++) - wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); + for (enr = first; enr <= last; enr++) { + struct lc_element *al_ext; + wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)) != NULL); + if (al_ext->lc_number != enr) + need_transaction = true; + } + + /* If *this* request was to an already active extent, + * we're done, even if there are pending changes. */ + if (!need_transaction) + return; /* Serialize multiple transactions. * This uses test_and_set_bit, memory barrier is implicit. -- cgit v1.2.3-59-g8ed1b From 6d9febe237146156947f0da8407c620b5c33c1df Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:50 +0100 Subject: drbd: split __drbd_make_request in before and after drbd_al_begin_io This is in preparation to be able to defer requests that need to wait for an activity log transaction to a submitter workqueue. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 7d1ff1aaeb71..96d5968fc1e4 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -34,14 +34,14 @@ static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); /* Update disk stats at start of I/O request */ -static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) +static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req) { - const int rw = bio_data_dir(bio); + const int rw = bio_data_dir(req->master_bio); int cpu; cpu = part_stat_lock(); part_round_stats(cpu, &mdev->vdisk->part0); part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); - part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); + part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], req->i.size >> 9); (void) cpu; /* The macro invocations above want the cpu argument, I do not like the compiler warning about cpu only assigned but never used... */ part_inc_in_flight(&mdev->vdisk->part0, rw); @@ -1020,12 +1020,16 @@ drbd_submit_req_private_bio(struct drbd_request *req) bio_endio(bio, -EIO); } -void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) +/* returns the new drbd_request pointer, if the caller is expected to + * drbd_send_and_submit() it (to save latency), or NULL if we queued the + * request on the submitter thread. + * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request. + */ +struct drbd_request * +drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) { - const int rw = bio_rw(bio); - struct bio_and_error m = { NULL, }; + const int rw = bio_data_dir(bio); struct drbd_request *req; - bool no_remote = false; /* allocate outside of all locks; */ req = drbd_req_new(mdev, bio); @@ -1035,7 +1039,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long * if user cannot handle io errors, that's not our business. */ dev_err(DEV, "could not kmalloc() req\n"); bio_endio(bio, -ENOMEM); - return; + return ERR_PTR(-ENOMEM); } req->start_time = start_time; @@ -1057,6 +1061,15 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long drbd_al_begin_io(mdev, &req->i, true); } + return req; +} + +static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *req) +{ + const int rw = bio_rw(req->master_bio); + struct bio_and_error m = { NULL, }; + bool no_remote = false; + spin_lock_irq(&mdev->tconn->req_lock); if (rw == WRITE) { /* This may temporarily give up the req_lock, @@ -1079,7 +1092,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long } /* Update disk stats */ - _drbd_start_io_acct(mdev, req, bio); + _drbd_start_io_acct(mdev, req); /* We fail READ/READA early, if we can not serve it. * We must do this before req is registered on any lists. @@ -1137,7 +1150,14 @@ out: if (m.bio) complete_master_bio(mdev, &m); - return; +} + +void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) +{ + struct drbd_request *req = drbd_request_prepare(mdev, bio, start_time); + if (IS_ERR_OR_NULL(req)) + return; + drbd_send_and_submit(mdev, req); } void drbd_make_request(struct request_queue *q, struct bio *bio) -- cgit v1.2.3-59-g8ed1b From 113fef9e20e0d614b3f5940b67c96e719c559eea Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 22 Mar 2013 18:14:40 -0600 Subject: drbd: prepare to queue write requests on a submit worker Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 13 +++++++++++++ drivers/block/drbd/drbd_main.c | 26 +++++++++++++++++++++++++- drivers/block/drbd/drbd_nl.c | 1 + drivers/block/drbd/drbd_req.c | 29 +++++++++++++++++++++++++++++ 4 files changed, 68 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 453fccfc440c..a6b71b6076b5 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -894,6 +894,14 @@ struct drbd_tconn { /* is a resource from the config file */ } send; }; +struct submit_worker { + struct workqueue_struct *wq; + struct work_struct worker; + + spinlock_t lock; + struct list_head writes; +}; + struct drbd_conf { struct drbd_tconn *tconn; int vnr; /* volume number within the connection */ @@ -1034,6 +1042,10 @@ struct drbd_conf { atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ unsigned int peer_max_bio_size; unsigned int local_max_bio_size; + + /* any requests that would block in drbd_make_request() + * are deferred to this single-threaded work queue */ + struct submit_worker submit; }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) @@ -1440,6 +1452,7 @@ extern void conn_free_crypto(struct drbd_tconn *tconn); extern int proc_details; /* drbd_req */ +extern void do_submit(struct work_struct *ws); extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); extern void drbd_make_request(struct request_queue *q, struct bio *bio); extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index e55271d6e7f6..a150b59897a0 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -45,7 +45,7 @@ #include #include #include - +#include #define __KERNEL_SYSCALLS__ #include #include @@ -2300,6 +2300,7 @@ static void drbd_cleanup(void) idr_for_each_entry(&minors, mdev, i) { idr_remove(&minors, mdev_to_minor(mdev)); idr_remove(&mdev->tconn->volumes, mdev->vnr); + destroy_workqueue(mdev->submit.wq); del_gendisk(mdev->vdisk); /* synchronize_rcu(); No other threads running at this point */ kref_put(&mdev->kref, &drbd_minor_destroy); @@ -2589,6 +2590,21 @@ void conn_destroy(struct kref *kref) kfree(tconn); } +int init_submitter(struct drbd_conf *mdev) +{ + /* opencoded create_singlethread_workqueue(), + * to be able to say "drbd%d", ..., minor */ + mdev->submit.wq = alloc_workqueue("drbd%u_submit", + WQ_UNBOUND | WQ_MEM_RECLAIM, 1, mdev->minor); + if (!mdev->submit.wq) + return -ENOMEM; + + INIT_WORK(&mdev->submit.worker, do_submit); + spin_lock_init(&mdev->submit.lock); + INIT_LIST_HEAD(&mdev->submit.writes); + return 0; +} + enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr) { struct drbd_conf *mdev; @@ -2678,6 +2694,12 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, goto out_idr_remove_minor; } + if (init_submitter(mdev)) { + err = ERR_NOMEM; + drbd_msg_put_info("unable to create submit workqueue"); + goto out_idr_remove_vol; + } + add_disk(disk); kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ @@ -2688,6 +2710,8 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, return NO_ERROR; +out_idr_remove_vol: + idr_remove(&tconn->volumes, vnr_got); out_idr_remove_minor: idr_remove(&minors, minor_got); synchronize_rcu(); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 974ea47a656a..bcf900bcd142 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -3173,6 +3173,7 @@ static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev) CS_VERBOSE + CS_WAIT_COMPLETE); idr_remove(&mdev->tconn->volumes, mdev->vnr); idr_remove(&minors, mdev_to_minor(mdev)); + destroy_workqueue(mdev->submit.wq); del_gendisk(mdev->vdisk); synchronize_rcu(); kref_put(&mdev->kref, &drbd_minor_destroy); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 96d5968fc1e4..4af709e0aae5 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1160,6 +1160,35 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long drbd_send_and_submit(mdev, req); } +void __drbd_make_request_from_worker(struct drbd_conf *mdev, struct drbd_request *req) +{ + const int rw = bio_rw(req->master_bio); + + if (rw == WRITE && req->private_bio && req->i.size + && !test_bit(AL_SUSPENDED, &mdev->flags)) { + drbd_al_begin_io(mdev, &req->i, false); + req->rq_state |= RQ_IN_ACT_LOG; + } + drbd_send_and_submit(mdev, req); +} + + +void do_submit(struct work_struct *ws) +{ + struct drbd_conf *mdev = container_of(ws, struct drbd_conf, submit.worker); + LIST_HEAD(writes); + struct drbd_request *req, *tmp; + + spin_lock(&mdev->submit.lock); + list_splice_init(&mdev->submit.writes, &writes); + spin_unlock(&mdev->submit.lock); + + list_for_each_entry_safe(req, tmp, &writes, tl_requests) { + list_del_init(&req->tl_requests); + __drbd_make_request_from_worker(mdev, req); + } +} + void drbd_make_request(struct request_queue *q, struct bio *bio) { struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; -- cgit v1.2.3-59-g8ed1b From b5bc8e08641805391f2c7834c40d0f647e8563c6 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:52 +0100 Subject: drbd: split drbd_al_begin_io into fastpath, prepare, and commit Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 104 ++++++++++++++++++++++++++------------- drivers/block/drbd/drbd_int.h | 1 + 2 files changed, 72 insertions(+), 33 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 1d7244d2a910..e4f1231c2ef2 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -104,7 +104,6 @@ struct update_al_work { int err; }; -static int al_write_transaction(struct drbd_conf *mdev, bool delegate); void *drbd_md_get_buffer(struct drbd_conf *mdev) { @@ -246,30 +245,37 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) return al_ext; } -/* - * @delegate: delegate activity log I/O to the worker thread - */ -void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate) +bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i) { /* for bios crossing activity log extent boundaries, * we may need to activate two extents in one go */ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); - unsigned enr; - bool need_transaction = false; - bool locked = false; + bool fastpath_ok = true; - /* When called through generic_make_request(), we must delegate - * activity log I/O to the worker thread: a further request - * submitted via generic_make_request() within the same task - * would be queued on current->bio_list, and would only start - * after this function returns (see generic_make_request()). - * - * However, if we *are* the worker, we must not delegate to ourselves. - */ + D_ASSERT((unsigned)(last - first) <= 1); + D_ASSERT(atomic_read(&mdev->local_cnt) > 0); + + /* FIXME figure out a fast path for bios crossing AL extent boundaries */ + if (first != last) + return false; + + spin_lock_irq(&mdev->al_lock); + fastpath_ok = + lc_find(mdev->resync, first/AL_EXT_PER_BM_SECT) == NULL && + lc_try_get(mdev->act_log, first) != NULL; + spin_unlock_irq(&mdev->al_lock); + return fastpath_ok; +} - if (delegate) - BUG_ON(current == mdev->tconn->worker.task); +bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i) +{ + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + unsigned enr; + bool need_transaction = false; D_ASSERT(first <= last); D_ASSERT(atomic_read(&mdev->local_cnt) > 0); @@ -280,11 +286,28 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool dele if (al_ext->lc_number != enr) need_transaction = true; } + return need_transaction; +} - /* If *this* request was to an already active extent, - * we're done, even if there are pending changes. */ - if (!need_transaction) - return; +static int al_write_transaction(struct drbd_conf *mdev, bool delegate); + +/* When called through generic_make_request(), we must delegate + * activity log I/O to the worker thread: a further request + * submitted via generic_make_request() within the same task + * would be queued on current->bio_list, and would only start + * after this function returns (see generic_make_request()). + * + * However, if we *are* the worker, we must not delegate to ourselves. + */ + +/* + * @delegate: delegate activity log I/O to the worker thread + */ +void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate) +{ + bool locked = false; + + BUG_ON(delegate && current == mdev->tconn->worker.task); /* Serialize multiple transactions. * This uses test_and_set_bit, memory barrier is implicit. @@ -303,11 +326,8 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool dele write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; rcu_read_unlock(); - if (write_al_updates) { + if (write_al_updates) al_write_transaction(mdev, delegate); - mdev->al_writ_cnt++; - } - spin_lock_irq(&mdev->al_lock); /* FIXME if (err) @@ -321,6 +341,17 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool dele } } +/* + * @delegate: delegate activity log I/O to the worker thread + */ +void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate) +{ + BUG_ON(delegate && current == mdev->tconn->worker.task); + + if (drbd_al_begin_io_prepare(mdev, i)) + drbd_al_begin_io_commit(mdev, delegate); +} + void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) { /* for bios crossing activity log extent boundaries, @@ -478,15 +509,22 @@ _al_write_transaction(struct drbd_conf *mdev) crc = crc32c(0, buffer, 4096); buffer->crc32c = cpu_to_be32(crc); - /* normal execution path goes through all three branches */ if (drbd_bm_write_hinted(mdev)) err = -EIO; - /* drbd_chk_io_error done already */ - else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { - err = -EIO; - drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); - } else { - mdev->al_tr_number++; + else { + bool write_al_updates; + rcu_read_lock(); + write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; + rcu_read_unlock(); + if (write_al_updates) { + if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { + err = -EIO; + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); + } else { + mdev->al_tr_number++; + mdev->al_writ_cnt++; + } + } } drbd_md_put_buffer(mdev); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index a6b71b6076b5..b7b52dd42325 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1611,6 +1611,7 @@ extern const char *drbd_conn_str(enum drbd_conns s); extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ +extern bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i); extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate); extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); -- cgit v1.2.3-59-g8ed1b From 6c3c4355d6bfa418db828684e67910c559402264 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:53 +0100 Subject: drbd: split out some helper functions to drbd_al_begin_io To make the code easier to follow, use an explicit find_active_resync_extent(), and add a "nonblock" parameter to _al_get(). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 49 +++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 21 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index e4f1231c2ef2..ff03f9053316 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -222,25 +222,37 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, return err; } -static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) +static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr) { - struct lc_element *al_ext; struct lc_element *tmp; - int wake; - - spin_lock_irq(&mdev->al_lock); tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); if (unlikely(tmp != NULL)) { struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); - if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { - wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); - spin_unlock_irq(&mdev->al_lock); - if (wake) - wake_up(&mdev->al_wait); - return NULL; - } + if (test_bit(BME_NO_WRITES, &bm_ext->flags)) + return bm_ext; } - al_ext = lc_get(mdev->act_log, enr); + return NULL; +} + +static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock) +{ + struct lc_element *al_ext; + struct bm_extent *bm_ext; + int wake; + + spin_lock_irq(&mdev->al_lock); + bm_ext = find_active_resync_extent(mdev, enr); + if (bm_ext) { + wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); + spin_unlock_irq(&mdev->al_lock); + if (wake) + wake_up(&mdev->al_wait); + return NULL; + } + if (nonblock) + al_ext = lc_try_get(mdev->act_log, enr); + else + al_ext = lc_get(mdev->act_log, enr); spin_unlock_irq(&mdev->al_lock); return al_ext; } @@ -251,7 +263,6 @@ bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i) * we may need to activate two extents in one go */ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); - bool fastpath_ok = true; D_ASSERT((unsigned)(last - first) <= 1); D_ASSERT(atomic_read(&mdev->local_cnt) > 0); @@ -260,12 +271,7 @@ bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i) if (first != last) return false; - spin_lock_irq(&mdev->al_lock); - fastpath_ok = - lc_find(mdev->resync, first/AL_EXT_PER_BM_SECT) == NULL && - lc_try_get(mdev->act_log, first) != NULL; - spin_unlock_irq(&mdev->al_lock); - return fastpath_ok; + return _al_get(mdev, first, true); } bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i) @@ -282,7 +288,8 @@ bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i) for (enr = first; enr <= last; enr++) { struct lc_element *al_ext; - wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)) != NULL); + wait_event(mdev->al_wait, + (al_ext = _al_get(mdev, enr, false)) != NULL); if (al_ext->lc_number != enr) need_transaction = true; } -- cgit v1.2.3-59-g8ed1b From 779b3fe4c0e9dea19ae3ddef0b5fd1a663b63ee6 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:54 +0100 Subject: drbd: queue writes on submitter thread, unless they pass the activity log fastpath Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 4af709e0aae5..43bc1d064bc7 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1020,6 +1020,14 @@ drbd_submit_req_private_bio(struct drbd_request *req) bio_endio(bio, -EIO); } +static void drbd_queue_write(struct drbd_conf *mdev, struct drbd_request *req) +{ + spin_lock(&mdev->submit.lock); + list_add_tail(&req->tl_requests, &mdev->submit.writes); + spin_unlock(&mdev->submit.lock); + queue_work(mdev->submit.wq, &mdev->submit.worker); +} + /* returns the new drbd_request pointer, if the caller is expected to * drbd_send_and_submit() it (to save latency), or NULL if we queued the * request on the submitter thread. @@ -1048,17 +1056,13 @@ drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long star req->private_bio = NULL; } - /* For WRITES going to the local disk, grab a reference on the target - * extent. This waits for any resync activity in the corresponding - * resync extent to finish, and, if necessary, pulls in the target - * extent into the activity log, which involves further disk io because - * of transactional on-disk meta data updates. - * Empty flushes don't need to go into the activity log, they can only - * flush data for pending writes which are already in there. */ if (rw == WRITE && req->private_bio && req->i.size && !test_bit(AL_SUSPENDED, &mdev->flags)) { + if (!drbd_al_begin_io_fastpath(mdev, &req->i)) { + drbd_queue_write(mdev, req); + return NULL; + } req->rq_state |= RQ_IN_ACT_LOG; - drbd_al_begin_io(mdev, &req->i, true); } return req; -- cgit v1.2.3-59-g8ed1b From 08a1ddab6df7d3c7b6341774cb1cf4b21b96a214 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:56 +0100 Subject: drbd: consolidate as many updates as possible into one AL transaction Depending on current IO depth, try to consolidate as many updates as possible into one activity log transaction. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 49 ++++++++++++++++++++++++++++ drivers/block/drbd/drbd_int.h | 2 ++ drivers/block/drbd/drbd_req.c | 70 ++++++++++++++++++++++++++++++++-------- 3 files changed, 107 insertions(+), 14 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index ff03f9053316..6afe173d5c2b 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -359,6 +359,55 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool dele drbd_al_begin_io_commit(mdev, delegate); } +int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i) +{ + struct lru_cache *al = mdev->act_log; + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + unsigned nr_al_extents; + unsigned available_update_slots; + unsigned enr; + + D_ASSERT(first <= last); + + nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */ + available_update_slots = min(al->nr_elements - al->used, + al->max_pending_changes - al->pending_changes); + + /* We want all necessary updates for a given request within the same transaction + * We could first check how many updates are *actually* needed, + * and use that instead of the worst-case nr_al_extents */ + if (available_update_slots < nr_al_extents) + return -EWOULDBLOCK; + + /* Is resync active in this area? */ + for (enr = first; enr <= last; enr++) { + struct lc_element *tmp; + tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); + if (unlikely(tmp != NULL)) { + struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); + if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { + if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)); + return -EBUSY; + return -EWOULDBLOCK; + } + } + } + + /* Checkout the refcounts. + * Given that we checked for available elements and update slots above, + * this has to be successful. */ + for (enr = first; enr <= last; enr++) { + struct lc_element *al_ext; + al_ext = lc_get_cumulative(mdev->act_log, enr); + if (!al_ext) + dev_info(DEV, "LOGIC BUG for enr=%u\n", enr); + } + return 0; +} + void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) { /* for bios crossing activity log extent boundaries, diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b7b52dd42325..f943aacfdad8 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1611,6 +1611,8 @@ extern const char *drbd_conn_str(enum drbd_conns s); extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ +extern int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i); +extern void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate); extern bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i); extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate); extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 43bc1d064bc7..b923d41678e1 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1164,32 +1164,74 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long drbd_send_and_submit(mdev, req); } -void __drbd_make_request_from_worker(struct drbd_conf *mdev, struct drbd_request *req) +static void submit_fast_path(struct drbd_conf *mdev, struct list_head *incoming) { - const int rw = bio_rw(req->master_bio); + struct drbd_request *req, *tmp; + list_for_each_entry_safe(req, tmp, incoming, tl_requests) { + const int rw = bio_data_dir(req->master_bio); - if (rw == WRITE && req->private_bio && req->i.size - && !test_bit(AL_SUSPENDED, &mdev->flags)) { - drbd_al_begin_io(mdev, &req->i, false); - req->rq_state |= RQ_IN_ACT_LOG; + if (rw == WRITE /* rw != WRITE should not even end up here! */ + && req->private_bio && req->i.size + && !test_bit(AL_SUSPENDED, &mdev->flags)) { + if (!drbd_al_begin_io_fastpath(mdev, &req->i)) + continue; + + req->rq_state |= RQ_IN_ACT_LOG; + } + + list_del_init(&req->tl_requests); + drbd_send_and_submit(mdev, req); } - drbd_send_and_submit(mdev, req); } +static bool prepare_al_transaction_nonblock(struct drbd_conf *mdev, + struct list_head *incoming, + struct list_head *pending) +{ + struct drbd_request *req, *tmp; + int wake = 0; + int err; + + spin_lock_irq(&mdev->al_lock); + list_for_each_entry_safe(req, tmp, incoming, tl_requests) { + err = drbd_al_begin_io_nonblock(mdev, &req->i); + if (err == -EBUSY) + wake = 1; + if (err) + continue; + req->rq_state |= RQ_IN_ACT_LOG; + list_move_tail(&req->tl_requests, pending); + } + spin_unlock_irq(&mdev->al_lock); + if (wake) + wake_up(&mdev->al_wait); + + return !list_empty(pending); +} void do_submit(struct work_struct *ws) { struct drbd_conf *mdev = container_of(ws, struct drbd_conf, submit.worker); - LIST_HEAD(writes); + LIST_HEAD(incoming); + LIST_HEAD(pending); struct drbd_request *req, *tmp; - spin_lock(&mdev->submit.lock); - list_splice_init(&mdev->submit.writes, &writes); - spin_unlock(&mdev->submit.lock); + for (;;) { + spin_lock(&mdev->submit.lock); + list_splice_tail_init(&mdev->submit.writes, &incoming); + spin_unlock(&mdev->submit.lock); - list_for_each_entry_safe(req, tmp, &writes, tl_requests) { - list_del_init(&req->tl_requests); - __drbd_make_request_from_worker(mdev, req); + submit_fast_path(mdev, &incoming); + if (list_empty(&incoming)) + break; + + wait_event(mdev->al_wait, prepare_al_transaction_nonblock(mdev, &incoming, &pending)); + drbd_al_begin_io_commit(mdev, false); + + list_for_each_entry_safe(req, tmp, &pending, tl_requests) { + list_del_init(&req->tl_requests); + drbd_send_and_submit(mdev, req); + } } } -- cgit v1.2.3-59-g8ed1b From 7e8c288f6cde950a6ca001ec06a32c8c2cf4180e Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:57 +0100 Subject: drbd: move start io accounting before activity log transaction The IO accounting of the drbd "queue depth" was misleading. We only started IO accounting once we already wrote the activity log. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index b923d41678e1..d72f2fef1cba 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1056,6 +1056,9 @@ drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long star req->private_bio = NULL; } + /* Update disk stats */ + _drbd_start_io_acct(mdev, req); + if (rw == WRITE && req->private_bio && req->i.size && !test_bit(AL_SUSPENDED, &mdev->flags)) { if (!drbd_al_begin_io_fastpath(mdev, &req->i)) { @@ -1095,9 +1098,6 @@ static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *re goto out; } - /* Update disk stats */ - _drbd_start_io_acct(mdev, req); - /* We fail READ/READA early, if we can not serve it. * We must do this before req is registered on any lists. * Otherwise, drbd_req_complete() will queue failed READ for retry. */ -- cgit v1.2.3-59-g8ed1b From 45ad07b3ac1e3062188fb760fe71cafb4a100215 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:58 +0100 Subject: drbd: try hard to max out the updates per AL transaction There may have been more incoming requests while we where preparing the current transaction. Try to consolidate more updates into this transaction until we make no more progres. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index d72f2fef1cba..9f7ff1cb46ff 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1226,6 +1226,37 @@ void do_submit(struct work_struct *ws) break; wait_event(mdev->al_wait, prepare_al_transaction_nonblock(mdev, &incoming, &pending)); + /* Maybe more was queued, while we prepared the transaction? + * Try to stuff them into this transaction as well. + * Be strictly non-blocking here, no wait_event, we already + * have something to commit. + * Stop if we don't make any more progres. + */ + for (;;) { + LIST_HEAD(more_pending); + LIST_HEAD(more_incoming); + bool made_progress; + + /* It is ok to look outside the lock, + * it's only an optimization anyways */ + if (list_empty(&mdev->submit.writes)) + break; + + spin_lock(&mdev->submit.lock); + list_splice_tail_init(&mdev->submit.writes, &more_incoming); + spin_unlock(&mdev->submit.lock); + + if (list_empty(&more_incoming)) + break; + + made_progress = prepare_al_transaction_nonblock(mdev, &more_incoming, &more_pending); + + list_splice_tail_init(&more_pending, &pending); + list_splice_tail_init(&more_incoming, &incoming); + + if (!made_progress) + break; + } drbd_al_begin_io_commit(mdev, false); list_for_each_entry_safe(req, tmp, &pending, tl_requests) { -- cgit v1.2.3-59-g8ed1b From 5bbcf5e6abe97485748b51ea0713cc3012b4a8f0 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:59 +0100 Subject: drbd: adjust upper limit for activity log extents Now that the on-disk activity-log ring buffer size is adjustable, the maximum active set can become larger, and is now limited by the use of 16bit "labels". This increases the maximum working set from 6433 to 65534 extents, each of which covers an area of 4MiB. Which means that if you use the maximum, you'd have to resync more than 250 GiB after an unclean Primary shutdown. With capable backend storage and replication links, this is entirely feasible. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 47 +++++++++++++++++++++++++++++++++++--------- include/linux/drbd_limits.h | 11 +++++------ 2 files changed, 43 insertions(+), 15 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index bcf900bcd142..42fda4ae2f87 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1141,15 +1141,32 @@ static bool should_set_defaults(struct genl_info *info) return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); } -static void enforce_disk_conf_limits(struct disk_conf *dc) +static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev) { - if (dc->al_extents < DRBD_AL_EXTENTS_MIN) - dc->al_extents = DRBD_AL_EXTENTS_MIN; - if (dc->al_extents > DRBD_AL_EXTENTS_MAX) - dc->al_extents = DRBD_AL_EXTENTS_MAX; + /* This is limited by 16 bit "slot" numbers, + * and by available on-disk context storage. + * + * Also (u16)~0 is special (denotes a "free" extent). + * + * One transaction occupies one 4kB on-disk block, + * we have n such blocks in the on disk ring buffer, + * the "current" transaction may fail (n-1), + * and there is 919 slot numbers context information per transaction. + * + * 72 transaction blocks amounts to more than 2**16 context slots, + * so cap there first. + */ + const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX; + const unsigned int sufficient_on_disk = + (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1) + /AL_CONTEXT_PER_TRANSACTION; - if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) - dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; + unsigned int al_size_4k = bdev->md.al_size_4k; + + if (al_size_4k > sufficient_on_disk) + return max_al_nr; + + return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION; } int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) @@ -1196,7 +1213,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) if (!expect(new_disk_conf->resync_rate >= 1)) new_disk_conf->resync_rate = 1; - enforce_disk_conf_limits(new_disk_conf); + if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) + new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; + if (new_disk_conf->al_extents > drbd_al_extents_max(mdev->ldev)) + new_disk_conf->al_extents = drbd_al_extents_max(mdev->ldev); + + if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) + new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; if (fifo_size != mdev->rs_plan_s->size) { @@ -1344,7 +1367,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto fail; } - enforce_disk_conf_limits(new_disk_conf); + if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) + new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); if (!new_plan) { @@ -1419,6 +1443,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) if (retcode != NO_ERROR) goto fail; + if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) + new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; + if (new_disk_conf->al_extents > drbd_al_extents_max(nbc)) + new_disk_conf->al_extents = drbd_al_extents_max(nbc); + if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", (unsigned long long) drbd_get_max_capacity(nbc), diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 1fa19c5f5e64..1fedf2b17cc8 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -126,13 +126,12 @@ #define DRBD_RESYNC_RATE_DEF 250 #define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ - /* less than 7 would hit performance unnecessarily. - * 919 slots context information per transaction, - * 32k activity log, 4k transaction size, - * one transaction in flight: - * 919 * 7 = 6433 */ + /* less than 7 would hit performance unnecessarily. */ #define DRBD_AL_EXTENTS_MIN 7 -#define DRBD_AL_EXTENTS_MAX 6433 + /* we use u16 as "slot number", (u16)~0 is "FREE". + * If you use >= 292 kB on-disk ring buffer, + * this is the maximum you can use: */ +#define DRBD_AL_EXTENTS_MAX 0xfffe #define DRBD_AL_EXTENTS_DEF 1237 #define DRBD_AL_EXTENTS_SCALE '1' -- cgit v1.2.3-59-g8ed1b From 2124469efa6079e6c325165fb1926159356b15c3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 25 Mar 2013 15:27:26 -0600 Subject: aoe: get rid of cached bv variable in bufinit() Less error prone if we just kill it, it's only used once anyway. Signed-off-by: Jens Axboe --- drivers/block/aoe/aoecmd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 25ef5c014fca..67d216c716da 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -919,16 +919,14 @@ bio_pagedec(struct bio *bio) static void bufinit(struct buf *buf, struct request *rq, struct bio *bio) { - struct bio_vec *bv; - memset(buf, 0, sizeof(*buf)); buf->rq = rq; buf->bio = bio; buf->resid = bio->bi_size; buf->sector = bio->bi_sector; bio_pageinc(bio); - buf->bv = bv = &bio->bi_io_vec[bio->bi_idx]; - buf->bv_resid = bv->bv_len; + buf->bv = &bio->bi_io_vec[bio->bi_idx]; + buf->bv_resid = buf->bv->bv_len; WARN_ON(buf->bv_resid == 0); } -- cgit v1.2.3-59-g8ed1b From 2d56a974f36ffd1d00aa897bd55e28079aa9e5b7 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Mar 2013 14:08:34 +0100 Subject: drbd: reset ap_in_flight counter for new connections Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 1921871ca9a8..cd172b490a95 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -849,6 +849,7 @@ int drbd_connected(struct drbd_conf *mdev) err = drbd_send_current_state(mdev); clear_bit(USE_DEGR_WFC_T, &mdev->flags); clear_bit(RESIZE_PENDING, &mdev->flags); + atomic_set(&mdev->ap_in_flight, 0); mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ return err; } -- cgit v1.2.3-59-g8ed1b From a700471bf335965e7603273fd51034415553246a Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Mar 2013 14:08:35 +0100 Subject: drbd: abort start of resync early, if it raced with connection breakage We've seen a spurious full resync, because a connection breakage raced with drbd_start_resync(, C_SYNC_TARGET), and the resulting state change request intended to start the resync ended up looking like a local invalidate. Fix: Double check the state inside the lock, and don't even request that state change, if we had connection or IO problems. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_worker.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index f41e224caa7c..7f51f88b0a80 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1653,7 +1653,9 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) clear_bit(B_RS_H_DONE, &mdev->flags); write_lock_irq(&global_state_lock); - if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { + /* Did some connection breakage or IO error race with us? */ + if (mdev->state.conn < C_CONNECTED + || !get_ldev_if_state(mdev, D_NEGOTIATING)) { write_unlock_irq(&global_state_lock); mutex_unlock(mdev->state_mutex); return; -- cgit v1.2.3-59-g8ed1b From 9376d9f8b97f20df5d30c83713652c3118b31534 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Mar 2013 14:08:36 +0100 Subject: drbd: move invalidating the whole bitmap out of after_state ch() To avoid other state change requests, after passing through sanitize_state(), to be mistaken for an invalidate, move the "set all bits as out-of-sync" into the invalidate path. Make invalidate and invalidate-remote behave consistently wrt. current connection state (need either an established replication link, or really be disconnected). Also mention that in the documentation. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 36 ++++++++++++++++++++---------------- drivers/block/drbd/drbd_state.c | 7 ------- 2 files changed, 20 insertions(+), 23 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 42fda4ae2f87..c49bda7918b3 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -2448,19 +2448,23 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); - if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) - retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); - - while (retcode == SS_NEED_CONNECTION) { - spin_lock_irq(&mdev->tconn->req_lock); - if (mdev->state.conn < C_CONNECTED) - retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); - spin_unlock_irq(&mdev->tconn->req_lock); - - if (retcode != SS_NEED_CONNECTION) - break; - - retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); + /* If that did not work, try again, + * but log failures this time (implicit CS_VERBOSE). + * + * If we happen to be C_STANDALONE R_SECONDARY, + * just change to D_INCONSISTENT, and set all bits in the bitmap. + * Otherwise, we just fail, to avoid races with the resync handshake. + */ + if (retcode < SS_SUCCESS) { + if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_SECONDARY) { + retcode = drbd_request_state(mdev, NS(disk, D_INCONSISTENT)); + if (retcode >= SS_SUCCESS) { + if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, + "set_n_write from invalidate", BM_LOCKED_MASK)) + retcode = ERR_IO_MD_DISK; + } + } else + retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); } drbd_resume_io(mdev); @@ -2517,9 +2521,9 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); if (retcode < SS_SUCCESS) { - if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { - /* The peer will get a resync upon connect anyways. - * Just make that into a full resync. */ + if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_PRIMARY) { + /* The peer will get a resync upon connect anyways. Just make that + into a full resync. */ retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); if (retcode >= SS_SUCCESS) { if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 0fe220cfb9e9..3bc686f48b53 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -1377,13 +1377,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); - /* We are invalidating our self... */ - if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && - os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) - /* other bitmap operation expected during this phase */ - drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, - "set_n_write from invalidate", BM_LOCKED_MASK); - /* first half of local IO error, failure to attach, * or administrative detach */ if (os.disk != D_FAILED && ns.disk == D_FAILED) { -- cgit v1.2.3-59-g8ed1b From 5c4f13d991e69cb715ddc2b6a9bbecead7b02c9e Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Mar 2013 14:08:37 +0100 Subject: drbd: fix effective error returned when refusing an invalidate Since commit drbd: Disallow the peer_disk_state to be D_OUTDATED while connected trying to invalidate a disconnected Primary returned an error code that did not really match the situation: "Refusing to be Outdated while Connected" Insert two more specific conditions into is_valid_state(), changing that to "Need access to UpToDate data", respectively "Need a connection to start verify or resync". Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_state.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 3bc686f48b53..22e259f34370 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -570,6 +570,13 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns) mdev->tconn->agreed_pro_version < 88) rv = SS_NOT_SUPPORTED; + else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && + ns.pdsk == D_UNKNOWN) + rv = SS_NEED_CONNECTION; + else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) rv = SS_CONNECTED_OUTDATES; -- cgit v1.2.3-59-g8ed1b From 0b2dafcd9f8fd38d00398dd3da88225ad1e99726 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Mar 2013 14:08:38 +0100 Subject: drbd: drop now useless duplicate state request from invalidate Patch best viewed with git diff --ignore-space-change. Now that we attempt the fallback to local bitmap operation only when disconnected, we can safely drop the extra "silent" state request from both invalidate and invalidate-remote. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 62 ++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 34 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index c49bda7918b3..56bafdcd943e 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -2446,26 +2446,19 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); drbd_flush_workqueue(mdev); - retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); - - /* If that did not work, try again, - * but log failures this time (implicit CS_VERBOSE). - * - * If we happen to be C_STANDALONE R_SECONDARY, - * just change to D_INCONSISTENT, and set all bits in the bitmap. - * Otherwise, we just fail, to avoid races with the resync handshake. + /* If we happen to be C_STANDALONE R_SECONDARY, just change to + * D_INCONSISTENT, and set all bits in the bitmap. Otherwise, + * try to start a resync handshake as sync target for full sync. */ - if (retcode < SS_SUCCESS) { - if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_SECONDARY) { - retcode = drbd_request_state(mdev, NS(disk, D_INCONSISTENT)); - if (retcode >= SS_SUCCESS) { - if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, - "set_n_write from invalidate", BM_LOCKED_MASK)) - retcode = ERR_IO_MD_DISK; - } - } else - retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); - } + if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_SECONDARY) { + retcode = drbd_request_state(mdev, NS(disk, D_INCONSISTENT)); + if (retcode >= SS_SUCCESS) { + if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, + "set_n_write from invalidate", BM_LOCKED_MASK)) + retcode = ERR_IO_MD_DISK; + } + } else + retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); drbd_resume_io(mdev); out: @@ -2519,21 +2512,22 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); drbd_flush_workqueue(mdev); - retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); - if (retcode < SS_SUCCESS) { - if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_PRIMARY) { - /* The peer will get a resync upon connect anyways. Just make that - into a full resync. */ - retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); - if (retcode >= SS_SUCCESS) { - if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, - "set_n_write from invalidate_peer", - BM_LOCKED_SET_ALLOWED)) - retcode = ERR_IO_MD_DISK; - } - } else - retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); - } + /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits + * in the bitmap. Otherwise, try to start a resync handshake + * as sync source for full sync. + */ + if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_PRIMARY) { + /* The peer will get a resync upon connect anyways. Just make that + into a full resync. */ + retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); + if (retcode >= SS_SUCCESS) { + if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, + "set_n_write from invalidate_peer", + BM_LOCKED_SET_ALLOWED)) + retcode = ERR_IO_MD_DISK; + } + } else + retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); drbd_resume_io(mdev); out: -- cgit v1.2.3-59-g8ed1b From bb45185de2e90af63a7bc48855de6f870cc216fc Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Mar 2013 14:08:39 +0100 Subject: drbd: fix spurious warning about bitmap being locked from detach Introduced in drbd: always write bitmap on detach, the bitmap bulk writeout on detach was indicating it expected exclusive bitmap access. Where I meant to say: expect no more modifications, but testing/counting is still allowed. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a150b59897a0..67d2bb3bb533 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3412,8 +3412,12 @@ static int w_go_diskless(struct drbd_work *w, int unused) * end up here after a failed attach, before ldev was even assigned. */ if (mdev->bitmap && mdev->ldev) { + /* An interrupted resync or similar is allowed to recounts bits + * while we detach. + * Any modifications would not be expected anymore, though. + */ if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write, - "detach", BM_LOCKED_MASK)) { + "detach", BM_LOCKED_TEST_ALLOWED)) { if (test_bit(WAS_READ_ERROR, &mdev->flags)) { drbd_md_set_flag(mdev, MDF_FULL_SYNC); drbd_md_sync(mdev); -- cgit v1.2.3-59-g8ed1b From 2bd5ed5d6713594eb2b4d234d01217d506279c7d Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Mar 2013 14:08:40 +0100 Subject: drbd: Fix disconnect to keep the peer disk state if connection breaks during operation The issue was that if the connection broke while we did the gracefull state change to C_DISCONNECTING (C_TEARDOWN), then we returned a success code from the state engine. (SS_CW_NO_NEED) The result of that is that we missed to call the fence-peer script in such a case. Fixed that by introducing a new error code (SS_OUTDATE_WO_CONN). This one should never reach back into user space. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 7 +++++-- drivers/block/drbd/drbd_state.c | 14 +++++++------- drivers/block/drbd/drbd_strings.c | 1 + include/linux/drbd.h | 3 ++- 4 files changed, 15 insertions(+), 10 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 56bafdcd943e..39e9a91a8f31 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -2198,8 +2198,11 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool for return SS_SUCCESS; case SS_PRIMARY_NOP: /* Our state checking code wants to see the peer outdated. */ - rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, - pdsk, D_OUTDATED), CS_VERBOSE); + rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0); + + if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */ + rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_VERBOSE); + break; case SS_CW_FAILED_BY_PEER: /* The peer probably wants to see us outdated. */ diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 22e259f34370..90c5be2b1d30 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -642,6 +642,10 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_t && os.conn < C_WF_REPORT_PARAMS) rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ + if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED && + os.conn < C_CONNECTED && os.pdsk > D_OUTDATED) + rv = SS_OUTDATE_WO_CONN; + return rv; } @@ -1748,13 +1752,9 @@ _conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags)) return SS_CW_FAILED_BY_PEER; - rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR; - - if (rv == SS_UNKNOWN_ERROR) - rv = conn_is_valid_transition(tconn, mask, val, 0); - - if (rv == SS_SUCCESS) - rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ + rv = conn_is_valid_transition(tconn, mask, val, 0); + if (rv == SS_SUCCESS && tconn->cstate == C_WF_REPORT_PARAMS) + rv = SS_UNKNOWN_ERROR; /* continue waiting */ return rv; } diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index 9a664bd27404..58e08ff2b2ce 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c @@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = { [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", + [-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer", [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", }; diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 0c5a18ec322c..316330705fd7 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -319,7 +319,8 @@ enum drbd_state_rv { SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ SS_O_VOL_PEER_PRI = -20, - SS_AFTER_LAST_ERROR = -21, /* Keep this at bottom */ + SS_OUTDATE_WO_CONN = -21, + SS_AFTER_LAST_ERROR = -22, /* Keep this at bottom */ }; /* from drbd_strings.c */ -- cgit v1.2.3-59-g8ed1b From 7074e4a745799d521b17775f6d076d84dc7f8c50 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 27 Mar 2013 14:08:41 +0100 Subject: drbd: only fail empty flushes if no good data is reachable We completed empty flushes (blkdev_issue_flush()) with IO error if we lost the local disk, even if we still have an established replication link to a healthy remote disk. Fix this to only report errors to upper layers, if neither local nor remote data is reachable. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 12 ++++++++---- drivers/block/drbd/drbd_req.h | 8 ++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 9f7ff1cb46ff..beefe65764ff 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -263,8 +263,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) else root = &mdev->read_requests; drbd_remove_request_interval(root, req); - } else if (!(s & RQ_POSTPONED)) - D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); + } /* Before we can signal completion to the upper layers, * we may need to close the current transfer log epoch. @@ -755,6 +754,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, D_ASSERT(req->rq_state & RQ_NET_PENDING); mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); break; + + case QUEUE_AS_DRBD_BARRIER: + start_new_tl_epoch(mdev->tconn); + mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE); + break; }; return rv; @@ -975,8 +979,8 @@ static int drbd_process_write_request(struct drbd_request *req) /* The only size==0 bios we expect are empty flushes. */ D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); if (remote) - start_new_tl_epoch(mdev->tconn); - return 0; + _req_mod(req, QUEUE_AS_DRBD_BARRIER); + return remote; } if (!remote && !send_oos) diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index c08d22964d06..978cb1addc98 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -88,6 +88,14 @@ enum drbd_req_event { QUEUE_FOR_NET_READ, QUEUE_FOR_SEND_OOS, + /* An empty flush is queued as P_BARRIER, + * which will cause it to complete "successfully", + * even if the local disk flush failed. + * + * Just like "real" requests, empty flushes (blkdev_issue_flush()) will + * only see an error if neither local nor remote data is reachable. */ + QUEUE_AS_DRBD_BARRIER, + SEND_CANCELED, SEND_FAILED, HANDED_OVER_TO_NETWORK, -- cgit v1.2.3-59-g8ed1b From 94ad0a101415978be04945b2787be1e8e8a874db Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 27 Mar 2013 14:08:42 +0100 Subject: drbd: fix memory leak We forgot to free the disk_conf, so for each attach/detach cycle we leaked 336 bytes. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 67d2bb3bb533..1b93a7262ef7 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2819,6 +2819,7 @@ void drbd_free_bc(struct drbd_backing_dev *ldev) blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + kfree(ldev->disk_conf); kfree(ldev); } -- cgit v1.2.3-59-g8ed1b From a3f8f7dc7ad652cd84c12cb5efa0f7722dff4786 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 27 Mar 2013 14:08:43 +0100 Subject: drbd: validate resync_after dependency on attach already We validated resync_after dependencies, if changed via disk-options. But we did not validate them when first created via attach. We also did not check or cleanup dependencies that used to be correct, but now point to meanwhile removed minor devices. If the drbd_resync_after_valid() validation in disk-options tried to follow a dependency chain in this way, this could lead to NULL pointer dereference. Validate resync_after settings in drbd_adm_attach() already, as well as in drbd_adm_disk_opts(), and and only reject dependency loops. Depending on non-existing disks is allowed and equivalent to no dependency. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 6 ++++++ drivers/block/drbd/drbd_worker.c | 15 ++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 39e9a91a8f31..9e3f441e7e84 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1381,6 +1381,12 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto fail; } + write_lock_irq(&global_state_lock); + retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after); + write_unlock_irq(&global_state_lock); + if (retcode != NO_ERROR) + goto fail; + rcu_read_lock(); nc = rcu_dereference(mdev->tconn->net_conf); if (nc) { diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 7f51f88b0a80..891c0ecaa292 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1426,7 +1426,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev) int resync_after; while (1) { - if (!odev->ldev) + if (!odev->ldev || odev->state.disk == D_DISKLESS) return 1; rcu_read_lock(); resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; @@ -1434,7 +1434,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev) if (resync_after == -1) return 1; odev = minor_to_mdev(resync_after); - if (!expect(odev)) + if (!odev) return 1; if ((odev->state.conn >= C_SYNC_SOURCE && odev->state.conn <= C_PAUSED_SYNC_T) || @@ -1516,7 +1516,7 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor) if (o_minor == -1) return NO_ERROR; - if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) + if (o_minor < -1 || o_minor > MINORMASK) return ERR_RESYNC_AFTER; /* check for loops */ @@ -1525,6 +1525,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor) if (odev == mdev) return ERR_RESYNC_AFTER_CYCLE; + /* You are free to depend on diskless, non-existing, + * or not yet/no longer existing minors. + * We only reject dependency loops. + * We cannot follow the dependency chain beyond a detached or + * missing minor. + */ + if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS) + return NO_ERROR; + rcu_read_lock(); resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; rcu_read_unlock(); -- cgit v1.2.3-59-g8ed1b From ef57f9e6bb9278720c8a5278728f252ab85d7ac6 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Mar 2013 14:08:44 +0100 Subject: drbd: Fix build error when CONFIG_CRYPTO_HMAC is not set Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index cd172b490a95..7af0cc77aa60 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -4660,8 +4660,8 @@ static int drbd_do_features(struct drbd_tconn *tconn) #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) static int drbd_do_auth(struct drbd_tconn *tconn) { - dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); - dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); + conn_err(tconn, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); + conn_err(tconn, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); return -1; } #else -- cgit v1.2.3-59-g8ed1b From 607f25e56ee0a31e451f6bd8a7109fa1f5dcbe29 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 27 Mar 2013 14:08:45 +0100 Subject: drbd: fix drbd epoch write count for ahead/behind mode The sanity check when receiving P_BARRIER_ACK does expect all write requests with a given req->epoch to have been either all replicated, or all not replicated. Because req->epoch was assigned before calling maybe_pull_ahead(), this expectation was not met, leading to an off-by-one in the sanity check, and further to a "Protocol Error". Fix: move the call to maybe_pull_ahead() a few lines up, and assign req->epoch only after that. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index beefe65764ff..c24379ffd4e3 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -865,8 +865,10 @@ static void maybe_pull_ahead(struct drbd_conf *mdev) bool congested = false; enum drbd_on_congestion on_congestion; + rcu_read_lock(); nc = rcu_dereference(tconn->net_conf); on_congestion = nc ? nc->on_congestion : OC_BLOCK; + rcu_read_unlock(); if (on_congestion == OC_BLOCK || tconn->agreed_pro_version < 96) return; @@ -960,14 +962,8 @@ static int drbd_process_write_request(struct drbd_request *req) struct drbd_conf *mdev = req->w.mdev; int remote, send_oos; - rcu_read_lock(); remote = drbd_should_do_remote(mdev->state); - if (remote) { - maybe_pull_ahead(mdev); - remote = drbd_should_do_remote(mdev->state); - } send_oos = drbd_should_send_out_of_sync(mdev->state); - rcu_read_unlock(); /* Need to replicate writes. Unless it is an empty flush, * which is better mapped to a DRBD P_BARRIER packet, @@ -1087,9 +1083,13 @@ static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *re * but will re-aquire it before it returns here. * Needs to be before the check on drbd_suspended() */ complete_conflicting_writes(req); + /* no more giving up req_lock from now on! */ + + /* check for congestion, and potentially stop sending + * full data updates, but start sending "dirty bits" only. */ + maybe_pull_ahead(mdev); } - /* no more giving up req_lock from now on! */ if (drbd_suspended(mdev)) { /* push back and retry: */ -- cgit v1.2.3-59-g8ed1b From 193d01532a730a53cbc74462799dbc43968b97fd Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Wed, 27 Mar 2013 14:08:46 +0100 Subject: drbd: add module_put() on error path in drbd_proc_open() If single_open() fails in drbd_proc_open(), module refcount is left incremented. The patch adds module_put() on the error path. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_proc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 56672a61eb94..30fe0a57f5a0 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -313,8 +313,14 @@ static int drbd_seq_show(struct seq_file *seq, void *v) static int drbd_proc_open(struct inode *inode, struct file *file) { - if (try_module_get(THIS_MODULE)) - return single_open(file, drbd_seq_show, PDE(inode)->data); + int err; + + if (try_module_get(THIS_MODULE)) { + err = single_open(file, drbd_seq_show, PDE(inode)->data); + if (err) + module_put(THIS_MODULE); + return err; + } return -ENODEV; } -- cgit v1.2.3-59-g8ed1b From 7c689e63a847316c1b2500f86891b0a574ce7e69 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Mar 2013 14:08:47 +0100 Subject: drbd: fix for deadlock when using automatic split-brain-recovery With an automatic after split-brain recovery policy of "after-sb-1pri call-pri-lost-after-sb", when trying to drbd_set_role() to R_SECONDARY, we run into a deadlock. This was first recognized and supposedly fixed by 2009-06-10 "Fixed a deadlock when using automatic split brain recovery when both nodes are" replacing drbd_set_role() with drbd_change_state() in that code-path, but the first hunk of that patch forgets to remove the drbd_set_role(). We apparently only ever tested the "two primaries" case. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 7af0cc77aa60..a75c0b134856 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2662,7 +2662,6 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) if (hg == -1 && mdev->state.role == R_PRIMARY) { enum drbd_state_rv rv2; - drbd_set_role(mdev, R_SECONDARY, 0); /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, * we might be here in C_WF_REPORT_PARAMS which is transient. * we do not need to wait for the after state change work either. */ -- cgit v1.2.3-59-g8ed1b From 3990e04df085e0561ab34f84731dc5929585c526 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Mar 2013 14:08:48 +0100 Subject: drbd: use sched_setscheduler() It was unnoticed for some time that assigning to current->policy is no longer sufficient to set a real time priority for a kernel thread. Reported-by: Charlie Suffin Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 6 ++++-- include/linux/drbd.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index a75c0b134856..0f449bbf0edf 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -5257,9 +5257,11 @@ int drbd_asender(struct drbd_thread *thi) bool ping_timeout_active = false; struct net_conf *nc; int ping_timeo, tcp_cork, ping_int; + struct sched_param param = { .sched_priority = 2 }; - current->policy = SCHED_RR; /* Make this a realtime task! */ - current->rt_priority = 2; /* more important than all other tasks */ + rv = sched_setscheduler(current, SCHED_RR, ¶m); + if (rv < 0) + conn_err(tconn, "drbd_asender: ERROR set priority, ret=%d\n", rv); while (get_t_state(thi) == RUNNING) { drbd_thread_current_set_cpu(thi); diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 316330705fd7..1b4d4ee1168f 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -52,7 +52,7 @@ #endif extern const char *drbd_buildtag(void); -#define REL_VERSION "8.4.2" +#define REL_VERSION "8.4.3" #define API_VERSION 1 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 101 -- cgit v1.2.3-59-g8ed1b From 0b6ef4164f50698eee536903d69d086add1a7889 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 27 Mar 2013 14:08:49 +0100 Subject: drbd: fix if(); found by kbuild test robot Recently introduced al_begin_io_nonblock() was returning -EBUSY, even when it should return -EWOULDBLOCK. Impact: A few spurious wake_up() calls in prepare_al_transaction_nonblock(). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 6afe173d5c2b..6608076dc39e 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -389,7 +389,7 @@ int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i) if (unlikely(tmp != NULL)) { struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { - if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)); + if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)) return -EBUSY; return -EWOULDBLOCK; } -- cgit v1.2.3-59-g8ed1b From 5a79e1ac21681907f8cc58e1f78fc40af5bbb890 Mon Sep 17 00:00:00 2001 From: Asai Thambi S P Date: Fri, 12 Apr 2013 23:57:17 +0530 Subject: mtip32xx: fix a smatch warning Reported smatch warning: drivers/block/mtip32xx/mtip32xx.c:4163 mtip_block_shutdown() warn: variable dereferenced before check 'dd->disk' (see line 4159) dd->disk->disk_name accessed before the check if dd->disk is NULL. Fixed this and access of dd->queue/dd->disk->queue. Reported-by: Dan Carpenter Signed-off-by: Asai Thambi S P Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 11cc9522cdd4..2d0d52f6ee2a 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -4053,26 +4053,24 @@ static int mtip_block_remove(struct driver_data *dd) */ static int mtip_block_shutdown(struct driver_data *dd) { - dev_info(&dd->pdev->dev, - "Shutting down %s ...\n", dd->disk->disk_name); - /* Delete our gendisk structure, and cleanup the blk queue. */ if (dd->disk) { - if (dd->disk->queue) + dev_info(&dd->pdev->dev, + "Shutting down %s ...\n", dd->disk->disk_name); + + if (dd->disk->queue) { del_gendisk(dd->disk); - else + blk_cleanup_queue(dd->queue); + } else put_disk(dd->disk); + dd->disk = NULL; + dd->queue = NULL; } - spin_lock(&rssd_index_lock); ida_remove(&rssd_index_ida, dd->index); spin_unlock(&rssd_index_lock); - blk_cleanup_queue(dd->queue); - dd->disk = NULL; - dd->queue = NULL; - mtip_hw_shutdown(dd); return 0; } -- cgit v1.2.3-59-g8ed1b From 68466cbf92367c0808bfd2c64bc6faddd7a97b6f Mon Sep 17 00:00:00 2001 From: Asai Thambi S P Date: Fri, 12 Apr 2013 23:58:02 +0530 Subject: mtip32xx: mtip32xx: Disable TRIM support Temporarily disabling TRIM support until TRIM related issues are addressed in the firmware. Signed-off-by: Asai Thambi S P Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 2d0d52f6ee2a..30a03ab7efc9 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -1520,10 +1520,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) } #endif +#ifdef MTIP_TRIM /* Disabling TRIM support temporarily */ /* Demux ID.DRAT & ID.RZAT to determine trim support */ if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5)) port->dd->trim_supp = true; else +#endif port->dd->trim_supp = false; /* Set the identify buffer as valid. */ -- cgit v1.2.3-59-g8ed1b From 2077d947260c620550ba06f76aa4affd0106f4da Mon Sep 17 00:00:00 2001 From: Asai Thambi S P Date: Mon, 29 Apr 2013 21:19:49 +0200 Subject: mtip32xx: Workaround for unaligned writes Workaround for handling unaligned writes: limit number of outstanding unaligned writes Signed-off-by: Sam Bradshaw Signed-off-by: Asai Thambi S P Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 59 +++++++++++++++++++++++++++++++-------- drivers/block/mtip32xx/mtip32xx.h | 11 ++++++++ 2 files changed, 58 insertions(+), 12 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 30a03ab7efc9..c0d38734041c 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -712,7 +712,10 @@ static void mtip_async_complete(struct mtip_port *port, atomic_set(&port->commands[tag].active, 0); release_slot(port, tag); - up(&port->cmd_slot); + if (unlikely(command->unaligned)) + up(&port->cmd_slot_unal); + else + up(&port->cmd_slot); } /* @@ -2557,7 +2560,7 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, */ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, int nsect, int nents, int tag, void *callback, - void *data, int dir) + void *data, int dir, int unaligned) { struct host_to_dev_fis *fis; struct mtip_port *port = dd->port; @@ -2570,6 +2573,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, command->scatter_ents = nents; + command->unaligned = unaligned; /* * The number of retries for this command before it is * reported as a failure to the upper layers. @@ -2598,6 +2602,9 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, fis->res3 = 0; fill_command_sg(dd, command, nents); + if (unaligned) + fis->device |= 1 << 7; + /* Populate the command header */ command->command_header->opts = __force_bit2int cpu_to_le32( @@ -2644,9 +2651,13 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, * return value * None */ -static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag) +static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag, + int unaligned) { + struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal : + &dd->port->cmd_slot; release_slot(dd->port, tag); + up(sem); } /* @@ -2661,22 +2672,25 @@ static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag) * or NULL if no command slots are available. */ static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, - int *tag) + int *tag, int unaligned) { + struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal : + &dd->port->cmd_slot; + /* * It is possible that, even with this semaphore, a thread * may think that no command slots are available. Therefore, we * need to make an attempt to get_slot(). */ - down(&dd->port->cmd_slot); + down(sem); *tag = get_slot(dd->port); if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { - up(&dd->port->cmd_slot); + up(sem); return NULL; } if (unlikely(*tag < 0)) { - up(&dd->port->cmd_slot); + up(sem); return NULL; } @@ -2909,6 +2923,11 @@ static inline void hba_setup(struct driver_data *dd) dd->mmio + HOST_HSORG); } +static int mtip_device_unaligned_constrained(struct driver_data *dd) +{ + return (dd->pdev->device == P420M_DEVICE_ID ? 1 : 0); +} + /* * Detect the details of the product, and store anything needed * into the driver data structure. This includes product type and @@ -3131,8 +3150,15 @@ static int mtip_hw_init(struct driver_data *dd) for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) dd->work[i].port = dd->port; + /* Enable unaligned IO constraints for some devices */ + if (mtip_device_unaligned_constrained(dd)) + dd->unal_qdepth = MTIP_MAX_UNALIGNED_SLOTS; + else + dd->unal_qdepth = 0; + /* Counting semaphore to track command slot usage */ - sema_init(&dd->port->cmd_slot, num_command_slots - 1); + sema_init(&dd->port->cmd_slot, num_command_slots - 1 - dd->unal_qdepth); + sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth); /* Spinlock to prevent concurrent issue */ for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) @@ -3735,7 +3761,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) struct scatterlist *sg; struct bio_vec *bvec; int nents = 0; - int tag = 0; + int tag = 0, unaligned = 0; if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, @@ -3771,7 +3797,15 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) return; } - sg = mtip_hw_get_scatterlist(dd, &tag); + if (bio_data_dir(bio) == WRITE && bio_sectors(bio) <= 64 && + dd->unal_qdepth) { + if (bio->bi_sector % 8 != 0) /* Unaligned on 4k boundaries */ + unaligned = 1; + else if (bio_sectors(bio) % 8 != 0) /* Aligned but not 4k/8k */ + unaligned = 1; + } + + sg = mtip_hw_get_scatterlist(dd, &tag, unaligned); if (likely(sg != NULL)) { blk_queue_bounce(queue, &bio); @@ -3779,7 +3813,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) dev_warn(&dd->pdev->dev, "Maximum number of SGL entries exceeded\n"); bio_io_error(bio); - mtip_hw_release_scatterlist(dd, tag); + mtip_hw_release_scatterlist(dd, tag, unaligned); return; } @@ -3799,7 +3833,8 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) tag, bio_endio, bio, - bio_data_dir(bio)); + bio_data_dir(bio), + unaligned); } else bio_io_error(bio); } diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 3bffff5f670c..579fb84bdd95 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -52,6 +52,9 @@ #define MTIP_FTL_REBUILD_MAGIC 0xED51 #define MTIP_FTL_REBUILD_TIMEOUT_MS 2400000 +/* unaligned IO handling */ +#define MTIP_MAX_UNALIGNED_SLOTS 8 + /* Macro to extract the tag bit number from a tag value. */ #define MTIP_TAG_BIT(tag) (tag & 0x1F) @@ -333,6 +336,8 @@ struct mtip_cmd { int scatter_ents; /* Number of scatter list entries used */ + int unaligned; /* command is unaligned on 4k boundary */ + struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */ int retries; /* The number of retries left for this command. */ @@ -452,6 +457,10 @@ struct mtip_port { * command slots available. */ struct semaphore cmd_slot; + + /* Semaphore to control queue depth of unaligned IOs */ + struct semaphore cmd_slot_unal; + /* Spinlock for working around command-issue bug. */ spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS]; }; @@ -501,6 +510,8 @@ struct driver_data { atomic_t irq_workers_active; int isr_binding; + + int unal_qdepth; /* qdepth of unaligned IO queue */ }; #endif -- cgit v1.2.3-59-g8ed1b From aed3d67e57ec3d27d64114ab61dab9bcd69fe42f Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Mon, 29 Apr 2013 11:55:51 -0700 Subject: drivers/block/mg_disk.c: add CONFIG_PM_SLEEP to suspend/resume functions Add CONFIG_PM_SLEEP to suspend/resume functions to fix the following build warning when CONFIG_PM_SLEEP is not selected. This is because sleep PM callbacks defined by SIMPLE_DEV_PM_OPS are only used when the CONFIG_PM_SLEEP is enabled. drivers/block/mg_disk.c:783:12: warning: 'mg_suspend' defined but not used [-Wunused-function] drivers/block/mg_disk.c:807:12: warning: 'mg_resume' defined but not used [-Wunused-function] Signed-off-by: Jingoo Han Cc: Jens Axboe Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/block/mg_disk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 1788f491e0fb..532bb8935a57 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -780,6 +780,7 @@ static const struct block_device_operations mg_disk_ops = { .getgeo = mg_getgeo }; +#ifdef CONFIG_PM_SLEEP static int mg_suspend(struct device *dev) { struct mg_drv_data *prv_data = dev->platform_data; @@ -824,6 +825,7 @@ static int mg_resume(struct device *dev) return 0; } +#endif static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume); -- cgit v1.2.3-59-g8ed1b From e4292e05d4c1bb4e64c299ea4b378a0f66d3cc8a Mon Sep 17 00:00:00 2001 From: Mike Miller Date: Mon, 29 Apr 2013 11:55:53 -0700 Subject: cciss: add cciss_allow_hpsa module parameter Add the cciss_allow_hpsa modules parameter. This allows users to use the hpsa driver instead of cciss for older controllers. Signed-off-by: Mike Miller Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index ade58bc8f3c4..2b485ca1150b 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -75,6 +75,12 @@ module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(cciss_simple_mode, "Use 'simple mode' rather than 'performant mode'"); +static int cciss_allow_hpsa; +module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cciss_allow_hpsa, + "Prevent cciss driver from accessing hardware known to be " + " supported by the hpsa driver"); + static DEFINE_MUTEX(cciss_mutex); static struct proc_dir_entry *proc_cciss; @@ -4116,9 +4122,13 @@ static int cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id) *board_id = ((subsystem_device_id << 16) & 0xffff0000) | subsystem_vendor_id; - for (i = 0; i < ARRAY_SIZE(products); i++) + for (i = 0; i < ARRAY_SIZE(products); i++) { + /* Stand aside for hpsa driver on request */ + if (cciss_allow_hpsa) + return -ENODEV; if (*board_id == products[i].board_id) return i; + } dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n", *board_id); return -ENODEV; -- cgit v1.2.3-59-g8ed1b From 0821e904057505c7e25d72e1a282105d023b26c9 Mon Sep 17 00:00:00 2001 From: Mike Miller Date: Mon, 29 Apr 2013 11:55:54 -0700 Subject: cciss: bug fix to prevent cciss from loading in kdump crash kernel By default the cciss driver supports all "older" HP Smart Array controllers and hpsa supports all controllers starting with the G6 family. There are module parameters that allow a user to override those defaults and use hpsa for any HP Smart Array controller. If the user does override the default behavior and uses hpsa for older controllers it is possible that cciss may try to load in a kdump crash kernel. This may happen if cciss is loaded first from the kdump initrd image. If cciss does load rather than hpsa and reset_devices is true we immediately call cciss_hard_reset_controller. This will result in a kernel panic and the core file cannot be created. This patch prevents cciss from trying to load in this scenario. Signed-off-by: Mike Miller Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 2b485ca1150b..2cd230cfa813 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4970,6 +4970,16 @@ static int cciss_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) ctlr_info_t *h; unsigned long flags; + /* + * By default the cciss driver is used for all older HP Smart Array + * controllers. There are module paramaters that allow a user to + * override this behavior and instead use the hpsa SCSI driver. If + * this is the case cciss may be loaded first from the kdump initrd + * image and cause a kernel panic. So if reset_devices is true and + * cciss_allow_hpsa is set just bail. + */ + if ((reset_devices) && (cciss_allow_hpsa == 1)) + return -ENODEV; rc = cciss_init_reset_devices(pdev); if (rc) { if (rc != -ENOTSUPP) -- cgit v1.2.3-59-g8ed1b