aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-08-04 20:00:14 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-08-04 20:00:14 -0700
commitfa9db655d0e112c108fe838809608caf759bdf5e (patch)
tree899a983b333871688095fd14b413c199b9a38f73
parentMerge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma (diff)
parents390/dasd: Establish DMA alignment (diff)
downloadlinux-dev-fa9db655d0e112c108fe838809608caf759bdf5e.tar.xz
linux-dev-fa9db655d0e112c108fe838809608caf759bdf5e.zip
Merge tag 'for-5.20/block-2022-08-04' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe: - NVMe pull requests via Christoph: - add support for In-Band authentication (Hannes Reinecke) - handle the persistent internal error AER (Michael Kelley) - use in-capsule data for TCP I/O queue connect (Caleb Sander) - remove timeout for getting RDMA-CM established event (Israel Rukshin) - misc cleanups (Joel Granados, Sagi Grimberg, Chaitanya Kulkarni, Guixin Liu, Xiang wangx) - use command_id instead of req->tag in trace_nvme_complete_rq() (Bean Huo) - various fixes for the new authentication code (Lukas Bulwahn, Dan Carpenter, Colin Ian King, Chaitanya Kulkarni, Hannes Reinecke) - small cleanups (Liu Song, Christoph Hellwig) - restore compat_ioctl support (Nick Bowler) - make a nvmet-tcp workqueue lockdep-safe (Sagi Grimberg) - enable generic interface (/dev/ngXnY) for unknown command sets (Joel Granados, Christoph Hellwig) - don't always build constants.o (Christoph Hellwig) - print the command name of aborted commands (Christoph Hellwig) - MD pull requests via Song: - Improve raid5 lock contention, by Logan Gunthorpe. - Misc fixes to raid5, by Logan Gunthorpe. - Fix race condition with md_reap_sync_thread(), by Guoqing Jiang. - Fix potential deadlock with raid5_quiesce and raid5_get_active_stripe, by Logan Gunthorpe. - Refactoring md_alloc(), by Christoph" - Fix md disk_name lifetime problems, by Christoph Hellwig - Convert prepare_to_wait() to wait_woken() api, by Logan Gunthorpe; - Fix sectors_to_do bitmap issue, by Logan Gunthorpe. - Work on unifying the null_blk module parameters and configfs API (Vincent) - drbd bitmap IO error fix (Lars) - Set of rnbd fixes (Guoqing, Md Haris) - Remove experimental marker on bcache async device registration (Coly) - Series from cleaning up the bio splitting (Christoph) - Removal of the sx8 block driver. This hardware never really widespread, and it didn't receive a lot of attention after the initial merge of it back in 2005 (Christoph) - A few fixes for s390 dasd (Eric, Jiang) - Followup set of fixes for ublk (Ming) - Support for UBLK_IO_NEED_GET_DATA for ublk (ZiyangZhang) - Fixes for the dio dma alignment (Keith) - Misc fixes and cleanups (Ming, Yu, Dan, Christophe * tag 'for-5.20/block-2022-08-04' of git://git.kernel.dk/linux-block: (136 commits) s390/dasd: Establish DMA alignment s390/dasd: drop unexpected word 'for' in comments ublk_drv: add support for UBLK_IO_NEED_GET_DATA ublk_cmd.h: add one new ublk command: UBLK_IO_NEED_GET_DATA ublk_drv: cleanup ublksrv_ctrl_dev_info ublk_drv: add SET_PARAMS/GET_PARAMS control command ublk_drv: fix ublk device leak in case that add_disk fails ublk_drv: cancel device even though disk isn't up block: fix leaking page ref on truncated direct io block: ensure bio_iov_add_page can't fail block: ensure iov_iter advances for added pages drivers:md:fix a potential use-after-free bug md/raid5: Ensure batch_last is released before sleeping for quiesce md/raid5: Move stripe_request_ctx up md/raid5: Drop unnecessary call to r5c_check_stripe_cache_usage() md/raid5: Make is_inactive_blocked() helper md/raid5: Refactor raid5_get_active_stripe() block: pass struct queue_limits to the bio splitting helpers block: move bio_allowed_max_sectors to blk-merge.c block: move the call to get_max_io_size out of blk_bio_segment_split ...
-rw-r--r--Documentation/block/null_blk.rst22
-rw-r--r--MAINTAINERS4
-rw-r--r--block/bio-integrity.c2
-rw-r--r--block/bio.c51
-rw-r--r--block/blk-core.c9
-rw-r--r--block/blk-merge.c185
-rw-r--r--block/blk-mq.c6
-rw-r--r--block/blk-sysfs.c2
-rw-r--r--block/blk.h47
-rw-r--r--block/bounce.c26
-rw-r--r--block/genhd.c8
-rw-r--r--crypto/kpp.c6
-rw-r--r--crypto/shash.c6
-rw-r--r--drivers/block/Kconfig9
-rw-r--r--drivers/block/Makefile2
-rw-r--r--drivers/block/drbd/drbd_bitmap.c49
-rw-r--r--drivers/block/drbd/drbd_req.c2
-rw-r--r--drivers/block/nbd.c6
-rw-r--r--drivers/block/null_blk/main.c108
-rw-r--r--drivers/block/null_blk/null_blk.h2
-rw-r--r--drivers/block/pktcdvd.c2
-rw-r--r--drivers/block/ps3vram.c2
-rw-r--r--drivers/block/rnbd/rnbd-clt-sysfs.c2
-rw-r--r--drivers/block/rnbd/rnbd-clt.c201
-rw-r--r--drivers/block/rnbd/rnbd-clt.h18
-rw-r--r--drivers/block/rnbd/rnbd-srv.c20
-rw-r--r--drivers/block/rnbd/rnbd-srv.h4
-rw-r--r--drivers/block/sx8.c1582
-rw-r--r--drivers/block/ublk_drv.c348
-rw-r--r--drivers/md/bcache/Kconfig2
-rw-r--r--drivers/md/dm-raid.c1
-rw-r--r--drivers/md/dm.c8
-rw-r--r--drivers/md/md-autodetect.c21
-rw-r--r--drivers/md/md-cluster.c4
-rw-r--r--drivers/md/md.c424
-rw-r--r--drivers/md/md.h19
-rw-r--r--drivers/md/raid10.c5
-rw-r--r--drivers/md/raid5-cache.c40
-rw-r--r--drivers/md/raid5-log.h77
-rw-r--r--drivers/md/raid5-ppl.c2
-rw-r--r--drivers/md/raid5.c727
-rw-r--r--drivers/md/raid5.h2
-rw-r--r--drivers/nvme/Kconfig1
-rw-r--r--drivers/nvme/Makefile1
-rw-r--r--drivers/nvme/common/Kconfig4
-rw-r--r--drivers/nvme/common/Makefile7
-rw-r--r--drivers/nvme/common/auth.c483
-rw-r--r--drivers/nvme/host/Kconfig15
-rw-r--r--drivers/nvme/host/Makefile4
-rw-r--r--drivers/nvme/host/apple.c28
-rw-r--r--drivers/nvme/host/auth.c1017
-rw-r--r--drivers/nvme/host/constants.c3
-rw-r--r--drivers/nvme/host/core.c490
-rw-r--r--drivers/nvme/host/fabrics.c94
-rw-r--r--drivers/nvme/host/fabrics.h7
-rw-r--r--drivers/nvme/host/multipath.c9
-rw-r--r--drivers/nvme/host/nvme.h39
-rw-r--r--drivers/nvme/host/pci.c145
-rw-r--r--drivers/nvme/host/rdma.c106
-rw-r--r--drivers/nvme/host/tcp.c95
-rw-r--r--drivers/nvme/host/trace.c32
-rw-r--r--drivers/nvme/host/trace.h2
-rw-r--r--drivers/nvme/target/Kconfig15
-rw-r--r--drivers/nvme/target/Makefile1
-rw-r--r--drivers/nvme/target/admin-cmd.c4
-rw-r--r--drivers/nvme/target/auth.c525
-rw-r--r--drivers/nvme/target/configfs.c136
-rw-r--r--drivers/nvme/target/core.c15
-rw-r--r--drivers/nvme/target/fabrics-cmd-auth.c544
-rw-r--r--drivers/nvme/target/fabrics-cmd.c55
-rw-r--r--drivers/nvme/target/loop.c8
-rw-r--r--drivers/nvme/target/nvmet.h75
-rw-r--r--drivers/nvme/target/tcp.c3
-rw-r--r--drivers/s390/block/dasd.c2
-rw-r--r--drivers/s390/block/dasd_diag.c1
-rw-r--r--drivers/s390/block/dasd_eckd.c1
-rw-r--r--drivers/s390/block/dcssblk.c2
-rw-r--r--include/crypto/hash.h2
-rw-r--r--include/crypto/kpp.h2
-rw-r--r--include/linux/base64.h16
-rw-r--r--include/linux/blkdev.h5
-rw-r--r--include/linux/nvme-auth.h41
-rw-r--r--include/linux/nvme.h213
-rw-r--r--include/uapi/linux/ublk_cmd.h80
-rw-r--r--lib/Makefile2
-rw-r--r--lib/base64.c103
86 files changed, 5640 insertions, 2856 deletions
diff --git a/Documentation/block/null_blk.rst b/Documentation/block/null_blk.rst
index edbbab2f12f8..4dd78f24d10a 100644
--- a/Documentation/block/null_blk.rst
+++ b/Documentation/block/null_blk.rst
@@ -72,6 +72,28 @@ submit_queues=[1..nr_cpus]: Default: 1
hw_queue_depth=[0..qdepth]: Default: 64
The hardware queue depth of the device.
+memory_backed=[0/1]: Default: 0
+ Whether or not to use a memory buffer to respond to IO requests
+
+ = =============================================
+ 0 Transfer no data in response to IO requests
+ 1 Use a memory buffer to respond to IO requests
+ = =============================================
+
+discard=[0/1]: Default: 0
+ Support discard operations (requires memory-backed null_blk device).
+
+ = =====================================
+ 0 Do not support discard operations
+ 1 Enable support for discard operations
+ = =====================================
+
+cache_size=[Size in MB]: Default: 0
+ Cache size in MB for memory-backed device.
+
+mbps=[Maximum bandwidth in MB/s]: Default: 0 (no limit)
+ Bandwidth limit for device performance.
+
Multi-queue specific parameters
-------------------------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 1c42103d889a..a6c5c29e5a0b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14507,7 +14507,8 @@ S: Supported
W: http://git.infradead.org/nvme.git
T: git://git.infradead.org/nvme.git
F: drivers/nvme/host/
-F: include/linux/nvme.h
+F: drivers/nvme/common/
+F: include/linux/nvme*
F: include/uapi/linux/nvme_ioctl.h
NVM EXPRESS FC TRANSPORT DRIVERS
@@ -18838,6 +18839,7 @@ SOFTWARE RAID (Multiple Disks) SUPPORT
M: Song Liu <song@kernel.org>
L: linux-raid@vger.kernel.org
S: Supported
+Q: https://patchwork.kernel.org/project/linux-raid/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
F: drivers/md/Kconfig
F: drivers/md/Makefile
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 32929c89ba8a..3f5685c00e36 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
iv = bip->bip_vec + bip->bip_vcnt;
if (bip->bip_vcnt &&
- bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
+ bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits,
&bip->bip_vec[bip->bip_vcnt - 1], offset))
return 0;
diff --git a/block/bio.c b/block/bio.c
index 6f9f883f9a65..d6eb90d9b20b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -965,7 +965,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
* would create a gap, disallow it.
*/
bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
- if (bvec_gap_to_prev(q, bvec, offset))
+ if (bvec_gap_to_prev(&q->limits, bvec, offset))
return 0;
}
@@ -1151,22 +1151,12 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}
-static void bio_put_pages(struct page **pages, size_t size, size_t off)
-{
- size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
-
- for (i = 0; i < nr; i++)
- put_page(pages[i]);
-}
-
static int bio_iov_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
bool same_page = false;
if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
- if (WARN_ON_ONCE(bio_full(bio, len)))
- return -EINVAL;
__bio_add_page(bio, page, len, offset);
return 0;
}
@@ -1209,8 +1199,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
ssize_t size, left;
- unsigned len, i;
+ unsigned len, i = 0;
size_t offset;
+ int ret = 0;
/*
* Move page array up in the allocated memory for the bio vecs as far as
@@ -1227,32 +1218,40 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
* result to ensure the bio's total size is correct. The remainder of
* the iov data will be picked up in the next bio iteration.
*/
- size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
- if (size > 0)
+ size = iov_iter_get_pages(iter, pages, UINT_MAX - bio->bi_iter.bi_size,
+ nr_pages, &offset);
+ if (size > 0) {
+ nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
- if (unlikely(size <= 0))
- return size ? size : -EFAULT;
+ } else
+ nr_pages = 0;
+
+ if (unlikely(size <= 0)) {
+ ret = size ? size : -EFAULT;
+ goto out;
+ }
for (left = size, i = 0; left > 0; left -= len, i++) {
struct page *page = pages[i];
- int ret;
len = min_t(size_t, PAGE_SIZE - offset, left);
- if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
ret = bio_iov_add_zone_append_page(bio, page, len,
offset);
- else
- ret = bio_iov_add_page(bio, page, len, offset);
+ if (ret)
+ break;
+ } else
+ bio_iov_add_page(bio, page, len, offset);
- if (ret) {
- bio_put_pages(pages + i, left, offset);
- return ret;
- }
offset = 0;
}
- iov_iter_advance(iter, size);
- return 0;
+ iov_iter_advance(iter, size - left);
+out:
+ while (i < nr_pages)
+ put_page(pages[i++]);
+
+ return ret;
}
/**
diff --git a/block/blk-core.c b/block/blk-core.c
index 3d286a256d3d..a0d1104c5590 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -377,7 +377,6 @@ static void blk_timeout_work(struct work_struct *work)
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
{
struct request_queue *q;
- int ret;
q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
GFP_KERNEL | __GFP_ZERO, node_id);
@@ -396,13 +395,9 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
if (q->id < 0)
goto fail_srcu;
- ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
- if (ret)
- goto fail_id;
-
q->stats = blk_alloc_queue_stats();
if (!q->stats)
- goto fail_split;
+ goto fail_id;
q->node = node_id;
@@ -439,8 +434,6 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
fail_stats:
blk_free_queue_stats(q->stats);
-fail_split:
- bioset_exit(&q->bio_split);
fail_id:
ida_free(&blk_queue_ida, q->id);
fail_srcu:
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4c8a699754c9..ff04e9290715 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -82,7 +82,7 @@ static inline bool bio_will_gap(struct request_queue *q,
bio_get_first_bvec(next, &nb);
if (biovec_phys_mergeable(q, &pb, &nb))
return false;
- return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+ return __bvec_gap_to_prev(&q->limits, &pb, nb.bv_offset);
}
static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
@@ -95,23 +95,30 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
return bio_will_gap(req->q, NULL, bio, req->bio);
}
-static struct bio *blk_bio_discard_split(struct request_queue *q,
- struct bio *bio,
- struct bio_set *bs,
- unsigned *nsegs)
+/*
+ * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
+ * is defined as 'unsigned int', meantime it has to be aligned to with the
+ * logical block size, which is the minimum accepted unit by hardware.
+ */
+static unsigned int bio_allowed_max_sectors(struct queue_limits *lim)
+{
+ return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
+}
+
+static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim,
+ unsigned *nsegs, struct bio_set *bs)
{
unsigned int max_discard_sectors, granularity;
- int alignment;
sector_t tmp;
unsigned split_sectors;
*nsegs = 1;
/* Zero-sector (unknown) and one-sector granularities are the same. */
- granularity = max(q->limits.discard_granularity >> 9, 1U);
+ granularity = max(lim->discard_granularity >> 9, 1U);
- max_discard_sectors = min(q->limits.max_discard_sectors,
- bio_allowed_max_sectors(q));
+ max_discard_sectors =
+ min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
max_discard_sectors -= max_discard_sectors % granularity;
if (unlikely(!max_discard_sectors)) {
@@ -128,9 +135,8 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
* If the next starting sector would be misaligned, stop the discard at
* the previous aligned sector.
*/
- alignment = (q->limits.discard_alignment >> 9) % granularity;
-
- tmp = bio->bi_iter.bi_sector + split_sectors - alignment;
+ tmp = bio->bi_iter.bi_sector + split_sectors -
+ ((lim->discard_alignment >> 9) % granularity);
tmp = sector_div(tmp, granularity);
if (split_sectors > tmp)
@@ -139,18 +145,15 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
return bio_split(bio, split_sectors, GFP_NOIO, bs);
}
-static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
- struct bio *bio, struct bio_set *bs, unsigned *nsegs)
+static struct bio *bio_split_write_zeroes(struct bio *bio,
+ struct queue_limits *lim, unsigned *nsegs, struct bio_set *bs)
{
*nsegs = 0;
-
- if (!q->limits.max_write_zeroes_sectors)
+ if (!lim->max_write_zeroes_sectors)
return NULL;
-
- if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
+ if (bio_sectors(bio) <= lim->max_write_zeroes_sectors)
return NULL;
-
- return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
+ return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
}
/*
@@ -161,17 +164,17 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
* requests that are submitted to a block device if the start of a bio is not
* aligned to a physical block boundary.
*/
-static inline unsigned get_max_io_size(struct request_queue *q,
- struct bio *bio)
+static inline unsigned get_max_io_size(struct bio *bio,
+ struct queue_limits *lim)
{
- unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
- unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
- unsigned max_sectors = queue_max_sectors(q), start, end;
+ unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
+ unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
+ unsigned max_sectors = lim->max_sectors, start, end;
- if (q->limits.chunk_sectors) {
+ if (lim->chunk_sectors) {
max_sectors = min(max_sectors,
blk_chunk_sectors_left(bio->bi_iter.bi_sector,
- q->limits.chunk_sectors));
+ lim->chunk_sectors));
}
start = bio->bi_iter.bi_sector & (pbs - 1);
@@ -181,11 +184,10 @@ static inline unsigned get_max_io_size(struct request_queue *q,
return max_sectors & ~(lbs - 1);
}
-static inline unsigned get_max_segment_size(const struct request_queue *q,
- struct page *start_page,
- unsigned long offset)
+static inline unsigned get_max_segment_size(struct queue_limits *lim,
+ struct page *start_page, unsigned long offset)
{
- unsigned long mask = queue_segment_boundary(q);
+ unsigned long mask = lim->seg_boundary_mask;
offset = mask & (page_to_phys(start_page) + offset);
@@ -194,12 +196,12 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
* on 32bit arch, use queue's max segment size when that happens.
*/
return min_not_zero(mask - offset + 1,
- (unsigned long)queue_max_segment_size(q));
+ (unsigned long)lim->max_segment_size);
}
/**
* bvec_split_segs - verify whether or not a bvec should be split in the middle
- * @q: [in] request queue associated with the bio associated with @bv
+ * @lim: [in] queue limits to split based on
* @bv: [in] bvec to examine
* @nsegs: [in,out] Number of segments in the bio being built. Incremented
* by the number of segments from @bv that may be appended to that
@@ -217,10 +219,9 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
* *@nsegs segments and *@sectors sectors would make that bio unacceptable for
* the block driver.
*/
-static bool bvec_split_segs(const struct request_queue *q,
- const struct bio_vec *bv, unsigned *nsegs,
- unsigned *bytes, unsigned max_segs,
- unsigned max_bytes)
+static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv,
+ unsigned *nsegs, unsigned *bytes, unsigned max_segs,
+ unsigned max_bytes)
{
unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
unsigned len = min(bv->bv_len, max_len);
@@ -228,7 +229,7 @@ static bool bvec_split_segs(const struct request_queue *q,
unsigned seg_size = 0;
while (len && *nsegs < max_segs) {
- seg_size = get_max_segment_size(q, bv->bv_page,
+ seg_size = get_max_segment_size(lim, bv->bv_page,
bv->bv_offset + total_len);
seg_size = min(seg_size, len);
@@ -236,7 +237,7 @@ static bool bvec_split_segs(const struct request_queue *q,
total_len += seg_size;
len -= seg_size;
- if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
+ if ((bv->bv_offset + total_len) & lim->virt_boundary_mask)
break;
}
@@ -247,16 +248,17 @@ static bool bvec_split_segs(const struct request_queue *q,
}
/**
- * blk_bio_segment_split - split a bio in two bios
- * @q: [in] request queue pointer
+ * bio_split_rw - split a bio in two bios
* @bio: [in] bio to be split
- * @bs: [in] bio set to allocate the clone from
+ * @lim: [in] queue limits to split based on
* @segs: [out] number of segments in the bio with the first half of the sectors
+ * @bs: [in] bio set to allocate the clone from
+ * @max_bytes: [in] maximum number of bytes per bio
*
* Clone @bio, update the bi_iter of the clone to represent the first sectors
* of @bio and update @bio->bi_iter to represent the remaining sectors. The
* following is guaranteed for the cloned bio:
- * - That it has at most get_max_io_size(@q, @bio) sectors.
+ * - That it has at most @max_bytes worth of data
* - That it has at most queue_max_segments(@q) segments.
*
* Except for discard requests the cloned bio will point at the bi_io_vec of
@@ -265,33 +267,30 @@ static bool bvec_split_segs(const struct request_queue *q,
* responsible for ensuring that @bs is only destroyed after processing of the
* split bio has finished.
*/
-static struct bio *blk_bio_segment_split(struct request_queue *q,
- struct bio *bio,
- struct bio_set *bs,
- unsigned *segs)
+static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim,
+ unsigned *segs, struct bio_set *bs, unsigned max_bytes)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
unsigned nsegs = 0, bytes = 0;
- const unsigned max_bytes = get_max_io_size(q, bio) << 9;
- const unsigned max_segs = queue_max_segments(q);
bio_for_each_bvec(bv, bio, iter) {
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
*/
- if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
+ if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
goto split;
- if (nsegs < max_segs &&
+ if (nsegs < lim->max_segments &&
bytes + bv.bv_len <= max_bytes &&
bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
nsegs++;
bytes += bv.bv_len;
- } else if (bvec_split_segs(q, &bv, &nsegs, &bytes, max_segs,
- max_bytes)) {
- goto split;
+ } else {
+ if (bvec_split_segs(lim, &bv, &nsegs, &bytes,
+ lim->max_segments, max_bytes))
+ goto split;
}
bvprv = bv;
@@ -308,7 +307,7 @@ split:
* split size so that each bio is properly block size aligned, even if
* we do not use the full hardware limits.
*/
- bytes = ALIGN_DOWN(bytes, queue_logical_block_size(q));
+ bytes = ALIGN_DOWN(bytes, lim->logical_block_size);
/*
* Bio splitting may cause subtle trouble such as hang when doing sync
@@ -320,34 +319,35 @@ split:
}
/**
- * __blk_queue_split - split a bio and submit the second half
- * @q: [in] request_queue new bio is being queued at
- * @bio: [in, out] bio to be split
- * @nr_segs: [out] number of segments in the first bio
+ * __bio_split_to_limits - split a bio to fit the queue limits
+ * @bio: bio to be split
+ * @lim: queue limits to split based on
+ * @nr_segs: returns the number of segments in the returned bio
+ *
+ * Check if @bio needs splitting based on the queue limits, and if so split off
+ * a bio fitting the limits from the beginning of @bio and return it. @bio is
+ * shortened to the remainder and re-submitted.
*
- * Split a bio into two bios, chain the two bios, submit the second half and
- * store a pointer to the first half in *@bio. If the second bio is still too
- * big it will be split by a recursive call to this function. Since this
- * function may allocate a new bio from q->bio_split, it is the responsibility
- * of the caller to ensure that q->bio_split is only released after processing
- * of the split bio has finished.
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
*/
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
+struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
unsigned int *nr_segs)
{
- struct bio *split = NULL;
+ struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
+ struct bio *split;
- switch (bio_op(*bio)) {
+ switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
- split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
+ split = bio_split_discard(bio, lim, nr_segs, bs);
break;
case REQ_OP_WRITE_ZEROES:
- split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
- nr_segs);
+ split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
break;
default:
- split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
+ split = bio_split_rw(bio, lim, nr_segs, bs,
+ get_max_io_size(bio, lim) << SECTOR_SHIFT);
break;
}
@@ -356,32 +356,35 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
split->bi_opf |= REQ_NOMERGE;
blkcg_bio_issue_init(split);
- bio_chain(split, *bio);
- trace_block_split(split, (*bio)->bi_iter.bi_sector);
- submit_bio_noacct(*bio);
- *bio = split;
+ bio_chain(split, bio);
+ trace_block_split(split, bio->bi_iter.bi_sector);
+ submit_bio_noacct(bio);
+ return split;
}
+ return bio;
}
/**
- * blk_queue_split - split a bio and submit the second half
- * @bio: [in, out] bio to be split
+ * bio_split_to_limits - split a bio to fit the queue limits
+ * @bio: bio to be split
+ *
+ * Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and
+ * if so split off a bio fitting the limits from the beginning of @bio and
+ * return it. @bio is shortened to the remainder and re-submitted.
*
- * Split a bio into two bios, chains the two bios, submit the second half and
- * store a pointer to the first half in *@bio. Since this function may allocate
- * a new bio from q->bio_split, it is the responsibility of the caller to ensure
- * that q->bio_split is only released after processing of the split bio has
- * finished.
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
*/
-void blk_queue_split(struct bio **bio)
+struct bio *bio_split_to_limits(struct bio *bio)
{
- struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
+ struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
unsigned int nr_segs;
- if (blk_may_split(q, *bio))
- __blk_queue_split(q, bio, &nr_segs);
+ if (bio_may_exceed_limits(bio, lim))
+ return __bio_split_to_limits(bio, lim, &nr_segs);
+ return bio;
}
-EXPORT_SYMBOL(blk_queue_split);
+EXPORT_SYMBOL(bio_split_to_limits);
unsigned int blk_recalc_rq_segments(struct request *rq)
{
@@ -411,7 +414,7 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
}
rq_for_each_bvec(bv, rq, iter)
- bvec_split_segs(rq->q, &bv, &nr_phys_segs, &bytes,
+ bvec_split_segs(&rq->q->limits, &bv, &nr_phys_segs, &bytes,
UINT_MAX, UINT_MAX);
return nr_phys_segs;
}
@@ -442,8 +445,8 @@ static unsigned blk_bvec_map_sg(struct request_queue *q,
while (nbytes > 0) {
unsigned offset = bvec->bv_offset + total;
- unsigned len = min(get_max_segment_size(q, bvec->bv_page,
- offset), nbytes);
+ unsigned len = min(get_max_segment_size(&q->limits,
+ bvec->bv_page, offset), nbytes);
struct page *page = bvec->bv_page;
/*
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 70177ee74295..5ee62b95f3e5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2815,9 +2815,9 @@ void blk_mq_submit_bio(struct bio *bio)
unsigned int nr_segs = 1;
blk_status_t ret;
- blk_queue_bounce(q, &bio);
- if (blk_may_split(q, bio))
- __blk_queue_split(q, &bio, &nr_segs);
+ bio = blk_queue_bounce(bio, q);
+ if (bio_may_exceed_limits(bio, &q->limits))
+ bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio_integrity_prep(bio))
return;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c0303026752d..e1f009aba6fd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -779,8 +779,6 @@ static void blk_release_queue(struct kobject *kobj)
if (queue_is_mq(q))
blk_mq_release(q);
- bioset_exit(&q->bio_split);
-
if (blk_queue_has_srcu(q))
cleanup_srcu_struct(q->srcu);
diff --git a/block/blk.h b/block/blk.h
index 1d83b1d41cd1..d7142c4d2fef 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -97,23 +97,23 @@ static inline bool biovec_phys_mergeable(struct request_queue *q,
return true;
}
-static inline bool __bvec_gap_to_prev(struct request_queue *q,
+static inline bool __bvec_gap_to_prev(struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
- return (offset & queue_virt_boundary(q)) ||
- ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+ return (offset & lim->virt_boundary_mask) ||
+ ((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask);
}
/*
* Check if adding a bio_vec after bprv with offset would create a gap in
* the SG list. Most drivers don't care about this, but some do.
*/
-static inline bool bvec_gap_to_prev(struct request_queue *q,
+static inline bool bvec_gap_to_prev(struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
- if (!queue_virt_boundary(q))
+ if (!lim->virt_boundary_mask)
return false;
- return __bvec_gap_to_prev(q, bprv, offset);
+ return __bvec_gap_to_prev(lim, bprv, offset);
}
static inline bool rq_mergeable(struct request *rq)
@@ -189,7 +189,8 @@ static inline bool integrity_req_gap_back_merge(struct request *req,
struct bio_integrity_payload *bip = bio_integrity(req->bio);
struct bio_integrity_payload *bip_next = bio_integrity(next);
- return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+ return bvec_gap_to_prev(&req->q->limits,
+ &bip->bip_vec[bip->bip_vcnt - 1],
bip_next->bip_vec[0].bv_offset);
}
@@ -199,7 +200,8 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_integrity_payload *bip_next = bio_integrity(req->bio);
- return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+ return bvec_gap_to_prev(&req->q->limits,
+ &bip->bip_vec[bip->bip_vcnt - 1],
bip_next->bip_vec[0].bv_offset);
}
@@ -288,7 +290,8 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
-static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
+static inline bool bio_may_exceed_limits(struct bio *bio,
+ struct queue_limits *lim)
{
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
@@ -307,12 +310,12 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
* to the performance impact of cloned bios themselves the loop below
* doesn't matter anyway.
*/
- return q->limits.chunk_sectors || bio->bi_vcnt != 1 ||
+ return lim->chunk_sectors || bio->bi_vcnt != 1 ||
bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
}
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
- unsigned int *nr_segs);
+struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
+ unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -345,16 +348,6 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
}
/*
- * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
- * is defined as 'unsigned int', meantime it has to aligned to with logical
- * block size which is the minimum accepted unit by hardware.
- */
-static inline unsigned int bio_allowed_max_sectors(struct request_queue *q)
-{
- return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9;
-}
-
-/*
* Internal io_context interface
*/
struct io_cq *ioc_find_get_icq(struct request_queue *q);
@@ -378,7 +371,7 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { }
static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
#endif
-void __blk_queue_bounce(struct request_queue *q, struct bio **bio);
+struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
static inline bool blk_queue_may_bounce(struct request_queue *q)
{
@@ -387,10 +380,12 @@ static inline bool blk_queue_may_bounce(struct request_queue *q)
max_low_pfn >= max_pfn;
}
-static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
+static inline struct bio *blk_queue_bounce(struct bio *bio,
+ struct request_queue *q)
{
- if (unlikely(blk_queue_may_bounce(q) && bio_has_data(*bio)))
- __blk_queue_bounce(q, bio);
+ if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio)))
+ return __blk_queue_bounce(bio, q);
+ return bio;
}
#ifdef CONFIG_BLK_CGROUP_IOLATENCY
diff --git a/block/bounce.c b/block/bounce.c
index c8f487af7be3..7cfcb242f9a1 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -199,24 +199,24 @@ err_put:
return NULL;
}
-void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
+struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q)
{
struct bio *bio;
- int rw = bio_data_dir(*bio_orig);
+ int rw = bio_data_dir(bio_orig);
struct bio_vec *to, from;
struct bvec_iter iter;
unsigned i = 0, bytes = 0;
bool bounce = false;
int sectors;
- bio_for_each_segment(from, *bio_orig, iter) {
+ bio_for_each_segment(from, bio_orig, iter) {
if (i++ < BIO_MAX_VECS)
bytes += from.bv_len;
if (PageHighMem(from.bv_page))
bounce = true;
}
if (!bounce)
- return;
+ return bio_orig;
/*
* Individual bvecs might not be logical block aligned. Round down
@@ -225,13 +225,13 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
*/
sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >>
SECTOR_SHIFT;
- if (sectors < bio_sectors(*bio_orig)) {
- bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
- bio_chain(bio, *bio_orig);
- submit_bio_noacct(*bio_orig);
- *bio_orig = bio;
+ if (sectors < bio_sectors(bio_orig)) {
+ bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
+ bio_chain(bio, bio_orig);
+ submit_bio_noacct(bio_orig);
+ bio_orig = bio;
}
- bio = bounce_clone_bio(*bio_orig);
+ bio = bounce_clone_bio(bio_orig);
/*
* Bvec table can't be updated by bio_for_each_segment_all(),
@@ -254,7 +254,7 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
to->bv_page = bounce_page;
}
- trace_block_bio_bounce(*bio_orig);
+ trace_block_bio_bounce(bio_orig);
bio->bi_flags |= (1 << BIO_BOUNCED);
@@ -263,6 +263,6 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
else
bio->bi_end_io = bounce_end_io_write;
- bio->bi_private = *bio_orig;
- *bio_orig = bio;
+ bio->bi_private = bio_orig;
+ return bio;
}
diff --git a/block/genhd.c b/block/genhd.c
index e1d5b10ac193..b901fea1d55a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1151,6 +1151,7 @@ static void disk_release(struct device *dev)
blk_mq_exit_queue(disk->queue);
blkcg_exit_queue(disk->queue);
+ bioset_exit(&disk->bio_split);
disk_release_events(disk);
kfree(disk->random);
@@ -1342,9 +1343,12 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (!disk)
goto out_put_queue;
+ if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0))
+ goto out_free_disk;
+
disk->bdi = bdi_alloc(node_id);
if (!disk->bdi)
- goto out_free_disk;
+ goto out_free_bioset;
/* bdev_alloc() might need the queue, set before the first call */
disk->queue = q;
@@ -1382,6 +1386,8 @@ out_destroy_part_tbl:
iput(disk->part0->bd_inode);
out_free_bdi:
bdi_put(disk->bdi);
+out_free_bioset:
+ bioset_exit(&disk->bio_split);
out_free_disk:
kfree(disk);
out_put_queue:
diff --git a/crypto/kpp.c b/crypto/kpp.c
index 7aa6ba4b60a4..678e871ce418 100644
--- a/crypto/kpp.c
+++ b/crypto/kpp.c
@@ -104,6 +104,12 @@ int crypto_grab_kpp(struct crypto_kpp_spawn *spawn,
}
EXPORT_SYMBOL_GPL(crypto_grab_kpp);
+int crypto_has_kpp(const char *alg_name, u32 type, u32 mask)
+{
+ return crypto_type_has_alg(alg_name, &crypto_kpp_type, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_has_kpp);
+
static void kpp_prepare_alg(struct kpp_alg *alg)
{
struct crypto_alg *base = &alg->base;
diff --git a/crypto/shash.c b/crypto/shash.c
index 0a0a50cb694f..4c88e63b3350 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -521,6 +521,12 @@ struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
}
EXPORT_SYMBOL_GPL(crypto_alloc_shash);
+int crypto_has_shash(const char *alg_name, u32 type, u32 mask)
+{
+ return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_has_shash);
+
static int shash_prepare_alg(struct shash_alg *alg)
{
struct crypto_alg *base = &alg->base;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index e19fcab016ba..db1b4b202646 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -248,15 +248,6 @@ config BLK_DEV_NBD
If unsure, say N.
-config BLK_DEV_SX8
- tristate "Promise SATA SX8 support"
- depends on PCI
- help
- Saying Y or M here will enable support for the
- Promise SATA SX8 controllers.
-
- Use devices /dev/sx8/$N and /dev/sx8/$Np$M.
-
config BLK_DEV_RAM
tristate "RAM block device support"
help
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index be631352567e..101612cba303 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -26,8 +26,6 @@ obj-$(CONFIG_SUNVDC) += sunvdc.o
obj-$(CONFIG_BLK_DEV_NBD) += nbd.o
obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o
-obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
-
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 603f6828dd79..7d9db33363de 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -974,25 +974,58 @@ static void drbd_bm_endio(struct bio *bio)
}
}
+/* For the layout, see comment above drbd_md_set_sector_offsets(). */
+static inline sector_t drbd_md_last_bitmap_sector(struct drbd_backing_dev *bdev)
+{
+ switch (bdev->md.meta_dev_idx) {
+ case DRBD_MD_INDEX_INTERNAL:
+ case DRBD_MD_INDEX_FLEX_INT:
+ return bdev->md.md_offset + bdev->md.al_offset -1;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ default:
+ return bdev->md.md_offset + bdev->md.md_size_sect -1;
+ }
+}
+
static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
{
struct drbd_device *device = ctx->device;
enum req_op op = ctx->flags & BM_AIO_READ ? REQ_OP_READ : REQ_OP_WRITE;
- struct bio *bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op,
- GFP_NOIO, &drbd_md_io_bio_set);
struct drbd_bitmap *b = device->bitmap;
+ struct bio *bio;
struct page *page;
+ sector_t last_bm_sect;
+ sector_t first_bm_sect;
+ sector_t on_disk_sector;
unsigned int len;
- sector_t on_disk_sector =
- device->ldev->md.md_offset + device->ldev->md.bm_offset;
- on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
+ first_bm_sect = device->ldev->md.md_offset + device->ldev->md.bm_offset;
+ on_disk_sector = first_bm_sect + (((sector_t)page_nr) << (PAGE_SHIFT-SECTOR_SHIFT));
/* this might happen with very small
* flexible external meta data device,
* or with PAGE_SIZE > 4k */
- len = min_t(unsigned int, PAGE_SIZE,
- (drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9);
+ last_bm_sect = drbd_md_last_bitmap_sector(device->ldev);
+ if (first_bm_sect <= on_disk_sector && last_bm_sect >= on_disk_sector) {
+ sector_t len_sect = last_bm_sect - on_disk_sector + 1;
+ if (len_sect < PAGE_SIZE/SECTOR_SIZE)
+ len = (unsigned int)len_sect*SECTOR_SIZE;
+ else
+ len = PAGE_SIZE;
+ } else {
+ if (__ratelimit(&drbd_ratelimit_state)) {
+ drbd_err(device, "Invalid offset during on-disk bitmap access: "
+ "page idx %u, sector %llu\n", page_nr, on_disk_sector);
+ }
+ ctx->error = -EIO;
+ bm_set_page_io_err(b->bm_pages[page_nr]);
+ if (atomic_dec_and_test(&ctx->in_flight)) {
+ ctx->done = 1;
+ wake_up(&device->misc_wait);
+ kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+ }
+ return;
+ }
/* serialize IO on this page */
bm_page_lock_io(device, page_nr);
@@ -1007,6 +1040,8 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
bm_store_page_idx(page, page_nr);
} else
page = b->bm_pages[page_nr];
+ bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op, GFP_NOIO,
+ &drbd_md_io_bio_set);
bio->bi_iter.bi_sector = on_disk_sector;
/* bio_add_page of a single page to an empty bio will always succeed,
* according to api. Do we want to assert that? */
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 6d8dd14458c6..8f7f144e54f3 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1608,7 +1608,7 @@ void drbd_submit_bio(struct bio *bio)
{
struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
- blk_queue_split(&bio);
+ bio = bio_split_to_limits(bio);
/*
* what we "blindly" assume:
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index f5d098a148cb..2a709daefbc4 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -11,6 +11,8 @@
* (part of code stolen from loop.c)
*/
+#define pr_fmt(fmt) "nbd: " fmt
+
#include <linux/major.h>
#include <linux/blkdev.h>
@@ -1950,7 +1952,7 @@ again:
test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) ||
!refcount_inc_not_zero(&nbd->refs)) {
mutex_unlock(&nbd_index_mutex);
- pr_err("nbd: device at index %d is going down\n",
+ pr_err("device at index %d is going down\n",
index);
return -EINVAL;
}
@@ -1961,7 +1963,7 @@ again:
if (!nbd) {
nbd = nbd_dev_add(index, 2);
if (IS_ERR(nbd)) {
- pr_err("nbd: failed to add new device\n");
+ pr_err("failed to add new device\n");
return PTR_ERR(nbd);
}
}
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 8b224ede2e33..c451c477978f 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -201,6 +201,22 @@ static bool g_use_per_node_hctx;
module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
+static bool g_memory_backed;
+module_param_named(memory_backed, g_memory_backed, bool, 0444);
+MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false");
+
+static bool g_discard;
+module_param_named(discard, g_discard, bool, 0444);
+MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false");
+
+static unsigned long g_cache_size;
+module_param_named(cache_size, g_cache_size, ulong, 0444);
+MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
+
+static unsigned int g_mbps;
+module_param_named(mbps, g_mbps, uint, 0444);
+MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");
+
static bool g_zoned;
module_param_named(zoned, g_zoned, bool, S_IRUGO);
MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");
@@ -409,6 +425,8 @@ NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
+NULLB_DEVICE_ATTR(no_sched, bool, NULL);
+NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
{
@@ -532,6 +550,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_zone_max_open,
&nullb_device_attr_zone_max_active,
&nullb_device_attr_virt_boundary,
+ &nullb_device_attr_no_sched,
+ &nullb_device_attr_shared_tag_bitmap,
NULL,
};
@@ -588,7 +608,13 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
static ssize_t memb_group_features_show(struct config_item *item, char *page)
{
return snprintf(page, PAGE_SIZE,
- "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors,virt_boundary\n");
+ "badblocks,blocking,blocksize,cache_size,"
+ "completion_nsec,discard,home_node,hw_queue_depth,"
+ "irqmode,max_sectors,mbps,memory_backed,no_sched,"
+ "poll_queues,power,queue_mode,shared_tag_bitmap,size,"
+ "submit_queues,use_per_node_hctx,virt_boundary,zoned,"
+ "zone_capacity,zone_max_active,zone_max_open,"
+ "zone_nr_conv,zone_size\n");
}
CONFIGFS_ATTR_RO(memb_group_, features);
@@ -650,6 +676,10 @@ static struct nullb_device *null_alloc_dev(void)
dev->irqmode = g_irqmode;
dev->hw_queue_depth = g_hw_queue_depth;
dev->blocking = g_blocking;
+ dev->memory_backed = g_memory_backed;
+ dev->discard = g_discard;
+ dev->cache_size = g_cache_size;
+ dev->mbps = g_mbps;
dev->use_per_node_hctx = g_use_per_node_hctx;
dev->zoned = g_zoned;
dev->zone_size = g_zone_size;
@@ -658,6 +688,8 @@ static struct nullb_device *null_alloc_dev(void)
dev->zone_max_open = g_zone_max_open;
dev->zone_max_active = g_zone_max_active;
dev->virt_boundary = g_virt_boundary;
+ dev->no_sched = g_no_sched;
+ dev->shared_tag_bitmap = g_shared_tag_bitmap;
return dev;
}
@@ -1655,7 +1687,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
static void cleanup_queue(struct nullb_queue *nq)
{
- kfree(nq->tag_map);
+ bitmap_free(nq->tag_map);
kfree(nq->cmds);
}
@@ -1782,14 +1814,13 @@ static const struct block_device_operations null_rq_ops = {
static int setup_commands(struct nullb_queue *nq)
{
struct nullb_cmd *cmd;
- int i, tag_size;
+ int i;
nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL);
if (!nq->cmds)
return -ENOMEM;
- tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG;
- nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL);
+ nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL);
if (!nq->tag_map) {
kfree(nq->cmds);
return -ENOMEM;
@@ -1866,31 +1897,48 @@ static int null_gendisk_register(struct nullb *nullb)
static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
{
+ unsigned int flags = BLK_MQ_F_SHOULD_MERGE;
+ int hw_queues, numa_node;
+ unsigned int queue_depth;
int poll_queues;
+ if (nullb) {
+ hw_queues = nullb->dev->submit_queues;
+ poll_queues = nullb->dev->poll_queues;
+ queue_depth = nullb->dev->hw_queue_depth;
+ numa_node = nullb->dev->home_node;
+ if (nullb->dev->no_sched)
+ flags |= BLK_MQ_F_NO_SCHED;
+ if (nullb->dev->shared_tag_bitmap)
+ flags |= BLK_MQ_F_TAG_HCTX_SHARED;
+ if (nullb->dev->blocking)
+ flags |= BLK_MQ_F_BLOCKING;
+ } else {
+ hw_queues = g_submit_queues;
+ poll_queues = g_poll_queues;
+ queue_depth = g_hw_queue_depth;
+ numa_node = g_home_node;
+ if (g_no_sched)
+ flags |= BLK_MQ_F_NO_SCHED;
+ if (g_shared_tag_bitmap)
+ flags |= BLK_MQ_F_TAG_HCTX_SHARED;
+ if (g_blocking)
+ flags |= BLK_MQ_F_BLOCKING;
+ }
+
set->ops = &null_mq_ops;
- set->nr_hw_queues = nullb ? nullb->dev->submit_queues :
- g_submit_queues;
- poll_queues = nullb ? nullb->dev->poll_queues : g_poll_queues;
- if (poll_queues)
- set->nr_hw_queues += poll_queues;
- set->queue_depth = nullb ? nullb->dev->hw_queue_depth :
- g_hw_queue_depth;
- set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
set->cmd_size = sizeof(struct nullb_cmd);
- set->flags = BLK_MQ_F_SHOULD_MERGE;
- if (g_no_sched)
- set->flags |= BLK_MQ_F_NO_SCHED;
- if (g_shared_tag_bitmap)
- set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
+ set->flags = flags;
set->driver_data = nullb;
- if (poll_queues)
+ set->nr_hw_queues = hw_queues;
+ set->queue_depth = queue_depth;
+ set->numa_node = numa_node;
+ if (poll_queues) {
+ set->nr_hw_queues += poll_queues;
set->nr_maps = 3;
- else
+ } else {
set->nr_maps = 1;
-
- if ((nullb && nullb->dev->blocking) || g_blocking)
- set->flags |= BLK_MQ_F_BLOCKING;
+ }
return blk_mq_alloc_tag_set(set);
}
@@ -2042,8 +2090,13 @@ static int null_add_dev(struct nullb_device *dev)
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
mutex_lock(&lock);
- nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
- dev->index = nullb->index;
+ rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
+ if (rv < 0) {
+ mutex_unlock(&lock);
+ goto out_cleanup_zone;
+ }
+ nullb->index = rv;
+ dev->index = rv;
mutex_unlock(&lock);
blk_queue_logical_block_size(nullb->q, dev->blocksize);
@@ -2069,7 +2122,7 @@ static int null_add_dev(struct nullb_device *dev)
rv = null_gendisk_register(nullb);
if (rv)
- goto out_cleanup_zone;
+ goto out_ida_free;
mutex_lock(&lock);
list_add_tail(&nullb->list, &nullb_list);
@@ -2078,6 +2131,9 @@ static int null_add_dev(struct nullb_device *dev)
pr_info("disk %s created\n", nullb->disk_name);
return 0;
+
+out_ida_free:
+ ida_free(&nullb_indexes, nullb->index);
out_cleanup_zone:
null_free_zoned_dev(dev);
out_cleanup_disk:
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 6fbf0a1b2622..94ff68052b1e 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -113,6 +113,8 @@ struct nullb_device {
bool discard; /* if support discard */
bool zoned; /* if device is zoned */
bool virt_boundary; /* virtual boundary on/off for the device */
+ bool no_sched; /* no IO scheduler for the device */
+ bool shared_tag_bitmap; /* use hostwide shared tags */
};
struct nullb {
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 01a15dbd9cde..4cea3b08087e 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2399,7 +2399,7 @@ static void pkt_submit_bio(struct bio *bio)
struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata;
struct bio *split;
- blk_queue_split(&bio);
+ bio = bio_split_to_limits(bio);
pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
(unsigned long long)bio->bi_iter.bi_sector,
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index d1e0fefec90b..e1d080f680ed 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -586,7 +586,7 @@ static void ps3vram_submit_bio(struct bio *bio)
dev_dbg(&dev->core, "%s\n", __func__);
- blk_queue_split(&bio);
+ bio = bio_split_to_limits(bio);
spin_lock_irq(&priv->lock);
busy = !bio_list_empty(&priv->list);
diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c
index 2be5d87a3ca6..e7c7d9a68168 100644
--- a/drivers/block/rnbd/rnbd-clt-sysfs.c
+++ b/drivers/block/rnbd/rnbd-clt-sysfs.c
@@ -376,7 +376,7 @@ static ssize_t rnbd_clt_resize_dev_store(struct kobject *kobj,
if (ret)
return ret;
- ret = rnbd_clt_resize_disk(dev, (size_t)sectors);
+ ret = rnbd_clt_resize_disk(dev, sectors);
if (ret)
return ret;
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index b8d9e2824d9c..04da33a22ef4 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -68,39 +68,18 @@ static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev)
return refcount_inc_not_zero(&dev->refcount);
}
-static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev,
- const struct rnbd_msg_open_rsp *rsp)
+static void rnbd_clt_change_capacity(struct rnbd_clt_dev *dev,
+ sector_t new_nsectors)
{
- struct rnbd_clt_session *sess = dev->sess;
-
- if (!rsp->logical_block_size)
- return -EINVAL;
-
- dev->device_id = le32_to_cpu(rsp->device_id);
- dev->nsectors = le64_to_cpu(rsp->nsectors);
- dev->logical_block_size = le16_to_cpu(rsp->logical_block_size);
- dev->physical_block_size = le16_to_cpu(rsp->physical_block_size);
- dev->max_discard_sectors = le32_to_cpu(rsp->max_discard_sectors);
- dev->discard_granularity = le32_to_cpu(rsp->discard_granularity);
- dev->discard_alignment = le32_to_cpu(rsp->discard_alignment);
- dev->secure_discard = le16_to_cpu(rsp->secure_discard);
- dev->wc = !!(rsp->cache_policy & RNBD_WRITEBACK);
- dev->fua = !!(rsp->cache_policy & RNBD_FUA);
-
- dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE;
- dev->max_segments = sess->max_segments;
-
- return 0;
-}
+ if (get_capacity(dev->gd) == new_nsectors)
+ return;
-static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev,
- size_t new_nsectors)
-{
- rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n",
- dev->nsectors, new_nsectors);
- dev->nsectors = new_nsectors;
- set_capacity_and_notify(dev->gd, dev->nsectors);
- return 0;
+ /*
+ * If the size changed, we need to revalidate it
+ */
+ rnbd_clt_info(dev, "Device size changed from %llu to %llu sectors\n",
+ get_capacity(dev->gd), new_nsectors);
+ set_capacity_and_notify(dev->gd, new_nsectors);
}
static int process_msg_open_rsp(struct rnbd_clt_dev *dev,
@@ -119,19 +98,16 @@ static int process_msg_open_rsp(struct rnbd_clt_dev *dev,
if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) {
u64 nsectors = le64_to_cpu(rsp->nsectors);
- /*
- * If the device was remapped and the size changed in the
- * meantime we need to revalidate it
- */
- if (dev->nsectors != nsectors)
- rnbd_clt_change_capacity(dev, nsectors);
+ rnbd_clt_change_capacity(dev, nsectors);
gd_kobj = &disk_to_dev(dev->gd)->kobj;
kobject_uevent(gd_kobj, KOBJ_ONLINE);
rnbd_clt_info(dev, "Device online, device remapped successfully\n");
}
- err = rnbd_clt_set_dev_attr(dev, rsp);
- if (err)
+ if (!rsp->logical_block_size) {
+ err = -EINVAL;
goto out;
+ }
+ dev->device_id = le32_to_cpu(rsp->device_id);
dev->dev_state = DEV_STATE_MAPPED;
out:
@@ -140,7 +116,7 @@ out:
return err;
}
-int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize)
+int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize)
{
int ret = 0;
@@ -150,7 +126,7 @@ int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize)
ret = -ENOENT;
goto out;
}
- ret = rnbd_clt_change_capacity(dev, newsize);
+ rnbd_clt_change_capacity(dev, newsize);
out:
mutex_unlock(&dev->lock);
@@ -507,6 +483,11 @@ static void msg_open_conf(struct work_struct *work)
struct rnbd_msg_open_rsp *rsp = iu->buf;
struct rnbd_clt_dev *dev = iu->dev;
int errno = iu->errno;
+ bool from_map = false;
+
+ /* INIT state is only triggered from rnbd_clt_map_device */
+ if (dev->dev_state == DEV_STATE_INIT)
+ from_map = true;
if (errno) {
rnbd_clt_err(dev,
@@ -523,7 +504,9 @@ static void msg_open_conf(struct work_struct *work)
send_msg_close(dev, device_id, RTRS_PERMIT_NOWAIT);
}
}
- kfree(rsp);
+ /* We free rsp in rnbd_clt_map_device for map scenario */
+ if (!from_map)
+ kfree(rsp);
wake_up_iu_comp(iu, errno);
rnbd_put_iu(dev->sess, iu);
rnbd_clt_put_dev(dev);
@@ -942,7 +925,7 @@ static int rnbd_client_open(struct block_device *block_device, fmode_t mode)
{
struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
- if (dev->read_only && (mode & FMODE_WRITE))
+ if (get_disk_ro(dev->gd) && (mode & FMODE_WRITE))
return -EPERM;
if (dev->dev_state == DEV_STATE_UNMAPPED ||
@@ -963,10 +946,10 @@ static int rnbd_client_getgeo(struct block_device *block_device,
struct hd_geometry *geo)
{
u64 size;
- struct rnbd_clt_dev *dev;
+ struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
+ struct queue_limits *limit = &dev->queue->limits;
- dev = block_device->bd_disk->private_data;
- size = dev->size * (dev->logical_block_size / SECTOR_SIZE);
+ size = dev->size * (limit->logical_block_size / SECTOR_SIZE);
geo->cylinders = size >> 6; /* size/64 */
geo->heads = 4;
geo->sectors = 16;
@@ -1350,11 +1333,15 @@ static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev)
}
}
-static void setup_request_queue(struct rnbd_clt_dev *dev)
+static void setup_request_queue(struct rnbd_clt_dev *dev,
+ struct rnbd_msg_open_rsp *rsp)
{
- blk_queue_logical_block_size(dev->queue, dev->logical_block_size);
- blk_queue_physical_block_size(dev->queue, dev->physical_block_size);
- blk_queue_max_hw_sectors(dev->queue, dev->max_hw_sectors);
+ blk_queue_logical_block_size(dev->queue,
+ le16_to_cpu(rsp->logical_block_size));
+ blk_queue_physical_block_size(dev->queue,
+ le16_to_cpu(rsp->physical_block_size));
+ blk_queue_max_hw_sectors(dev->queue,
+ dev->sess->max_io_size / SECTOR_SIZE);
/*
* we don't support discards to "discontiguous" segments
@@ -1362,21 +1349,27 @@ static void setup_request_queue(struct rnbd_clt_dev *dev)
*/
blk_queue_max_discard_segments(dev->queue, 1);
- blk_queue_max_discard_sectors(dev->queue, dev->max_discard_sectors);
- dev->queue->limits.discard_granularity = dev->discard_granularity;
- dev->queue->limits.discard_alignment = dev->discard_alignment;
- if (dev->secure_discard)
+ blk_queue_max_discard_sectors(dev->queue,
+ le32_to_cpu(rsp->max_discard_sectors));
+ dev->queue->limits.discard_granularity =
+ le32_to_cpu(rsp->discard_granularity);
+ dev->queue->limits.discard_alignment =
+ le32_to_cpu(rsp->discard_alignment);
+ if (le16_to_cpu(rsp->secure_discard))
blk_queue_max_secure_erase_sectors(dev->queue,
- dev->max_discard_sectors);
+ le32_to_cpu(rsp->max_discard_sectors));
blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
- blk_queue_max_segments(dev->queue, dev->max_segments);
+ blk_queue_max_segments(dev->queue, dev->sess->max_segments);
blk_queue_io_opt(dev->queue, dev->sess->max_io_size);
blk_queue_virt_boundary(dev->queue, SZ_4K - 1);
- blk_queue_write_cache(dev->queue, dev->wc, dev->fua);
+ blk_queue_write_cache(dev->queue,
+ !!(rsp->cache_policy & RNBD_WRITEBACK),
+ !!(rsp->cache_policy & RNBD_FUA));
}
-static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
+static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev,
+ struct rnbd_msg_open_rsp *rsp, int idx)
{
int err;
@@ -1388,19 +1381,15 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
dev->gd->private_data = dev;
snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), "rnbd%d",
idx);
- pr_debug("disk_name=%s, capacity=%zu\n",
+ pr_debug("disk_name=%s, capacity=%llu\n",
dev->gd->disk_name,
- dev->nsectors * (dev->logical_block_size / SECTOR_SIZE)
- );
+ le64_to_cpu(rsp->nsectors) *
+ (le16_to_cpu(rsp->logical_block_size) / SECTOR_SIZE));
- set_capacity(dev->gd, dev->nsectors);
+ set_capacity(dev->gd, le64_to_cpu(rsp->nsectors));
- if (dev->access_mode == RNBD_ACCESS_RO) {
- dev->read_only = true;
+ if (dev->access_mode == RNBD_ACCESS_RO)
set_disk_ro(dev->gd, true);
- } else {
- dev->read_only = false;
- }
/*
* Network device does not need rotational
@@ -1413,11 +1402,13 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
return err;
}
-static int rnbd_client_setup_device(struct rnbd_clt_dev *dev)
+static int rnbd_client_setup_device(struct rnbd_clt_dev *dev,
+ struct rnbd_msg_open_rsp *rsp)
{
int idx = dev->clt_device_id;
- dev->size = dev->nsectors * dev->logical_block_size;
+ dev->size = le64_to_cpu(rsp->nsectors) *
+ le16_to_cpu(rsp->logical_block_size);
dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev);
if (IS_ERR(dev->gd))
@@ -1425,8 +1416,8 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev)
dev->queue = dev->gd->queue;
rnbd_init_mq_hw_queues(dev);
- setup_request_queue(dev);
- return rnbd_clt_setup_gen_disk(dev, idx);
+ setup_request_queue(dev, rsp);
+ return rnbd_clt_setup_gen_disk(dev, rsp, idx);
}
static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess,
@@ -1562,7 +1553,14 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
{
struct rnbd_clt_session *sess;
struct rnbd_clt_dev *dev;
- int ret;
+ int ret, errno;
+ struct rnbd_msg_open_rsp *rsp;
+ struct rnbd_msg_open msg;
+ struct rnbd_iu *iu;
+ struct kvec vec = {
+ .iov_base = &msg,
+ .iov_len = sizeof(msg)
+ };
if (exists_devpath(pathname, sessname))
return ERR_PTR(-EEXIST);
@@ -1582,17 +1580,47 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
ret = -EEXIST;
goto put_dev;
}
- ret = send_msg_open(dev, RTRS_PERMIT_WAIT);
+
+ rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+ if (!rsp) {
+ ret = -ENOMEM;
+ goto del_dev;
+ }
+
+ iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT);
+ if (!iu) {
+ ret = -ENOMEM;
+ kfree(rsp);
+ goto del_dev;
+ }
+ iu->buf = rsp;
+ iu->dev = dev;
+ sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp));
+
+ msg.hdr.type = cpu_to_le16(RNBD_MSG_OPEN);
+ msg.access_mode = dev->access_mode;
+ strscpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name));
+
+ WARN_ON(!rnbd_clt_get_dev(dev));
+ ret = send_usr_msg(sess->rtrs, READ, iu,
+ &vec, sizeof(*rsp), iu->sgt.sgl, 1,
+ msg_open_conf, &errno, RTRS_PERMIT_WAIT);
+ if (ret) {
+ rnbd_clt_put_dev(dev);
+ rnbd_put_iu(sess, iu);
+ } else {
+ ret = errno;
+ }
if (ret) {
rnbd_clt_err(dev,
"map_device: failed, can't open remote device, err: %d\n",
ret);
- goto del_dev;
+ goto put_iu;
}
mutex_lock(&dev->lock);
pr_debug("Opened remote device: session=%s, path='%s'\n",
sess->sessname, pathname);
- ret = rnbd_client_setup_device(dev);
+ ret = rnbd_client_setup_device(dev, rsp);
if (ret) {
rnbd_clt_err(dev,
"map_device: Failed to configure device, err: %d\n",
@@ -1602,21 +1630,30 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
}
rnbd_clt_info(dev,
- "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n",
- dev->gd->disk_name, dev->nsectors,
- dev->logical_block_size, dev->physical_block_size,
- dev->max_discard_sectors,
- dev->discard_granularity, dev->discard_alignment,
- dev->secure_discard, dev->max_segments,
- dev->max_hw_sectors, dev->wc, dev->fua);
+ "map_device: Device mapped as %s (nsectors: %llu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n",
+ dev->gd->disk_name, le64_to_cpu(rsp->nsectors),
+ le16_to_cpu(rsp->logical_block_size),
+ le16_to_cpu(rsp->physical_block_size),
+ le32_to_cpu(rsp->max_discard_sectors),
+ le32_to_cpu(rsp->discard_granularity),
+ le32_to_cpu(rsp->discard_alignment),
+ le16_to_cpu(rsp->secure_discard),
+ sess->max_segments, sess->max_io_size / SECTOR_SIZE,
+ !!(rsp->cache_policy & RNBD_WRITEBACK),
+ !!(rsp->cache_policy & RNBD_FUA));
mutex_unlock(&dev->lock);
+ kfree(rsp);
+ rnbd_put_iu(sess, iu);
rnbd_clt_put_sess(sess);
return dev;
send_close:
send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT);
+put_iu:
+ kfree(rsp);
+ rnbd_put_iu(sess, iu);
del_dev:
delete_dev(dev);
put_dev:
diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h
index 2e2e8c4a85c1..a48e040abe63 100644
--- a/drivers/block/rnbd/rnbd-clt.h
+++ b/drivers/block/rnbd/rnbd-clt.h
@@ -106,6 +106,7 @@ struct rnbd_queue {
};
struct rnbd_clt_dev {
+ struct kobject kobj;
struct rnbd_clt_session *sess;
struct request_queue *queue;
struct rnbd_queue *hw_queues;
@@ -114,27 +115,14 @@ struct rnbd_clt_dev {
u32 clt_device_id;
struct mutex lock;
enum rnbd_clt_dev_state dev_state;
+ refcount_t refcount;
char *pathname;
enum rnbd_access_mode access_mode;
u32 nr_poll_queues;
- bool read_only;
- bool wc;
- bool fua;
- u32 max_hw_sectors;
- u32 max_discard_sectors;
- u32 discard_granularity;
- u32 discard_alignment;
- u16 secure_discard;
- u16 physical_block_size;
- u16 logical_block_size;
- u16 max_segments;
- size_t nsectors;
u64 size; /* device size in bytes */
struct list_head list;
struct gendisk *gd;
- struct kobject kobj;
char *blk_symlink_name;
- refcount_t refcount;
struct work_struct unmap_on_rmmod_work;
};
@@ -150,7 +138,7 @@ int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force,
const struct attribute *sysfs_self);
int rnbd_clt_remap_device(struct rnbd_clt_dev *dev);
-int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize);
+int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize);
/* rnbd-clt-sysfs.c */
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index 0713014bf423..5e08da277ddf 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -224,7 +224,6 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id)
wait_for_completion(&dc); /* wait for inflights to drop to zero */
rnbd_dev_close(sess_dev->rnbd_dev);
- list_del(&sess_dev->sess_list);
mutex_lock(&sess_dev->dev->lock);
list_del(&sess_dev->dev_list);
if (sess_dev->open_flags & FMODE_WRITE)
@@ -239,14 +238,14 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id)
static void destroy_sess(struct rnbd_srv_session *srv_sess)
{
- struct rnbd_srv_sess_dev *sess_dev, *tmp;
+ struct rnbd_srv_sess_dev *sess_dev;
+ unsigned long index;
- if (list_empty(&srv_sess->sess_dev_list))
+ if (xa_empty(&srv_sess->index_idr))
goto out;
mutex_lock(&srv_sess->lock);
- list_for_each_entry_safe(sess_dev, tmp, &srv_sess->sess_dev_list,
- sess_list)
+ xa_for_each(&srv_sess->index_idr, index, sess_dev)
rnbd_srv_destroy_dev_session_sysfs(sess_dev);
mutex_unlock(&srv_sess->lock);
@@ -281,7 +280,6 @@ static int create_sess(struct rtrs_srv_sess *rtrs)
srv_sess->queue_depth = rtrs_srv_get_queue_depth(rtrs);
xa_init_flags(&srv_sess->index_idr, XA_FLAGS_ALLOC);
- INIT_LIST_HEAD(&srv_sess->sess_dev_list);
mutex_init(&srv_sess->lock);
mutex_lock(&sess_lock);
list_add(&srv_sess->list, &sess_list);
@@ -323,10 +321,11 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev,
{
struct rnbd_srv_session *sess = sess_dev->sess;
- sess_dev->keep_id = true;
/* It is already started to close by client's close message. */
if (!mutex_trylock(&sess->lock))
return;
+
+ sess_dev->keep_id = true;
/* first remove sysfs itself to avoid deadlock */
sysfs_remove_file_self(&sess_dev->kobj, &attr->attr);
rnbd_srv_destroy_dev_session_sysfs(sess_dev);
@@ -666,11 +665,12 @@ static struct rnbd_srv_sess_dev *
find_srv_sess_dev(struct rnbd_srv_session *srv_sess, const char *dev_name)
{
struct rnbd_srv_sess_dev *sess_dev;
+ unsigned long index;
- if (list_empty(&srv_sess->sess_dev_list))
+ if (xa_empty(&srv_sess->index_idr))
return NULL;
- list_for_each_entry(sess_dev, &srv_sess->sess_dev_list, sess_list)
+ xa_for_each(&srv_sess->index_idr, index, sess_dev)
if (!strcmp(sess_dev->pathname, dev_name))
return sess_dev;
@@ -780,8 +780,6 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
list_add(&srv_sess_dev->dev_list, &srv_dev->sess_dev_list);
mutex_unlock(&srv_dev->lock);
- list_add(&srv_sess_dev->sess_list, &srv_sess->sess_dev_list);
-
rnbd_srv_info(srv_sess_dev, "Opened device '%s'\n", srv_dev->id);
kfree(full_path);
diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h
index 6926f9069dc4..081bceaf4ae9 100644
--- a/drivers/block/rnbd/rnbd-srv.h
+++ b/drivers/block/rnbd/rnbd-srv.h
@@ -25,8 +25,6 @@ struct rnbd_srv_session {
int queue_depth;
struct xarray index_idr;
- /* List of struct rnbd_srv_sess_dev */
- struct list_head sess_dev_list;
struct mutex lock;
u8 ver;
};
@@ -48,8 +46,6 @@ struct rnbd_srv_dev {
struct rnbd_srv_sess_dev {
/* Entry inside rnbd_srv_dev struct */
struct list_head dev_list;
- /* Entry inside rnbd_srv_session struct */
- struct list_head sess_list;
struct rnbd_dev *rnbd_dev;
struct rnbd_srv_session *sess;
struct rnbd_srv_dev *dev;
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
deleted file mode 100644
index 0e1a484cab0b..000000000000
--- a/drivers/block/sx8.c
+++ /dev/null
@@ -1,1582 +0,0 @@
-/*
- * sx8.c: Driver for Promise SATA SX8 looks-like-I2O hardware
- *
- * Copyright 2004-2005 Red Hat, Inc.
- *
- * Author/maintainer: Jeff Garzik <jgarzik@pobox.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/blk-mq.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/compiler.h>
-#include <linux/workqueue.h>
-#include <linux/bitops.h>
-#include <linux/delay.h>
-#include <linux/ktime.h>
-#include <linux/hdreg.h>
-#include <linux/dma-mapping.h>
-#include <linux/completion.h>
-#include <linux/scatterlist.h>
-#include <asm/io.h>
-#include <linux/uaccess.h>
-
-#if 0
-#define CARM_DEBUG
-#define CARM_VERBOSE_DEBUG
-#else
-#undef CARM_DEBUG
-#undef CARM_VERBOSE_DEBUG
-#endif
-#undef CARM_NDEBUG
-
-#define DRV_NAME "sx8"
-#define DRV_VERSION "1.0"
-#define PFX DRV_NAME ": "
-
-MODULE_AUTHOR("Jeff Garzik");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Promise SATA SX8 block driver");
-MODULE_VERSION(DRV_VERSION);
-
-/*
- * SX8 hardware has a single message queue for all ATA ports.
- * When this driver was written, the hardware (firmware?) would
- * corrupt data eventually, if more than one request was outstanding.
- * As one can imagine, having 8 ports bottlenecking on a single
- * command hurts performance.
- *
- * Based on user reports, later versions of the hardware (firmware?)
- * seem to be able to survive with more than one command queued.
- *
- * Therefore, we default to the safe option -- 1 command -- but
- * allow the user to increase this.
- *
- * SX8 should be able to support up to ~60 queued commands (CARM_MAX_REQ),
- * but problems seem to occur when you exceed ~30, even on newer hardware.
- */
-static int max_queue = 1;
-module_param(max_queue, int, 0444);
-MODULE_PARM_DESC(max_queue, "Maximum number of queued commands. (min==1, max==30, safe==1)");
-
-
-#define NEXT_RESP(idx) ((idx + 1) % RMSG_Q_LEN)
-
-/* 0xf is just arbitrary, non-zero noise; this is sorta like poisoning */
-#define TAG_ENCODE(tag) (((tag) << 16) | 0xf)
-#define TAG_DECODE(tag) (((tag) >> 16) & 0x1f)
-#define TAG_VALID(tag) ((((tag) & 0xf) == 0xf) && (TAG_DECODE(tag) < 32))
-
-/* note: prints function name for you */
-#ifdef CARM_DEBUG
-#define DPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args)
-#ifdef CARM_VERBOSE_DEBUG
-#define VPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args)
-#else
-#define VPRINTK(fmt, args...)
-#endif /* CARM_VERBOSE_DEBUG */
-#else
-#define DPRINTK(fmt, args...)
-#define VPRINTK(fmt, args...)
-#endif /* CARM_DEBUG */
-
-#ifdef CARM_NDEBUG
-#define assert(expr)
-#else
-#define assert(expr) \
- if(unlikely(!(expr))) { \
- printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
- #expr, __FILE__, __func__, __LINE__); \
- }
-#endif
-
-/* defines only for the constants which don't work well as enums */
-struct carm_host;
-
-enum {
- /* adapter-wide limits */
- CARM_MAX_PORTS = 8,
- CARM_SHM_SIZE = (4096 << 7),
- CARM_MINORS_PER_MAJOR = 256 / CARM_MAX_PORTS,
- CARM_MAX_WAIT_Q = CARM_MAX_PORTS + 1,
-
- /* command message queue limits */
- CARM_MAX_REQ = 64, /* max command msgs per host */
- CARM_MSG_LOW_WATER = (CARM_MAX_REQ / 4), /* refill mark */
-
- /* S/G limits, host-wide and per-request */
- CARM_MAX_REQ_SG = 32, /* max s/g entries per request */
- CARM_MAX_HOST_SG = 600, /* max s/g entries per host */
- CARM_SG_LOW_WATER = (CARM_MAX_HOST_SG / 4), /* re-fill mark */
-
- /* hardware registers */
- CARM_IHQP = 0x1c,
- CARM_INT_STAT = 0x10, /* interrupt status */
- CARM_INT_MASK = 0x14, /* interrupt mask */
- CARM_HMUC = 0x18, /* host message unit control */
- RBUF_ADDR_LO = 0x20, /* response msg DMA buf low 32 bits */
- RBUF_ADDR_HI = 0x24, /* response msg DMA buf high 32 bits */
- RBUF_BYTE_SZ = 0x28,
- CARM_RESP_IDX = 0x2c,
- CARM_CMS0 = 0x30, /* command message size reg 0 */
- CARM_LMUC = 0x48,
- CARM_HMPHA = 0x6c,
- CARM_INITC = 0xb5,
-
- /* bits in CARM_INT_{STAT,MASK} */
- INT_RESERVED = 0xfffffff0,
- INT_WATCHDOG = (1 << 3), /* watchdog timer */
- INT_Q_OVERFLOW = (1 << 2), /* cmd msg q overflow */
- INT_Q_AVAILABLE = (1 << 1), /* cmd msg q has free space */
- INT_RESPONSE = (1 << 0), /* response msg available */
- INT_ACK_MASK = INT_WATCHDOG | INT_Q_OVERFLOW,
- INT_DEF_MASK = INT_RESERVED | INT_Q_OVERFLOW |
- INT_RESPONSE,
-
- /* command messages, and related register bits */
- CARM_HAVE_RESP = 0x01,
- CARM_MSG_READ = 1,
- CARM_MSG_WRITE = 2,
- CARM_MSG_VERIFY = 3,
- CARM_MSG_GET_CAPACITY = 4,
- CARM_MSG_FLUSH = 5,
- CARM_MSG_IOCTL = 6,
- CARM_MSG_ARRAY = 8,
- CARM_MSG_MISC = 9,
- CARM_CME = (1 << 2),
- CARM_RME = (1 << 1),
- CARM_WZBC = (1 << 0),
- CARM_RMI = (1 << 0),
- CARM_Q_FULL = (1 << 3),
- CARM_MSG_SIZE = 288,
- CARM_Q_LEN = 48,
-
- /* CARM_MSG_IOCTL messages */
- CARM_IOC_SCAN_CHAN = 5, /* scan channels for devices */
- CARM_IOC_GET_TCQ = 13, /* get tcq/ncq depth */
- CARM_IOC_SET_TCQ = 14, /* set tcq/ncq depth */
-
- IOC_SCAN_CHAN_NODEV = 0x1f,
- IOC_SCAN_CHAN_OFFSET = 0x40,
-
- /* CARM_MSG_ARRAY messages */
- CARM_ARRAY_INFO = 0,
-
- ARRAY_NO_EXIST = (1 << 31),
-
- /* response messages */
- RMSG_SZ = 8, /* sizeof(struct carm_response) */
- RMSG_Q_LEN = 48, /* resp. msg list length */
- RMSG_OK = 1, /* bit indicating msg was successful */
- /* length of entire resp. msg buffer */
- RBUF_LEN = RMSG_SZ * RMSG_Q_LEN,
-
- PDC_SHM_SIZE = (4096 << 7), /* length of entire h/w buffer */
-
- /* CARM_MSG_MISC messages */
- MISC_GET_FW_VER = 2,
- MISC_ALLOC_MEM = 3,
- MISC_SET_TIME = 5,
-
- /* MISC_GET_FW_VER feature bits */
- FW_VER_4PORT = (1 << 2), /* 1=4 ports, 0=8 ports */
- FW_VER_NON_RAID = (1 << 1), /* 1=non-RAID firmware, 0=RAID */
- FW_VER_ZCR = (1 << 0), /* zero channel RAID (whatever that is) */
-
- /* carm_host flags */
- FL_NON_RAID = FW_VER_NON_RAID,
- FL_4PORT = FW_VER_4PORT,
- FL_FW_VER_MASK = (FW_VER_NON_RAID | FW_VER_4PORT),
- FL_DYN_MAJOR = (1 << 17),
-};
-
-enum {
- CARM_SG_BOUNDARY = 0xffffUL, /* s/g segment boundary */
-};
-
-enum scatter_gather_types {
- SGT_32BIT = 0,
- SGT_64BIT = 1,
-};
-
-enum host_states {
- HST_INVALID, /* invalid state; never used */
- HST_ALLOC_BUF, /* setting up master SHM area */
- HST_ERROR, /* we never leave here */
- HST_PORT_SCAN, /* start dev scan */
- HST_DEV_SCAN_START, /* start per-device probe */
- HST_DEV_SCAN, /* continue per-device probe */
- HST_DEV_ACTIVATE, /* activate devices we found */
- HST_PROBE_FINISHED, /* probe is complete */
- HST_PROBE_START, /* initiate probe */
- HST_SYNC_TIME, /* tell firmware what time it is */
- HST_GET_FW_VER, /* get firmware version, adapter port cnt */
-};
-
-#ifdef CARM_DEBUG
-static const char *state_name[] = {
- "HST_INVALID",
- "HST_ALLOC_BUF",
- "HST_ERROR",
- "HST_PORT_SCAN",
- "HST_DEV_SCAN_START",
- "HST_DEV_SCAN",
- "HST_DEV_ACTIVATE",
- "HST_PROBE_FINISHED",
- "HST_PROBE_START",
- "HST_SYNC_TIME",
- "HST_GET_FW_VER",
-};
-#endif
-
-struct carm_port {
- unsigned int port_no;
- struct gendisk *disk;
- struct carm_host *host;
-
- /* attached device characteristics */
- u64 capacity;
- char name[41];
- u16 dev_geom_head;
- u16 dev_geom_sect;
- u16 dev_geom_cyl;
-};
-
-struct carm_request {
- int n_elem;
- unsigned int msg_type;
- unsigned int msg_subtype;
- unsigned int msg_bucket;
- struct scatterlist sg[CARM_MAX_REQ_SG];
-};
-
-struct carm_host {
- unsigned long flags;
- void __iomem *mmio;
- void *shm;
- dma_addr_t shm_dma;
-
- int major;
- int id;
- char name[32];
-
- spinlock_t lock;
- struct pci_dev *pdev;
- unsigned int state;
- u32 fw_ver;
-
- struct blk_mq_tag_set tag_set;
- struct request_queue *oob_q;
- unsigned int n_oob;
-
- unsigned int hw_sg_used;
-
- unsigned int resp_idx;
-
- unsigned int wait_q_prod;
- unsigned int wait_q_cons;
- struct request_queue *wait_q[CARM_MAX_WAIT_Q];
-
- void *msg_base;
- dma_addr_t msg_dma;
-
- int cur_scan_dev;
- unsigned long dev_active;
- unsigned long dev_present;
- struct carm_port port[CARM_MAX_PORTS];
-
- struct work_struct fsm_task;
-
- int probe_err;
- struct completion probe_comp;
-};
-
-struct carm_response {
- __le32 ret_handle;
- __le32 status;
-} __attribute__((packed));
-
-struct carm_msg_sg {
- __le32 start;
- __le32 len;
-} __attribute__((packed));
-
-struct carm_msg_rw {
- u8 type;
- u8 id;
- u8 sg_count;
- u8 sg_type;
- __le32 handle;
- __le32 lba;
- __le16 lba_count;
- __le16 lba_high;
- struct carm_msg_sg sg[32];
-} __attribute__((packed));
-
-struct carm_msg_allocbuf {
- u8 type;
- u8 subtype;
- u8 n_sg;
- u8 sg_type;
- __le32 handle;
- __le32 addr;
- __le32 len;
- __le32 evt_pool;
- __le32 n_evt;
- __le32 rbuf_pool;
- __le32 n_rbuf;
- __le32 msg_pool;
- __le32 n_msg;
- struct carm_msg_sg sg[8];
-} __attribute__((packed));
-
-struct carm_msg_ioctl {
- u8 type;
- u8 subtype;
- u8 array_id;
- u8 reserved1;
- __le32 handle;
- __le32 data_addr;
- u32 reserved2;
-} __attribute__((packed));
-
-struct carm_msg_sync_time {
- u8 type;
- u8 subtype;
- u16 reserved1;
- __le32 handle;
- u32 reserved2;
- __le32 timestamp;
-} __attribute__((packed));
-
-struct carm_msg_get_fw_ver {
- u8 type;
- u8 subtype;
- u16 reserved1;
- __le32 handle;
- __le32 data_addr;
- u32 reserved2;
-} __attribute__((packed));
-
-struct carm_fw_ver {
- __le32 version;
- u8 features;
- u8 reserved1;
- u16 reserved2;
-} __attribute__((packed));
-
-struct carm_array_info {
- __le32 size;
-
- __le16 size_hi;
- __le16 stripe_size;
-
- __le32 mode;
-
- __le16 stripe_blk_sz;
- __le16 reserved1;
-
- __le16 cyl;
- __le16 head;
-
- __le16 sect;
- u8 array_id;
- u8 reserved2;
-
- char name[40];
-
- __le32 array_status;
-
- /* device list continues beyond this point? */
-} __attribute__((packed));
-
-static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
-static void carm_remove_one (struct pci_dev *pdev);
-static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo);
-
-static const struct pci_device_id carm_pci_tbl[] = {
- { PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
- { PCI_VENDOR_ID_PROMISE, 0x8002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
- { } /* terminate list */
-};
-MODULE_DEVICE_TABLE(pci, carm_pci_tbl);
-
-static struct pci_driver carm_driver = {
- .name = DRV_NAME,
- .id_table = carm_pci_tbl,
- .probe = carm_init_one,
- .remove = carm_remove_one,
-};
-
-static const struct block_device_operations carm_bd_ops = {
- .owner = THIS_MODULE,
- .getgeo = carm_bdev_getgeo,
-};
-
-static unsigned int carm_host_id;
-static unsigned long carm_major_alloc;
-
-
-
-static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
- struct carm_port *port = bdev->bd_disk->private_data;
-
- geo->heads = (u8) port->dev_geom_head;
- geo->sectors = (u8) port->dev_geom_sect;
- geo->cylinders = port->dev_geom_cyl;
- return 0;
-}
-
-static const u32 msg_sizes[] = { 32, 64, 128, CARM_MSG_SIZE };
-
-static inline int carm_lookup_bucket(u32 msg_size)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(msg_sizes); i++)
- if (msg_size <= msg_sizes[i])
- return i;
-
- return -ENOENT;
-}
-
-static void carm_init_buckets(void __iomem *mmio)
-{
- unsigned int i;
-
- for (i = 0; i < ARRAY_SIZE(msg_sizes); i++)
- writel(msg_sizes[i], mmio + CARM_CMS0 + (4 * i));
-}
-
-static inline void *carm_ref_msg(struct carm_host *host,
- unsigned int msg_idx)
-{
- return host->msg_base + (msg_idx * CARM_MSG_SIZE);
-}
-
-static inline dma_addr_t carm_ref_msg_dma(struct carm_host *host,
- unsigned int msg_idx)
-{
- return host->msg_dma + (msg_idx * CARM_MSG_SIZE);
-}
-
-static int carm_send_msg(struct carm_host *host,
- struct carm_request *crq, unsigned tag)
-{
- void __iomem *mmio = host->mmio;
- u32 msg = (u32) carm_ref_msg_dma(host, tag);
- u32 cm_bucket = crq->msg_bucket;
- u32 tmp;
- int rc = 0;
-
- VPRINTK("ENTER\n");
-
- tmp = readl(mmio + CARM_HMUC);
- if (tmp & CARM_Q_FULL) {
-#if 0
- tmp = readl(mmio + CARM_INT_MASK);
- tmp |= INT_Q_AVAILABLE;
- writel(tmp, mmio + CARM_INT_MASK);
- readl(mmio + CARM_INT_MASK); /* flush */
-#endif
- DPRINTK("host msg queue full\n");
- rc = -EBUSY;
- } else {
- writel(msg | (cm_bucket << 1), mmio + CARM_IHQP);
- readl(mmio + CARM_IHQP); /* flush */
- }
-
- return rc;
-}
-
-static int carm_array_info (struct carm_host *host, unsigned int array_idx)
-{
- struct carm_msg_ioctl *ioc;
- u32 msg_data;
- dma_addr_t msg_dma;
- struct carm_request *crq;
- struct request *rq;
- int rc;
-
- rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0);
- if (IS_ERR(rq)) {
- rc = -ENOMEM;
- goto err_out;
- }
- crq = blk_mq_rq_to_pdu(rq);
-
- ioc = carm_ref_msg(host, rq->tag);
- msg_dma = carm_ref_msg_dma(host, rq->tag);
- msg_data = (u32) (msg_dma + sizeof(struct carm_array_info));
-
- crq->msg_type = CARM_MSG_ARRAY;
- crq->msg_subtype = CARM_ARRAY_INFO;
- rc = carm_lookup_bucket(sizeof(struct carm_msg_ioctl) +
- sizeof(struct carm_array_info));
- BUG_ON(rc < 0);
- crq->msg_bucket = (u32) rc;
-
- memset(ioc, 0, sizeof(*ioc));
- ioc->type = CARM_MSG_ARRAY;
- ioc->subtype = CARM_ARRAY_INFO;
- ioc->array_id = (u8) array_idx;
- ioc->handle = cpu_to_le32(TAG_ENCODE(rq->tag));
- ioc->data_addr = cpu_to_le32(msg_data);
-
- spin_lock_irq(&host->lock);
- assert(host->state == HST_DEV_SCAN_START ||
- host->state == HST_DEV_SCAN);
- spin_unlock_irq(&host->lock);
-
- DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
- blk_execute_rq_nowait(rq, true);
-
- return 0;
-
-err_out:
- spin_lock_irq(&host->lock);
- host->state = HST_ERROR;
- spin_unlock_irq(&host->lock);
- return rc;
-}
-
-typedef unsigned int (*carm_sspc_t)(struct carm_host *, unsigned int, void *);
-
-static int carm_send_special (struct carm_host *host, carm_sspc_t func)
-{
- struct request *rq;
- struct carm_request *crq;
- struct carm_msg_ioctl *ioc;
- void *mem;
- unsigned int msg_size;
- int rc;
-
- rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0);
- if (IS_ERR(rq))
- return -ENOMEM;
- crq = blk_mq_rq_to_pdu(rq);
-
- mem = carm_ref_msg(host, rq->tag);
-
- msg_size = func(host, rq->tag, mem);
-
- ioc = mem;
- crq->msg_type = ioc->type;
- crq->msg_subtype = ioc->subtype;
- rc = carm_lookup_bucket(msg_size);
- BUG_ON(rc < 0);
- crq->msg_bucket = (u32) rc;
-
- DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
- blk_execute_rq_nowait(rq, true);
-
- return 0;
-}
-
-static unsigned int carm_fill_sync_time(struct carm_host *host,
- unsigned int idx, void *mem)
-{
- struct carm_msg_sync_time *st = mem;
-
- time64_t tv = ktime_get_real_seconds();
-
- memset(st, 0, sizeof(*st));
- st->type = CARM_MSG_MISC;
- st->subtype = MISC_SET_TIME;
- st->handle = cpu_to_le32(TAG_ENCODE(idx));
- st->timestamp = cpu_to_le32(tv);
-
- return sizeof(struct carm_msg_sync_time);
-}
-
-static unsigned int carm_fill_alloc_buf(struct carm_host *host,
- unsigned int idx, void *mem)
-{
- struct carm_msg_allocbuf *ab = mem;
-
- memset(ab, 0, sizeof(*ab));
- ab->type = CARM_MSG_MISC;
- ab->subtype = MISC_ALLOC_MEM;
- ab->handle = cpu_to_le32(TAG_ENCODE(idx));
- ab->n_sg = 1;
- ab->sg_type = SGT_32BIT;
- ab->addr = cpu_to_le32(host->shm_dma + (PDC_SHM_SIZE >> 1));
- ab->len = cpu_to_le32(PDC_SHM_SIZE >> 1);
- ab->evt_pool = cpu_to_le32(host->shm_dma + (16 * 1024));
- ab->n_evt = cpu_to_le32(1024);
- ab->rbuf_pool = cpu_to_le32(host->shm_dma);
- ab->n_rbuf = cpu_to_le32(RMSG_Q_LEN);
- ab->msg_pool = cpu_to_le32(host->shm_dma + RBUF_LEN);
- ab->n_msg = cpu_to_le32(CARM_Q_LEN);
- ab->sg[0].start = cpu_to_le32(host->shm_dma + (PDC_SHM_SIZE >> 1));
- ab->sg[0].len = cpu_to_le32(65536);
-
- return sizeof(struct carm_msg_allocbuf);
-}
-
-static unsigned int carm_fill_scan_channels(struct carm_host *host,
- unsigned int idx, void *mem)
-{
- struct carm_msg_ioctl *ioc = mem;
- u32 msg_data = (u32) (carm_ref_msg_dma(host, idx) +
- IOC_SCAN_CHAN_OFFSET);
-
- memset(ioc, 0, sizeof(*ioc));
- ioc->type = CARM_MSG_IOCTL;
- ioc->subtype = CARM_IOC_SCAN_CHAN;
- ioc->handle = cpu_to_le32(TAG_ENCODE(idx));
- ioc->data_addr = cpu_to_le32(msg_data);
-
- /* fill output data area with "no device" default values */
- mem += IOC_SCAN_CHAN_OFFSET;
- memset(mem, IOC_SCAN_CHAN_NODEV, CARM_MAX_PORTS);
-
- return IOC_SCAN_CHAN_OFFSET + CARM_MAX_PORTS;
-}
-
-static unsigned int carm_fill_get_fw_ver(struct carm_host *host,
- unsigned int idx, void *mem)
-{
- struct carm_msg_get_fw_ver *ioc = mem;
- u32 msg_data = (u32) (carm_ref_msg_dma(host, idx) + sizeof(*ioc));
-
- memset(ioc, 0, sizeof(*ioc));
- ioc->type = CARM_MSG_MISC;
- ioc->subtype = MISC_GET_FW_VER;
- ioc->handle = cpu_to_le32(TAG_ENCODE(idx));
- ioc->data_addr = cpu_to_le32(msg_data);
-
- return sizeof(struct carm_msg_get_fw_ver) +
- sizeof(struct carm_fw_ver);
-}
-
-static inline void carm_push_q (struct carm_host *host, struct request_queue *q)
-{
- unsigned int idx = host->wait_q_prod % CARM_MAX_WAIT_Q;
-
- blk_mq_stop_hw_queues(q);
- VPRINTK("STOPPED QUEUE %p\n", q);
-
- host->wait_q[idx] = q;
- host->wait_q_prod++;
- BUG_ON(host->wait_q_prod == host->wait_q_cons); /* overrun */
-}
-
-static inline struct request_queue *carm_pop_q(struct carm_host *host)
-{
- unsigned int idx;
-
- if (host->wait_q_prod == host->wait_q_cons)
- return NULL;
-
- idx = host->wait_q_cons % CARM_MAX_WAIT_Q;
- host->wait_q_cons++;
-
- return host->wait_q[idx];
-}
-
-static inline void carm_round_robin(struct carm_host *host)
-{
- struct request_queue *q = carm_pop_q(host);
- if (q) {
- blk_mq_start_hw_queues(q);
- VPRINTK("STARTED QUEUE %p\n", q);
- }
-}
-
-static inline enum dma_data_direction carm_rq_dir(struct request *rq)
-{
- return op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
-}
-
-static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
- const struct blk_mq_queue_data *bd)
-{
- struct request_queue *q = hctx->queue;
- struct request *rq = bd->rq;
- struct carm_port *port = q->queuedata;
- struct carm_host *host = port->host;
- struct carm_request *crq = blk_mq_rq_to_pdu(rq);
- struct carm_msg_rw *msg;
- struct scatterlist *sg;
- int i, n_elem = 0, rc;
- unsigned int msg_size;
- u32 tmp;
-
- crq->n_elem = 0;
- sg_init_table(crq->sg, CARM_MAX_REQ_SG);
-
- blk_mq_start_request(rq);
-
- spin_lock_irq(&host->lock);
- if (req_op(rq) == REQ_OP_DRV_OUT)
- goto send_msg;
-
- /* get scatterlist from block layer */
- sg = &crq->sg[0];
- n_elem = blk_rq_map_sg(q, rq, sg);
- if (n_elem <= 0)
- goto out_ioerr;
-
- /* map scatterlist to PCI bus addresses */
- n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, carm_rq_dir(rq));
- if (n_elem <= 0)
- goto out_ioerr;
-
- /* obey global hardware limit on S/G entries */
- if (host->hw_sg_used >= CARM_MAX_HOST_SG - n_elem)
- goto out_resource;
-
- crq->n_elem = n_elem;
- host->hw_sg_used += n_elem;
-
- /*
- * build read/write message
- */
-
- VPRINTK("build msg\n");
- msg = (struct carm_msg_rw *) carm_ref_msg(host, rq->tag);
-
- if (rq_data_dir(rq) == WRITE) {
- msg->type = CARM_MSG_WRITE;
- crq->msg_type = CARM_MSG_WRITE;
- } else {
- msg->type = CARM_MSG_READ;
- crq->msg_type = CARM_MSG_READ;
- }
-
- msg->id = port->port_no;
- msg->sg_count = n_elem;
- msg->sg_type = SGT_32BIT;
- msg->handle = cpu_to_le32(TAG_ENCODE(rq->tag));
- msg->lba = cpu_to_le32(blk_rq_pos(rq) & 0xffffffff);
- tmp = (blk_rq_pos(rq) >> 16) >> 16;
- msg->lba_high = cpu_to_le16( (u16) tmp );
- msg->lba_count = cpu_to_le16(blk_rq_sectors(rq));
-
- msg_size = sizeof(struct carm_msg_rw) - sizeof(msg->sg);
- for (i = 0; i < n_elem; i++) {
- struct carm_msg_sg *carm_sg = &msg->sg[i];
- carm_sg->start = cpu_to_le32(sg_dma_address(&crq->sg[i]));
- carm_sg->len = cpu_to_le32(sg_dma_len(&crq->sg[i]));
- msg_size += sizeof(struct carm_msg_sg);
- }
-
- rc = carm_lookup_bucket(msg_size);
- BUG_ON(rc < 0);
- crq->msg_bucket = (u32) rc;
-send_msg:
- /*
- * queue read/write message to hardware
- */
- VPRINTK("send msg, tag == %u\n", rq->tag);
- rc = carm_send_msg(host, crq, rq->tag);
- if (rc) {
- host->hw_sg_used -= n_elem;
- goto out_resource;
- }
-
- spin_unlock_irq(&host->lock);
- return BLK_STS_OK;
-out_resource:
- dma_unmap_sg(&host->pdev->dev, &crq->sg[0], n_elem, carm_rq_dir(rq));
- carm_push_q(host, q);
- spin_unlock_irq(&host->lock);
- return BLK_STS_DEV_RESOURCE;
-out_ioerr:
- carm_round_robin(host);
- spin_unlock_irq(&host->lock);
- return BLK_STS_IOERR;
-}
-
-static void carm_handle_array_info(struct carm_host *host,
- struct carm_request *crq, u8 *mem,
- blk_status_t error)
-{
- struct carm_port *port;
- u8 *msg_data = mem + sizeof(struct carm_array_info);
- struct carm_array_info *desc = (struct carm_array_info *) msg_data;
- u64 lo, hi;
- int cur_port;
- size_t slen;
-
- DPRINTK("ENTER\n");
-
- if (error)
- goto out;
- if (le32_to_cpu(desc->array_status) & ARRAY_NO_EXIST)
- goto out;
-
- cur_port = host->cur_scan_dev;
-
- /* should never occur */
- if ((cur_port < 0) || (cur_port >= CARM_MAX_PORTS)) {
- printk(KERN_ERR PFX "BUG: cur_scan_dev==%d, array_id==%d\n",
- cur_port, (int) desc->array_id);
- goto out;
- }
-
- port = &host->port[cur_port];
-
- lo = (u64) le32_to_cpu(desc->size);
- hi = (u64) le16_to_cpu(desc->size_hi);
-
- port->capacity = lo | (hi << 32);
- port->dev_geom_head = le16_to_cpu(desc->head);
- port->dev_geom_sect = le16_to_cpu(desc->sect);
- port->dev_geom_cyl = le16_to_cpu(desc->cyl);
-
- host->dev_active |= (1 << cur_port);
-
- strncpy(port->name, desc->name, sizeof(port->name));
- port->name[sizeof(port->name) - 1] = 0;
- slen = strlen(port->name);
- while (slen && (port->name[slen - 1] == ' ')) {
- port->name[slen - 1] = 0;
- slen--;
- }
-
- printk(KERN_INFO DRV_NAME "(%s): port %u device %Lu sectors\n",
- pci_name(host->pdev), port->port_no,
- (unsigned long long) port->capacity);
- printk(KERN_INFO DRV_NAME "(%s): port %u device \"%s\"\n",
- pci_name(host->pdev), port->port_no, port->name);
-
-out:
- assert(host->state == HST_DEV_SCAN);
- schedule_work(&host->fsm_task);
-}
-
-static void carm_handle_scan_chan(struct carm_host *host,
- struct carm_request *crq, u8 *mem,
- blk_status_t error)
-{
- u8 *msg_data = mem + IOC_SCAN_CHAN_OFFSET;
- unsigned int i, dev_count = 0;
- int new_state = HST_DEV_SCAN_START;
-
- DPRINTK("ENTER\n");
-
- if (error) {
- new_state = HST_ERROR;
- goto out;
- }
-
- /* TODO: scan and support non-disk devices */
- for (i = 0; i < 8; i++)
- if (msg_data[i] == 0) { /* direct-access device (disk) */
- host->dev_present |= (1 << i);
- dev_count++;
- }
-
- printk(KERN_INFO DRV_NAME "(%s): found %u interesting devices\n",
- pci_name(host->pdev), dev_count);
-
-out:
- assert(host->state == HST_PORT_SCAN);
- host->state = new_state;
- schedule_work(&host->fsm_task);
-}
-
-static void carm_handle_generic(struct carm_host *host,
- struct carm_request *crq, blk_status_t error,
- int cur_state, int next_state)
-{
- DPRINTK("ENTER\n");
-
- assert(host->state == cur_state);
- if (error)
- host->state = HST_ERROR;
- else
- host->state = next_state;
- schedule_work(&host->fsm_task);
-}
-
-static inline void carm_handle_resp(struct carm_host *host,
- __le32 ret_handle_le, u32 status)
-{
- u32 handle = le32_to_cpu(ret_handle_le);
- unsigned int msg_idx;
- struct request *rq;
- struct carm_request *crq;
- blk_status_t error = (status == RMSG_OK) ? 0 : BLK_STS_IOERR;
- u8 *mem;
-
- VPRINTK("ENTER, handle == 0x%x\n", handle);
-
- if (unlikely(!TAG_VALID(handle))) {
- printk(KERN_ERR DRV_NAME "(%s): BUG: invalid tag 0x%x\n",
- pci_name(host->pdev), handle);
- return;
- }
-
- msg_idx = TAG_DECODE(handle);
- VPRINTK("tag == %u\n", msg_idx);
-
- rq = blk_mq_tag_to_rq(host->tag_set.tags[0], msg_idx);
- crq = blk_mq_rq_to_pdu(rq);
-
- /* fast path */
- if (likely(crq->msg_type == CARM_MSG_READ ||
- crq->msg_type == CARM_MSG_WRITE)) {
- dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem,
- carm_rq_dir(rq));
- goto done;
- }
-
- mem = carm_ref_msg(host, msg_idx);
-
- switch (crq->msg_type) {
- case CARM_MSG_IOCTL: {
- switch (crq->msg_subtype) {
- case CARM_IOC_SCAN_CHAN:
- carm_handle_scan_chan(host, crq, mem, error);
- goto done;
- default:
- /* unknown / invalid response */
- goto err_out;
- }
- break;
- }
-
- case CARM_MSG_MISC: {
- switch (crq->msg_subtype) {
- case MISC_ALLOC_MEM:
- carm_handle_generic(host, crq, error,
- HST_ALLOC_BUF, HST_SYNC_TIME);
- goto done;
- case MISC_SET_TIME:
- carm_handle_generic(host, crq, error,
- HST_SYNC_TIME, HST_GET_FW_VER);
- goto done;
- case MISC_GET_FW_VER: {
- struct carm_fw_ver *ver = (struct carm_fw_ver *)
- (mem + sizeof(struct carm_msg_get_fw_ver));
- if (!error) {
- host->fw_ver = le32_to_cpu(ver->version);
- host->flags |= (ver->features & FL_FW_VER_MASK);
- }
- carm_handle_generic(host, crq, error,
- HST_GET_FW_VER, HST_PORT_SCAN);
- goto done;
- }
- default:
- /* unknown / invalid response */
- goto err_out;
- }
- break;
- }
-
- case CARM_MSG_ARRAY: {
- switch (crq->msg_subtype) {
- case CARM_ARRAY_INFO:
- carm_handle_array_info(host, crq, mem, error);
- break;
- default:
- /* unknown / invalid response */
- goto err_out;
- }
- break;
- }
-
- default:
- /* unknown / invalid response */
- goto err_out;
- }
-
- return;
-
-err_out:
- printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n",
- pci_name(host->pdev), crq->msg_type, crq->msg_subtype);
- error = BLK_STS_IOERR;
-done:
- host->hw_sg_used -= crq->n_elem;
- blk_mq_end_request(blk_mq_rq_from_pdu(crq), error);
-
- if (host->hw_sg_used <= CARM_SG_LOW_WATER)
- carm_round_robin(host);
-}
-
-static inline void carm_handle_responses(struct carm_host *host)
-{
- void __iomem *mmio = host->mmio;
- struct carm_response *resp = (struct carm_response *) host->shm;
- unsigned int work = 0;
- unsigned int idx = host->resp_idx % RMSG_Q_LEN;
-
- while (1) {
- u32 status = le32_to_cpu(resp[idx].status);
-
- if (status == 0xffffffff) {
- VPRINTK("ending response on index %u\n", idx);
- writel(idx << 3, mmio + CARM_RESP_IDX);
- break;
- }
-
- /* response to a message we sent */
- else if ((status & (1 << 31)) == 0) {
- VPRINTK("handling msg response on index %u\n", idx);
- carm_handle_resp(host, resp[idx].ret_handle, status);
- resp[idx].status = cpu_to_le32(0xffffffff);
- }
-
- /* asynchronous events the hardware throws our way */
- else if ((status & 0xff000000) == (1 << 31)) {
- u8 *evt_type_ptr = (u8 *) &resp[idx];
- u8 evt_type = *evt_type_ptr;
- printk(KERN_WARNING DRV_NAME "(%s): unhandled event type %d\n",
- pci_name(host->pdev), (int) evt_type);
- resp[idx].status = cpu_to_le32(0xffffffff);
- }
-
- idx = NEXT_RESP(idx);
- work++;
- }
-
- VPRINTK("EXIT, work==%u\n", work);
- host->resp_idx += work;
-}
-
-static irqreturn_t carm_interrupt(int irq, void *__host)
-{
- struct carm_host *host = __host;
- void __iomem *mmio;
- u32 mask;
- int handled = 0;
- unsigned long flags;
-
- if (!host) {
- VPRINTK("no host\n");
- return IRQ_NONE;
- }
-
- spin_lock_irqsave(&host->lock, flags);
-
- mmio = host->mmio;
-
- /* reading should also clear interrupts */
- mask = readl(mmio + CARM_INT_STAT);
-
- if (mask == 0 || mask == 0xffffffff) {
- VPRINTK("no work, mask == 0x%x\n", mask);
- goto out;
- }
-
- if (mask & INT_ACK_MASK)
- writel(mask, mmio + CARM_INT_STAT);
-
- if (unlikely(host->state == HST_INVALID)) {
- VPRINTK("not initialized yet, mask = 0x%x\n", mask);
- goto out;
- }
-
- if (mask & CARM_HAVE_RESP) {
- handled = 1;
- carm_handle_responses(host);
- }
-
-out:
- spin_unlock_irqrestore(&host->lock, flags);
- VPRINTK("EXIT\n");
- return IRQ_RETVAL(handled);
-}
-
-static void carm_fsm_task (struct work_struct *work)
-{
- struct carm_host *host =
- container_of(work, struct carm_host, fsm_task);
- unsigned long flags;
- unsigned int state;
- int rc, i, next_dev;
- int reschedule = 0;
- int new_state = HST_INVALID;
-
- spin_lock_irqsave(&host->lock, flags);
- state = host->state;
- spin_unlock_irqrestore(&host->lock, flags);
-
- DPRINTK("ENTER, state == %s\n", state_name[state]);
-
- switch (state) {
- case HST_PROBE_START:
- new_state = HST_ALLOC_BUF;
- reschedule = 1;
- break;
-
- case HST_ALLOC_BUF:
- rc = carm_send_special(host, carm_fill_alloc_buf);
- if (rc) {
- new_state = HST_ERROR;
- reschedule = 1;
- }
- break;
-
- case HST_SYNC_TIME:
- rc = carm_send_special(host, carm_fill_sync_time);
- if (rc) {
- new_state = HST_ERROR;
- reschedule = 1;
- }
- break;
-
- case HST_GET_FW_VER:
- rc = carm_send_special(host, carm_fill_get_fw_ver);
- if (rc) {
- new_state = HST_ERROR;
- reschedule = 1;
- }
- break;
-
- case HST_PORT_SCAN:
- rc = carm_send_special(host, carm_fill_scan_channels);
- if (rc) {
- new_state = HST_ERROR;
- reschedule = 1;
- }
- break;
-
- case HST_DEV_SCAN_START:
- host->cur_scan_dev = -1;
- new_state = HST_DEV_SCAN;
- reschedule = 1;
- break;
-
- case HST_DEV_SCAN:
- next_dev = -1;
- for (i = host->cur_scan_dev + 1; i < CARM_MAX_PORTS; i++)
- if (host->dev_present & (1 << i)) {
- next_dev = i;
- break;
- }
-
- if (next_dev >= 0) {
- host->cur_scan_dev = next_dev;
- rc = carm_array_info(host, next_dev);
- if (rc) {
- new_state = HST_ERROR;
- reschedule = 1;
- }
- } else {
- new_state = HST_DEV_ACTIVATE;
- reschedule = 1;
- }
- break;
-
- case HST_DEV_ACTIVATE: {
- int activated = 0;
- for (i = 0; i < CARM_MAX_PORTS; i++)
- if (host->dev_active & (1 << i)) {
- struct carm_port *port = &host->port[i];
- struct gendisk *disk = port->disk;
-
- set_capacity(disk, port->capacity);
- host->probe_err = add_disk(disk);
- if (!host->probe_err)
- activated++;
- else
- break;
- }
-
- printk(KERN_INFO DRV_NAME "(%s): %d ports activated\n",
- pci_name(host->pdev), activated);
-
- new_state = HST_PROBE_FINISHED;
- reschedule = 1;
- break;
- }
- case HST_PROBE_FINISHED:
- complete(&host->probe_comp);
- break;
- case HST_ERROR:
- /* FIXME: TODO */
- break;
-
- default:
- /* should never occur */
- printk(KERN_ERR PFX "BUG: unknown state %d\n", state);
- assert(0);
- break;
- }
-
- if (new_state != HST_INVALID) {
- spin_lock_irqsave(&host->lock, flags);
- host->state = new_state;
- spin_unlock_irqrestore(&host->lock, flags);
- }
- if (reschedule)
- schedule_work(&host->fsm_task);
-}
-
-static int carm_init_wait(void __iomem *mmio, u32 bits, unsigned int test_bit)
-{
- unsigned int i;
-
- for (i = 0; i < 50000; i++) {
- u32 tmp = readl(mmio + CARM_LMUC);
- udelay(100);
-
- if (test_bit) {
- if ((tmp & bits) == bits)
- return 0;
- } else {
- if ((tmp & bits) == 0)
- return 0;
- }
-
- cond_resched();
- }
-
- printk(KERN_ERR PFX "carm_init_wait timeout, bits == 0x%x, test_bit == %s\n",
- bits, test_bit ? "yes" : "no");
- return -EBUSY;
-}
-
-static void carm_init_responses(struct carm_host *host)
-{
- void __iomem *mmio = host->mmio;
- unsigned int i;
- struct carm_response *resp = (struct carm_response *) host->shm;
-
- for (i = 0; i < RMSG_Q_LEN; i++)
- resp[i].status = cpu_to_le32(0xffffffff);
-
- writel(0, mmio + CARM_RESP_IDX);
-}
-
-static int carm_init_host(struct carm_host *host)
-{
- void __iomem *mmio = host->mmio;
- u32 tmp;
- u8 tmp8;
- int rc;
-
- DPRINTK("ENTER\n");
-
- writel(0, mmio + CARM_INT_MASK);
-
- tmp8 = readb(mmio + CARM_INITC);
- if (tmp8 & 0x01) {
- tmp8 &= ~0x01;
- writeb(tmp8, mmio + CARM_INITC);
- readb(mmio + CARM_INITC); /* flush */
-
- DPRINTK("snooze...\n");
- msleep(5000);
- }
-
- tmp = readl(mmio + CARM_HMUC);
- if (tmp & CARM_CME) {
- DPRINTK("CME bit present, waiting\n");
- rc = carm_init_wait(mmio, CARM_CME, 1);
- if (rc) {
- DPRINTK("EXIT, carm_init_wait 1 failed\n");
- return rc;
- }
- }
- if (tmp & CARM_RME) {
- DPRINTK("RME bit present, waiting\n");
- rc = carm_init_wait(mmio, CARM_RME, 1);
- if (rc) {
- DPRINTK("EXIT, carm_init_wait 2 failed\n");
- return rc;
- }
- }
-
- tmp &= ~(CARM_RME | CARM_CME);
- writel(tmp, mmio + CARM_HMUC);
- readl(mmio + CARM_HMUC); /* flush */
-
- rc = carm_init_wait(mmio, CARM_RME | CARM_CME, 0);
- if (rc) {
- DPRINTK("EXIT, carm_init_wait 3 failed\n");
- return rc;
- }
-
- carm_init_buckets(mmio);
-
- writel(host->shm_dma & 0xffffffff, mmio + RBUF_ADDR_LO);
- writel((host->shm_dma >> 16) >> 16, mmio + RBUF_ADDR_HI);
- writel(RBUF_LEN, mmio + RBUF_BYTE_SZ);
-
- tmp = readl(mmio + CARM_HMUC);
- tmp |= (CARM_RME | CARM_CME | CARM_WZBC);
- writel(tmp, mmio + CARM_HMUC);
- readl(mmio + CARM_HMUC); /* flush */
-
- rc = carm_init_wait(mmio, CARM_RME | CARM_CME, 1);
- if (rc) {
- DPRINTK("EXIT, carm_init_wait 4 failed\n");
- return rc;
- }
-
- writel(0, mmio + CARM_HMPHA);
- writel(INT_DEF_MASK, mmio + CARM_INT_MASK);
-
- carm_init_responses(host);
-
- /* start initialization, probing state machine */
- spin_lock_irq(&host->lock);
- assert(host->state == HST_INVALID);
- host->state = HST_PROBE_START;
- spin_unlock_irq(&host->lock);
- schedule_work(&host->fsm_task);
-
- DPRINTK("EXIT\n");
- return 0;
-}
-
-static const struct blk_mq_ops carm_mq_ops = {
- .queue_rq = carm_queue_rq,
-};
-
-static int carm_init_disk(struct carm_host *host, unsigned int port_no)
-{
- struct carm_port *port = &host->port[port_no];
- struct gendisk *disk;
-
- port->host = host;
- port->port_no = port_no;
-
- disk = blk_mq_alloc_disk(&host->tag_set, port);
- if (IS_ERR(disk))
- return PTR_ERR(disk);
-
- port->disk = disk;
- sprintf(disk->disk_name, DRV_NAME "/%u",
- (unsigned int)host->id * CARM_MAX_PORTS + port_no);
- disk->major = host->major;
- disk->first_minor = port_no * CARM_MINORS_PER_MAJOR;
- disk->minors = CARM_MINORS_PER_MAJOR;
- disk->fops = &carm_bd_ops;
- disk->private_data = port;
-
- blk_queue_max_segments(disk->queue, CARM_MAX_REQ_SG);
- blk_queue_segment_boundary(disk->queue, CARM_SG_BOUNDARY);
- return 0;
-}
-
-static void carm_free_disk(struct carm_host *host, unsigned int port_no)
-{
- struct carm_port *port = &host->port[port_no];
- struct gendisk *disk = port->disk;
-
- if (!disk)
- return;
-
- if (host->state > HST_DEV_ACTIVATE)
- del_gendisk(disk);
- put_disk(disk);
-}
-
-static int carm_init_shm(struct carm_host *host)
-{
- host->shm = dma_alloc_coherent(&host->pdev->dev, CARM_SHM_SIZE,
- &host->shm_dma, GFP_KERNEL);
- if (!host->shm)
- return -ENOMEM;
-
- host->msg_base = host->shm + RBUF_LEN;
- host->msg_dma = host->shm_dma + RBUF_LEN;
-
- memset(host->shm, 0xff, RBUF_LEN);
- memset(host->msg_base, 0, PDC_SHM_SIZE - RBUF_LEN);
-
- return 0;
-}
-
-static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
-{
- struct carm_host *host;
- int rc;
- struct request_queue *q;
- unsigned int i;
-
- printk_once(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
-
- rc = pci_enable_device(pdev);
- if (rc)
- return rc;
-
- rc = pci_request_regions(pdev, DRV_NAME);
- if (rc)
- goto err_out;
-
- rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
- if (rc) {
- printk(KERN_ERR DRV_NAME "(%s): DMA mask failure\n",
- pci_name(pdev));
- goto err_out_regions;
- }
-
- host = kzalloc(sizeof(*host), GFP_KERNEL);
- if (!host) {
- rc = -ENOMEM;
- goto err_out_regions;
- }
-
- host->pdev = pdev;
- spin_lock_init(&host->lock);
- INIT_WORK(&host->fsm_task, carm_fsm_task);
- init_completion(&host->probe_comp);
-
- host->mmio = ioremap(pci_resource_start(pdev, 0),
- pci_resource_len(pdev, 0));
- if (!host->mmio) {
- printk(KERN_ERR DRV_NAME "(%s): MMIO alloc failure\n",
- pci_name(pdev));
- rc = -ENOMEM;
- goto err_out_kfree;
- }
-
- rc = carm_init_shm(host);
- if (rc) {
- printk(KERN_ERR DRV_NAME "(%s): DMA SHM alloc failure\n",
- pci_name(pdev));
- goto err_out_iounmap;
- }
-
- memset(&host->tag_set, 0, sizeof(host->tag_set));
- host->tag_set.ops = &carm_mq_ops;
- host->tag_set.cmd_size = sizeof(struct carm_request);
- host->tag_set.nr_hw_queues = 1;
- host->tag_set.nr_maps = 1;
- host->tag_set.queue_depth = max_queue;
- host->tag_set.numa_node = NUMA_NO_NODE;
- host->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
-
- rc = blk_mq_alloc_tag_set(&host->tag_set);
- if (rc)
- goto err_out_dma_free;
-
- q = blk_mq_init_queue(&host->tag_set);
- if (IS_ERR(q)) {
- rc = PTR_ERR(q);
- blk_mq_free_tag_set(&host->tag_set);
- goto err_out_dma_free;
- }
-
- host->oob_q = q;
- q->queuedata = host;
-
- /*
- * Figure out which major to use: 160, 161, or dynamic
- */
- if (!test_and_set_bit(0, &carm_major_alloc))
- host->major = 160;
- else if (!test_and_set_bit(1, &carm_major_alloc))
- host->major = 161;
- else
- host->flags |= FL_DYN_MAJOR;
-
- host->id = carm_host_id;
- sprintf(host->name, DRV_NAME "%d", carm_host_id);
-
- rc = register_blkdev(host->major, host->name);
- if (rc < 0)
- goto err_out_free_majors;
- if (host->flags & FL_DYN_MAJOR)
- host->major = rc;
-
- for (i = 0; i < CARM_MAX_PORTS; i++) {
- rc = carm_init_disk(host, i);
- if (rc)
- goto err_out_blkdev_disks;
- }
-
- pci_set_master(pdev);
-
- rc = request_irq(pdev->irq, carm_interrupt, IRQF_SHARED, DRV_NAME, host);
- if (rc) {
- printk(KERN_ERR DRV_NAME "(%s): irq alloc failure\n",
- pci_name(pdev));
- goto err_out_blkdev_disks;
- }
-
- rc = carm_init_host(host);
- if (rc)
- goto err_out_free_irq;
-
- DPRINTK("waiting for probe_comp\n");
- host->probe_err = -ENODEV;
- wait_for_completion(&host->probe_comp);
- if (host->probe_err) {
- rc = host->probe_err;
- goto err_out_free_irq;
- }
-
- printk(KERN_INFO "%s: pci %s, ports %d, io %llx, irq %u, major %d\n",
- host->name, pci_name(pdev), (int) CARM_MAX_PORTS,
- (unsigned long long)pci_resource_start(pdev, 0),
- pdev->irq, host->major);
-
- carm_host_id++;
- pci_set_drvdata(pdev, host);
- return 0;
-
-err_out_free_irq:
- free_irq(pdev->irq, host);
-err_out_blkdev_disks:
- for (i = 0; i < CARM_MAX_PORTS; i++)
- carm_free_disk(host, i);
- unregister_blkdev(host->major, host->name);
-err_out_free_majors:
- if (host->major == 160)
- clear_bit(0, &carm_major_alloc);
- else if (host->major == 161)
- clear_bit(1, &carm_major_alloc);
- blk_mq_destroy_queue(host->oob_q);
- blk_mq_free_tag_set(&host->tag_set);
-err_out_dma_free:
- dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma);
-err_out_iounmap:
- iounmap(host->mmio);
-err_out_kfree:
- kfree(host);
-err_out_regions:
- pci_release_regions(pdev);
-err_out:
- pci_disable_device(pdev);
- return rc;
-}
-
-static void carm_remove_one (struct pci_dev *pdev)
-{
- struct carm_host *host = pci_get_drvdata(pdev);
- unsigned int i;
-
- if (!host) {
- printk(KERN_ERR PFX "BUG: no host data for PCI(%s)\n",
- pci_name(pdev));
- return;
- }
-
- free_irq(pdev->irq, host);
- for (i = 0; i < CARM_MAX_PORTS; i++)
- carm_free_disk(host, i);
- unregister_blkdev(host->major, host->name);
- if (host->major == 160)
- clear_bit(0, &carm_major_alloc);
- else if (host->major == 161)
- clear_bit(1, &carm_major_alloc);
- blk_mq_destroy_queue(host->oob_q);
- blk_mq_free_tag_set(&host->tag_set);
- dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma);
- iounmap(host->mmio);
- kfree(host);
- pci_release_regions(pdev);
- pci_disable_device(pdev);
-}
-
-module_pci_driver(carm_driver);
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 3f1906965ac8..2b7d1db5c4a7 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -47,7 +47,12 @@
#define UBLK_MINORS (1U << MINORBITS)
/* All UBLK_F_* have to be included into UBLK_F_ALL */
-#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_URING_CMD_COMP_IN_TASK)
+#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
+ | UBLK_F_URING_CMD_COMP_IN_TASK \
+ | UBLK_F_NEED_GET_DATA)
+
+/* All UBLK_PARAM_TYPE_* should be included here */
+#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
struct ublk_rq_data {
struct callback_head work;
@@ -86,6 +91,15 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_IO_FLAG_ABORTED 0x04
+/*
+ * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
+ * get data buffer address from ublksrv.
+ *
+ * Then, bio data could be copied into this data buffer for a WRITE request
+ * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
+ */
+#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
+
struct ublk_io {
/* userspace buffer address from io cmd */
__u64 addr;
@@ -119,7 +133,6 @@ struct ublk_device {
char *__queues;
unsigned short queue_size;
- unsigned short bs_shift;
struct ublksrv_ctrl_dev_info dev_info;
struct blk_mq_tag_set tag_set;
@@ -137,6 +150,8 @@ struct ublk_device {
spinlock_t mm_lock;
struct mm_struct *mm;
+ struct ublk_params params;
+
struct completion completion;
unsigned int nr_queues_ready;
atomic_t nr_aborted_queues;
@@ -149,6 +164,12 @@ struct ublk_device {
struct work_struct stop_work;
};
+/* header of ublk_params */
+struct ublk_params_header {
+ __u32 len;
+ __u32 types;
+};
+
static dev_t ublk_chr_devt;
static struct class *ublk_chr_class;
@@ -160,6 +181,90 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
static struct miscdevice ublk_misc;
+static void ublk_dev_param_basic_apply(struct ublk_device *ub)
+{
+ struct request_queue *q = ub->ub_disk->queue;
+ const struct ublk_param_basic *p = &ub->params.basic;
+
+ blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
+ blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
+ blk_queue_io_min(q, 1 << p->io_min_shift);
+ blk_queue_io_opt(q, 1 << p->io_opt_shift);
+
+ blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
+ p->attrs & UBLK_ATTR_FUA);
+ if (p->attrs & UBLK_ATTR_ROTATIONAL)
+ blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
+ else
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+
+ blk_queue_max_hw_sectors(q, p->max_sectors);
+ blk_queue_chunk_sectors(q, p->chunk_sectors);
+ blk_queue_virt_boundary(q, p->virt_boundary_mask);
+
+ if (p->attrs & UBLK_ATTR_READ_ONLY)
+ set_disk_ro(ub->ub_disk, true);
+
+ set_capacity(ub->ub_disk, p->dev_sectors);
+}
+
+static void ublk_dev_param_discard_apply(struct ublk_device *ub)
+{
+ struct request_queue *q = ub->ub_disk->queue;
+ const struct ublk_param_discard *p = &ub->params.discard;
+
+ q->limits.discard_alignment = p->discard_alignment;
+ q->limits.discard_granularity = p->discard_granularity;
+ blk_queue_max_discard_sectors(q, p->max_discard_sectors);
+ blk_queue_max_write_zeroes_sectors(q,
+ p->max_write_zeroes_sectors);
+ blk_queue_max_discard_segments(q, p->max_discard_segments);
+}
+
+static int ublk_validate_params(const struct ublk_device *ub)
+{
+ /* basic param is the only one which must be set */
+ if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
+ const struct ublk_param_basic *p = &ub->params.basic;
+
+ if (p->logical_bs_shift > PAGE_SHIFT)
+ return -EINVAL;
+
+ if (p->logical_bs_shift > p->physical_bs_shift)
+ return -EINVAL;
+
+ if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
+ return -EINVAL;
+ } else
+ return -EINVAL;
+
+ if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
+ const struct ublk_param_discard *p = &ub->params.discard;
+
+ /* So far, only support single segment discard */
+ if (p->max_discard_sectors && p->max_discard_segments != 1)
+ return -EINVAL;
+
+ if (!p->discard_granularity)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ublk_apply_params(struct ublk_device *ub)
+{
+ if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
+ return -EINVAL;
+
+ ublk_dev_param_basic_apply(ub);
+
+ if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
+ ublk_dev_param_discard_apply(ub);
+
+ return 0;
+}
+
static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
{
if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
@@ -168,6 +273,13 @@ static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
return false;
}
+static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
+{
+ if (ubq->flags & UBLK_F_NEED_GET_DATA)
+ return true;
+ return false;
+}
+
static struct ublk_device *ublk_get_device(struct ublk_device *ub)
{
if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
@@ -509,6 +621,21 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
}
}
+static void ubq_complete_io_cmd(struct ublk_io *io, int res)
+{
+ /* mark this cmd owned by ublksrv */
+ io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
+
+ /*
+ * clear ACTIVE since we are done with this sqe/cmd slot
+ * We can only accept io cmd in case of being not active.
+ */
+ io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+
+ /* tell ublksrv one io request is coming */
+ io_uring_cmd_done(io->cmd, res, 0);
+}
+
#define UBLK_REQUEUE_DELAY_MS 3
static inline void __ublk_rq_task_work(struct request *req)
@@ -531,6 +658,30 @@ static inline void __ublk_rq_task_work(struct request *req)
return;
}
+ if (ublk_need_get_data(ubq) &&
+ (req_op(req) == REQ_OP_WRITE ||
+ req_op(req) == REQ_OP_FLUSH)) {
+ /*
+ * We have not handled UBLK_IO_NEED_GET_DATA command yet,
+ * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
+ * and notify it.
+ */
+ if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
+ io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
+ pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
+ __func__, io->cmd->cmd_op, ubq->q_id,
+ req->tag, io->flags);
+ ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA);
+ return;
+ }
+ /*
+ * We have handled UBLK_IO_NEED_GET_DATA command,
+ * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
+ * do the copy work.
+ */
+ io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
+ }
+
mapped_bytes = ublk_map_io(ubq, req, io);
/* partially mapped, update io descriptor */
@@ -553,17 +704,7 @@ static inline void __ublk_rq_task_work(struct request *req)
mapped_bytes >> 9;
}
- /* mark this cmd owned by ublksrv */
- io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
-
- /*
- * clear ACTIVE since we are done with this sqe/cmd slot
- * We can only accept io cmd in case of being not active.
- */
- io->flags &= ~UBLK_IO_FLAG_ACTIVE;
-
- /* tell ublksrv one io request is coming */
- io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0);
+ ubq_complete_io_cmd(io, UBLK_IO_RES_OK);
}
static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
@@ -788,16 +929,27 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
UBLK_DAEMON_MONITOR_PERIOD);
}
+static inline bool ublk_queue_ready(struct ublk_queue *ubq)
+{
+ return ubq->nr_io_ready == ubq->q_depth;
+}
+
static void ublk_cancel_queue(struct ublk_queue *ubq)
{
int i;
+ if (!ublk_queue_ready(ubq))
+ return;
+
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
if (io->flags & UBLK_IO_FLAG_ACTIVE)
io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
}
+
+ /* all io commands are canceled */
+ ubq->nr_io_ready = 0;
}
/* Cancel all pending commands, must be called after del_gendisk() returns */
@@ -818,19 +970,14 @@ static void ublk_stop_dev(struct ublk_device *ub)
del_gendisk(ub->ub_disk);
ub->dev_info.state = UBLK_S_DEV_DEAD;
ub->dev_info.ublksrv_pid = -1;
- ublk_cancel_dev(ub);
put_disk(ub->ub_disk);
ub->ub_disk = NULL;
unlock:
+ ublk_cancel_dev(ub);
mutex_unlock(&ub->mutex);
cancel_delayed_work_sync(&ub->monitor_work);
}
-static inline bool ublk_queue_ready(struct ublk_queue *ubq)
-{
- return ubq->nr_io_ready == ubq->q_depth;
-}
-
/* device can only be started after all IOs are ready */
static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
{
@@ -846,6 +993,25 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
mutex_unlock(&ub->mutex);
}
+static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
+ int tag, struct io_uring_cmd *cmd)
+{
+ struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+ struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
+
+ if (ublk_can_use_task_work(ubq)) {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ /* should not fail since we call it just in ubq->ubq_daemon */
+ task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL_NO_IPI);
+ } else {
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+ pdu->req = req;
+ io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+ }
+}
+
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
@@ -884,6 +1050,14 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
goto out;
}
+ /*
+ * ensure that the user issues UBLK_IO_NEED_GET_DATA
+ * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
+ */
+ if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
+ ^ (cmd_op == UBLK_IO_NEED_GET_DATA))
+ goto out;
+
switch (cmd_op) {
case UBLK_IO_FETCH_REQ:
/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
@@ -917,6 +1091,14 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
io->cmd = cmd;
ublk_commit_completion(ub, ub_cmd);
break;
+ case UBLK_IO_NEED_GET_DATA:
+ if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
+ goto out;
+ io->addr = ub_cmd->addr;
+ io->cmd = cmd;
+ io->flags |= UBLK_IO_FLAG_ACTIVE;
+ ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag, cmd);
+ break;
default:
goto out;
}
@@ -1083,13 +1265,13 @@ static void ublk_stop_work_fn(struct work_struct *work)
ublk_stop_dev(ub);
}
-/* align maximum I/O size to PAGE_SIZE */
+/* align max io buffer size with PAGE_SIZE */
static void ublk_align_max_io_size(struct ublk_device *ub)
{
- unsigned int max_rq_bytes = ub->dev_info.rq_max_blocks << ub->bs_shift;
+ unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
- ub->dev_info.rq_max_blocks =
- round_down(max_rq_bytes, PAGE_SIZE) >> ub->bs_shift;
+ ub->dev_info.max_io_buf_bytes =
+ round_down(max_io_bytes, PAGE_SIZE);
}
static int ublk_add_tag_set(struct ublk_device *ub)
@@ -1132,7 +1314,6 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
{
struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
int ublksrv_pid = (int)header->data[0];
- unsigned long dev_blocks = header->data[1];
struct ublk_device *ub;
struct gendisk *disk;
int ret = -EINVAL;
@@ -1155,10 +1336,6 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
goto out_unlock;
}
- /* We may get disk size updated */
- if (dev_blocks)
- ub->dev_info.dev_blocks = dev_blocks;
-
disk = blk_mq_alloc_disk(&ub->tag_set, ub);
if (IS_ERR(disk)) {
ret = PTR_ERR(disk);
@@ -1168,27 +1345,28 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
disk->fops = &ub_fops;
disk->private_data = ub;
- blk_queue_logical_block_size(disk->queue, ub->dev_info.block_size);
- blk_queue_physical_block_size(disk->queue, ub->dev_info.block_size);
- blk_queue_io_min(disk->queue, ub->dev_info.block_size);
- blk_queue_max_hw_sectors(disk->queue,
- ub->dev_info.rq_max_blocks << (ub->bs_shift - 9));
- disk->queue->limits.discard_granularity = PAGE_SIZE;
- blk_queue_max_discard_sectors(disk->queue, UINT_MAX >> 9);
- blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX >> 9);
-
- set_capacity(disk, ub->dev_info.dev_blocks << (ub->bs_shift - 9));
-
ub->dev_info.ublksrv_pid = ublksrv_pid;
ub->ub_disk = disk;
+
+ ret = ublk_apply_params(ub);
+ if (ret)
+ goto out_put_disk;
+
get_device(&ub->cdev_dev);
ret = add_disk(disk);
if (ret) {
- put_disk(disk);
- goto out_unlock;
+ /*
+ * Has to drop the reference since ->free_disk won't be
+ * called in case of add_disk failure.
+ */
+ ublk_put_device(ub);
+ goto out_put_disk;
}
set_bit(UB_STATE_USED, &ub->state);
ub->dev_info.state = UBLK_S_DEV_LIVE;
+out_put_disk:
+ if (ret)
+ put_disk(disk);
out_unlock:
mutex_unlock(&ub->mutex);
ublk_put_device(ub);
@@ -1250,9 +1428,8 @@ static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
{
pr_devel("%s: dev id %d flags %llx\n", __func__,
info->dev_id, info->flags);
- pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
- info->nr_hw_queues, info->queue_depth,
- info->block_size, info->dev_blocks);
+ pr_devel("\t nr_hw_queues %d queue_depth %d\n",
+ info->nr_hw_queues, info->queue_depth);
}
static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
@@ -1312,7 +1489,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
/* We are not ready to support zero copy */
ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
- ub->bs_shift = ilog2(ub->dev_info.block_size);
ub->dev_info.nr_hw_queues = min_t(unsigned int,
ub->dev_info.nr_hw_queues, nr_cpu_ids);
ublk_align_max_io_size(ub);
@@ -1436,6 +1612,82 @@ static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd)
return ret;
}
+static int ublk_ctrl_get_params(struct io_uring_cmd *cmd)
+{
+ struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ void __user *argp = (void __user *)(unsigned long)header->addr;
+ struct ublk_params_header ph;
+ struct ublk_device *ub;
+ int ret;
+
+ if (header->len <= sizeof(ph) || !header->addr)
+ return -EINVAL;
+
+ if (copy_from_user(&ph, argp, sizeof(ph)))
+ return -EFAULT;
+
+ if (ph.len > header->len || !ph.len)
+ return -EINVAL;
+
+ if (ph.len > sizeof(struct ublk_params))
+ ph.len = sizeof(struct ublk_params);
+
+ ub = ublk_get_device_from_id(header->dev_id);
+ if (!ub)
+ return -EINVAL;
+
+ mutex_lock(&ub->mutex);
+ if (copy_to_user(argp, &ub->params, ph.len))
+ ret = -EFAULT;
+ else
+ ret = 0;
+ mutex_unlock(&ub->mutex);
+
+ ublk_put_device(ub);
+ return ret;
+}
+
+static int ublk_ctrl_set_params(struct io_uring_cmd *cmd)
+{
+ struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ void __user *argp = (void __user *)(unsigned long)header->addr;
+ struct ublk_params_header ph;
+ struct ublk_device *ub;
+ int ret = -EFAULT;
+
+ if (header->len <= sizeof(ph) || !header->addr)
+ return -EINVAL;
+
+ if (copy_from_user(&ph, argp, sizeof(ph)))
+ return -EFAULT;
+
+ if (ph.len > header->len || !ph.len || !ph.types)
+ return -EINVAL;
+
+ if (ph.len > sizeof(struct ublk_params))
+ ph.len = sizeof(struct ublk_params);
+
+ ub = ublk_get_device_from_id(header->dev_id);
+ if (!ub)
+ return -EINVAL;
+
+ /* parameters can only be changed when device isn't live */
+ mutex_lock(&ub->mutex);
+ if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
+ ret = -EACCES;
+ } else if (copy_from_user(&ub->params, argp, ph.len)) {
+ ret = -EFAULT;
+ } else {
+ /* clear all we don't support yet */
+ ub->params.types &= UBLK_PARAM_TYPE_ALL;
+ ret = ublk_validate_params(ub);
+ }
+ mutex_unlock(&ub->mutex);
+ ublk_put_device(ub);
+
+ return ret;
+}
+
static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
@@ -1471,6 +1723,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_CMD_GET_QUEUE_AFFINITY:
ret = ublk_ctrl_get_queue_affinity(cmd);
break;
+ case UBLK_CMD_GET_PARAMS:
+ ret = ublk_ctrl_get_params(cmd);
+ break;
+ case UBLK_CMD_SET_PARAMS:
+ ret = ublk_ctrl_set_params(cmd);
+ break;
default:
break;
}
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index cf3e8096942a..529c9d04e9a4 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -29,7 +29,7 @@ config BCACHE_CLOSURES_DEBUG
operations that get stuck.
config BCACHE_ASYNC_REGISTRATION
- bool "Asynchronous device registration (EXPERIMENTAL)"
+ bool "Asynchronous device registration"
depends on BCACHE
help
Add a sysfs file /sys/fs/bcache/register_async. Writing registering
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1ec17c32867f..c640be453313 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3728,6 +3728,7 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
}
} else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 99642f69bfa7..28bd4a35b86b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1016,7 +1016,7 @@ static void dm_wq_requeue_work(struct work_struct *work)
while (io) {
struct dm_io *next = io->next;
- dm_io_rewind(io, &md->queue->bio_split);
+ dm_io_rewind(io, &md->disk->bio_split);
io->next = NULL;
__dm_io_complete(io, false);
@@ -1181,7 +1181,7 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector)
* Does the target need to split IO even further?
* - varied (per target) IO splitting is a tenet of DM; this
* explains why stacked chunk_sectors based splitting via
- * blk_queue_split() isn't possible here.
+ * bio_split_to_limits() isn't possible here.
*/
if (!ti->max_io_len)
return len;
@@ -1751,10 +1751,10 @@ static void dm_split_and_process_bio(struct mapped_device *md,
is_abnormal = is_abnormal_io(bio);
if (unlikely(is_abnormal)) {
/*
- * Use blk_queue_split() for abnormal IO (e.g. discard, etc)
+ * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
* otherwise associated queue_limits won't be imposed.
*/
- blk_queue_split(&bio);
+ bio = bio_split_to_limits(bio);
}
init_clone_info(&ci, md, map, bio, is_abnormal);
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index 2cf973722f59..91836e6de326 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -125,7 +125,6 @@ static void __init md_setup_drive(struct md_setup_args *args)
char *devname = args->device_names;
dev_t devices[MD_SB_DISKS + 1], mdev;
struct mdu_array_info_s ainfo = { };
- struct block_device *bdev;
struct mddev *mddev;
int err = 0, i;
char name[16];
@@ -169,24 +168,16 @@ static void __init md_setup_drive(struct md_setup_args *args)
pr_info("md: Loading %s: %s\n", name, args->device_names);
- bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL);
- if (IS_ERR(bdev)) {
- pr_err("md: open failed - cannot start array %s\n", name);
+ mddev = md_alloc(mdev, name);
+ if (IS_ERR(mddev)) {
+ pr_err("md: md_alloc failed - cannot start array %s\n", name);
return;
}
- err = -EIO;
- if (WARN(bdev->bd_disk->fops != &md_fops,
- "Opening block device %x resulted in non-md device\n",
- mdev))
- goto out_blkdev_put;
-
- mddev = bdev->bd_disk->private_data;
-
err = mddev_lock(mddev);
if (err) {
pr_err("md: failed to lock array %s\n", name);
- goto out_blkdev_put;
+ goto out_mddev_put;
}
if (!list_empty(&mddev->disks) || mddev->raid_disks) {
@@ -230,8 +221,8 @@ static void __init md_setup_drive(struct md_setup_args *args)
pr_warn("md: starting %s failed\n", name);
out_unlock:
mddev_unlock(mddev);
-out_blkdev_put:
- blkdev_put(bdev, FMODE_READ);
+out_mddev_put:
+ mddev_put(mddev);
}
static int __init raid_setup(char *str)
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 37cbcce3cc66..742b2349fea3 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -40,7 +40,7 @@ struct resync_info {
/* Lock the send communication. This is done through
* bit manipulation as opposed to a mutex in order to
- * accomodate lock and hold. See next comment.
+ * accommodate lock and hold. See next comment.
*/
#define MD_CLUSTER_SEND_LOCK 4
/* If cluster operations (such as adding a disk) must lock the
@@ -689,7 +689,7 @@ static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
/*
* If resync thread run after raid1d thread, then process_metadata_update
* could not continue if raid1d held reconfig_mutex (and raid1d is blocked
- * since another node already got EX on Token and waitting the EX of Ack),
+ * since another node already got EX on Token and waiting the EX of Ack),
* so let resync wake up thread in case flag is set.
*/
if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4df78e30b76a..afaf36b2f6ab 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -368,28 +368,6 @@ EXPORT_SYMBOL_GPL(md_new_event);
static LIST_HEAD(all_mddevs);
static DEFINE_SPINLOCK(all_mddevs_lock);
-/*
- * iterates through all used mddevs in the system.
- * We take care to grab the all_mddevs_lock whenever navigating
- * the list, and to always hold a refcount when unlocked.
- * Any code which breaks out of this loop while own
- * a reference to the current mddev and must mddev_put it.
- */
-#define for_each_mddev(_mddev,_tmp) \
- \
- for (({ spin_lock(&all_mddevs_lock); \
- _tmp = all_mddevs.next; \
- _mddev = NULL;}); \
- ({ if (_tmp != &all_mddevs) \
- mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
- spin_unlock(&all_mddevs_lock); \
- if (_mddev) mddev_put(_mddev); \
- _mddev = list_entry(_tmp, struct mddev, all_mddevs); \
- _tmp != &all_mddevs;}); \
- ({ spin_lock(&all_mddevs_lock); \
- _tmp = _tmp->next;}) \
- )
-
/* Rather than calling directly into the personality make_request function,
* IO requests come here first so that we can check if the device is
* being suspended pending a reconfiguration.
@@ -464,7 +442,7 @@ static void md_submit_bio(struct bio *bio)
return;
}
- blk_queue_split(&bio);
+ bio = bio_split_to_limits(bio);
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
if (bio_sectors(bio) != 0)
@@ -647,13 +625,17 @@ EXPORT_SYMBOL(md_flush_request);
static inline struct mddev *mddev_get(struct mddev *mddev)
{
+ lockdep_assert_held(&all_mddevs_lock);
+
+ if (test_bit(MD_DELETED, &mddev->flags))
+ return NULL;
atomic_inc(&mddev->active);
return mddev;
}
static void mddev_delayed_delete(struct work_struct *ws);
-static void mddev_put(struct mddev *mddev)
+void mddev_put(struct mddev *mddev)
{
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
@@ -661,7 +643,7 @@ static void mddev_put(struct mddev *mddev)
mddev->ctime == 0 && !mddev->hold_active) {
/* Array is not configured at all, and not held active,
* so destroy it */
- list_del_init(&mddev->all_mddevs);
+ set_bit(MD_DELETED, &mddev->flags);
/*
* Call queue_work inside the spinlock so that
@@ -678,7 +660,6 @@ static void md_safemode_timeout(struct timer_list *t);
void mddev_init(struct mddev *mddev)
{
- kobject_init(&mddev->kobj, &md_ktype);
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->bitmap_info.mutex);
@@ -733,22 +714,6 @@ static dev_t mddev_alloc_unit(void)
return dev;
}
-static struct mddev *mddev_find(dev_t unit)
-{
- struct mddev *mddev;
-
- if (MAJOR(unit) != MD_MAJOR)
- unit &= ~((1 << MdpMinorShift) - 1);
-
- spin_lock(&all_mddevs_lock);
- mddev = mddev_find_locked(unit);
- if (mddev)
- mddev_get(mddev);
- spin_unlock(&all_mddevs_lock);
-
- return mddev;
-}
-
static struct mddev *mddev_alloc(dev_t unit)
{
struct mddev *new;
@@ -791,6 +756,15 @@ out_free_new:
return ERR_PTR(error);
}
+static void mddev_free(struct mddev *mddev)
+{
+ spin_lock(&all_mddevs_lock);
+ list_del(&mddev->all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+
+ kfree(mddev);
+}
+
static const struct attribute_group md_redundancy_group;
void mddev_unlock(struct mddev *mddev)
@@ -3335,14 +3309,35 @@ rdev_size_show(struct md_rdev *rdev, char *page)
return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
}
-static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
+static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b)
{
/* check if two start/length pairs overlap */
- if (s1+l1 <= s2)
- return 0;
- if (s2+l2 <= s1)
- return 0;
- return 1;
+ if (a->data_offset + a->sectors <= b->data_offset)
+ return false;
+ if (b->data_offset + b->sectors <= a->data_offset)
+ return false;
+ return true;
+}
+
+static bool md_rdev_overlaps(struct md_rdev *rdev)
+{
+ struct mddev *mddev;
+ struct md_rdev *rdev2;
+
+ spin_lock(&all_mddevs_lock);
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
+ if (test_bit(MD_DELETED, &mddev->flags))
+ continue;
+ rdev_for_each(rdev2, mddev) {
+ if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
+ md_rdevs_overlap(rdev, rdev2)) {
+ spin_unlock(&all_mddevs_lock);
+ return true;
+ }
+ }
+ }
+ spin_unlock(&all_mddevs_lock);
+ return false;
}
static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
@@ -3394,46 +3389,21 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
return -EINVAL; /* component must fit device */
rdev->sectors = sectors;
- if (sectors > oldsectors && my_mddev->external) {
- /* Need to check that all other rdevs with the same
- * ->bdev do not overlap. 'rcu' is sufficient to walk
- * the rdev lists safely.
- * This check does not provide a hard guarantee, it
- * just helps avoid dangerous mistakes.
- */
- struct mddev *mddev;
- int overlap = 0;
- struct list_head *tmp;
-
- rcu_read_lock();
- for_each_mddev(mddev, tmp) {
- struct md_rdev *rdev2;
- rdev_for_each(rdev2, mddev)
- if (rdev->bdev == rdev2->bdev &&
- rdev != rdev2 &&
- overlaps(rdev->data_offset, rdev->sectors,
- rdev2->data_offset,
- rdev2->sectors)) {
- overlap = 1;
- break;
- }
- if (overlap) {
- mddev_put(mddev);
- break;
- }
- }
- rcu_read_unlock();
- if (overlap) {
- /* Someone else could have slipped in a size
- * change here, but doing so is just silly.
- * We put oldsectors back because we *know* it is
- * safe, and trust userspace not to race with
- * itself
- */
- rdev->sectors = oldsectors;
- return -EBUSY;
- }
+ /*
+ * Check that all other rdevs with the same bdev do not overlap. This
+ * check does not provide a hard guarantee, it just helps avoid
+ * dangerous mistakes.
+ */
+ if (sectors > oldsectors && my_mddev->external &&
+ md_rdev_overlaps(rdev)) {
+ /*
+ * Someone else could have slipped in a size change here, but
+ * doing so is just silly. We put oldsectors back because we
+ * know it is safe, and trust userspace not to race with itself.
+ */
+ rdev->sectors = oldsectors;
+ return -EBUSY;
}
return len;
}
@@ -4830,6 +4800,19 @@ action_store(struct mddev *mddev, const char *page, size_t len)
if (work_pending(&mddev->del_work))
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
+ sector_t save_rp = mddev->reshape_position;
+
+ mddev_unlock(mddev);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
+ mddev_lock_nointr(mddev);
+ /*
+ * set RECOVERY_INTR again and restore reshape
+ * position in case others changed them after
+ * got lock, eg, reshape_position_store and
+ * md_check_recovery.
+ */
+ mddev->reshape_position = save_rp;
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
}
@@ -5001,7 +4984,7 @@ static ssize_t
sync_speed_show(struct mddev *mddev, char *page)
{
unsigned long resync, dt, db;
- if (mddev->curr_resync == 0)
+ if (mddev->curr_resync == MD_RESYNC_NONE)
return sprintf(page, "none\n");
resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
dt = (jiffies - mddev->resync_mark) / HZ;
@@ -5020,8 +5003,8 @@ sync_completed_show(struct mddev *mddev, char *page)
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return sprintf(page, "none\n");
- if (mddev->curr_resync == 1 ||
- mddev->curr_resync == 2)
+ if (mddev->curr_resync == MD_RESYNC_YIELDED ||
+ mddev->curr_resync == MD_RESYNC_DELAYED)
return sprintf(page, "delayed\n");
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -5532,11 +5515,10 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
if (!entry->show)
return -EIO;
spin_lock(&all_mddevs_lock);
- if (list_empty(&mddev->all_mddevs)) {
+ if (!mddev_get(mddev)) {
spin_unlock(&all_mddevs_lock);
return -EBUSY;
}
- mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
rv = entry->show(mddev, page);
@@ -5557,18 +5539,17 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
spin_lock(&all_mddevs_lock);
- if (list_empty(&mddev->all_mddevs)) {
+ if (!mddev_get(mddev)) {
spin_unlock(&all_mddevs_lock);
return -EBUSY;
}
- mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
rv = entry->store(mddev, page, length);
mddev_put(mddev);
return rv;
}
-static void md_free(struct kobject *ko)
+static void md_kobj_release(struct kobject *ko)
{
struct mddev *mddev = container_of(ko, struct mddev, kobj);
@@ -5577,15 +5558,8 @@ static void md_free(struct kobject *ko)
if (mddev->sysfs_level)
sysfs_put(mddev->sysfs_level);
- if (mddev->gendisk) {
- del_gendisk(mddev->gendisk);
- put_disk(mddev->gendisk);
- }
- percpu_ref_exit(&mddev->writes_pending);
-
- bioset_exit(&mddev->bio_set);
- bioset_exit(&mddev->sync_set);
- kfree(mddev);
+ del_gendisk(mddev->gendisk);
+ put_disk(mddev->gendisk);
}
static const struct sysfs_ops md_sysfs_ops = {
@@ -5593,7 +5567,7 @@ static const struct sysfs_ops md_sysfs_ops = {
.store = md_attr_store,
};
static struct kobj_type md_ktype = {
- .release = md_free,
+ .release = md_kobj_release,
.sysfs_ops = &md_sysfs_ops,
.default_groups = md_attr_groups,
};
@@ -5604,7 +5578,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
- kobject_del(&mddev->kobj);
kobject_put(&mddev->kobj);
}
@@ -5623,7 +5596,7 @@ int mddev_init_writes_pending(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
-static int md_alloc(dev_t dev, char *name)
+struct mddev *md_alloc(dev_t dev, char *name)
{
/*
* If dev is zero, name is the name of a device to allocate with
@@ -5651,8 +5624,8 @@ static int md_alloc(dev_t dev, char *name)
mutex_lock(&disks_mutex);
mddev = mddev_alloc(dev);
if (IS_ERR(mddev)) {
- mutex_unlock(&disks_mutex);
- return PTR_ERR(mddev);
+ error = PTR_ERR(mddev);
+ goto out_unlock;
}
partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
@@ -5670,7 +5643,7 @@ static int md_alloc(dev_t dev, char *name)
strcmp(mddev2->gendisk->disk_name, name) == 0) {
spin_unlock(&all_mddevs_lock);
error = -EEXIST;
- goto out_unlock_disks_mutex;
+ goto out_free_mddev;
}
spin_unlock(&all_mddevs_lock);
}
@@ -5683,7 +5656,7 @@ static int md_alloc(dev_t dev, char *name)
error = -ENOMEM;
disk = blk_alloc_disk(NUMA_NO_NODE);
if (!disk)
- goto out_unlock_disks_mutex;
+ goto out_free_mddev;
disk->major = MAJOR(mddev->unit);
disk->first_minor = unit << shift;
@@ -5704,25 +5677,45 @@ static int md_alloc(dev_t dev, char *name)
mddev->gendisk = disk;
error = add_disk(disk);
if (error)
- goto out_cleanup_disk;
+ goto out_put_disk;
+ kobject_init(&mddev->kobj, &md_ktype);
error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
- if (error)
- goto out_del_gendisk;
+ if (error) {
+ /*
+ * The disk is already live at this point. Clear the hold flag
+ * and let mddev_put take care of the deletion, as it isn't any
+ * different from a normal close on last release now.
+ */
+ mddev->hold_active = 0;
+ mutex_unlock(&disks_mutex);
+ mddev_put(mddev);
+ return ERR_PTR(error);
+ }
kobject_uevent(&mddev->kobj, KOBJ_ADD);
mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
- goto out_unlock_disks_mutex;
+ mutex_unlock(&disks_mutex);
+ return mddev;
-out_del_gendisk:
- del_gendisk(disk);
-out_cleanup_disk:
+out_put_disk:
put_disk(disk);
-out_unlock_disks_mutex:
+out_free_mddev:
+ mddev_free(mddev);
+out_unlock:
mutex_unlock(&disks_mutex);
+ return ERR_PTR(error);
+}
+
+static int md_alloc_and_put(dev_t dev, char *name)
+{
+ struct mddev *mddev = md_alloc(dev, name);
+
+ if (IS_ERR(mddev))
+ return PTR_ERR(mddev);
mddev_put(mddev);
- return error;
+ return 0;
}
static void md_probe(dev_t dev)
@@ -5730,7 +5723,7 @@ static void md_probe(dev_t dev)
if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
return;
if (create_on_open)
- md_alloc(dev, NULL);
+ md_alloc_and_put(dev, NULL);
}
static int add_named_array(const char *val, const struct kernel_param *kp)
@@ -5752,12 +5745,12 @@ static int add_named_array(const char *val, const struct kernel_param *kp)
return -E2BIG;
strscpy(buf, val, len+1);
if (strncmp(buf, "md_", 3) == 0)
- return md_alloc(0, buf);
+ return md_alloc_and_put(0, buf);
if (strncmp(buf, "md", 2) == 0 &&
isdigit(buf[2]) &&
kstrtoul(buf+2, 10, &devnum) == 0 &&
devnum <= MINORMASK)
- return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
+ return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL);
return -EINVAL;
}
@@ -6197,6 +6190,7 @@ static void __md_stop_writes(struct mddev *mddev)
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
}
@@ -6244,11 +6238,11 @@ static void mddev_detach(struct mddev *mddev)
static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
- md_bitmap_destroy(mddev);
mddev_detach(mddev);
/* Ensure ->event_work is done */
if (mddev->event_work.func)
flush_workqueue(md_misc_wq);
+ md_bitmap_destroy(mddev);
spin_lock(&mddev->lock);
mddev->pers = NULL;
spin_unlock(&mddev->lock);
@@ -6497,9 +6491,8 @@ static void autorun_devices(int part)
break;
}
- md_probe(dev);
- mddev = mddev_find(dev);
- if (!mddev)
+ mddev = md_alloc(dev, NULL);
+ if (IS_ERR(mddev))
break;
if (mddev_lock(mddev))
@@ -7782,45 +7775,33 @@ out_unlock:
static int md_open(struct block_device *bdev, fmode_t mode)
{
- /*
- * Succeed if we can lock the mddev, which confirms that
- * it isn't being stopped right now.
- */
- struct mddev *mddev = mddev_find(bdev->bd_dev);
+ struct mddev *mddev;
int err;
+ spin_lock(&all_mddevs_lock);
+ mddev = mddev_get(bdev->bd_disk->private_data);
+ spin_unlock(&all_mddevs_lock);
if (!mddev)
return -ENODEV;
- if (mddev->gendisk != bdev->bd_disk) {
- /* we are racing with mddev_put which is discarding this
- * bd_disk.
- */
- mddev_put(mddev);
- /* Wait until bdev->bd_disk is definitely gone */
- if (work_pending(&mddev->del_work))
- flush_workqueue(md_misc_wq);
- return -EBUSY;
- }
- BUG_ON(mddev != bdev->bd_disk->private_data);
-
- if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
+ err = mutex_lock_interruptible(&mddev->open_mutex);
+ if (err)
goto out;
- if (test_bit(MD_CLOSING, &mddev->flags)) {
- mutex_unlock(&mddev->open_mutex);
- err = -ENODEV;
- goto out;
- }
+ err = -ENODEV;
+ if (test_bit(MD_CLOSING, &mddev->flags))
+ goto out_unlock;
- err = 0;
atomic_inc(&mddev->openers);
mutex_unlock(&mddev->open_mutex);
bdev_check_media_change(bdev);
- out:
- if (err)
- mddev_put(mddev);
+ return 0;
+
+out_unlock:
+ mutex_unlock(&mddev->open_mutex);
+out:
+ mddev_put(mddev);
return err;
}
@@ -7844,6 +7825,17 @@ static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
return ret;
}
+static void md_free_disk(struct gendisk *disk)
+{
+ struct mddev *mddev = disk->private_data;
+
+ percpu_ref_exit(&mddev->writes_pending);
+ bioset_exit(&mddev->bio_set);
+ bioset_exit(&mddev->sync_set);
+
+ mddev_free(mddev);
+}
+
const struct block_device_operations md_fops =
{
.owner = THIS_MODULE,
@@ -7857,6 +7849,7 @@ const struct block_device_operations md_fops =
.getgeo = md_getgeo,
.check_events = md_check_events,
.set_read_only = md_set_read_only,
+ .free_disk = md_free_disk,
};
static int md_thread(void *arg)
@@ -8018,16 +8011,26 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
max_sectors = mddev->dev_sectors;
resync = mddev->curr_resync;
- if (resync <= 3) {
+ if (resync < MD_RESYNC_ACTIVE) {
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
/* Still cleaning up */
resync = max_sectors;
- } else if (resync > max_sectors)
+ } else if (resync > max_sectors) {
resync = max_sectors;
- else
+ } else {
resync -= atomic_read(&mddev->recovery_active);
+ if (resync < MD_RESYNC_ACTIVE) {
+ /*
+ * Resync has started, but the subtraction has
+ * yielded one of the special values. Force it
+ * to active to ensure the status reports an
+ * active resync.
+ */
+ resync = MD_RESYNC_ACTIVE;
+ }
+ }
- if (resync == 0) {
+ if (resync == MD_RESYNC_NONE) {
if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
struct md_rdev *rdev;
@@ -8051,7 +8054,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
}
return 0;
}
- if (resync < 3) {
+ if (resync < MD_RESYNC_ACTIVE) {
seq_printf(seq, "\tresync=DELAYED");
return 1;
}
@@ -8152,6 +8155,8 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
if (!l--) {
mddev = list_entry(tmp, struct mddev, all_mddevs);
mddev_get(mddev);
+ if (!mddev_get(mddev))
+ continue;
spin_unlock(&all_mddevs_lock);
return mddev;
}
@@ -8165,25 +8170,35 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct list_head *tmp;
struct mddev *next_mddev, *mddev = v;
+ struct mddev *to_put = NULL;
++*pos;
if (v == (void*)2)
return NULL;
spin_lock(&all_mddevs_lock);
- if (v == (void*)1)
+ if (v == (void*)1) {
tmp = all_mddevs.next;
- else
+ } else {
+ to_put = mddev;
+ tmp = mddev->all_mddevs.next;
+ }
+
+ for (;;) {
+ if (tmp == &all_mddevs) {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ break;
+ }
+ next_mddev = list_entry(tmp, struct mddev, all_mddevs);
+ if (mddev_get(next_mddev))
+ break;
+ mddev = next_mddev;
tmp = mddev->all_mddevs.next;
- if (tmp != &all_mddevs)
- next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
- else {
- next_mddev = (void*)2;
- *pos = 0x10000;
}
spin_unlock(&all_mddevs_lock);
- if (v != (void*)1)
+ if (to_put)
mddev_put(mddev);
return next_mddev;
@@ -8682,7 +8697,6 @@ void md_do_sync(struct md_thread *thread)
unsigned long update_time;
sector_t mark_cnt[SYNC_MARKS];
int last_mark,m;
- struct list_head *tmp;
sector_t last_check;
int skipped = 0;
struct md_rdev *rdev;
@@ -8729,13 +8743,7 @@ void md_do_sync(struct md_thread *thread)
mddev->last_sync_action = action ?: desc;
- /* we overload curr_resync somewhat here.
- * 0 == not engaged in resync at all
- * 2 == checking that there is no conflict with another sync
- * 1 == like 2, but have yielded to allow conflicting resync to
- * commence
- * other == active in resync - this many blocks
- *
+ /*
* Before starting a resync we must have set curr_resync to
* 2, and then checked that every "conflicting" array has curr_resync
* less than ours. When we find one that is the same or higher
@@ -8747,24 +8755,29 @@ void md_do_sync(struct md_thread *thread)
do {
int mddev2_minor = -1;
- mddev->curr_resync = 2;
+ mddev->curr_resync = MD_RESYNC_DELAYED;
try_again:
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
goto skip;
- for_each_mddev(mddev2, tmp) {
+ spin_lock(&all_mddevs_lock);
+ list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
+ if (test_bit(MD_DELETED, &mddev2->flags))
+ continue;
if (mddev2 == mddev)
continue;
if (!mddev->parallel_resync
&& mddev2->curr_resync
&& match_mddev_units(mddev, mddev2)) {
DEFINE_WAIT(wq);
- if (mddev < mddev2 && mddev->curr_resync == 2) {
+ if (mddev < mddev2 &&
+ mddev->curr_resync == MD_RESYNC_DELAYED) {
/* arbitrarily yield */
- mddev->curr_resync = 1;
+ mddev->curr_resync = MD_RESYNC_YIELDED;
wake_up(&resync_wait);
}
- if (mddev > mddev2 && mddev->curr_resync == 1)
+ if (mddev > mddev2 &&
+ mddev->curr_resync == MD_RESYNC_YIELDED)
/* no need to wait here, we can wait the next
* time 'round when curr_resync == 2
*/
@@ -8782,7 +8795,8 @@ void md_do_sync(struct md_thread *thread)
desc, mdname(mddev),
mdname(mddev2));
}
- mddev_put(mddev2);
+ spin_unlock(&all_mddevs_lock);
+
if (signal_pending(current))
flush_signals(current);
schedule();
@@ -8792,7 +8806,8 @@ void md_do_sync(struct md_thread *thread)
finish_wait(&resync_wait, &wq);
}
}
- } while (mddev->curr_resync < 2);
+ spin_unlock(&all_mddevs_lock);
+ } while (mddev->curr_resync < MD_RESYNC_DELAYED);
j = 0;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
@@ -8876,7 +8891,7 @@ void md_do_sync(struct md_thread *thread)
desc, mdname(mddev));
mddev->curr_resync = j;
} else
- mddev->curr_resync = 3; /* no longer delayed */
+ mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */
mddev->curr_resync_completed = j;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
md_new_event();
@@ -9011,14 +9026,14 @@ void md_do_sync(struct md_thread *thread)
if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- mddev->curr_resync > 3) {
+ mddev->curr_resync >= MD_RESYNC_ACTIVE) {
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
- mddev->curr_resync > 3) {
+ mddev->curr_resync >= MD_RESYNC_ACTIVE) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
if (mddev->curr_resync >= mddev->recovery_cp) {
@@ -9082,7 +9097,7 @@ void md_do_sync(struct md_thread *thread)
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
- mddev->curr_resync = 0;
+ mddev->curr_resync = MD_RESYNC_NONE;
spin_unlock(&mddev->lock);
wake_up(&resync_wait);
@@ -9303,6 +9318,7 @@ void md_check_recovery(struct mddev *mddev)
* ->spare_active and clear saved_raid_disk
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -9338,6 +9354,7 @@ void md_check_recovery(struct mddev *mddev)
goto unlock;
}
if (mddev->sync_thread) {
+ md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
goto unlock;
}
@@ -9417,8 +9434,7 @@ void md_reap_sync_thread(struct mddev *mddev)
sector_t old_dev_sectors = mddev->dev_sectors;
bool is_reshaped = false;
- /* resync has finished, collect result */
- md_unregister_thread(&mddev->sync_thread);
+ /* sync_thread should be unregistered, collect result */
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
mddev->degraded != mddev->raid_disks) {
@@ -9466,6 +9482,7 @@ void md_reap_sync_thread(struct mddev *mddev)
wake_up(&resync_wait);
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
sysfs_notify_dirent_safe(mddev->sysfs_action);
md_new_event();
if (mddev->event_work.func)
@@ -9544,11 +9561,14 @@ EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
static int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x)
{
- struct list_head *tmp;
- struct mddev *mddev;
+ struct mddev *mddev, *n;
int need_delay = 0;
- for_each_mddev(mddev, tmp) {
+ spin_lock(&all_mddevs_lock);
+ list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+ if (!mddev_get(mddev))
+ continue;
+ spin_unlock(&all_mddevs_lock);
if (mddev_trylock(mddev)) {
if (mddev->pers)
__md_stop_writes(mddev);
@@ -9557,7 +9577,11 @@ static int md_notify_reboot(struct notifier_block *this,
mddev_unlock(mddev);
}
need_delay = 1;
+ mddev_put(mddev);
+ spin_lock(&all_mddevs_lock);
}
+ spin_unlock(&all_mddevs_lock);
+
/*
* certain more exotic SCSI devices are known to be
* volatile wrt too early system reboots. While the
@@ -9876,8 +9900,7 @@ void md_autostart_arrays(int part)
static __exit void md_exit(void)
{
- struct mddev *mddev;
- struct list_head *tmp;
+ struct mddev *mddev, *n;
int delay = 1;
unregister_blkdev(MD_MAJOR,"md");
@@ -9897,17 +9920,24 @@ static __exit void md_exit(void)
}
remove_proc_entry("mdstat", NULL);
- for_each_mddev(mddev, tmp) {
+ spin_lock(&all_mddevs_lock);
+ list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+ if (!mddev_get(mddev))
+ continue;
+ spin_unlock(&all_mddevs_lock);
export_array(mddev);
mddev->ctime = 0;
mddev->hold_active = 0;
/*
- * for_each_mddev() will call mddev_put() at the end of each
- * iteration. As the mddev is now fully clear, this will
- * schedule the mddev for destruction by a workqueue, and the
+ * As the mddev is now fully clear, mddev_put will schedule
+ * the mddev for destruction by a workqueue, and the
* destroy_workqueue() below will wait for that to complete.
*/
+ mddev_put(mddev);
+ spin_lock(&all_mddevs_lock);
}
+ spin_unlock(&all_mddevs_lock);
+
destroy_workqueue(md_rdev_misc_wq);
destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index b4f84b27bdef..b4e2d8b87b61 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -254,6 +254,7 @@ struct md_cluster_info;
* @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
* array is ready yet.
* @MD_BROKEN: This is used to stop writes and mark array as failed.
+ * @MD_DELETED: This device is being deleted
*
* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
*/
@@ -270,6 +271,7 @@ enum mddev_flags {
MD_UPDATING_SB,
MD_NOT_READY,
MD_BROKEN,
+ MD_DELETED,
};
enum mddev_sb_flags {
@@ -288,6 +290,21 @@ struct serial_info {
sector_t _subtree_last; /* highest sector in subtree of rb node */
};
+/*
+ * mddev->curr_resync stores the current sector of the resync but
+ * also has some overloaded values.
+ */
+enum {
+ /* No resync in progress */
+ MD_RESYNC_NONE = 0,
+ /* Yielded to allow another conflicting resync to commence */
+ MD_RESYNC_YIELDED = 1,
+ /* Delayed to check that there is no conflict with another sync */
+ MD_RESYNC_DELAYED = 2,
+ /* Any value greater than or equal to this is in an active resync */
+ MD_RESYNC_ACTIVE = 3,
+};
+
struct mddev {
void *private;
struct md_personality *pers;
@@ -750,6 +767,8 @@ extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
extern void mddev_init(struct mddev *mddev);
+struct mddev *md_alloc(dev_t dev, char *name);
+void mddev_put(struct mddev *mddev);
extern int md_run(struct mddev *mddev);
extern int md_start(struct mddev *mddev);
extern void md_stop(struct mddev *mddev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 26545950ca42..9117fcdee1be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2167,9 +2167,12 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
int err = 0;
int number = rdev->raid_disk;
struct md_rdev **rdevp;
- struct raid10_info *p = conf->mirrors + number;
+ struct raid10_info *p;
print_conf(conf);
+ if (unlikely(number >= mddev->raid_disks))
+ return 0;
+ p = conf->mirrors + number;
if (rdev == p->rdev)
rdevp = &p->rdev;
else if (rdev == p->replacement)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 6f2dd73128b0..f4e1cc1ece43 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1590,18 +1590,13 @@ void r5l_quiesce(struct r5l_log *log, int quiesce)
bool r5l_log_disk_error(struct r5conf *conf)
{
- struct r5l_log *log;
- bool ret;
- /* don't allow write if journal disk is missing */
- rcu_read_lock();
- log = rcu_dereference(conf->log);
+ struct r5l_log *log = conf->log;
+ /* don't allow write if journal disk is missing */
if (!log)
- ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
+ return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
else
- ret = test_bit(Faulty, &log->rdev->flags);
- rcu_read_unlock();
- return ret;
+ return test_bit(Faulty, &log->rdev->flags);
}
#define R5L_RECOVERY_PAGE_POOL_SIZE 256
@@ -2534,12 +2529,13 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
struct r5conf *conf;
int ret;
- spin_lock(&mddev->lock);
+ ret = mddev_lock(mddev);
+ if (ret)
+ return ret;
+
conf = mddev->private;
- if (!conf || !conf->log) {
- spin_unlock(&mddev->lock);
- return 0;
- }
+ if (!conf || !conf->log)
+ goto out_unlock;
switch (conf->log->r5c_journal_mode) {
case R5C_JOURNAL_MODE_WRITE_THROUGH:
@@ -2557,7 +2553,9 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
default:
ret = 0;
}
- spin_unlock(&mddev->lock);
+
+out_unlock:
+ mddev_unlock(mddev);
return ret;
}
@@ -2639,7 +2637,7 @@ int r5c_try_caching_write(struct r5conf *conf,
int i;
struct r5dev *dev;
int to_cache = 0;
- void **pslot;
+ void __rcu **pslot;
sector_t tree_index;
int ret;
uintptr_t refcount;
@@ -2806,7 +2804,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
int i;
int do_wakeup = 0;
sector_t tree_index;
- void **pslot;
+ void __rcu **pslot;
uintptr_t refcount;
if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
@@ -3145,7 +3143,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
spin_lock_init(&log->stripe_in_journal_lock);
atomic_set(&log->stripe_in_journal_count, 0);
- rcu_assign_pointer(conf->log, log);
+ conf->log = log;
set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return 0;
@@ -3167,13 +3165,13 @@ void r5l_exit_log(struct r5conf *conf)
{
struct r5l_log *log = conf->log;
- conf->log = NULL;
- synchronize_rcu();
-
/* Ensure disable_writeback_work wakes up and exits */
wake_up(&conf->mddev->sb_wait);
flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
+
+ conf->log = NULL;
+
mempool_exit(&log->meta_pool);
bioset_exit(&log->bs);
mempool_exit(&log->io_pool);
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 43c714a8798c..c8332502669e 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -2,49 +2,46 @@
#ifndef _RAID5_LOG_H
#define _RAID5_LOG_H
-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
-extern void r5l_exit_log(struct r5conf *conf);
-extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
-extern void r5l_write_stripe_run(struct r5l_log *log);
-extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
-extern void r5l_stripe_write_finished(struct stripe_head *sh);
-extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int quiesce);
-extern bool r5l_log_disk_error(struct r5conf *conf);
-extern bool r5c_is_writeback(struct r5l_log *log);
-extern int
-r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s, int disks);
-extern void
-r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s);
-extern void r5c_release_extra_page(struct stripe_head *sh);
-extern void r5c_use_extra_page(struct stripe_head *sh);
-extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
-extern void r5c_handle_cached_data_endio(struct r5conf *conf,
- struct stripe_head *sh, int disks);
-extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
-extern void r5c_make_stripe_write_out(struct stripe_head *sh);
-extern void r5c_flush_cache(struct r5conf *conf, int num);
-extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
-extern void r5c_check_cached_full_stripe(struct r5conf *conf);
+int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+void r5l_exit_log(struct r5conf *conf);
+int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
+void r5l_write_stripe_run(struct r5l_log *log);
+void r5l_flush_stripe_to_raid(struct r5l_log *log);
+void r5l_stripe_write_finished(struct stripe_head *sh);
+int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
+void r5l_quiesce(struct r5l_log *log, int quiesce);
+bool r5l_log_disk_error(struct r5conf *conf);
+bool r5c_is_writeback(struct r5l_log *log);
+int r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s, int disks);
+void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s);
+void r5c_release_extra_page(struct stripe_head *sh);
+void r5c_use_extra_page(struct stripe_head *sh);
+void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+void r5c_handle_cached_data_endio(struct r5conf *conf,
+ struct stripe_head *sh, int disks);
+int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
+void r5c_make_stripe_write_out(struct stripe_head *sh);
+void r5c_flush_cache(struct r5conf *conf, int num);
+void r5c_check_stripe_cache_usage(struct r5conf *conf);
+void r5c_check_cached_full_stripe(struct r5conf *conf);
extern struct md_sysfs_entry r5c_journal_mode;
-extern void r5c_update_on_rdev_error(struct mddev *mddev,
- struct md_rdev *rdev);
-extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
-extern int r5l_start(struct r5l_log *log);
+void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev);
+bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
+int r5l_start(struct r5l_log *log);
-extern struct dma_async_tx_descriptor *
+struct dma_async_tx_descriptor *
ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
struct dma_async_tx_descriptor *tx);
-extern int ppl_init_log(struct r5conf *conf);
-extern void ppl_exit_log(struct r5conf *conf);
-extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
-extern void ppl_write_stripe_run(struct r5conf *conf);
-extern void ppl_stripe_write_finished(struct stripe_head *sh);
-extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
-extern void ppl_quiesce(struct r5conf *conf, int quiesce);
-extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio);
+int ppl_init_log(struct r5conf *conf);
+void ppl_exit_log(struct r5conf *conf);
+int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
+void ppl_write_stripe_run(struct r5conf *conf);
+void ppl_stripe_write_finished(struct stripe_head *sh);
+int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
+void ppl_quiesce(struct r5conf *conf, int quiesce);
+int ppl_handle_flush_request(struct bio *bio);
extern struct md_sysfs_entry ppl_write_hint;
static inline bool raid5_has_log(struct r5conf *conf)
@@ -111,7 +108,7 @@ static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio)
if (conf->log)
ret = r5l_handle_flush_request(conf->log, bio);
else if (raid5_has_ppl(conf))
- ret = ppl_handle_flush_request(conf->log, bio);
+ ret = ppl_handle_flush_request(bio);
return ret;
}
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 98988cb26295..31b9157bc9ae 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -679,7 +679,7 @@ void ppl_quiesce(struct r5conf *conf, int quiesce)
}
}
-int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio)
+int ppl_handle_flush_request(struct bio *bio)
{
if (bio->bi_iter.bi_size == 0) {
bio_endio(bio);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5cabdbbac48b..860c45c10a57 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -61,6 +61,8 @@
#define cpu_to_group(cpu) cpu_to_node(cpu)
#define ANY_GROUP NUMA_NO_NODE
+#define RAID5_MAX_REQ_STRIPES 256
+
static bool devices_handle_discard_safely = false;
module_param(devices_handle_discard_safely, bool, 0644);
MODULE_PARM_DESC(devices_handle_discard_safely,
@@ -624,6 +626,49 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
return NULL;
}
+static struct stripe_head *find_get_stripe(struct r5conf *conf,
+ sector_t sector, short generation, int hash)
+{
+ int inc_empty_inactive_list_flag;
+ struct stripe_head *sh;
+
+ sh = __find_stripe(conf, sector, generation);
+ if (!sh)
+ return NULL;
+
+ if (atomic_inc_not_zero(&sh->count))
+ return sh;
+
+ /*
+ * Slow path. The reference count is zero which means the stripe must
+ * be on a list (sh->lru). Must remove the stripe from the list that
+ * references it with the device_lock held.
+ */
+
+ spin_lock(&conf->device_lock);
+ if (!atomic_read(&sh->count)) {
+ if (!test_bit(STRIPE_HANDLE, &sh->state))
+ atomic_inc(&conf->active_stripes);
+ BUG_ON(list_empty(&sh->lru) &&
+ !test_bit(STRIPE_EXPANDING, &sh->state));
+ inc_empty_inactive_list_flag = 0;
+ if (!list_empty(conf->inactive_list + hash))
+ inc_empty_inactive_list_flag = 1;
+ list_del_init(&sh->lru);
+ if (list_empty(conf->inactive_list + hash) &&
+ inc_empty_inactive_list_flag)
+ atomic_inc(&conf->empty_inactive_list_nr);
+ if (sh->group) {
+ sh->group->stripes_cnt--;
+ sh->group = NULL;
+ }
+ }
+ atomic_inc(&sh->count);
+ spin_unlock(&conf->device_lock);
+
+ return sh;
+}
+
/*
* Need to check if array has failed when deciding whether to:
* - start an array
@@ -710,80 +755,121 @@ static bool has_failed(struct r5conf *conf)
return degraded > conf->max_degraded;
}
-struct stripe_head *
-raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
- int previous, int noblock, int noquiesce)
+enum stripe_result {
+ STRIPE_SUCCESS = 0,
+ STRIPE_RETRY,
+ STRIPE_SCHEDULE_AND_RETRY,
+ STRIPE_FAIL,
+};
+
+struct stripe_request_ctx {
+ /* a reference to the last stripe_head for batching */
+ struct stripe_head *batch_last;
+
+ /* first sector in the request */
+ sector_t first_sector;
+
+ /* last sector in the request */
+ sector_t last_sector;
+
+ /*
+ * bitmap to track stripe sectors that have been added to stripes
+ * add one to account for unaligned requests
+ */
+ DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1);
+
+ /* the request had REQ_PREFLUSH, cleared after the first stripe_head */
+ bool do_flush;
+};
+
+/*
+ * Block until another thread clears R5_INACTIVE_BLOCKED or
+ * there are fewer than 3/4 the maximum number of active stripes
+ * and there is an inactive stripe available.
+ */
+static bool is_inactive_blocked(struct r5conf *conf, int hash)
+{
+ int active = atomic_read(&conf->active_stripes);
+
+ if (list_empty(conf->inactive_list + hash))
+ return false;
+
+ if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
+ return true;
+
+ return active < (conf->max_nr_stripes * 3 / 4);
+}
+
+static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf,
+ struct stripe_request_ctx *ctx, sector_t sector,
+ bool previous, bool noblock, bool noquiesce)
{
struct stripe_head *sh;
int hash = stripe_hash_locks_hash(conf, sector);
- int inc_empty_inactive_list_flag;
pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
spin_lock_irq(conf->hash_locks + hash);
- do {
- wait_event_lock_irq(conf->wait_for_quiescent,
- conf->quiesce == 0 || noquiesce,
+retry:
+ if (!noquiesce && conf->quiesce) {
+ /*
+ * Must release the reference to batch_last before waiting,
+ * on quiesce, otherwise the batch_last will hold a reference
+ * to a stripe and raid5_quiesce() will deadlock waiting for
+ * active_stripes to go to zero.
+ */
+ if (ctx && ctx->batch_last) {
+ raid5_release_stripe(ctx->batch_last);
+ ctx->batch_last = NULL;
+ }
+
+ wait_event_lock_irq(conf->wait_for_quiescent, !conf->quiesce,
*(conf->hash_locks + hash));
- sh = __find_stripe(conf, sector, conf->generation - previous);
- if (!sh) {
- if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
- sh = get_free_stripe(conf, hash);
- if (!sh && !test_bit(R5_DID_ALLOC,
- &conf->cache_state))
- set_bit(R5_ALLOC_MORE,
- &conf->cache_state);
- }
- if (noblock && sh == NULL)
- break;
+ }
- r5c_check_stripe_cache_usage(conf);
- if (!sh) {
- set_bit(R5_INACTIVE_BLOCKED,
- &conf->cache_state);
- r5l_wake_reclaim(conf->log, 0);
- wait_event_lock_irq(
- conf->wait_for_stripe,
- !list_empty(conf->inactive_list + hash) &&
- (atomic_read(&conf->active_stripes)
- < (conf->max_nr_stripes * 3 / 4)
- || !test_bit(R5_INACTIVE_BLOCKED,
- &conf->cache_state)),
- *(conf->hash_locks + hash));
- clear_bit(R5_INACTIVE_BLOCKED,
- &conf->cache_state);
- } else {
- init_stripe(sh, sector, previous);
- atomic_inc(&sh->count);
- }
- } else if (!atomic_inc_not_zero(&sh->count)) {
- spin_lock(&conf->device_lock);
- if (!atomic_read(&sh->count)) {
- if (!test_bit(STRIPE_HANDLE, &sh->state))
- atomic_inc(&conf->active_stripes);
- BUG_ON(list_empty(&sh->lru) &&
- !test_bit(STRIPE_EXPANDING, &sh->state));
- inc_empty_inactive_list_flag = 0;
- if (!list_empty(conf->inactive_list + hash))
- inc_empty_inactive_list_flag = 1;
- list_del_init(&sh->lru);
- if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
- atomic_inc(&conf->empty_inactive_list_nr);
- if (sh->group) {
- sh->group->stripes_cnt--;
- sh->group = NULL;
- }
- }
- atomic_inc(&sh->count);
- spin_unlock(&conf->device_lock);
- }
- } while (sh == NULL);
+ sh = find_get_stripe(conf, sector, conf->generation - previous, hash);
+ if (sh)
+ goto out;
+
+ if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
+ goto wait_for_stripe;
+
+ sh = get_free_stripe(conf, hash);
+ if (sh) {
+ r5c_check_stripe_cache_usage(conf);
+ init_stripe(sh, sector, previous);
+ atomic_inc(&sh->count);
+ goto out;
+ }
+
+ if (!test_bit(R5_DID_ALLOC, &conf->cache_state))
+ set_bit(R5_ALLOC_MORE, &conf->cache_state);
+
+wait_for_stripe:
+ if (noblock)
+ goto out;
+ set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
+ r5l_wake_reclaim(conf->log, 0);
+ wait_event_lock_irq(conf->wait_for_stripe,
+ is_inactive_blocked(conf, hash),
+ *(conf->hash_locks + hash));
+ clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
+ goto retry;
+
+out:
spin_unlock_irq(conf->hash_locks + hash);
return sh;
}
+struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
+ sector_t sector, bool previous, bool noblock, bool noquiesce)
+{
+ return __raid5_get_active_stripe(conf, NULL, sector, previous, noblock,
+ noquiesce);
+}
+
static bool is_full_stripe_write(struct stripe_head *sh)
{
BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
@@ -824,13 +910,13 @@ static bool stripe_can_batch(struct stripe_head *sh)
}
/* we only do back search */
-static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
+static void stripe_add_to_batch_list(struct r5conf *conf,
+ struct stripe_head *sh, struct stripe_head *last_sh)
{
struct stripe_head *head;
sector_t head_sector, tmp_sec;
int hash;
int dd_idx;
- int inc_empty_inactive_list_flag;
/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
tmp_sec = sh->sector;
@@ -838,36 +924,20 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
return;
head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
- hash = stripe_hash_locks_hash(conf, head_sector);
- spin_lock_irq(conf->hash_locks + hash);
- head = __find_stripe(conf, head_sector, conf->generation);
- if (head && !atomic_inc_not_zero(&head->count)) {
- spin_lock(&conf->device_lock);
- if (!atomic_read(&head->count)) {
- if (!test_bit(STRIPE_HANDLE, &head->state))
- atomic_inc(&conf->active_stripes);
- BUG_ON(list_empty(&head->lru) &&
- !test_bit(STRIPE_EXPANDING, &head->state));
- inc_empty_inactive_list_flag = 0;
- if (!list_empty(conf->inactive_list + hash))
- inc_empty_inactive_list_flag = 1;
- list_del_init(&head->lru);
- if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
- atomic_inc(&conf->empty_inactive_list_nr);
- if (head->group) {
- head->group->stripes_cnt--;
- head->group = NULL;
- }
- }
+ if (last_sh && head_sector == last_sh->sector) {
+ head = last_sh;
atomic_inc(&head->count);
- spin_unlock(&conf->device_lock);
+ } else {
+ hash = stripe_hash_locks_hash(conf, head_sector);
+ spin_lock_irq(conf->hash_locks + hash);
+ head = find_get_stripe(conf, head_sector, conf->generation,
+ hash);
+ spin_unlock_irq(conf->hash_locks + hash);
+ if (!head)
+ return;
+ if (!stripe_can_batch(head))
+ goto out;
}
- spin_unlock_irq(conf->hash_locks + hash);
-
- if (!head)
- return;
- if (!stripe_can_batch(head))
- goto out;
lock_two_stripes(head, sh);
/* clear_batch_ready clear the flag */
@@ -2882,10 +2952,10 @@ static void raid5_end_write_request(struct bio *bi)
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
- raid5_release_stripe(sh);
if (sh->batch_head && sh != sh->batch_head)
raid5_release_stripe(sh->batch_head);
+ raid5_release_stripe(sh);
}
static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
@@ -3413,39 +3483,32 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked, s->ops_request);
}
-/*
- * Each stripe/dev can have one or more bion attached.
- * toread/towrite point to the first in a chain.
- * The bi_next chain must be in order.
- */
-static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
- int forwrite, int previous)
+static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
+ int dd_idx, int forwrite)
{
- struct bio **bip;
struct r5conf *conf = sh->raid_conf;
- int firstwrite=0;
+ struct bio **bip;
- pr_debug("adding bi b#%llu to stripe s#%llu\n",
- (unsigned long long)bi->bi_iter.bi_sector,
- (unsigned long long)sh->sector);
+ pr_debug("checking bi b#%llu to stripe s#%llu\n",
+ bi->bi_iter.bi_sector, sh->sector);
- spin_lock_irq(&sh->stripe_lock);
/* Don't allow new IO added to stripes in batch list */
if (sh->batch_head)
- goto overlap;
- if (forwrite) {
+ return true;
+
+ if (forwrite)
bip = &sh->dev[dd_idx].towrite;
- if (*bip == NULL)
- firstwrite = 1;
- } else
+ else
bip = &sh->dev[dd_idx].toread;
+
while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
- goto overlap;
- bip = & (*bip)->bi_next;
+ return true;
+ bip = &(*bip)->bi_next;
}
+
if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
- goto overlap;
+ return true;
if (forwrite && raid5_has_ppl(conf)) {
/*
@@ -3474,9 +3537,30 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
}
if (first + conf->chunk_sectors * (count - 1) != last)
- goto overlap;
+ return true;
+ }
+
+ return false;
+}
+
+static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
+ int dd_idx, int forwrite, int previous)
+{
+ struct r5conf *conf = sh->raid_conf;
+ struct bio **bip;
+ int firstwrite = 0;
+
+ if (forwrite) {
+ bip = &sh->dev[dd_idx].towrite;
+ if (!*bip)
+ firstwrite = 1;
+ } else {
+ bip = &sh->dev[dd_idx].toread;
}
+ while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector)
+ bip = &(*bip)->bi_next;
+
if (!forwrite || previous)
clear_bit(STRIPE_BATCH_READY, &sh->state);
@@ -3502,9 +3586,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
sh->overwrite_disks++;
}
- pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
- (unsigned long long)(*bip)->bi_iter.bi_sector,
- (unsigned long long)sh->sector, dd_idx);
+ pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n",
+ (*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
+ sh->dev[dd_idx].sector);
if (conf->mddev->bitmap && firstwrite) {
/* Cannot hold spinlock over bitmap_startwrite,
@@ -3512,7 +3596,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
* we have added to the bitmap and set bm_seq.
* So set STRIPE_BITMAP_PENDING to prevent
* batching.
- * If multiple add_stripe_bio() calls race here they
+ * If multiple __add_stripe_bio() calls race here they
* much all set STRIPE_BITMAP_PENDING. So only the first one
* to complete "bitmap_startwrite" gets to set
* STRIPE_BIT_DELAY. This is important as once a stripe
@@ -3530,16 +3614,27 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
}
- spin_unlock_irq(&sh->stripe_lock);
+}
- if (stripe_can_batch(sh))
- stripe_add_to_batch_list(conf, sh);
- return 1;
+/*
+ * Each stripe/dev can have one or more bios attached.
+ * toread/towrite point to the first in a chain.
+ * The bi_next chain must be in order.
+ */
+static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi,
+ int dd_idx, int forwrite, int previous)
+{
+ spin_lock_irq(&sh->stripe_lock);
+
+ if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
+ set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
+ spin_unlock_irq(&sh->stripe_lock);
+ return false;
+ }
- overlap:
- set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
+ __add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
spin_unlock_irq(&sh->stripe_lock);
- return 0;
+ return true;
}
static void end_reshape(struct r5conf *conf);
@@ -5785,17 +5880,215 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
bio_endio(bi);
}
-static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
+static bool ahead_of_reshape(struct mddev *mddev, sector_t sector,
+ sector_t reshape_sector)
{
- struct r5conf *conf = mddev->private;
+ return mddev->reshape_backwards ? sector < reshape_sector :
+ sector >= reshape_sector;
+}
+
+static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min,
+ sector_t max, sector_t reshape_sector)
+{
+ return mddev->reshape_backwards ? max < reshape_sector :
+ min >= reshape_sector;
+}
+
+static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf,
+ struct stripe_head *sh)
+{
+ sector_t max_sector = 0, min_sector = MaxSector;
+ bool ret = false;
int dd_idx;
- sector_t new_sector;
- sector_t logical_sector, last_sector;
+
+ for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
+ if (dd_idx == sh->pd_idx)
+ continue;
+
+ min_sector = min(min_sector, sh->dev[dd_idx].sector);
+ max_sector = min(max_sector, sh->dev[dd_idx].sector);
+ }
+
+ spin_lock_irq(&conf->device_lock);
+
+ if (!range_ahead_of_reshape(mddev, min_sector, max_sector,
+ conf->reshape_progress))
+ /* mismatch, need to try again */
+ ret = true;
+
+ spin_unlock_irq(&conf->device_lock);
+
+ return ret;
+}
+
+static int add_all_stripe_bios(struct r5conf *conf,
+ struct stripe_request_ctx *ctx, struct stripe_head *sh,
+ struct bio *bi, int forwrite, int previous)
+{
+ int dd_idx;
+ int ret = 1;
+
+ spin_lock_irq(&sh->stripe_lock);
+
+ for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
+ struct r5dev *dev = &sh->dev[dd_idx];
+
+ if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+ continue;
+
+ if (dev->sector < ctx->first_sector ||
+ dev->sector >= ctx->last_sector)
+ continue;
+
+ if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
+ set_bit(R5_Overlap, &dev->flags);
+ ret = 0;
+ continue;
+ }
+ }
+
+ if (!ret)
+ goto out;
+
+ for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
+ struct r5dev *dev = &sh->dev[dd_idx];
+
+ if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+ continue;
+
+ if (dev->sector < ctx->first_sector ||
+ dev->sector >= ctx->last_sector)
+ continue;
+
+ __add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
+ clear_bit((dev->sector - ctx->first_sector) >>
+ RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do);
+ }
+
+out:
+ spin_unlock_irq(&sh->stripe_lock);
+ return ret;
+}
+
+static enum stripe_result make_stripe_request(struct mddev *mddev,
+ struct r5conf *conf, struct stripe_request_ctx *ctx,
+ sector_t logical_sector, struct bio *bi)
+{
+ const int rw = bio_data_dir(bi);
+ enum stripe_result ret;
struct stripe_head *sh;
+ sector_t new_sector;
+ int previous = 0;
+ int seq, dd_idx;
+
+ seq = read_seqcount_begin(&conf->gen_lock);
+
+ if (unlikely(conf->reshape_progress != MaxSector)) {
+ /*
+ * Spinlock is needed as reshape_progress may be
+ * 64bit on a 32bit platform, and so it might be
+ * possible to see a half-updated value
+ * Of course reshape_progress could change after
+ * the lock is dropped, so once we get a reference
+ * to the stripe that we think it is, we will have
+ * to check again.
+ */
+ spin_lock_irq(&conf->device_lock);
+ if (ahead_of_reshape(mddev, logical_sector,
+ conf->reshape_progress)) {
+ previous = 1;
+ } else {
+ if (ahead_of_reshape(mddev, logical_sector,
+ conf->reshape_safe)) {
+ spin_unlock_irq(&conf->device_lock);
+ return STRIPE_SCHEDULE_AND_RETRY;
+ }
+ }
+ spin_unlock_irq(&conf->device_lock);
+ }
+
+ new_sector = raid5_compute_sector(conf, logical_sector, previous,
+ &dd_idx, NULL);
+ pr_debug("raid456: %s, sector %llu logical %llu\n", __func__,
+ new_sector, logical_sector);
+
+ sh = __raid5_get_active_stripe(conf, ctx, new_sector, previous,
+ (bi->bi_opf & REQ_RAHEAD), 0);
+ if (unlikely(!sh)) {
+ /* cannot get stripe, just give-up */
+ bi->bi_status = BLK_STS_IOERR;
+ return STRIPE_FAIL;
+ }
+
+ if (unlikely(previous) &&
+ stripe_ahead_of_reshape(mddev, conf, sh)) {
+ /*
+ * Expansion moved on while waiting for a stripe.
+ * Expansion could still move past after this
+ * test, but as we are holding a reference to
+ * 'sh', we know that if that happens,
+ * STRIPE_EXPANDING will get set and the expansion
+ * won't proceed until we finish with the stripe.
+ */
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out_release;
+ }
+
+ if (read_seqcount_retry(&conf->gen_lock, seq)) {
+ /* Might have got the wrong stripe_head by accident */
+ ret = STRIPE_RETRY;
+ goto out_release;
+ }
+
+ if (test_bit(STRIPE_EXPANDING, &sh->state) ||
+ !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
+ /*
+ * Stripe is busy expanding or add failed due to
+ * overlap. Flush everything and wait a while.
+ */
+ md_wakeup_thread(mddev->thread);
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out_release;
+ }
+
+ if (stripe_can_batch(sh)) {
+ stripe_add_to_batch_list(conf, sh, ctx->batch_last);
+ if (ctx->batch_last)
+ raid5_release_stripe(ctx->batch_last);
+ atomic_inc(&sh->count);
+ ctx->batch_last = sh;
+ }
+
+ if (ctx->do_flush) {
+ set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
+ /* we only need flush for one stripe */
+ ctx->do_flush = false;
+ }
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ if ((!sh->batch_head || sh == sh->batch_head) &&
+ (bi->bi_opf & REQ_SYNC) &&
+ !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ atomic_inc(&conf->preread_active_stripes);
+
+ release_stripe_plug(mddev, sh);
+ return STRIPE_SUCCESS;
+
+out_release:
+ raid5_release_stripe(sh);
+ return ret;
+}
+
+static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct r5conf *conf = mddev->private;
+ sector_t logical_sector;
+ struct stripe_request_ctx ctx = {};
const int rw = bio_data_dir(bi);
- DEFINE_WAIT(w);
- bool do_prepare;
- bool do_flush = false;
+ enum stripe_result res;
+ int s, stripe_cnt;
if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
int ret = log_handle_flush_request(conf, bi);
@@ -5811,7 +6104,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
* if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
* we need to flush journal device
*/
- do_flush = bi->bi_opf & REQ_PREFLUSH;
+ ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
}
if (!md_write_start(mddev, bi))
@@ -5835,134 +6128,68 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
}
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
- last_sector = bio_end_sector(bi);
+ ctx.first_sector = logical_sector;
+ ctx.last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
+ stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector,
+ RAID5_STRIPE_SECTORS(conf));
+ bitmap_set(ctx.sectors_to_do, 0, stripe_cnt);
+
+ pr_debug("raid456: %s, logical %llu to %llu\n", __func__,
+ bi->bi_iter.bi_sector, ctx.last_sector);
+
/* Bail out if conflicts with reshape and REQ_NOWAIT is set */
if ((bi->bi_opf & REQ_NOWAIT) &&
(conf->reshape_progress != MaxSector) &&
- (mddev->reshape_backwards
- ? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe)
- : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) {
+ !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) &&
+ ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) {
bio_wouldblock_error(bi);
if (rw == WRITE)
md_write_end(mddev);
return true;
}
md_account_bio(mddev, &bi);
- prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
- for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
- int previous;
- int seq;
-
- do_prepare = false;
- retry:
- seq = read_seqcount_begin(&conf->gen_lock);
- previous = 0;
- if (do_prepare)
- prepare_to_wait(&conf->wait_for_overlap, &w,
- TASK_UNINTERRUPTIBLE);
- if (unlikely(conf->reshape_progress != MaxSector)) {
- /* spinlock is needed as reshape_progress may be
- * 64bit on a 32bit platform, and so it might be
- * possible to see a half-updated value
- * Of course reshape_progress could change after
- * the lock is dropped, so once we get a reference
- * to the stripe that we think it is, we will have
- * to check again.
- */
- spin_lock_irq(&conf->device_lock);
- if (mddev->reshape_backwards
- ? logical_sector < conf->reshape_progress
- : logical_sector >= conf->reshape_progress) {
- previous = 1;
- } else {
- if (mddev->reshape_backwards
- ? logical_sector < conf->reshape_safe
- : logical_sector >= conf->reshape_safe) {
- spin_unlock_irq(&conf->device_lock);
- schedule();
- do_prepare = true;
- goto retry;
- }
- }
- spin_unlock_irq(&conf->device_lock);
- }
- new_sector = raid5_compute_sector(conf, logical_sector,
- previous,
- &dd_idx, NULL);
- pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
- (unsigned long long)new_sector,
- (unsigned long long)logical_sector);
+ add_wait_queue(&conf->wait_for_overlap, &wait);
+ while (1) {
+ res = make_stripe_request(mddev, conf, &ctx, logical_sector,
+ bi);
+ if (res == STRIPE_FAIL)
+ break;
- sh = raid5_get_active_stripe(conf, new_sector, previous,
- (bi->bi_opf & REQ_RAHEAD), 0);
- if (sh) {
- if (unlikely(previous)) {
- /* expansion might have moved on while waiting for a
- * stripe, so we must do the range check again.
- * Expansion could still move past after this
- * test, but as we are holding a reference to
- * 'sh', we know that if that happens,
- * STRIPE_EXPANDING will get set and the expansion
- * won't proceed until we finish with the stripe.
- */
- int must_retry = 0;
- spin_lock_irq(&conf->device_lock);
- if (mddev->reshape_backwards
- ? logical_sector >= conf->reshape_progress
- : logical_sector < conf->reshape_progress)
- /* mismatch, need to try again */
- must_retry = 1;
- spin_unlock_irq(&conf->device_lock);
- if (must_retry) {
- raid5_release_stripe(sh);
- schedule();
- do_prepare = true;
- goto retry;
- }
- }
- if (read_seqcount_retry(&conf->gen_lock, seq)) {
- /* Might have got the wrong stripe_head
- * by accident
- */
- raid5_release_stripe(sh);
- goto retry;
- }
+ if (res == STRIPE_RETRY)
+ continue;
- if (test_bit(STRIPE_EXPANDING, &sh->state) ||
- !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
- /* Stripe is busy expanding or
- * add failed due to overlap. Flush everything
- * and wait a while
- */
- md_wakeup_thread(mddev->thread);
- raid5_release_stripe(sh);
- schedule();
- do_prepare = true;
- goto retry;
- }
- if (do_flush) {
- set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
- /* we only need flush for one stripe */
- do_flush = false;
+ if (res == STRIPE_SCHEDULE_AND_RETRY) {
+ /*
+ * Must release the reference to batch_last before
+ * scheduling and waiting for work to be done,
+ * otherwise the batch_last stripe head could prevent
+ * raid5_activate_delayed() from making progress
+ * and thus deadlocking.
+ */
+ if (ctx.batch_last) {
+ raid5_release_stripe(ctx.batch_last);
+ ctx.batch_last = NULL;
}
- set_bit(STRIPE_HANDLE, &sh->state);
- clear_bit(STRIPE_DELAYED, &sh->state);
- if ((!sh->batch_head || sh == sh->batch_head) &&
- (bi->bi_opf & REQ_SYNC) &&
- !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
- atomic_inc(&conf->preread_active_stripes);
- release_stripe_plug(mddev, sh);
- } else {
- /* cannot get stripe for read-ahead, just give-up */
- bi->bi_status = BLK_STS_IOERR;
- break;
+ wait_woken(&wait, TASK_UNINTERRUPTIBLE,
+ MAX_SCHEDULE_TIMEOUT);
+ continue;
}
+
+ s = find_first_bit(ctx.sectors_to_do, stripe_cnt);
+ if (s == stripe_cnt)
+ break;
+
+ logical_sector = ctx.first_sector +
+ (s << RAID5_STRIPE_SHIFT(conf));
}
- finish_wait(&conf->wait_for_overlap, &w);
+ remove_wait_queue(&conf->wait_for_overlap, &wait);
+
+ if (ctx.batch_last)
+ raid5_release_stripe(ctx.batch_last);
if (rw == WRITE)
md_write_end(mddev);
@@ -7815,7 +8042,15 @@ static int raid5_run(struct mddev *mddev)
mddev->queue->limits.discard_granularity < stripe)
blk_queue_max_discard_sectors(mddev->queue, 0);
- blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
+ /*
+ * Requests require having a bitmap for each stripe.
+ * Limit the max sectors based on this.
+ */
+ blk_queue_max_hw_sectors(mddev->queue,
+ RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
+
+ /* No restrictions on the number of segments in the request */
+ blk_queue_max_segments(mddev->queue, USHRT_MAX);
}
if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
@@ -8066,8 +8301,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* find the disk ... but prefer rdev->saved_raid_disk
* if possible.
*/
- if (rdev->saved_raid_disk >= 0 &&
- rdev->saved_raid_disk >= first &&
+ if (rdev->saved_raid_disk >= first &&
rdev->saved_raid_disk <= last &&
conf->disks[rdev->saved_raid_disk].rdev == NULL)
first = rdev->saved_raid_disk;
@@ -8704,8 +8938,11 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
err = log_init(conf, NULL, true);
if (!err) {
err = resize_stripes(conf, conf->pool_size);
- if (err)
+ if (err) {
+ mddev_suspend(mddev);
log_exit(conf);
+ mddev_resume(mddev);
+ }
}
} else
err = -EINVAL;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 638d29863503..a5082bed83c8 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -812,7 +812,7 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
struct stripe_head *sh);
extern struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
- int previous, int noblock, int noquiesce);
+ bool previous, bool noblock, bool noquiesce);
extern int raid5_calc_degraded(struct r5conf *conf);
extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode);
#endif
diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig
index 87ae409a32b9..656e46d938da 100644
--- a/drivers/nvme/Kconfig
+++ b/drivers/nvme/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
menu "NVME Support"
+source "drivers/nvme/common/Kconfig"
source "drivers/nvme/host/Kconfig"
source "drivers/nvme/target/Kconfig"
diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile
index fb42c44609a8..eedca8c72098 100644
--- a/drivers/nvme/Makefile
+++ b/drivers/nvme/Makefile
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_NVME_COMMON) += common/
obj-y += host/
obj-y += target/
diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig
new file mode 100644
index 000000000000..4514f44362dd
--- /dev/null
+++ b/drivers/nvme/common/Kconfig
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config NVME_COMMON
+ tristate
diff --git a/drivers/nvme/common/Makefile b/drivers/nvme/common/Makefile
new file mode 100644
index 000000000000..720c625b8a52
--- /dev/null
+++ b/drivers/nvme/common/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y += -I$(src)
+
+obj-$(CONFIG_NVME_COMMON) += nvme-common.o
+
+nvme-common-y += auth.o
diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
new file mode 100644
index 000000000000..04bd28f17dcc
--- /dev/null
+++ b/drivers/nvme/common/auth.c
@@ -0,0 +1,483 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Hannes Reinecke, SUSE Linux
+ */
+
+#include <linux/module.h>
+#include <linux/crc32.h>
+#include <linux/base64.h>
+#include <linux/prandom.h>
+#include <linux/scatterlist.h>
+#include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/dh.h>
+#include <linux/nvme.h>
+#include <linux/nvme-auth.h>
+
+static u32 nvme_dhchap_seqnum;
+static DEFINE_MUTEX(nvme_dhchap_mutex);
+
+u32 nvme_auth_get_seqnum(void)
+{
+ u32 seqnum;
+
+ mutex_lock(&nvme_dhchap_mutex);
+ if (!nvme_dhchap_seqnum)
+ nvme_dhchap_seqnum = prandom_u32();
+ else {
+ nvme_dhchap_seqnum++;
+ if (!nvme_dhchap_seqnum)
+ nvme_dhchap_seqnum++;
+ }
+ seqnum = nvme_dhchap_seqnum;
+ mutex_unlock(&nvme_dhchap_mutex);
+ return seqnum;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_get_seqnum);
+
+static struct nvme_auth_dhgroup_map {
+ const char name[16];
+ const char kpp[16];
+} dhgroup_map[] = {
+ [NVME_AUTH_DHGROUP_NULL] = {
+ .name = "null", .kpp = "null" },
+ [NVME_AUTH_DHGROUP_2048] = {
+ .name = "ffdhe2048", .kpp = "ffdhe2048(dh)" },
+ [NVME_AUTH_DHGROUP_3072] = {
+ .name = "ffdhe3072", .kpp = "ffdhe3072(dh)" },
+ [NVME_AUTH_DHGROUP_4096] = {
+ .name = "ffdhe4096", .kpp = "ffdhe4096(dh)" },
+ [NVME_AUTH_DHGROUP_6144] = {
+ .name = "ffdhe6144", .kpp = "ffdhe6144(dh)" },
+ [NVME_AUTH_DHGROUP_8192] = {
+ .name = "ffdhe8192", .kpp = "ffdhe8192(dh)" },
+};
+
+const char *nvme_auth_dhgroup_name(u8 dhgroup_id)
+{
+ if (dhgroup_id >= ARRAY_SIZE(dhgroup_map))
+ return NULL;
+ return dhgroup_map[dhgroup_id].name;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_name);
+
+const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id)
+{
+ if (dhgroup_id >= ARRAY_SIZE(dhgroup_map))
+ return NULL;
+ return dhgroup_map[dhgroup_id].kpp;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_kpp);
+
+u8 nvme_auth_dhgroup_id(const char *dhgroup_name)
+{
+ int i;
+
+ if (!dhgroup_name || !strlen(dhgroup_name))
+ return NVME_AUTH_DHGROUP_INVALID;
+ for (i = 0; i < ARRAY_SIZE(dhgroup_map); i++) {
+ if (!strlen(dhgroup_map[i].name))
+ continue;
+ if (!strncmp(dhgroup_map[i].name, dhgroup_name,
+ strlen(dhgroup_map[i].name)))
+ return i;
+ }
+ return NVME_AUTH_DHGROUP_INVALID;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_id);
+
+static struct nvme_dhchap_hash_map {
+ int len;
+ const char hmac[15];
+ const char digest[8];
+} hash_map[] = {
+ [NVME_AUTH_HASH_SHA256] = {
+ .len = 32,
+ .hmac = "hmac(sha256)",
+ .digest = "sha256",
+ },
+ [NVME_AUTH_HASH_SHA384] = {
+ .len = 48,
+ .hmac = "hmac(sha384)",
+ .digest = "sha384",
+ },
+ [NVME_AUTH_HASH_SHA512] = {
+ .len = 64,
+ .hmac = "hmac(sha512)",
+ .digest = "sha512",
+ },
+};
+
+const char *nvme_auth_hmac_name(u8 hmac_id)
+{
+ if (hmac_id >= ARRAY_SIZE(hash_map))
+ return NULL;
+ return hash_map[hmac_id].hmac;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_name);
+
+const char *nvme_auth_digest_name(u8 hmac_id)
+{
+ if (hmac_id >= ARRAY_SIZE(hash_map))
+ return NULL;
+ return hash_map[hmac_id].digest;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_digest_name);
+
+u8 nvme_auth_hmac_id(const char *hmac_name)
+{
+ int i;
+
+ if (!hmac_name || !strlen(hmac_name))
+ return NVME_AUTH_HASH_INVALID;
+
+ for (i = 0; i < ARRAY_SIZE(hash_map); i++) {
+ if (!strlen(hash_map[i].hmac))
+ continue;
+ if (!strncmp(hash_map[i].hmac, hmac_name,
+ strlen(hash_map[i].hmac)))
+ return i;
+ }
+ return NVME_AUTH_HASH_INVALID;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_id);
+
+size_t nvme_auth_hmac_hash_len(u8 hmac_id)
+{
+ if (hmac_id >= ARRAY_SIZE(hash_map))
+ return 0;
+ return hash_map[hmac_id].len;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_hash_len);
+
+struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
+ u8 key_hash)
+{
+ struct nvme_dhchap_key *key;
+ unsigned char *p;
+ u32 crc;
+ int ret, key_len;
+ size_t allocated_len = strlen(secret);
+
+ /* Secret might be affixed with a ':' */
+ p = strrchr(secret, ':');
+ if (p)
+ allocated_len = p - secret;
+ key = kzalloc(sizeof(*key), GFP_KERNEL);
+ if (!key)
+ return ERR_PTR(-ENOMEM);
+ key->key = kzalloc(allocated_len, GFP_KERNEL);
+ if (!key->key) {
+ ret = -ENOMEM;
+ goto out_free_key;
+ }
+
+ key_len = base64_decode(secret, allocated_len, key->key);
+ if (key_len < 0) {
+ pr_debug("base64 key decoding error %d\n",
+ key_len);
+ ret = key_len;
+ goto out_free_secret;
+ }
+
+ if (key_len != 36 && key_len != 52 &&
+ key_len != 68) {
+ pr_err("Invalid key len %d\n", key_len);
+ ret = -EINVAL;
+ goto out_free_secret;
+ }
+
+ if (key_hash > 0 &&
+ (key_len - 4) != nvme_auth_hmac_hash_len(key_hash)) {
+ pr_err("Mismatched key len %d for %s\n", key_len,
+ nvme_auth_hmac_name(key_hash));
+ ret = -EINVAL;
+ goto out_free_secret;
+ }
+
+ /* The last four bytes is the CRC in little-endian format */
+ key_len -= 4;
+ /*
+ * The linux implementation doesn't do pre- and post-increments,
+ * so we have to do it manually.
+ */
+ crc = ~crc32(~0, key->key, key_len);
+
+ if (get_unaligned_le32(key->key + key_len) != crc) {
+ pr_err("key crc mismatch (key %08x, crc %08x)\n",
+ get_unaligned_le32(key->key + key_len), crc);
+ ret = -EKEYREJECTED;
+ goto out_free_secret;
+ }
+ key->len = key_len;
+ key->hash = key_hash;
+ return key;
+out_free_secret:
+ kfree_sensitive(key->key);
+out_free_key:
+ kfree(key);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_extract_key);
+
+void nvme_auth_free_key(struct nvme_dhchap_key *key)
+{
+ if (!key)
+ return;
+ kfree_sensitive(key->key);
+ kfree(key);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_free_key);
+
+u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
+{
+ const char *hmac_name;
+ struct crypto_shash *key_tfm;
+ struct shash_desc *shash;
+ u8 *transformed_key;
+ int ret;
+
+ if (!key || !key->key) {
+ pr_warn("No key specified\n");
+ return ERR_PTR(-ENOKEY);
+ }
+ if (key->hash == 0) {
+ transformed_key = kmemdup(key->key, key->len, GFP_KERNEL);
+ return transformed_key ? transformed_key : ERR_PTR(-ENOMEM);
+ }
+ hmac_name = nvme_auth_hmac_name(key->hash);
+ if (!hmac_name) {
+ pr_warn("Invalid key hash id %d\n", key->hash);
+ return ERR_PTR(-EINVAL);
+ }
+
+ key_tfm = crypto_alloc_shash(hmac_name, 0, 0);
+ if (IS_ERR(key_tfm))
+ return (u8 *)key_tfm;
+
+ shash = kmalloc(sizeof(struct shash_desc) +
+ crypto_shash_descsize(key_tfm),
+ GFP_KERNEL);
+ if (!shash) {
+ ret = -ENOMEM;
+ goto out_free_key;
+ }
+
+ transformed_key = kzalloc(crypto_shash_digestsize(key_tfm), GFP_KERNEL);
+ if (!transformed_key) {
+ ret = -ENOMEM;
+ goto out_free_shash;
+ }
+
+ shash->tfm = key_tfm;
+ ret = crypto_shash_setkey(key_tfm, key->key, key->len);
+ if (ret < 0)
+ goto out_free_transformed_key;
+ ret = crypto_shash_init(shash);
+ if (ret < 0)
+ goto out_free_transformed_key;
+ ret = crypto_shash_update(shash, nqn, strlen(nqn));
+ if (ret < 0)
+ goto out_free_transformed_key;
+ ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17);
+ if (ret < 0)
+ goto out_free_transformed_key;
+ ret = crypto_shash_final(shash, transformed_key);
+ if (ret < 0)
+ goto out_free_transformed_key;
+
+ kfree(shash);
+ crypto_free_shash(key_tfm);
+
+ return transformed_key;
+
+out_free_transformed_key:
+ kfree_sensitive(transformed_key);
+out_free_shash:
+ kfree(shash);
+out_free_key:
+ crypto_free_shash(key_tfm);
+
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_transform_key);
+
+static int nvme_auth_hash_skey(int hmac_id, u8 *skey, size_t skey_len, u8 *hkey)
+{
+ const char *digest_name;
+ struct crypto_shash *tfm;
+ int ret;
+
+ digest_name = nvme_auth_digest_name(hmac_id);
+ if (!digest_name) {
+ pr_debug("%s: failed to get digest for %d\n", __func__,
+ hmac_id);
+ return -EINVAL;
+ }
+ tfm = crypto_alloc_shash(digest_name, 0, 0);
+ if (IS_ERR(tfm))
+ return -ENOMEM;
+
+ ret = crypto_shash_tfm_digest(tfm, skey, skey_len, hkey);
+ if (ret < 0)
+ pr_debug("%s: Failed to hash digest len %zu\n", __func__,
+ skey_len);
+
+ crypto_free_shash(tfm);
+ return ret;
+}
+
+int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
+ u8 *challenge, u8 *aug, size_t hlen)
+{
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ u8 *hashed_key;
+ const char *hmac_name;
+ int ret;
+
+ hashed_key = kmalloc(hlen, GFP_KERNEL);
+ if (!hashed_key)
+ return -ENOMEM;
+
+ ret = nvme_auth_hash_skey(hmac_id, skey,
+ skey_len, hashed_key);
+ if (ret < 0)
+ goto out_free_key;
+
+ hmac_name = nvme_auth_hmac_name(hmac_id);
+ if (!hmac_name) {
+ pr_warn("%s: invalid hash algorithm %d\n",
+ __func__, hmac_id);
+ ret = -EINVAL;
+ goto out_free_key;
+ }
+
+ tfm = crypto_alloc_shash(hmac_name, 0, 0);
+ if (IS_ERR(tfm)) {
+ ret = PTR_ERR(tfm);
+ goto out_free_key;
+ }
+
+ desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm),
+ GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto out_free_hash;
+ }
+ desc->tfm = tfm;
+
+ ret = crypto_shash_setkey(tfm, hashed_key, hlen);
+ if (ret)
+ goto out_free_desc;
+
+ ret = crypto_shash_init(desc);
+ if (ret)
+ goto out_free_desc;
+
+ ret = crypto_shash_update(desc, challenge, hlen);
+ if (ret)
+ goto out_free_desc;
+
+ ret = crypto_shash_final(desc, aug);
+out_free_desc:
+ kfree_sensitive(desc);
+out_free_hash:
+ crypto_free_shash(tfm);
+out_free_key:
+ kfree_sensitive(hashed_key);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_augmented_challenge);
+
+int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid)
+{
+ int ret;
+
+ ret = crypto_kpp_set_secret(dh_tfm, NULL, 0);
+ if (ret)
+ pr_debug("failed to set private key, error %d\n", ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_gen_privkey);
+
+int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm,
+ u8 *host_key, size_t host_key_len)
+{
+ struct kpp_request *req;
+ struct crypto_wait wait;
+ struct scatterlist dst;
+ int ret;
+
+ req = kpp_request_alloc(dh_tfm, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ crypto_init_wait(&wait);
+ kpp_request_set_input(req, NULL, 0);
+ sg_init_one(&dst, host_key, host_key_len);
+ kpp_request_set_output(req, &dst, host_key_len);
+ kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+
+ ret = crypto_wait_req(crypto_kpp_generate_public_key(req), &wait);
+ kpp_request_free(req);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_gen_pubkey);
+
+int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
+ u8 *ctrl_key, size_t ctrl_key_len,
+ u8 *sess_key, size_t sess_key_len)
+{
+ struct kpp_request *req;
+ struct crypto_wait wait;
+ struct scatterlist src, dst;
+ int ret;
+
+ req = kpp_request_alloc(dh_tfm, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ crypto_init_wait(&wait);
+ sg_init_one(&src, ctrl_key, ctrl_key_len);
+ kpp_request_set_input(req, &src, ctrl_key_len);
+ sg_init_one(&dst, sess_key, sess_key_len);
+ kpp_request_set_output(req, &dst, sess_key_len);
+ kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+
+ ret = crypto_wait_req(crypto_kpp_compute_shared_secret(req), &wait);
+
+ kpp_request_free(req);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret);
+
+int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key)
+{
+ struct nvme_dhchap_key *key;
+ u8 key_hash;
+
+ if (!secret) {
+ *ret_key = NULL;
+ return 0;
+ }
+
+ if (sscanf(secret, "DHHC-1:%hhd:%*s:", &key_hash) != 1)
+ return -EINVAL;
+
+ /* Pass in the secret without the 'DHHC-1:XX:' prefix */
+ key = nvme_auth_extract_key(secret + 10, key_hash);
+ if (IS_ERR(key)) {
+ *ret_key = NULL;
+ return PTR_ERR(key);
+ }
+
+ *ret_key = key;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_generate_key);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 877d2ec4ea9f..2f6a7f8c94e8 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -92,6 +92,21 @@ config NVME_TCP
If unsure, say N.
+config NVME_AUTH
+ bool "NVM Express over Fabrics In-Band Authentication"
+ depends on NVME_CORE
+ select NVME_COMMON
+ select CRYPTO
+ select CRYPTO_HMAC
+ select CRYPTO_SHA256
+ select CRYPTO_SHA512
+ select CRYPTO_DH
+ select CRYPTO_DH_RFC7919_GROUPS
+ help
+ This provides support for NVMe over Fabrics In-Band Authentication.
+
+ If unsure, say N.
+
config NVME_APPLE
tristate "Apple ANS2 NVM Express host driver"
depends on OF && BLOCK
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index a36ae1612059..e27202d22c7d 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -10,12 +10,14 @@ obj-$(CONFIG_NVME_FC) += nvme-fc.o
obj-$(CONFIG_NVME_TCP) += nvme-tcp.o
obj-$(CONFIG_NVME_APPLE) += nvme-apple.o
-nvme-core-y := core.o ioctl.o constants.o
+nvme-core-y += core.o ioctl.o
+nvme-core-$(CONFIG_NVME_VERBOSE_ERRORS) += constants.o
nvme-core-$(CONFIG_TRACING) += trace.o
nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o
nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o
nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o
+nvme-core-$(CONFIG_NVME_AUTH) += auth.o
nvme-y += pci.o
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index 5c352d5d8ee6..5fc5ea196b40 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -845,11 +845,8 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
apple_nvme_handle_cq(&anv->adminq, true);
spin_unlock_irqrestore(&anv->lock, flags);
- blk_mq_tagset_busy_iter(&anv->tagset, nvme_cancel_request, &anv->ctrl);
- blk_mq_tagset_busy_iter(&anv->admin_tagset, nvme_cancel_request,
- &anv->ctrl);
- blk_mq_tagset_wait_completed_request(&anv->tagset);
- blk_mq_tagset_wait_completed_request(&anv->admin_tagset);
+ nvme_cancel_tagset(&anv->ctrl);
+ nvme_cancel_admin_tagset(&anv->ctrl);
/*
* The driver will not be starting up queues again if shutting down so
@@ -1222,6 +1219,11 @@ static void apple_nvme_async_probe(void *data, async_cookie_t cookie)
nvme_put_ctrl(&anv->ctrl);
}
+static void devm_apple_nvme_put_tag_set(void *data)
+{
+ blk_mq_free_tag_set(data);
+}
+
static int apple_nvme_alloc_tagsets(struct apple_nvme *anv)
{
int ret;
@@ -1238,8 +1240,7 @@ static int apple_nvme_alloc_tagsets(struct apple_nvme *anv)
ret = blk_mq_alloc_tag_set(&anv->admin_tagset);
if (ret)
return ret;
- ret = devm_add_action_or_reset(anv->dev,
- (void (*)(void *))blk_mq_free_tag_set,
+ ret = devm_add_action_or_reset(anv->dev, devm_apple_nvme_put_tag_set,
&anv->admin_tagset);
if (ret)
return ret;
@@ -1263,8 +1264,8 @@ static int apple_nvme_alloc_tagsets(struct apple_nvme *anv)
ret = blk_mq_alloc_tag_set(&anv->tagset);
if (ret)
return ret;
- ret = devm_add_action_or_reset(
- anv->dev, (void (*)(void *))blk_mq_free_tag_set, &anv->tagset);
+ ret = devm_add_action_or_reset(anv->dev, devm_apple_nvme_put_tag_set,
+ &anv->tagset);
if (ret)
return ret;
@@ -1365,6 +1366,11 @@ static int apple_nvme_attach_genpd(struct apple_nvme *anv)
return 0;
}
+static void devm_apple_nvme_mempool_destroy(void *data)
+{
+ mempool_destroy(data);
+}
+
static int apple_nvme_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
@@ -1462,8 +1468,8 @@ static int apple_nvme_probe(struct platform_device *pdev)
ret = -ENOMEM;
goto put_dev;
}
- ret = devm_add_action_or_reset(
- anv->dev, (void (*)(void *))mempool_destroy, anv->iod_mempool);
+ ret = devm_add_action_or_reset(anv->dev,
+ devm_apple_nvme_mempool_destroy, anv->iod_mempool);
if (ret)
goto put_dev;
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
new file mode 100644
index 000000000000..c8a6db7c4498
--- /dev/null
+++ b/drivers/nvme/host/auth.c
@@ -0,0 +1,1017 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Hannes Reinecke, SUSE Linux
+ */
+
+#include <linux/crc32.h>
+#include <linux/base64.h>
+#include <linux/prandom.h>
+#include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/dh.h>
+#include "nvme.h"
+#include "fabrics.h"
+#include <linux/nvme-auth.h>
+
+struct nvme_dhchap_queue_context {
+ struct list_head entry;
+ struct work_struct auth_work;
+ struct nvme_ctrl *ctrl;
+ struct crypto_shash *shash_tfm;
+ struct crypto_kpp *dh_tfm;
+ void *buf;
+ size_t buf_size;
+ int qid;
+ int error;
+ u32 s1;
+ u32 s2;
+ u16 transaction;
+ u8 status;
+ u8 hash_id;
+ size_t hash_len;
+ u8 dhgroup_id;
+ u8 c1[64];
+ u8 c2[64];
+ u8 response[64];
+ u8 *host_response;
+ u8 *ctrl_key;
+ int ctrl_key_len;
+ u8 *host_key;
+ int host_key_len;
+ u8 *sess_key;
+ int sess_key_len;
+};
+
+#define nvme_auth_flags_from_qid(qid) \
+ (qid == 0) ? 0 : BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED
+#define nvme_auth_queue_from_qid(ctrl, qid) \
+ (qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q
+
+static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid,
+ void *data, size_t data_len, bool auth_send)
+{
+ struct nvme_command cmd = {};
+ blk_mq_req_flags_t flags = nvme_auth_flags_from_qid(qid);
+ struct request_queue *q = nvme_auth_queue_from_qid(ctrl, qid);
+ int ret;
+
+ cmd.auth_common.opcode = nvme_fabrics_command;
+ cmd.auth_common.secp = NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER;
+ cmd.auth_common.spsp0 = 0x01;
+ cmd.auth_common.spsp1 = 0x01;
+ if (auth_send) {
+ cmd.auth_send.fctype = nvme_fabrics_type_auth_send;
+ cmd.auth_send.tl = cpu_to_le32(data_len);
+ } else {
+ cmd.auth_receive.fctype = nvme_fabrics_type_auth_receive;
+ cmd.auth_receive.al = cpu_to_le32(data_len);
+ }
+
+ ret = __nvme_submit_sync_cmd(q, &cmd, NULL, data, data_len,
+ qid == 0 ? NVME_QID_ANY : qid,
+ 0, flags);
+ if (ret > 0)
+ dev_warn(ctrl->device,
+ "qid %d auth_send failed with status %d\n", qid, ret);
+ else if (ret < 0)
+ dev_err(ctrl->device,
+ "qid %d auth_send failed with error %d\n", qid, ret);
+ return ret;
+}
+
+static int nvme_auth_receive_validate(struct nvme_ctrl *ctrl, int qid,
+ struct nvmf_auth_dhchap_failure_data *data,
+ u16 transaction, u8 expected_msg)
+{
+ dev_dbg(ctrl->device, "%s: qid %d auth_type %d auth_id %x\n",
+ __func__, qid, data->auth_type, data->auth_id);
+
+ if (data->auth_type == NVME_AUTH_COMMON_MESSAGES &&
+ data->auth_id == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) {
+ return data->rescode_exp;
+ }
+ if (data->auth_type != NVME_AUTH_DHCHAP_MESSAGES ||
+ data->auth_id != expected_msg) {
+ dev_warn(ctrl->device,
+ "qid %d invalid message %02x/%02x\n",
+ qid, data->auth_type, data->auth_id);
+ return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
+ }
+ if (le16_to_cpu(data->t_id) != transaction) {
+ dev_warn(ctrl->device,
+ "qid %d invalid transaction ID %d\n",
+ qid, le16_to_cpu(data->t_id));
+ return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
+ }
+ return 0;
+}
+
+static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl,
+ struct nvme_dhchap_queue_context *chap)
+{
+ struct nvmf_auth_dhchap_negotiate_data *data = chap->buf;
+ size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol);
+
+ if (chap->buf_size < size) {
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+ return -EINVAL;
+ }
+ memset((u8 *)chap->buf, 0, size);
+ data->auth_type = NVME_AUTH_COMMON_MESSAGES;
+ data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
+ data->t_id = cpu_to_le16(chap->transaction);
+ data->sc_c = 0; /* No secure channel concatenation */
+ data->napd = 1;
+ data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID;
+ data->auth_protocol[0].dhchap.halen = 3;
+ data->auth_protocol[0].dhchap.dhlen = 6;
+ data->auth_protocol[0].dhchap.idlist[0] = NVME_AUTH_HASH_SHA256;
+ data->auth_protocol[0].dhchap.idlist[1] = NVME_AUTH_HASH_SHA384;
+ data->auth_protocol[0].dhchap.idlist[2] = NVME_AUTH_HASH_SHA512;
+ data->auth_protocol[0].dhchap.idlist[30] = NVME_AUTH_DHGROUP_NULL;
+ data->auth_protocol[0].dhchap.idlist[31] = NVME_AUTH_DHGROUP_2048;
+ data->auth_protocol[0].dhchap.idlist[32] = NVME_AUTH_DHGROUP_3072;
+ data->auth_protocol[0].dhchap.idlist[33] = NVME_AUTH_DHGROUP_4096;
+ data->auth_protocol[0].dhchap.idlist[34] = NVME_AUTH_DHGROUP_6144;
+ data->auth_protocol[0].dhchap.idlist[35] = NVME_AUTH_DHGROUP_8192;
+
+ return size;
+}
+
+static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
+ struct nvme_dhchap_queue_context *chap)
+{
+ struct nvmf_auth_dhchap_challenge_data *data = chap->buf;
+ u16 dhvlen = le16_to_cpu(data->dhvlen);
+ size_t size = sizeof(*data) + data->hl + dhvlen;
+ const char *gid_name = nvme_auth_dhgroup_name(data->dhgid);
+ const char *hmac_name, *kpp_name;
+
+ if (chap->buf_size < size) {
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+ return NVME_SC_INVALID_FIELD;
+ }
+
+ hmac_name = nvme_auth_hmac_name(data->hashid);
+ if (!hmac_name) {
+ dev_warn(ctrl->device,
+ "qid %d: invalid HASH ID %d\n",
+ chap->qid, data->hashid);
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+ return NVME_SC_INVALID_FIELD;
+ }
+
+ if (chap->hash_id == data->hashid && chap->shash_tfm &&
+ !strcmp(crypto_shash_alg_name(chap->shash_tfm), hmac_name) &&
+ crypto_shash_digestsize(chap->shash_tfm) == data->hl) {
+ dev_dbg(ctrl->device,
+ "qid %d: reuse existing hash %s\n",
+ chap->qid, hmac_name);
+ goto select_kpp;
+ }
+
+ /* Reset if hash cannot be reused */
+ if (chap->shash_tfm) {
+ crypto_free_shash(chap->shash_tfm);
+ chap->hash_id = 0;
+ chap->hash_len = 0;
+ }
+ chap->shash_tfm = crypto_alloc_shash(hmac_name, 0,
+ CRYPTO_ALG_ALLOCATES_MEMORY);
+ if (IS_ERR(chap->shash_tfm)) {
+ dev_warn(ctrl->device,
+ "qid %d: failed to allocate hash %s, error %ld\n",
+ chap->qid, hmac_name, PTR_ERR(chap->shash_tfm));
+ chap->shash_tfm = NULL;
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+ return NVME_SC_AUTH_REQUIRED;
+ }
+
+ if (crypto_shash_digestsize(chap->shash_tfm) != data->hl) {
+ dev_warn(ctrl->device,
+ "qid %d: invalid hash length %d\n",
+ chap->qid, data->hl);
+ crypto_free_shash(chap->shash_tfm);
+ chap->shash_tfm = NULL;
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+ return NVME_SC_AUTH_REQUIRED;
+ }
+
+ /* Reset host response if the hash had been changed */
+ if (chap->hash_id != data->hashid) {
+ kfree(chap->host_response);
+ chap->host_response = NULL;
+ }
+
+ chap->hash_id = data->hashid;
+ chap->hash_len = data->hl;
+ dev_dbg(ctrl->device, "qid %d: selected hash %s\n",
+ chap->qid, hmac_name);
+
+select_kpp:
+ kpp_name = nvme_auth_dhgroup_kpp(data->dhgid);
+ if (!kpp_name) {
+ dev_warn(ctrl->device,
+ "qid %d: invalid DH group id %d\n",
+ chap->qid, data->dhgid);
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+ /* Leave previous dh_tfm intact */
+ return NVME_SC_AUTH_REQUIRED;
+ }
+
+ /* Clear host and controller key to avoid accidental reuse */
+ kfree_sensitive(chap->host_key);
+ chap->host_key = NULL;
+ chap->host_key_len = 0;
+ kfree_sensitive(chap->ctrl_key);
+ chap->ctrl_key = NULL;
+ chap->ctrl_key_len = 0;
+
+ if (chap->dhgroup_id == data->dhgid &&
+ (data->dhgid == NVME_AUTH_DHGROUP_NULL || chap->dh_tfm)) {
+ dev_dbg(ctrl->device,
+ "qid %d: reuse existing DH group %s\n",
+ chap->qid, gid_name);
+ goto skip_kpp;
+ }
+
+ /* Reset dh_tfm if it can't be reused */
+ if (chap->dh_tfm) {
+ crypto_free_kpp(chap->dh_tfm);
+ chap->dh_tfm = NULL;
+ }
+
+ if (data->dhgid != NVME_AUTH_DHGROUP_NULL) {
+ if (dhvlen == 0) {
+ dev_warn(ctrl->device,
+ "qid %d: empty DH value\n",
+ chap->qid);
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+ return NVME_SC_INVALID_FIELD;
+ }
+
+ chap->dh_tfm = crypto_alloc_kpp(kpp_name, 0, 0);
+ if (IS_ERR(chap->dh_tfm)) {
+ int ret = PTR_ERR(chap->dh_tfm);
+
+ dev_warn(ctrl->device,
+ "qid %d: error %d initializing DH group %s\n",
+ chap->qid, ret, gid_name);
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+ chap->dh_tfm = NULL;
+ return NVME_SC_AUTH_REQUIRED;
+ }
+ dev_dbg(ctrl->device, "qid %d: selected DH group %s\n",
+ chap->qid, gid_name);
+ } else if (dhvlen != 0) {
+ dev_warn(ctrl->device,
+ "qid %d: invalid DH value for NULL DH\n",
+ chap->qid);
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+ return NVME_SC_INVALID_FIELD;
+ }
+ chap->dhgroup_id = data->dhgid;
+
+skip_kpp:
+ chap->s1 = le32_to_cpu(data->seqnum);
+ memcpy(chap->c1, data->cval, chap->hash_len);
+ if (dhvlen) {
+ chap->ctrl_key = kmalloc(dhvlen, GFP_KERNEL);
+ if (!chap->ctrl_key) {
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+ return NVME_SC_AUTH_REQUIRED;
+ }
+ chap->ctrl_key_len = dhvlen;
+ memcpy(chap->ctrl_key, data->cval + chap->hash_len,
+ dhvlen);
+ dev_dbg(ctrl->device, "ctrl public key %*ph\n",
+ (int)chap->ctrl_key_len, chap->ctrl_key);
+ }
+
+ return 0;
+}
+
+static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
+ struct nvme_dhchap_queue_context *chap)
+{
+ struct nvmf_auth_dhchap_reply_data *data = chap->buf;
+ size_t size = sizeof(*data);
+
+ size += 2 * chap->hash_len;
+
+ if (chap->host_key_len)
+ size += chap->host_key_len;
+
+ if (chap->buf_size < size) {
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+ return -EINVAL;
+ }
+
+ memset(chap->buf, 0, size);
+ data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
+ data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_REPLY;
+ data->t_id = cpu_to_le16(chap->transaction);
+ data->hl = chap->hash_len;
+ data->dhvlen = cpu_to_le16(chap->host_key_len);
+ memcpy(data->rval, chap->response, chap->hash_len);
+ if (ctrl->ctrl_key) {
+ get_random_bytes(chap->c2, chap->hash_len);
+ data->cvalid = 1;
+ chap->s2 = nvme_auth_get_seqnum();
+ memcpy(data->rval + chap->hash_len, chap->c2,
+ chap->hash_len);
+ dev_dbg(ctrl->device, "%s: qid %d ctrl challenge %*ph\n",
+ __func__, chap->qid, (int)chap->hash_len, chap->c2);
+ } else {
+ memset(chap->c2, 0, chap->hash_len);
+ chap->s2 = 0;
+ }
+ data->seqnum = cpu_to_le32(chap->s2);
+ if (chap->host_key_len) {
+ dev_dbg(ctrl->device, "%s: qid %d host public key %*ph\n",
+ __func__, chap->qid,
+ chap->host_key_len, chap->host_key);
+ memcpy(data->rval + 2 * chap->hash_len, chap->host_key,
+ chap->host_key_len);
+ }
+
+ return size;
+}
+
+static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl,
+ struct nvme_dhchap_queue_context *chap)
+{
+ struct nvmf_auth_dhchap_success1_data *data = chap->buf;
+ size_t size = sizeof(*data);
+
+ if (ctrl->ctrl_key)
+ size += chap->hash_len;
+
+ if (chap->buf_size < size) {
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+ return NVME_SC_INVALID_FIELD;
+ }
+
+ if (data->hl != chap->hash_len) {
+ dev_warn(ctrl->device,
+ "qid %d: invalid hash length %u\n",
+ chap->qid, data->hl);
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+ return NVME_SC_INVALID_FIELD;
+ }
+
+ /* Just print out information for the admin queue */
+ if (chap->qid == 0)
+ dev_info(ctrl->device,
+ "qid 0: authenticated with hash %s dhgroup %s\n",
+ nvme_auth_hmac_name(chap->hash_id),
+ nvme_auth_dhgroup_name(chap->dhgroup_id));
+
+ if (!data->rvalid)
+ return 0;
+
+ /* Validate controller response */
+ if (memcmp(chap->response, data->rval, data->hl)) {
+ dev_dbg(ctrl->device, "%s: qid %d ctrl response %*ph\n",
+ __func__, chap->qid, (int)chap->hash_len, data->rval);
+ dev_dbg(ctrl->device, "%s: qid %d host response %*ph\n",
+ __func__, chap->qid, (int)chap->hash_len,
+ chap->response);
+ dev_warn(ctrl->device,
+ "qid %d: controller authentication failed\n",
+ chap->qid);
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+ return NVME_SC_AUTH_REQUIRED;
+ }
+
+ /* Just print out information for the admin queue */
+ if (chap->qid == 0)
+ dev_info(ctrl->device,
+ "qid 0: controller authenticated\n");
+ return 0;
+}
+
+static int nvme_auth_set_dhchap_success2_data(struct nvme_ctrl *ctrl,
+ struct nvme_dhchap_queue_context *chap)
+{
+ struct nvmf_auth_dhchap_success2_data *data = chap->buf;
+ size_t size = sizeof(*data);
+
+ memset(chap->buf, 0, size);
+ data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
+ data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2;
+ data->t_id = cpu_to_le16(chap->transaction);
+
+ return size;
+}
+
+static int nvme_auth_set_dhchap_failure2_data(struct nvme_ctrl *ctrl,
+ struct nvme_dhchap_queue_context *chap)
+{
+ struct nvmf_auth_dhchap_failure_data *data = chap->buf;
+ size_t size = sizeof(*data);
+
+ memset(chap->buf, 0, size);
+ data->auth_type = NVME_AUTH_COMMON_MESSAGES;
+ data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_FAILURE2;
+ data->t_id = cpu_to_le16(chap->transaction);
+ data->rescode = NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED;
+ data->rescode_exp = chap->status;
+
+ return size;
+}
+
+static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
+ struct nvme_dhchap_queue_context *chap)
+{
+ SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
+ u8 buf[4], *challenge = chap->c1;
+ int ret;
+
+ dev_dbg(ctrl->device, "%s: qid %d host response seq %u transaction %d\n",
+ __func__, chap->qid, chap->s1, chap->transaction);
+
+ if (!chap->host_response) {
+ chap->host_response = nvme_auth_transform_key(ctrl->host_key,
+ ctrl->opts->host->nqn);
+ if (IS_ERR(chap->host_response)) {
+ ret = PTR_ERR(chap->host_response);
+ chap->host_response = NULL;
+ return ret;
+ }
+ } else {
+ dev_dbg(ctrl->device, "%s: qid %d re-using host response\n",
+ __func__, chap->qid);
+ }
+
+ ret = crypto_shash_setkey(chap->shash_tfm,
+ chap->host_response, ctrl->host_key->len);
+ if (ret) {
+ dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
+ chap->qid, ret);
+ goto out;
+ }
+
+ if (chap->dh_tfm) {
+ challenge = kmalloc(chap->hash_len, GFP_KERNEL);
+ if (!challenge) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = nvme_auth_augmented_challenge(chap->hash_id,
+ chap->sess_key,
+ chap->sess_key_len,
+ chap->c1, challenge,
+ chap->hash_len);
+ if (ret)
+ goto out;
+ }
+
+ shash->tfm = chap->shash_tfm;
+ ret = crypto_shash_init(shash);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, challenge, chap->hash_len);
+ if (ret)
+ goto out;
+ put_unaligned_le32(chap->s1, buf);
+ ret = crypto_shash_update(shash, buf, 4);
+ if (ret)
+ goto out;
+ put_unaligned_le16(chap->transaction, buf);
+ ret = crypto_shash_update(shash, buf, 2);
+ if (ret)
+ goto out;
+ memset(buf, 0, sizeof(buf));
+ ret = crypto_shash_update(shash, buf, 1);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, "HostHost", 8);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, ctrl->opts->host->nqn,
+ strlen(ctrl->opts->host->nqn));
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, buf, 1);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, ctrl->opts->subsysnqn,
+ strlen(ctrl->opts->subsysnqn));
+ if (ret)
+ goto out;
+ ret = crypto_shash_final(shash, chap->response);
+out:
+ if (challenge != chap->c1)
+ kfree(challenge);
+ return ret;
+}
+
+static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
+ struct nvme_dhchap_queue_context *chap)
+{
+ SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
+ u8 *ctrl_response;
+ u8 buf[4], *challenge = chap->c2;
+ int ret;
+
+ ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key,
+ ctrl->opts->subsysnqn);
+ if (IS_ERR(ctrl_response)) {
+ ret = PTR_ERR(ctrl_response);
+ return ret;
+ }
+ ret = crypto_shash_setkey(chap->shash_tfm,
+ ctrl_response, ctrl->ctrl_key->len);
+ if (ret) {
+ dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
+ chap->qid, ret);
+ goto out;
+ }
+
+ if (chap->dh_tfm) {
+ challenge = kmalloc(chap->hash_len, GFP_KERNEL);
+ if (!challenge) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = nvme_auth_augmented_challenge(chap->hash_id,
+ chap->sess_key,
+ chap->sess_key_len,
+ chap->c2, challenge,
+ chap->hash_len);
+ if (ret)
+ goto out;
+ }
+ dev_dbg(ctrl->device, "%s: qid %d ctrl response seq %u transaction %d\n",
+ __func__, chap->qid, chap->s2, chap->transaction);
+ dev_dbg(ctrl->device, "%s: qid %d challenge %*ph\n",
+ __func__, chap->qid, (int)chap->hash_len, challenge);
+ dev_dbg(ctrl->device, "%s: qid %d subsysnqn %s\n",
+ __func__, chap->qid, ctrl->opts->subsysnqn);
+ dev_dbg(ctrl->device, "%s: qid %d hostnqn %s\n",
+ __func__, chap->qid, ctrl->opts->host->nqn);
+ shash->tfm = chap->shash_tfm;
+ ret = crypto_shash_init(shash);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, challenge, chap->hash_len);
+ if (ret)
+ goto out;
+ put_unaligned_le32(chap->s2, buf);
+ ret = crypto_shash_update(shash, buf, 4);
+ if (ret)
+ goto out;
+ put_unaligned_le16(chap->transaction, buf);
+ ret = crypto_shash_update(shash, buf, 2);
+ if (ret)
+ goto out;
+ memset(buf, 0, 4);
+ ret = crypto_shash_update(shash, buf, 1);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, "Controller", 10);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, ctrl->opts->subsysnqn,
+ strlen(ctrl->opts->subsysnqn));
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, buf, 1);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, ctrl->opts->host->nqn,
+ strlen(ctrl->opts->host->nqn));
+ if (ret)
+ goto out;
+ ret = crypto_shash_final(shash, chap->response);
+out:
+ if (challenge != chap->c2)
+ kfree(challenge);
+ kfree(ctrl_response);
+ return ret;
+}
+
+static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl,
+ struct nvme_dhchap_queue_context *chap)
+{
+ int ret;
+
+ if (chap->host_key && chap->host_key_len) {
+ dev_dbg(ctrl->device,
+ "qid %d: reusing host key\n", chap->qid);
+ goto gen_sesskey;
+ }
+ ret = nvme_auth_gen_privkey(chap->dh_tfm, chap->dhgroup_id);
+ if (ret < 0) {
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+ return ret;
+ }
+
+ chap->host_key_len = crypto_kpp_maxsize(chap->dh_tfm);
+
+ chap->host_key = kzalloc(chap->host_key_len, GFP_KERNEL);
+ if (!chap->host_key) {
+ chap->host_key_len = 0;
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+ return -ENOMEM;
+ }
+ ret = nvme_auth_gen_pubkey(chap->dh_tfm,
+ chap->host_key, chap->host_key_len);
+ if (ret) {
+ dev_dbg(ctrl->device,
+ "failed to generate public key, error %d\n", ret);
+ kfree(chap->host_key);
+ chap->host_key = NULL;
+ chap->host_key_len = 0;
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+ return ret;
+ }
+
+gen_sesskey:
+ chap->sess_key_len = chap->host_key_len;
+ chap->sess_key = kmalloc(chap->sess_key_len, GFP_KERNEL);
+ if (!chap->sess_key) {
+ chap->sess_key_len = 0;
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+ return -ENOMEM;
+ }
+
+ ret = nvme_auth_gen_shared_secret(chap->dh_tfm,
+ chap->ctrl_key, chap->ctrl_key_len,
+ chap->sess_key, chap->sess_key_len);
+ if (ret) {
+ dev_dbg(ctrl->device,
+ "failed to generate shared secret, error %d\n", ret);
+ kfree_sensitive(chap->sess_key);
+ chap->sess_key = NULL;
+ chap->sess_key_len = 0;
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+ return ret;
+ }
+ dev_dbg(ctrl->device, "shared secret %*ph\n",
+ (int)chap->sess_key_len, chap->sess_key);
+ return 0;
+}
+
+static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap)
+{
+ kfree_sensitive(chap->host_response);
+ chap->host_response = NULL;
+ kfree_sensitive(chap->host_key);
+ chap->host_key = NULL;
+ chap->host_key_len = 0;
+ kfree_sensitive(chap->ctrl_key);
+ chap->ctrl_key = NULL;
+ chap->ctrl_key_len = 0;
+ kfree_sensitive(chap->sess_key);
+ chap->sess_key = NULL;
+ chap->sess_key_len = 0;
+ chap->status = 0;
+ chap->error = 0;
+ chap->s1 = 0;
+ chap->s2 = 0;
+ chap->transaction = 0;
+ memset(chap->c1, 0, sizeof(chap->c1));
+ memset(chap->c2, 0, sizeof(chap->c2));
+}
+
+static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap)
+{
+ __nvme_auth_reset(chap);
+ if (chap->shash_tfm)
+ crypto_free_shash(chap->shash_tfm);
+ if (chap->dh_tfm)
+ crypto_free_kpp(chap->dh_tfm);
+ kfree_sensitive(chap->ctrl_key);
+ kfree_sensitive(chap->host_key);
+ kfree_sensitive(chap->sess_key);
+ kfree_sensitive(chap->host_response);
+ kfree(chap->buf);
+ kfree(chap);
+}
+
+static void __nvme_auth_work(struct work_struct *work)
+{
+ struct nvme_dhchap_queue_context *chap =
+ container_of(work, struct nvme_dhchap_queue_context, auth_work);
+ struct nvme_ctrl *ctrl = chap->ctrl;
+ size_t tl;
+ int ret = 0;
+
+ chap->transaction = ctrl->transaction++;
+
+ /* DH-HMAC-CHAP Step 1: send negotiate */
+ dev_dbg(ctrl->device, "%s: qid %d send negotiate\n",
+ __func__, chap->qid);
+ ret = nvme_auth_set_dhchap_negotiate_data(ctrl, chap);
+ if (ret < 0) {
+ chap->error = ret;
+ return;
+ }
+ tl = ret;
+ ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+ if (ret) {
+ chap->error = ret;
+ return;
+ }
+
+ /* DH-HMAC-CHAP Step 2: receive challenge */
+ dev_dbg(ctrl->device, "%s: qid %d receive challenge\n",
+ __func__, chap->qid);
+
+ memset(chap->buf, 0, chap->buf_size);
+ ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "qid %d failed to receive challenge, %s %d\n",
+ chap->qid, ret < 0 ? "error" : "nvme status", ret);
+ chap->error = ret;
+ return;
+ }
+ ret = nvme_auth_receive_validate(ctrl, chap->qid, chap->buf, chap->transaction,
+ NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE);
+ if (ret) {
+ chap->status = ret;
+ chap->error = NVME_SC_AUTH_REQUIRED;
+ return;
+ }
+
+ ret = nvme_auth_process_dhchap_challenge(ctrl, chap);
+ if (ret) {
+ /* Invalid challenge parameters */
+ chap->error = ret;
+ goto fail2;
+ }
+
+ if (chap->ctrl_key_len) {
+ dev_dbg(ctrl->device,
+ "%s: qid %d DH exponential\n",
+ __func__, chap->qid);
+ ret = nvme_auth_dhchap_exponential(ctrl, chap);
+ if (ret) {
+ chap->error = ret;
+ goto fail2;
+ }
+ }
+
+ dev_dbg(ctrl->device, "%s: qid %d host response\n",
+ __func__, chap->qid);
+ ret = nvme_auth_dhchap_setup_host_response(ctrl, chap);
+ if (ret) {
+ chap->error = ret;
+ goto fail2;
+ }
+
+ /* DH-HMAC-CHAP Step 3: send reply */
+ dev_dbg(ctrl->device, "%s: qid %d send reply\n",
+ __func__, chap->qid);
+ ret = nvme_auth_set_dhchap_reply_data(ctrl, chap);
+ if (ret < 0) {
+ chap->error = ret;
+ goto fail2;
+ }
+
+ tl = ret;
+ ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+ if (ret) {
+ chap->error = ret;
+ goto fail2;
+ }
+
+ /* DH-HMAC-CHAP Step 4: receive success1 */
+ dev_dbg(ctrl->device, "%s: qid %d receive success1\n",
+ __func__, chap->qid);
+
+ memset(chap->buf, 0, chap->buf_size);
+ ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "qid %d failed to receive success1, %s %d\n",
+ chap->qid, ret < 0 ? "error" : "nvme status", ret);
+ chap->error = ret;
+ return;
+ }
+ ret = nvme_auth_receive_validate(ctrl, chap->qid,
+ chap->buf, chap->transaction,
+ NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1);
+ if (ret) {
+ chap->status = ret;
+ chap->error = NVME_SC_AUTH_REQUIRED;
+ return;
+ }
+
+ if (ctrl->ctrl_key) {
+ dev_dbg(ctrl->device,
+ "%s: qid %d controller response\n",
+ __func__, chap->qid);
+ ret = nvme_auth_dhchap_setup_ctrl_response(ctrl, chap);
+ if (ret) {
+ chap->error = ret;
+ goto fail2;
+ }
+ }
+
+ ret = nvme_auth_process_dhchap_success1(ctrl, chap);
+ if (ret) {
+ /* Controller authentication failed */
+ chap->error = NVME_SC_AUTH_REQUIRED;
+ goto fail2;
+ }
+
+ if (ctrl->ctrl_key) {
+ /* DH-HMAC-CHAP Step 5: send success2 */
+ dev_dbg(ctrl->device, "%s: qid %d send success2\n",
+ __func__, chap->qid);
+ tl = nvme_auth_set_dhchap_success2_data(ctrl, chap);
+ ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+ if (ret)
+ chap->error = ret;
+ }
+ if (!ret) {
+ chap->error = 0;
+ return;
+ }
+
+fail2:
+ dev_dbg(ctrl->device, "%s: qid %d send failure2, status %x\n",
+ __func__, chap->qid, chap->status);
+ tl = nvme_auth_set_dhchap_failure2_data(ctrl, chap);
+ ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+ /*
+ * only update error if send failure2 failed and no other
+ * error had been set during authentication.
+ */
+ if (ret && !chap->error)
+ chap->error = ret;
+}
+
+int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
+{
+ struct nvme_dhchap_queue_context *chap;
+
+ if (!ctrl->host_key) {
+ dev_warn(ctrl->device, "qid %d: no key\n", qid);
+ return -ENOKEY;
+ }
+
+ if (ctrl->opts->dhchap_ctrl_secret && !ctrl->ctrl_key) {
+ dev_warn(ctrl->device, "qid %d: invalid ctrl key\n", qid);
+ return -ENOKEY;
+ }
+
+ mutex_lock(&ctrl->dhchap_auth_mutex);
+ /* Check if the context is already queued */
+ list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
+ WARN_ON(!chap->buf);
+ if (chap->qid == qid) {
+ dev_dbg(ctrl->device, "qid %d: re-using context\n", qid);
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
+ flush_work(&chap->auth_work);
+ __nvme_auth_reset(chap);
+ queue_work(nvme_wq, &chap->auth_work);
+ return 0;
+ }
+ }
+ chap = kzalloc(sizeof(*chap), GFP_KERNEL);
+ if (!chap) {
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
+ return -ENOMEM;
+ }
+ chap->qid = (qid == NVME_QID_ANY) ? 0 : qid;
+ chap->ctrl = ctrl;
+
+ /*
+ * Allocate a large enough buffer for the entire negotiation:
+ * 4k should be enough to ffdhe8192.
+ */
+ chap->buf_size = 4096;
+ chap->buf = kzalloc(chap->buf_size, GFP_KERNEL);
+ if (!chap->buf) {
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
+ kfree(chap);
+ return -ENOMEM;
+ }
+
+ INIT_WORK(&chap->auth_work, __nvme_auth_work);
+ list_add(&chap->entry, &ctrl->dhchap_auth_list);
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
+ queue_work(nvme_wq, &chap->auth_work);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_negotiate);
+
+int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
+{
+ struct nvme_dhchap_queue_context *chap;
+ int ret;
+
+ mutex_lock(&ctrl->dhchap_auth_mutex);
+ list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
+ if (chap->qid != qid)
+ continue;
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
+ flush_work(&chap->auth_work);
+ ret = chap->error;
+ return ret;
+ }
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
+ return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_wait);
+
+void nvme_auth_reset(struct nvme_ctrl *ctrl)
+{
+ struct nvme_dhchap_queue_context *chap;
+
+ mutex_lock(&ctrl->dhchap_auth_mutex);
+ list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
+ flush_work(&chap->auth_work);
+ __nvme_auth_reset(chap);
+ }
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_reset);
+
+static void nvme_dhchap_auth_work(struct work_struct *work)
+{
+ struct nvme_ctrl *ctrl =
+ container_of(work, struct nvme_ctrl, dhchap_auth_work);
+ int ret, q;
+
+ /* Authenticate admin queue first */
+ ret = nvme_auth_negotiate(ctrl, 0);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "qid 0: error %d setting up authentication\n", ret);
+ return;
+ }
+ ret = nvme_auth_wait(ctrl, 0);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "qid 0: authentication failed\n");
+ return;
+ }
+
+ for (q = 1; q < ctrl->queue_count; q++) {
+ ret = nvme_auth_negotiate(ctrl, q);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "qid %d: error %d setting up authentication\n",
+ q, ret);
+ break;
+ }
+ }
+
+ /*
+ * Failure is a soft-state; credentials remain valid until
+ * the controller terminates the connection.
+ */
+}
+
+void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
+{
+ INIT_LIST_HEAD(&ctrl->dhchap_auth_list);
+ INIT_WORK(&ctrl->dhchap_auth_work, nvme_dhchap_auth_work);
+ mutex_init(&ctrl->dhchap_auth_mutex);
+ if (!ctrl->opts)
+ return;
+ nvme_auth_generate_key(ctrl->opts->dhchap_secret, &ctrl->host_key);
+ nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl);
+
+void nvme_auth_stop(struct nvme_ctrl *ctrl)
+{
+ struct nvme_dhchap_queue_context *chap = NULL, *tmp;
+
+ cancel_work_sync(&ctrl->dhchap_auth_work);
+ mutex_lock(&ctrl->dhchap_auth_mutex);
+ list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry)
+ cancel_work_sync(&chap->auth_work);
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_stop);
+
+void nvme_auth_free(struct nvme_ctrl *ctrl)
+{
+ struct nvme_dhchap_queue_context *chap = NULL, *tmp;
+
+ mutex_lock(&ctrl->dhchap_auth_mutex);
+ list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) {
+ list_del_init(&chap->entry);
+ flush_work(&chap->auth_work);
+ __nvme_auth_free(chap);
+ }
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
+ if (ctrl->host_key) {
+ nvme_auth_free_key(ctrl->host_key);
+ ctrl->host_key = NULL;
+ }
+ if (ctrl->ctrl_key) {
+ nvme_auth_free_key(ctrl->ctrl_key);
+ ctrl->ctrl_key = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(nvme_auth_free);
diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index 4910543f00ff..e958d5015585 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -6,7 +6,6 @@
#include "nvme.h"
-#ifdef CONFIG_NVME_VERBOSE_ERRORS
static const char * const nvme_ops[] = {
[nvme_cmd_flush] = "Flush",
[nvme_cmd_write] = "Write",
@@ -178,6 +177,7 @@ const unsigned char *nvme_get_opcode_str(u8 opcode)
return nvme_ops[opcode];
return "Unknown";
}
+EXPORT_SYMBOL_GPL(nvme_get_opcode_str);
const unsigned char *nvme_get_admin_opcode_str(u8 opcode)
{
@@ -185,4 +185,3 @@ const unsigned char *nvme_get_admin_opcode_str(u8 opcode)
return nvme_admin_ops[opcode];
return "Unknown";
}
-#endif /* CONFIG_NVME_VERBOSE_ERRORS */
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2533b88e66d5..2429b11eb9a8 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -24,12 +24,22 @@
#include "nvme.h"
#include "fabrics.h"
+#include <linux/nvme-auth.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
#define NVME_MINORS (1U << MINORBITS)
+struct nvme_ns_info {
+ struct nvme_ns_ids ids;
+ u32 nsid;
+ __le32 anagrpid;
+ bool is_shared;
+ bool is_readonly;
+ bool is_ready;
+};
+
unsigned int admin_timeout = 60;
module_param(admin_timeout, uint, 0644);
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
@@ -330,6 +340,7 @@ enum nvme_disposition {
COMPLETE,
RETRY,
FAILOVER,
+ AUTHENTICATE,
};
static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
@@ -337,6 +348,9 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
if (likely(nvme_req(req)->status == 0))
return COMPLETE;
+ if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED)
+ return AUTHENTICATE;
+
if (blk_noretry_request(req) ||
(nvme_req(req)->status & NVME_SC_DNR) ||
nvme_req(req)->retries >= nvme_max_retries)
@@ -375,11 +389,13 @@ static inline void nvme_end_req(struct request *req)
void nvme_complete_rq(struct request *req)
{
+ struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
+
trace_nvme_complete_rq(req);
nvme_cleanup_cmd(req);
- if (nvme_req(req)->ctrl->kas)
- nvme_req(req)->ctrl->comp_seen = true;
+ if (ctrl->kas)
+ ctrl->comp_seen = true;
switch (nvme_decide_disposition(req)) {
case COMPLETE:
@@ -391,6 +407,14 @@ void nvme_complete_rq(struct request *req)
case FAILOVER:
nvme_failover_req(req);
return;
+ case AUTHENTICATE:
+#ifdef CONFIG_NVME_AUTH
+ queue_work(nvme_wq, &ctrl->dhchap_auth_work);
+ nvme_retry_req(req);
+#else
+ nvme_end_req(req);
+#endif
+ return;
}
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
@@ -702,7 +726,9 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
switch (ctrl->state) {
case NVME_CTRL_CONNECTING:
if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
- req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
+ (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
+ req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
+ req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
return true;
break;
default:
@@ -990,8 +1016,7 @@ static int nvme_execute_rq(struct request *rq, bool at_head)
*/
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
- unsigned timeout, int qid, int at_head,
- blk_mq_req_flags_t flags)
+ int qid, int at_head, blk_mq_req_flags_t flags)
{
struct request *req;
int ret;
@@ -1000,15 +1025,12 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
else
req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
- qid ? qid - 1 : 0);
+ qid - 1);
if (IS_ERR(req))
return PTR_ERR(req);
nvme_init_request(req, cmd);
- if (timeout)
- req->timeout = timeout;
-
if (buffer && bufflen) {
ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
if (ret)
@@ -1028,7 +1050,7 @@ EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buffer, unsigned bufflen)
{
- return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
+ return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen,
NVME_QID_ANY, 0, 0);
}
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
@@ -1329,8 +1351,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
}
}
-static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
- struct nvme_ns_ids *ids)
+static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
+ struct nvme_ns_info *info)
{
struct nvme_command c = { };
bool csi_seen = false;
@@ -1343,7 +1365,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
return 0;
c.identify.opcode = nvme_admin_identify;
- c.identify.nsid = cpu_to_le32(nsid);
+ c.identify.nsid = cpu_to_le32(info->nsid);
c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
@@ -1355,7 +1377,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
if (status) {
dev_warn(ctrl->device,
"Identify Descriptors failed (nsid=%u, status=0x%x)\n",
- nsid, status);
+ info->nsid, status);
goto free_data;
}
@@ -1365,7 +1387,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
if (cur->nidl == 0)
break;
- len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
+ len = nvme_process_ns_desc(ctrl, &info->ids, cur, &csi_seen);
if (len < 0)
break;
@@ -1374,7 +1396,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
if (nvme_multi_css(ctrl) && !csi_seen) {
dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
- nsid);
+ info->nsid);
status = -EINVAL;
}
@@ -1384,7 +1406,7 @@ free_data:
}
static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
- struct nvme_ns_ids *ids, struct nvme_id_ns **id)
+ struct nvme_id_ns **id)
{
struct nvme_command c = { };
int error;
@@ -1407,51 +1429,66 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
error = NVME_SC_INVALID_NS | NVME_SC_DNR;
if ((*id)->ncap == 0) /* namespace not allocated or attached */
goto out_free_id;
+ return 0;
+out_free_id:
+ kfree(*id);
+ return error;
+}
+static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
+ struct nvme_ns_info *info)
+{
+ struct nvme_ns_ids *ids = &info->ids;
+ struct nvme_id_ns *id;
+ int ret;
+
+ ret = nvme_identify_ns(ctrl, info->nsid, &id);
+ if (ret)
+ return ret;
+ info->anagrpid = id->anagrpid;
+ info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
+ info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
+ info->is_ready = true;
if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
dev_info(ctrl->device,
"Ignoring bogus Namespace Identifiers\n");
} else {
if (ctrl->vs >= NVME_VS(1, 1, 0) &&
!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
- memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
+ memcpy(ids->eui64, id->eui64, sizeof(ids->eui64));
if (ctrl->vs >= NVME_VS(1, 2, 0) &&
!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
- memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
+ memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
}
-
+ kfree(id);
return 0;
-
-out_free_id:
- kfree(*id);
- return error;
}
-static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid,
- struct nvme_id_ns_cs_indep **id)
+static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
+ struct nvme_ns_info *info)
{
+ struct nvme_id_ns_cs_indep *id;
struct nvme_command c = {
.identify.opcode = nvme_admin_identify,
- .identify.nsid = cpu_to_le32(nsid),
+ .identify.nsid = cpu_to_le32(info->nsid),
.identify.cns = NVME_ID_CNS_NS_CS_INDEP,
};
int ret;
- *id = kmalloc(sizeof(**id), GFP_KERNEL);
- if (!*id)
+ id = kmalloc(sizeof(*id), GFP_KERNEL);
+ if (!id)
return -ENOMEM;
- ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
- if (ret) {
- dev_warn(ctrl->device,
- "Identify namespace (CS independent) failed (%d)\n",
- ret);
- kfree(*id);
- return ret;
+ ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
+ if (!ret) {
+ info->anagrpid = id->anagrpid;
+ info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
+ info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
+ info->is_ready = id->nstat & NVME_NSTAT_NRDY;
}
-
- return 0;
+ kfree(id);
+ return ret;
}
static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
@@ -1466,7 +1503,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
c.features.dword11 = cpu_to_le32(dword11);
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
- buffer, buflen, 0, NVME_QID_ANY, 0, 0);
+ buffer, buflen, NVME_QID_ANY, 0, 0);
if (ret >= 0 && result)
*result = le32_to_cpu(res.u32);
return ret;
@@ -1875,6 +1912,11 @@ static void nvme_update_disk_info(struct gendisk *disk,
ns->ctrl->max_zeroes_sectors);
}
+static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
+{
+ return info->is_readonly || test_bit(NVME_NS_FORCE_RO, &ns->flags);
+}
+
static inline bool nvme_first_scan(struct gendisk *disk)
{
/* nvme_alloc_ns() scans the disk prior to adding it */
@@ -1912,12 +1954,44 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
blk_queue_chunk_sectors(ns->queue, iob);
}
-static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
+static int nvme_update_ns_info_generic(struct nvme_ns *ns,
+ struct nvme_ns_info *info)
{
- unsigned lbaf = nvme_lbaf_index(id->flbas);
+ blk_mq_freeze_queue(ns->disk->queue);
+ nvme_set_queue_limits(ns->ctrl, ns->queue);
+ set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
+ blk_mq_unfreeze_queue(ns->disk->queue);
+
+ if (nvme_ns_head_multipath(ns->head)) {
+ blk_mq_freeze_queue(ns->head->disk->queue);
+ set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
+ nvme_mpath_revalidate_paths(ns);
+ blk_stack_limits(&ns->head->disk->queue->limits,
+ &ns->queue->limits, 0);
+ ns->head->disk->flags |= GENHD_FL_HIDDEN;
+ blk_mq_unfreeze_queue(ns->head->disk->queue);
+ }
+
+ /* Hide the block-interface for these devices */
+ ns->disk->flags |= GENHD_FL_HIDDEN;
+ set_bit(NVME_NS_READY, &ns->flags);
+
+ return 0;
+}
+
+static int nvme_update_ns_info_block(struct nvme_ns *ns,
+ struct nvme_ns_info *info)
+{
+ struct nvme_id_ns *id;
+ unsigned lbaf;
int ret;
+ ret = nvme_identify_ns(ns->ctrl, info->nsid, &id);
+ if (ret)
+ return ret;
+
blk_mq_freeze_queue(ns->disk->queue);
+ lbaf = nvme_lbaf_index(id->flbas);
ns->lba_shift = id->lbaf[lbaf].ds;
nvme_set_queue_limits(ns->ctrl, ns->queue);
@@ -1927,36 +2001,35 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
if (ns->head->ids.csi == NVME_CSI_ZNS) {
ret = nvme_update_zone_info(ns, lbaf);
- if (ret)
- goto out_unfreeze;
+ if (ret) {
+ blk_mq_unfreeze_queue(ns->disk->queue);
+ goto out;
+ }
}
- set_disk_ro(ns->disk, (id->nsattr & NVME_NS_ATTR_RO) ||
- test_bit(NVME_NS_FORCE_RO, &ns->flags));
+ set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue);
if (blk_queue_is_zoned(ns->queue)) {
ret = nvme_revalidate_zones(ns);
if (ret && !nvme_first_scan(ns->disk))
- return ret;
+ goto out;
}
if (nvme_ns_head_multipath(ns->head)) {
blk_mq_freeze_queue(ns->head->disk->queue);
nvme_update_disk_info(ns->head->disk, ns, id);
- set_disk_ro(ns->head->disk,
- (id->nsattr & NVME_NS_ATTR_RO) ||
- test_bit(NVME_NS_FORCE_RO, &ns->flags));
+ set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
nvme_mpath_revalidate_paths(ns);
blk_stack_limits(&ns->head->disk->queue->limits,
&ns->queue->limits, 0);
disk_update_readahead(ns->head->disk);
blk_mq_unfreeze_queue(ns->head->disk->queue);
}
- return 0;
-out_unfreeze:
+ ret = 0;
+out:
/*
* If probing fails due an unsupported feature, hide the block device,
* but still allow other access.
@@ -1966,10 +2039,31 @@ out_unfreeze:
set_bit(NVME_NS_READY, &ns->flags);
ret = 0;
}
- blk_mq_unfreeze_queue(ns->disk->queue);
+ kfree(id);
return ret;
}
+static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
+{
+ switch (info->ids.csi) {
+ case NVME_CSI_ZNS:
+ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+ dev_info(ns->ctrl->device,
+ "block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
+ info->nsid);
+ return nvme_update_ns_info_generic(ns, info);
+ }
+ return nvme_update_ns_info_block(ns, info);
+ case NVME_CSI_NVM:
+ return nvme_update_ns_info_block(ns, info);
+ default:
+ dev_info(ns->ctrl->device,
+ "block device for nsid %u not supported (csi %u)\n",
+ info->nsid, info->ids.csi);
+ return nvme_update_ns_info_generic(ns, info);
+ }
+}
+
static char nvme_pr_type(enum pr_type type)
{
switch (type) {
@@ -2103,7 +2197,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
cmd.common.cdw11 = cpu_to_le32(len);
- return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0,
+ return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
NVME_QID_ANY, 1, 0);
}
EXPORT_SYMBOL_GPL(nvme_sec_submit);
@@ -2123,6 +2217,7 @@ static int nvme_report_zones(struct gendisk *disk, sector_t sector,
static const struct block_device_operations nvme_bdev_ops = {
.owner = THIS_MODULE,
.ioctl = nvme_ioctl,
+ .compat_ioctl = blkdev_compat_ptr_ioctl,
.open = nvme_open,
.release = nvme_release,
.getgeo = nvme_getgeo,
@@ -3613,6 +3708,108 @@ static ssize_t dctype_show(struct device *dev,
}
static DEVICE_ATTR_RO(dctype);
+#ifdef CONFIG_NVME_AUTH
+static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+
+ if (!opts->dhchap_secret)
+ return sysfs_emit(buf, "none\n");
+ return sysfs_emit(buf, "%s\n", opts->dhchap_secret);
+}
+
+static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+ char *dhchap_secret;
+
+ if (!ctrl->opts->dhchap_secret)
+ return -EINVAL;
+ if (count < 7)
+ return -EINVAL;
+ if (memcmp(buf, "DHHC-1:", 7))
+ return -EINVAL;
+
+ dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
+ if (!dhchap_secret)
+ return -ENOMEM;
+ memcpy(dhchap_secret, buf, count);
+ nvme_auth_stop(ctrl);
+ if (strcmp(dhchap_secret, opts->dhchap_secret)) {
+ int ret;
+
+ ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key);
+ if (ret)
+ return ret;
+ kfree(opts->dhchap_secret);
+ opts->dhchap_secret = dhchap_secret;
+ /* Key has changed; re-authentication with new key */
+ nvme_auth_reset(ctrl);
+ }
+ /* Start re-authentication */
+ dev_info(ctrl->device, "re-authenticating controller\n");
+ queue_work(nvme_wq, &ctrl->dhchap_auth_work);
+
+ return count;
+}
+static DEVICE_ATTR(dhchap_secret, S_IRUGO | S_IWUSR,
+ nvme_ctrl_dhchap_secret_show, nvme_ctrl_dhchap_secret_store);
+
+static ssize_t nvme_ctrl_dhchap_ctrl_secret_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+
+ if (!opts->dhchap_ctrl_secret)
+ return sysfs_emit(buf, "none\n");
+ return sysfs_emit(buf, "%s\n", opts->dhchap_ctrl_secret);
+}
+
+static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+ char *dhchap_secret;
+
+ if (!ctrl->opts->dhchap_ctrl_secret)
+ return -EINVAL;
+ if (count < 7)
+ return -EINVAL;
+ if (memcmp(buf, "DHHC-1:", 7))
+ return -EINVAL;
+
+ dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
+ if (!dhchap_secret)
+ return -ENOMEM;
+ memcpy(dhchap_secret, buf, count);
+ nvme_auth_stop(ctrl);
+ if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) {
+ int ret;
+
+ ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key);
+ if (ret)
+ return ret;
+ kfree(opts->dhchap_ctrl_secret);
+ opts->dhchap_ctrl_secret = dhchap_secret;
+ /* Key has changed; re-authentication with new key */
+ nvme_auth_reset(ctrl);
+ }
+ /* Start re-authentication */
+ dev_info(ctrl->device, "re-authenticating controller\n");
+ queue_work(nvme_wq, &ctrl->dhchap_auth_work);
+
+ return count;
+}
+static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR,
+ nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store);
+#endif
+
static struct attribute *nvme_dev_attrs[] = {
&dev_attr_reset_controller.attr,
&dev_attr_rescan_controller.attr,
@@ -3636,6 +3833,10 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_kato.attr,
&dev_attr_cntrltype.attr,
&dev_attr_dctype.attr,
+#ifdef CONFIG_NVME_AUTH
+ &dev_attr_dhchap_secret.attr,
+ &dev_attr_dhchap_ctrl_secret.attr,
+#endif
NULL
};
@@ -3659,6 +3860,12 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
return 0;
if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts)
return 0;
+#ifdef CONFIG_NVME_AUTH
+ if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts)
+ return 0;
+ if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts)
+ return 0;
+#endif
return a->mode;
}
@@ -3786,7 +3993,7 @@ static int nvme_add_ns_cdev(struct nvme_ns *ns)
}
static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
- unsigned nsid, struct nvme_ns_ids *ids, bool is_shared)
+ struct nvme_ns_info *info)
{
struct nvme_ns_head *head;
size_t size = sizeof(*head);
@@ -3808,9 +4015,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
if (ret)
goto out_ida_remove;
head->subsys = ctrl->subsys;
- head->ns_id = nsid;
- head->ids = *ids;
- head->shared = is_shared;
+ head->ns_id = info->nsid;
+ head->ids = info->ids;
+ head->shared = info->is_shared;
kref_init(&head->ref);
if (head->ids.csi) {
@@ -3867,54 +4074,54 @@ static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
return ret;
}
-static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
- struct nvme_ns_ids *ids, bool is_shared)
+static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
{
struct nvme_ctrl *ctrl = ns->ctrl;
struct nvme_ns_head *head = NULL;
int ret;
- ret = nvme_global_check_duplicate_ids(ctrl->subsys, ids);
+ ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids);
if (ret) {
dev_err(ctrl->device,
- "globally duplicate IDs for nsid %d\n", nsid);
+ "globally duplicate IDs for nsid %d\n", info->nsid);
nvme_print_device_info(ctrl);
return ret;
}
mutex_lock(&ctrl->subsys->lock);
- head = nvme_find_ns_head(ctrl, nsid);
+ head = nvme_find_ns_head(ctrl, info->nsid);
if (!head) {
- ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, ids);
+ ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &info->ids);
if (ret) {
dev_err(ctrl->device,
"duplicate IDs in subsystem for nsid %d\n",
- nsid);
+ info->nsid);
goto out_unlock;
}
- head = nvme_alloc_ns_head(ctrl, nsid, ids, is_shared);
+ head = nvme_alloc_ns_head(ctrl, info);
if (IS_ERR(head)) {
ret = PTR_ERR(head);
goto out_unlock;
}
} else {
ret = -EINVAL;
- if (!is_shared || !head->shared) {
+ if (!info->is_shared || !head->shared) {
dev_err(ctrl->device,
- "Duplicate unshared namespace %d\n", nsid);
+ "Duplicate unshared namespace %d\n",
+ info->nsid);
goto out_put_ns_head;
}
- if (!nvme_ns_ids_equal(&head->ids, ids)) {
+ if (!nvme_ns_ids_equal(&head->ids, &info->ids)) {
dev_err(ctrl->device,
"IDs don't match for shared namespace %d\n",
- nsid);
+ info->nsid);
goto out_put_ns_head;
}
if (!multipath && !list_empty(&head->list)) {
dev_warn(ctrl->device,
"Found shared namespace %d, but multipathing not supported.\n",
- nsid);
+ info->nsid);
dev_warn_once(ctrl->device,
"Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
}
@@ -3968,20 +4175,15 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
list_add(&ns->list, &ns->ctrl->namespaces);
}
-static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
- struct nvme_ns_ids *ids)
+static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
{
struct nvme_ns *ns;
struct gendisk *disk;
- struct nvme_id_ns *id;
int node = ctrl->numa_node;
- if (nvme_identify_ns(ctrl, nsid, ids, &id))
- return;
-
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
- goto out_free_id;
+ return;
disk = blk_mq_alloc_disk(ctrl->tagset, ns);
if (IS_ERR(disk))
@@ -4002,7 +4204,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
ns->ctrl = ctrl;
kref_init(&ns->kref);
- if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED))
+ if (nvme_init_ns_head(ns, info))
goto out_cleanup_disk;
/*
@@ -4028,7 +4230,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
ns->head->instance);
}
- if (nvme_update_ns_info(ns, id))
+ if (nvme_update_ns_info(ns, info))
goto out_unlink_ns;
down_write(&ctrl->namespaces_rwsem);
@@ -4042,9 +4244,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
if (!nvme_ns_head_multipath(ns->head))
nvme_add_ns_cdev(ns);
- nvme_mpath_add_disk(ns, id);
+ nvme_mpath_add_disk(ns, info->anagrpid);
nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
- kfree(id);
return;
@@ -4064,8 +4265,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
put_disk(disk);
out_free_ns:
kfree(ns);
- out_free_id:
- kfree(id);
}
static void nvme_ns_remove(struct nvme_ns *ns)
@@ -4123,29 +4322,21 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
}
}
-static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
+static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
{
- struct nvme_id_ns *id;
int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
if (test_bit(NVME_NS_DEAD, &ns->flags))
goto out;
- ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id);
- if (ret)
- goto out;
-
ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
- if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
+ if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
dev_err(ns->ctrl->device,
"identifiers changed for nsid %d\n", ns->head->ns_id);
- goto out_free_id;
+ goto out;
}
- ret = nvme_update_ns_info(ns, id);
-
-out_free_id:
- kfree(id);
+ ret = nvme_update_ns_info(ns, info);
out:
/*
* Only remove the namespace if we got a fatal error back from the
@@ -4157,59 +4348,47 @@ out:
nvme_ns_remove(ns);
}
-static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
- struct nvme_ns_ids ids = { };
- struct nvme_id_ns_cs_indep *id;
+ struct nvme_ns_info info = { .nsid = nsid };
struct nvme_ns *ns;
- bool ready = true;
- if (nvme_identify_ns_descs(ctrl, nsid, &ids))
+ if (nvme_identify_ns_descs(ctrl, &info))
return;
+ if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) {
+ dev_warn(ctrl->device,
+ "command set not reported for nsid: %d\n", nsid);
+ return;
+ }
+
/*
- * Check if the namespace is ready. If not ignore it, we will get an
- * AEN once it becomes ready and restart the scan.
+ * If available try to use the Command Set Idependent Identify Namespace
+ * data structure to find all the generic information that is needed to
+ * set up a namespace. If not fall back to the legacy version.
*/
- if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) &&
- !nvme_identify_ns_cs_indep(ctrl, nsid, &id)) {
- ready = id->nstat & NVME_NSTAT_NRDY;
- kfree(id);
+ if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
+ (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS)) {
+ if (nvme_ns_info_from_id_cs_indep(ctrl, &info))
+ return;
+ } else {
+ if (nvme_ns_info_from_identify(ctrl, &info))
+ return;
}
- if (!ready)
+ /*
+ * Ignore the namespace if it is not ready. We will get an AEN once it
+ * becomes ready and restart the scan.
+ */
+ if (!info.is_ready)
return;
ns = nvme_find_get_ns(ctrl, nsid);
if (ns) {
- nvme_validate_ns(ns, &ids);
+ nvme_validate_ns(ns, &info);
nvme_put_ns(ns);
- return;
- }
-
- switch (ids.csi) {
- case NVME_CSI_NVM:
- nvme_alloc_ns(ctrl, nsid, &ids);
- break;
- case NVME_CSI_ZNS:
- if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
- dev_warn(ctrl->device,
- "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
- nsid);
- break;
- }
- if (!nvme_multi_css(ctrl)) {
- dev_warn(ctrl->device,
- "command set not reported for nsid: %d\n",
- nsid);
- break;
- }
- nvme_alloc_ns(ctrl, nsid, &ids);
- break;
- default:
- dev_warn(ctrl->device, "unknown csi %u for nsid %u\n",
- ids.csi, nsid);
- break;
+ } else {
+ nvme_alloc_ns(ctrl, &info);
}
}
@@ -4265,7 +4444,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
if (!nsid) /* end of the list? */
goto out;
- nvme_validate_or_alloc_ns(ctrl, nsid);
+ nvme_scan_ns(ctrl, nsid);
while (++prev < nsid)
nvme_ns_remove_by_nsid(ctrl, prev);
}
@@ -4288,7 +4467,7 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
kfree(id);
for (i = 1; i <= nn; i++)
- nvme_validate_or_alloc_ns(ctrl, i);
+ nvme_scan_ns(ctrl, i);
nvme_remove_invalid_namespaces(ctrl, nn);
}
@@ -4525,9 +4704,19 @@ static void nvme_fw_act_work(struct work_struct *work)
nvme_get_fw_slot_info(ctrl);
}
+static u32 nvme_aer_type(u32 result)
+{
+ return result & 0x7;
+}
+
+static u32 nvme_aer_subtype(u32 result)
+{
+ return (result & 0xff00) >> 8;
+}
+
static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
{
- u32 aer_notice_type = (result & 0xff00) >> 8;
+ u32 aer_notice_type = nvme_aer_subtype(result);
trace_nvme_async_event(ctrl, aer_notice_type);
@@ -4542,8 +4731,10 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
* recovery actions from interfering with the controller's
* firmware activation.
*/
- if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
+ nvme_auth_stop(ctrl);
queue_work(nvme_wq, &ctrl->fw_act_work);
+ }
break;
#ifdef CONFIG_NVME_MULTIPATH
case NVME_AER_NOTICE_ANA:
@@ -4560,11 +4751,19 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
}
}
+static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
+{
+ trace_nvme_async_event(ctrl, NVME_AER_ERROR);
+ dev_warn(ctrl->device, "resetting controller due to AER\n");
+ nvme_reset_ctrl(ctrl);
+}
+
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
volatile union nvme_result *res)
{
u32 result = le32_to_cpu(res->u32);
- u32 aer_type = result & 0x07;
+ u32 aer_type = nvme_aer_type(result);
+ u32 aer_subtype = nvme_aer_subtype(result);
if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
return;
@@ -4574,6 +4773,15 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
nvme_handle_aen_notice(ctrl, result);
break;
case NVME_AER_ERROR:
+ /*
+ * For a persistent internal error, don't run async_event_work
+ * to submit a new AER. The controller reset will do it.
+ */
+ if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
+ nvme_handle_aer_persistent_error(ctrl);
+ return;
+ }
+ fallthrough;
case NVME_AER_SMART:
case NVME_AER_CSS:
case NVME_AER_VS:
@@ -4590,6 +4798,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
{
nvme_mpath_stop(ctrl);
+ nvme_auth_stop(ctrl);
nvme_stop_keep_alive(ctrl);
nvme_stop_failfast_work(ctrl);
flush_work(&ctrl->async_event_work);
@@ -4649,6 +4858,8 @@ static void nvme_free_ctrl(struct device *dev)
nvme_free_cels(ctrl);
nvme_mpath_uninit(ctrl);
+ nvme_auth_stop(ctrl);
+ nvme_auth_free(ctrl);
__free_page(ctrl->discard_page);
if (subsys) {
@@ -4739,6 +4950,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
nvme_mpath_init_ctrl(ctrl);
+ nvme_auth_init_ctrl(ctrl);
return 0;
out_free_name:
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index ee79a6d639b4..5207a2348257 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -152,7 +152,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
cmd.prop_get.fctype = nvme_fabrics_type_property_get;
cmd.prop_get.offset = cpu_to_le32(off);
- ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0,
NVME_QID_ANY, 0, 0);
if (ret >= 0)
@@ -198,7 +198,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
cmd.prop_get.attrib = 1;
cmd.prop_get.offset = cpu_to_le32(off);
- ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0,
NVME_QID_ANY, 0, 0);
if (ret >= 0)
@@ -243,7 +243,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
cmd.prop_set.offset = cpu_to_le32(off);
cmd.prop_set.value = cpu_to_le64(val);
- ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0,
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0,
NVME_QID_ANY, 0, 0);
if (unlikely(ret))
dev_err(ctrl->device,
@@ -331,6 +331,10 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl,
dev_err(ctrl->device,
"Connect command failed: host path error\n");
break;
+ case NVME_SC_AUTH_REQUIRED:
+ dev_err(ctrl->device,
+ "Connect command failed: authentication required\n");
+ break;
default:
dev_err(ctrl->device,
"Connect command failed, error wo/DNR bit: %d\n",
@@ -365,6 +369,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
union nvme_result res;
struct nvmf_connect_data *data;
int ret;
+ u32 result;
cmd.connect.opcode = nvme_fabrics_command;
cmd.connect.fctype = nvme_fabrics_type_connect;
@@ -389,7 +394,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res,
- data, sizeof(*data), 0, NVME_QID_ANY, 1,
+ data, sizeof(*data), NVME_QID_ANY, 1,
BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
if (ret) {
nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
@@ -397,8 +402,25 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
goto out_free_data;
}
- ctrl->cntlid = le16_to_cpu(res.u16);
-
+ result = le32_to_cpu(res.u32);
+ ctrl->cntlid = result & 0xFFFF;
+ if ((result >> 16) & 0x3) {
+ /* Authentication required */
+ ret = nvme_auth_negotiate(ctrl, 0);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "qid 0: authentication setup failed\n");
+ ret = NVME_SC_AUTH_REQUIRED;
+ goto out_free_data;
+ }
+ ret = nvme_auth_wait(ctrl, 0);
+ if (ret)
+ dev_warn(ctrl->device,
+ "qid 0: authentication failed\n");
+ else
+ dev_info(ctrl->device,
+ "qid 0: authenticated\n");
+ }
out_free_data:
kfree(data);
return ret;
@@ -431,6 +453,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
struct nvmf_connect_data *data;
union nvme_result res;
int ret;
+ u32 result;
cmd.connect.opcode = nvme_fabrics_command;
cmd.connect.fctype = nvme_fabrics_type_connect;
@@ -450,12 +473,27 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res,
- data, sizeof(*data), 0, qid, 1,
+ data, sizeof(*data), qid, 1,
BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
if (ret) {
nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
&cmd, data);
}
+ result = le32_to_cpu(res.u32);
+ if ((result >> 16) & 2) {
+ /* Authentication required */
+ ret = nvme_auth_negotiate(ctrl, qid);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "qid %d: authentication setup failed\n", qid);
+ ret = NVME_SC_AUTH_REQUIRED;
+ } else {
+ ret = nvme_auth_wait(ctrl, qid);
+ if (ret)
+ dev_warn(ctrl->device,
+ "qid %u: authentication failed\n", qid);
+ }
+ }
kfree(data);
return ret;
}
@@ -548,6 +586,8 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_TOS, "tos=%d" },
{ NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" },
{ NVMF_OPT_DISCOVERY, "discovery" },
+ { NVMF_OPT_DHCHAP_SECRET, "dhchap_secret=%s" },
+ { NVMF_OPT_DHCHAP_CTRL_SECRET, "dhchap_ctrl_secret=%s" },
{ NVMF_OPT_ERR, NULL }
};
@@ -829,6 +869,34 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
case NVMF_OPT_DISCOVERY:
opts->discovery_nqn = true;
break;
+ case NVMF_OPT_DHCHAP_SECRET:
+ p = match_strdup(args);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) {
+ pr_err("Invalid DH-CHAP secret %s\n", p);
+ ret = -EINVAL;
+ goto out;
+ }
+ kfree(opts->dhchap_secret);
+ opts->dhchap_secret = p;
+ break;
+ case NVMF_OPT_DHCHAP_CTRL_SECRET:
+ p = match_strdup(args);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) {
+ pr_err("Invalid DH-CHAP secret %s\n", p);
+ ret = -EINVAL;
+ goto out;
+ }
+ kfree(opts->dhchap_ctrl_secret);
+ opts->dhchap_ctrl_secret = p;
+ break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
@@ -947,6 +1015,8 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts)
kfree(opts->subsysnqn);
kfree(opts->host_traddr);
kfree(opts->host_iface);
+ kfree(opts->dhchap_secret);
+ kfree(opts->dhchap_ctrl_secret);
kfree(opts);
}
EXPORT_SYMBOL_GPL(nvmf_free_options);
@@ -956,7 +1026,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\
- NVMF_OPT_FAIL_FAST_TMO)
+ NVMF_OPT_FAIL_FAST_TMO | NVMF_OPT_DHCHAP_SECRET |\
+ NVMF_OPT_DHCHAP_CTRL_SECRET)
static struct nvme_ctrl *
nvmf_create_ctrl(struct device *dev, const char *buf)
@@ -1192,7 +1263,14 @@ static void __exit nvmf_exit(void)
BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvmf_auth_send_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvmf_auth_receive_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024);
+ BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_negotiate_data) != 8);
+ BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_challenge_data) != 16);
+ BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_reply_data) != 16);
+ BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success1_data) != 16);
+ BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success2_data) != 16);
}
MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 46d6e194ac2b..a6e22116e139 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -68,6 +68,8 @@ enum {
NVMF_OPT_FAIL_FAST_TMO = 1 << 20,
NVMF_OPT_HOST_IFACE = 1 << 21,
NVMF_OPT_DISCOVERY = 1 << 22,
+ NVMF_OPT_DHCHAP_SECRET = 1 << 23,
+ NVMF_OPT_DHCHAP_CTRL_SECRET = 1 << 24,
};
/**
@@ -97,6 +99,9 @@ enum {
* @max_reconnects: maximum number of allowed reconnect attempts before removing
* the controller, (-1) means reconnect forever, zero means remove
* immediately;
+ * @dhchap_secret: DH-HMAC-CHAP secret
+ * @dhchap_ctrl_secret: DH-HMAC-CHAP controller secret for bi-directional
+ * authentication
* @disable_sqflow: disable controller sq flow control
* @hdr_digest: generate/verify header digest (TCP)
* @data_digest: generate/verify data digest (TCP)
@@ -121,6 +126,8 @@ struct nvmf_ctrl_options {
unsigned int kato;
struct nvmf_host *host;
int max_reconnects;
+ char *dhchap_secret;
+ char *dhchap_ctrl_secret;
bool disable_sqflow;
bool hdr_digest;
bool data_digest;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index f26640ccb955..6ef497c75a16 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -346,7 +346,7 @@ static void nvme_ns_head_submit_bio(struct bio *bio)
* different queue via blk_steal_bios(), so we need to use the bio_split
* pool from the original queue to allocate the bvecs from.
*/
- blk_queue_split(&bio);
+ bio = bio_split_to_limits(bio);
srcu_idx = srcu_read_lock(&head->srcu);
ns = nvme_find_path(head);
@@ -408,6 +408,7 @@ const struct block_device_operations nvme_ns_head_ops = {
.open = nvme_ns_head_open,
.release = nvme_ns_head_release,
.ioctl = nvme_ns_head_ioctl,
+ .compat_ioctl = blkdev_compat_ptr_ioctl,
.getgeo = nvme_getgeo,
.report_zones = nvme_ns_head_report_zones,
.pr_ops = &nvme_pr_ops,
@@ -800,16 +801,16 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
return -ENXIO; /* just break out of the loop */
}
-void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
+void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
if (nvme_ctrl_use_ana(ns->ctrl)) {
struct nvme_ana_group_desc desc = {
- .grpid = id->anagrpid,
+ .grpid = anagrpid,
.state = 0,
};
mutex_lock(&ns->ctrl->ana_lock);
- ns->ana_grpid = le32_to_cpu(id->anagrpid);
+ ns->ana_grpid = le32_to_cpu(anagrpid);
nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
mutex_unlock(&ns->ctrl->ana_lock);
if (desc.state) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 7e0a925bf3be..bdc0ff7ed9ab 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -140,7 +140,7 @@ enum nvme_quirks {
NVME_QUIRK_DMA_ADDRESS_BITS_48 = (1 << 16),
/*
- * The controller requires the command_id value be be limited, so skip
+ * The controller requires the command_id value be limited, so skip
* encoding the generation sequence number.
*/
NVME_QUIRK_SKIP_CID_GEN = (1 << 17),
@@ -328,6 +328,15 @@ struct nvme_ctrl {
struct work_struct ana_work;
#endif
+#ifdef CONFIG_NVME_AUTH
+ struct work_struct dhchap_auth_work;
+ struct list_head dhchap_auth_list;
+ struct mutex dhchap_auth_mutex;
+ struct nvme_dhchap_key *host_key;
+ struct nvme_dhchap_key *ctrl_key;
+ u16 transaction;
+#endif
+
/* Power saving configuration */
u64 ps_max_latency_us;
bool apst_enabled;
@@ -781,7 +790,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buf, unsigned bufflen);
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
- unsigned timeout, int qid, int at_head,
+ int qid, int at_head,
blk_mq_req_flags_t flags);
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
@@ -837,7 +846,7 @@ void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
void nvme_failover_req(struct request *req);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
-void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
+void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
@@ -879,8 +888,7 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
{
return 0;
}
-static inline void nvme_mpath_add_disk(struct nvme_ns *ns,
- struct nvme_id_ns *id)
+static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
}
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
@@ -992,6 +1000,27 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
return ctrl->sgls & ((1 << 0) | (1 << 1));
}
+#ifdef CONFIG_NVME_AUTH
+void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl);
+void nvme_auth_stop(struct nvme_ctrl *ctrl);
+int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid);
+int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid);
+void nvme_auth_reset(struct nvme_ctrl *ctrl);
+void nvme_auth_free(struct nvme_ctrl *ctrl);
+#else
+static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {};
+static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {};
+static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
+{
+ return -EPROTONOSUPPORT;
+}
+static inline int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
+{
+ return NVME_SC_AUTH_REQUIRED;
+}
+static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {};
+#endif
+
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u8 opcode);
int nvme_execute_passthru_rq(struct request *rq);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7e7d4802ac6b..71a4f26ba476 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -670,7 +670,6 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
if (!prp_list) {
- iod->first_dma = dma_addr;
iod->npages = -1;
return BLK_STS_RESOURCE;
}
@@ -1435,8 +1434,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
dev_warn(nvmeq->dev->ctrl.device,
- "I/O %d QID %d timeout, aborting\n",
- req->tag, nvmeq->qid);
+ "I/O %d (%s) QID %d timeout, aborting\n",
+ req->tag,
+ nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode),
+ nvmeq->qid);
abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
BLK_MQ_REQ_NOWAIT);
@@ -1765,37 +1766,35 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev)
}
}
-static int nvme_alloc_admin_tags(struct nvme_dev *dev)
+static int nvme_pci_alloc_admin_tag_set(struct nvme_dev *dev)
{
- if (!dev->ctrl.admin_q) {
- dev->admin_tagset.ops = &nvme_mq_admin_ops;
- dev->admin_tagset.nr_hw_queues = 1;
+ struct blk_mq_tag_set *set = &dev->admin_tagset;
- dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
- dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT;
- dev->admin_tagset.numa_node = dev->ctrl.numa_node;
- dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
- dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
- dev->admin_tagset.driver_data = dev;
+ set->ops = &nvme_mq_admin_ops;
+ set->nr_hw_queues = 1;
- if (blk_mq_alloc_tag_set(&dev->admin_tagset))
- return -ENOMEM;
- dev->ctrl.admin_tagset = &dev->admin_tagset;
+ set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+ set->timeout = NVME_ADMIN_TIMEOUT;
+ set->numa_node = dev->ctrl.numa_node;
+ set->cmd_size = sizeof(struct nvme_iod);
+ set->flags = BLK_MQ_F_NO_SCHED;
+ set->driver_data = dev;
- dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
- if (IS_ERR(dev->ctrl.admin_q)) {
- blk_mq_free_tag_set(&dev->admin_tagset);
- dev->ctrl.admin_q = NULL;
- return -ENOMEM;
- }
- if (!blk_get_queue(dev->ctrl.admin_q)) {
- nvme_dev_remove_admin(dev);
- dev->ctrl.admin_q = NULL;
- return -ENODEV;
- }
- } else
- nvme_start_admin_queue(&dev->ctrl);
+ if (blk_mq_alloc_tag_set(set))
+ return -ENOMEM;
+ dev->ctrl.admin_tagset = set;
+ dev->ctrl.admin_q = blk_mq_init_queue(set);
+ if (IS_ERR(dev->ctrl.admin_q)) {
+ blk_mq_free_tag_set(set);
+ dev->ctrl.admin_q = NULL;
+ return -ENOMEM;
+ }
+ if (!blk_get_queue(dev->ctrl.admin_q)) {
+ nvme_dev_remove_admin(dev);
+ dev->ctrl.admin_q = NULL;
+ return -ENODEV;
+ }
return 0;
}
@@ -2534,47 +2533,45 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
return true;
}
-static void nvme_dev_add(struct nvme_dev *dev)
+static void nvme_pci_alloc_tag_set(struct nvme_dev *dev)
{
+ struct blk_mq_tag_set * set = &dev->tagset;
int ret;
- if (!dev->ctrl.tagset) {
- dev->tagset.ops = &nvme_mq_ops;
- dev->tagset.nr_hw_queues = dev->online_queues - 1;
- dev->tagset.nr_maps = 2; /* default + read */
- if (dev->io_queues[HCTX_TYPE_POLL])
- dev->tagset.nr_maps++;
- dev->tagset.timeout = NVME_IO_TIMEOUT;
- dev->tagset.numa_node = dev->ctrl.numa_node;
- dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
- BLK_MQ_MAX_DEPTH) - 1;
- dev->tagset.cmd_size = sizeof(struct nvme_iod);
- dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
- dev->tagset.driver_data = dev;
+ set->ops = &nvme_mq_ops;
+ set->nr_hw_queues = dev->online_queues - 1;
+ set->nr_maps = 2; /* default + read */
+ if (dev->io_queues[HCTX_TYPE_POLL])
+ set->nr_maps++;
+ set->timeout = NVME_IO_TIMEOUT;
+ set->numa_node = dev->ctrl.numa_node;
+ set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
+ set->cmd_size = sizeof(struct nvme_iod);
+ set->flags = BLK_MQ_F_SHOULD_MERGE;
+ set->driver_data = dev;
- /*
- * Some Apple controllers requires tags to be unique
- * across admin and IO queue, so reserve the first 32
- * tags of the IO queue.
- */
- if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
- dev->tagset.reserved_tags = NVME_AQ_DEPTH;
-
- ret = blk_mq_alloc_tag_set(&dev->tagset);
- if (ret) {
- dev_warn(dev->ctrl.device,
- "IO queues tagset allocation failed %d\n", ret);
- return;
- }
- dev->ctrl.tagset = &dev->tagset;
- } else {
- blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
+ /*
+ * Some Apple controllers requires tags to be unique
+ * across admin and IO queue, so reserve the first 32
+ * tags of the IO queue.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
+ set->reserved_tags = NVME_AQ_DEPTH;
- /* Free previously allocated queues that are no longer usable */
- nvme_free_queues(dev, dev->online_queues);
+ ret = blk_mq_alloc_tag_set(set);
+ if (ret) {
+ dev_warn(dev->ctrl.device,
+ "IO queues tagset allocation failed %d\n", ret);
+ return;
}
+ dev->ctrl.tagset = set;
+}
- nvme_dbbuf_set(dev);
+static void nvme_pci_update_nr_queues(struct nvme_dev *dev)
+{
+ blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
+ /* free previously allocated queues that are no longer usable */
+ nvme_free_queues(dev, dev->online_queues);
}
static int nvme_pci_enable(struct nvme_dev *dev)
@@ -2725,10 +2722,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
nvme_pci_disable(dev);
nvme_reap_pending_cqes(dev);
- blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
- blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
- blk_mq_tagset_wait_completed_request(&dev->tagset);
- blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
+ nvme_cancel_tagset(&dev->ctrl);
+ nvme_cancel_admin_tagset(&dev->ctrl);
/*
* The driver will not be starting up queues again if shutting down so
@@ -2842,9 +2837,13 @@ static void nvme_reset_work(struct work_struct *work)
if (result)
goto out_unlock;
- result = nvme_alloc_admin_tags(dev);
- if (result)
- goto out_unlock;
+ if (!dev->ctrl.admin_q) {
+ result = nvme_pci_alloc_admin_tag_set(dev);
+ if (result)
+ goto out_unlock;
+ } else {
+ nvme_start_admin_queue(&dev->ctrl);
+ }
/*
* Limit the max command size to prevent iod->sg allocations going
@@ -2923,7 +2922,11 @@ static void nvme_reset_work(struct work_struct *work)
} else {
nvme_start_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
- nvme_dev_add(dev);
+ if (!dev->ctrl.tagset)
+ nvme_pci_alloc_tag_set(dev);
+ else
+ nvme_pci_update_nr_queues(dev);
+ nvme_dbbuf_set(dev);
nvme_unfreeze(&dev->ctrl);
}
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 4665aebd944d..3100643be299 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -29,7 +29,7 @@
#include "fabrics.h"
-#define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */
+#define NVME_RDMA_CM_TIMEOUT_MS 3000 /* 3 second */
#define NVME_RDMA_MAX_SEGMENTS 256
@@ -248,12 +248,9 @@ static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
{
int ret;
- ret = wait_for_completion_interruptible_timeout(&queue->cm_done,
- msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
- if (ret < 0)
+ ret = wait_for_completion_interruptible(&queue->cm_done);
+ if (ret)
return ret;
- if (ret == 0)
- return -ETIMEDOUT;
WARN_ON_ONCE(queue->cm_error > 0);
return queue->cm_error;
}
@@ -612,7 +609,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
queue->cm_error = -ETIMEDOUT;
ret = rdma_resolve_addr(queue->cm_id, src_addr,
(struct sockaddr *)&ctrl->addr,
- NVME_RDMA_CONNECT_TIMEOUT_MS);
+ NVME_RDMA_CM_TIMEOUT_MS);
if (ret) {
dev_info(ctrl->ctrl.device,
"rdma_resolve_addr failed (%d).\n", ret);
@@ -790,50 +787,54 @@ out_free_queues:
return ret;
}
-static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
- bool admin)
+static int nvme_rdma_alloc_admin_tag_set(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
- struct blk_mq_tag_set *set;
+ struct blk_mq_tag_set *set = &ctrl->admin_tag_set;
int ret;
- if (admin) {
- set = &ctrl->admin_tag_set;
- memset(set, 0, sizeof(*set));
- set->ops = &nvme_rdma_admin_mq_ops;
- set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
- set->reserved_tags = NVMF_RESERVED_TAGS;
- set->numa_node = nctrl->numa_node;
- set->cmd_size = sizeof(struct nvme_rdma_request) +
- NVME_RDMA_DATA_SGL_SIZE;
- set->driver_data = ctrl;
- set->nr_hw_queues = 1;
- set->timeout = NVME_ADMIN_TIMEOUT;
- set->flags = BLK_MQ_F_NO_SCHED;
- } else {
- set = &ctrl->tag_set;
- memset(set, 0, sizeof(*set));
- set->ops = &nvme_rdma_mq_ops;
- set->queue_depth = nctrl->sqsize + 1;
- set->reserved_tags = NVMF_RESERVED_TAGS;
- set->numa_node = nctrl->numa_node;
- set->flags = BLK_MQ_F_SHOULD_MERGE;
- set->cmd_size = sizeof(struct nvme_rdma_request) +
- NVME_RDMA_DATA_SGL_SIZE;
- if (nctrl->max_integrity_segments)
- set->cmd_size += sizeof(struct nvme_rdma_sgl) +
- NVME_RDMA_METADATA_SGL_SIZE;
- set->driver_data = ctrl;
- set->nr_hw_queues = nctrl->queue_count - 1;
- set->timeout = NVME_IO_TIMEOUT;
- set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
- }
-
+ memset(set, 0, sizeof(*set));
+ set->ops = &nvme_rdma_admin_mq_ops;
+ set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+ set->reserved_tags = NVMF_RESERVED_TAGS;
+ set->numa_node = nctrl->numa_node;
+ set->cmd_size = sizeof(struct nvme_rdma_request) +
+ NVME_RDMA_DATA_SGL_SIZE;
+ set->driver_data = ctrl;
+ set->nr_hw_queues = 1;
+ set->timeout = NVME_ADMIN_TIMEOUT;
+ set->flags = BLK_MQ_F_NO_SCHED;
ret = blk_mq_alloc_tag_set(set);
- if (ret)
- return ERR_PTR(ret);
+ if (!ret)
+ ctrl->ctrl.admin_tagset = set;
+ return ret;
+}
+
+static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *nctrl)
+{
+ struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+ struct blk_mq_tag_set *set = &ctrl->tag_set;
+ int ret;
- return set;
+ memset(set, 0, sizeof(*set));
+ set->ops = &nvme_rdma_mq_ops;
+ set->queue_depth = nctrl->sqsize + 1;
+ set->reserved_tags = NVMF_RESERVED_TAGS;
+ set->numa_node = nctrl->numa_node;
+ set->flags = BLK_MQ_F_SHOULD_MERGE;
+ set->cmd_size = sizeof(struct nvme_rdma_request) +
+ NVME_RDMA_DATA_SGL_SIZE;
+ if (nctrl->max_integrity_segments)
+ set->cmd_size += sizeof(struct nvme_rdma_sgl) +
+ NVME_RDMA_METADATA_SGL_SIZE;
+ set->driver_data = ctrl;
+ set->nr_hw_queues = nctrl->queue_count - 1;
+ set->timeout = NVME_IO_TIMEOUT;
+ set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
+ ret = blk_mq_alloc_tag_set(set);
+ if (!ret)
+ ctrl->ctrl.tagset = set;
+ return ret;
}
static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
@@ -885,11 +886,9 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
goto out_free_queue;
if (new) {
- ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
- if (IS_ERR(ctrl->ctrl.admin_tagset)) {
- error = PTR_ERR(ctrl->ctrl.admin_tagset);
+ error = nvme_rdma_alloc_admin_tag_set(&ctrl->ctrl);
+ if (error)
goto out_free_async_qe;
- }
ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.fabrics_q)) {
@@ -972,11 +971,9 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
return ret;
if (new) {
- ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false);
- if (IS_ERR(ctrl->ctrl.tagset)) {
- ret = PTR_ERR(ctrl->ctrl.tagset);
+ ret = nvme_rdma_alloc_tag_set(&ctrl->ctrl);
+ if (ret)
goto out_free_io_queues;
- }
ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl));
if (ret)
@@ -1205,6 +1202,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
struct nvme_rdma_ctrl *ctrl = container_of(work,
struct nvme_rdma_ctrl, err_work);
+ nvme_auth_stop(&ctrl->ctrl);
nvme_stop_keep_alive(&ctrl->ctrl);
flush_work(&ctrl->ctrl.async_event_work);
nvme_rdma_teardown_io_queues(ctrl, false);
@@ -1894,7 +1892,7 @@ static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
if (ctrl->opts->tos >= 0)
rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
- ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
+ ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CM_TIMEOUT_MS);
if (ret) {
dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
queue->cm_error);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index b95ee85053e3..e82dcfcda29b 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -209,9 +209,11 @@ static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
}
-static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
+static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req)
{
- return queue->cmnd_capsule_len - sizeof(struct nvme_command);
+ if (nvme_is_fabrics(req->req.cmd))
+ return NVME_TCP_ADMIN_CCSZ;
+ return req->queue->cmnd_capsule_len - sizeof(struct nvme_command);
}
static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
@@ -229,7 +231,7 @@ static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
rq = blk_mq_rq_from_pdu(req);
return rq_data_dir(rq) == WRITE && req->data_len &&
- req->data_len <= nvme_tcp_inline_data_size(req->queue);
+ req->data_len <= nvme_tcp_inline_data_size(req);
}
static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
@@ -1685,45 +1687,49 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
return ret;
}
-static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
- bool admin)
+static int nvme_tcp_alloc_admin_tag_set(struct nvme_ctrl *nctrl)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
- struct blk_mq_tag_set *set;
+ struct blk_mq_tag_set *set = &ctrl->admin_tag_set;
int ret;
- if (admin) {
- set = &ctrl->admin_tag_set;
- memset(set, 0, sizeof(*set));
- set->ops = &nvme_tcp_admin_mq_ops;
- set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
- set->reserved_tags = NVMF_RESERVED_TAGS;
- set->numa_node = nctrl->numa_node;
- set->flags = BLK_MQ_F_BLOCKING;
- set->cmd_size = sizeof(struct nvme_tcp_request);
- set->driver_data = ctrl;
- set->nr_hw_queues = 1;
- set->timeout = NVME_ADMIN_TIMEOUT;
- } else {
- set = &ctrl->tag_set;
- memset(set, 0, sizeof(*set));
- set->ops = &nvme_tcp_mq_ops;
- set->queue_depth = nctrl->sqsize + 1;
- set->reserved_tags = NVMF_RESERVED_TAGS;
- set->numa_node = nctrl->numa_node;
- set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
- set->cmd_size = sizeof(struct nvme_tcp_request);
- set->driver_data = ctrl;
- set->nr_hw_queues = nctrl->queue_count - 1;
- set->timeout = NVME_IO_TIMEOUT;
- set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
- }
-
+ memset(set, 0, sizeof(*set));
+ set->ops = &nvme_tcp_admin_mq_ops;
+ set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+ set->reserved_tags = NVMF_RESERVED_TAGS;
+ set->numa_node = nctrl->numa_node;
+ set->flags = BLK_MQ_F_BLOCKING;
+ set->cmd_size = sizeof(struct nvme_tcp_request);
+ set->driver_data = ctrl;
+ set->nr_hw_queues = 1;
+ set->timeout = NVME_ADMIN_TIMEOUT;
ret = blk_mq_alloc_tag_set(set);
- if (ret)
- return ERR_PTR(ret);
+ if (!ret)
+ nctrl->admin_tagset = set;
+ return ret;
+}
- return set;
+static int nvme_tcp_alloc_tag_set(struct nvme_ctrl *nctrl)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ struct blk_mq_tag_set *set = &ctrl->tag_set;
+ int ret;
+
+ memset(set, 0, sizeof(*set));
+ set->ops = &nvme_tcp_mq_ops;
+ set->queue_depth = nctrl->sqsize + 1;
+ set->reserved_tags = NVMF_RESERVED_TAGS;
+ set->numa_node = nctrl->numa_node;
+ set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+ set->cmd_size = sizeof(struct nvme_tcp_request);
+ set->driver_data = ctrl;
+ set->nr_hw_queues = nctrl->queue_count - 1;
+ set->timeout = NVME_IO_TIMEOUT;
+ set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
+ ret = blk_mq_alloc_tag_set(set);
+ if (!ret)
+ nctrl->tagset = set;
+ return ret;
}
static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
@@ -1899,11 +1905,9 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
return ret;
if (new) {
- ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
- if (IS_ERR(ctrl->tagset)) {
- ret = PTR_ERR(ctrl->tagset);
+ ret = nvme_tcp_alloc_tag_set(ctrl);
+ if (ret)
goto out_free_io_queues;
- }
ret = nvme_ctrl_init_connect_q(ctrl);
if (ret)
@@ -1968,11 +1972,9 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
return error;
if (new) {
- ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
- if (IS_ERR(ctrl->admin_tagset)) {
- error = PTR_ERR(ctrl->admin_tagset);
+ error = nvme_tcp_alloc_admin_tag_set(ctrl);
+ if (error)
goto out_free_queue;
- }
ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
if (IS_ERR(ctrl->fabrics_q)) {
@@ -2173,6 +2175,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
struct nvme_tcp_ctrl, err_work);
struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+ nvme_auth_stop(ctrl);
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
nvme_tcp_teardown_io_queues(ctrl, false);
@@ -2371,7 +2374,7 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
if (!blk_rq_nr_phys_segments(rq))
nvme_tcp_set_sg_null(c);
else if (rq_data_dir(rq) == WRITE &&
- req->data_len <= nvme_tcp_inline_data_size(queue))
+ req->data_len <= nvme_tcp_inline_data_size(req))
nvme_tcp_set_sg_inline(queue, c, req->data_len);
else
nvme_tcp_set_sg_host_data(c, req->data_len);
@@ -2406,7 +2409,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
nvme_tcp_init_iter(req, rq_data_dir(rq));
if (rq_data_dir(rq) == WRITE &&
- req->data_len <= nvme_tcp_inline_data_size(queue))
+ req->data_len <= nvme_tcp_inline_data_size(req))
req->pdu_len = req->data_len;
pdu->hdr.type = nvme_tcp_cmd;
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 2a89c5aa0790..1c36fcedea20 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -287,6 +287,34 @@ static const char *nvme_trace_fabrics_property_get(struct trace_seq *p, u8 *spc)
return ret;
}
+static const char *nvme_trace_fabrics_auth_send(struct trace_seq *p, u8 *spc)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 spsp0 = spc[1];
+ u8 spsp1 = spc[2];
+ u8 secp = spc[3];
+ u32 tl = get_unaligned_le32(spc + 4);
+
+ trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, tl=%u",
+ spsp0, spsp1, secp, tl);
+ trace_seq_putc(p, 0);
+ return ret;
+}
+
+static const char *nvme_trace_fabrics_auth_receive(struct trace_seq *p, u8 *spc)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 spsp0 = spc[1];
+ u8 spsp1 = spc[2];
+ u8 secp = spc[3];
+ u32 al = get_unaligned_le32(spc + 4);
+
+ trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, al=%u",
+ spsp0, spsp1, secp, al);
+ trace_seq_putc(p, 0);
+ return ret;
+}
+
static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc)
{
const char *ret = trace_seq_buffer_ptr(p);
@@ -306,6 +334,10 @@ const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p,
return nvme_trace_fabrics_connect(p, spc);
case nvme_fabrics_type_property_get:
return nvme_trace_fabrics_property_get(p, spc);
+ case nvme_fabrics_type_auth_send:
+ return nvme_trace_fabrics_auth_send(p, spc);
+ case nvme_fabrics_type_auth_receive:
+ return nvme_trace_fabrics_auth_receive(p, spc);
default:
return nvme_trace_fabrics_common(p, spc);
}
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index 37c7f4c89f92..6f0eaf6a1528 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -98,7 +98,7 @@ TRACE_EVENT(nvme_complete_rq,
TP_fast_assign(
__entry->ctrl_id = nvme_req(req)->ctrl->instance;
__entry->qid = nvme_req_qid(req);
- __entry->cid = req->tag;
+ __entry->cid = nvme_req(req)->cmd->common.command_id;
__entry->result = le64_to_cpu(nvme_req(req)->result.u64);
__entry->retries = nvme_req(req)->retries;
__entry->flags = nvme_req(req)->flags;
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 973561c93888..79fc64035ee3 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -83,3 +83,18 @@ config NVME_TARGET_TCP
devices over TCP.
If unsure, say N.
+
+config NVME_TARGET_AUTH
+ bool "NVMe over Fabrics In-band Authentication support"
+ depends on NVME_TARGET
+ select NVME_COMMON
+ select CRYPTO
+ select CRYPTO_HMAC
+ select CRYPTO_SHA256
+ select CRYPTO_SHA512
+ select CRYPTO_DH
+ select CRYPTO_DH_RFC7919_GROUPS
+ help
+ This enables support for NVMe over Fabrics In-band Authentication
+
+ If unsure, say N.
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 9837e580fa7e..c66820102493 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -13,6 +13,7 @@ nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \
discovery.o io-cmd-file.o io-cmd-bdev.o
nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o
nvmet-$(CONFIG_BLK_DEV_ZONED) += zns.o
+nvmet-$(CONFIG_NVME_TARGET_AUTH) += fabrics-cmd-auth.o auth.o
nvme-loop-y += loop.o
nvmet-rdma-y += rdma.o
nvmet-fc-y += fc.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 397daaf51f1b..fc8a957fad0a 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -1017,7 +1017,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
u16 ret;
if (nvme_is_fabrics(cmd))
- return nvmet_parse_fabrics_cmd(req);
+ return nvmet_parse_fabrics_admin_cmd(req);
+ if (unlikely(!nvmet_check_auth_status(req)))
+ return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR;
if (nvmet_is_disc_subsys(nvmet_req_subsys(req)))
return nvmet_parse_discovery_cmd(req);
diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
new file mode 100644
index 000000000000..cf690df34775
--- /dev/null
+++ b/drivers/nvme/target/auth.c
@@ -0,0 +1,525 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics DH-HMAC-CHAP authentication.
+ * Copyright (c) 2020 Hannes Reinecke, SUSE Software Solutions.
+ * All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <crypto/hash.h>
+#include <linux/crc32.h>
+#include <linux/base64.h>
+#include <linux/ctype.h>
+#include <linux/random.h>
+#include <linux/nvme-auth.h>
+#include <asm/unaligned.h>
+
+#include "nvmet.h"
+
+int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
+ bool set_ctrl)
+{
+ unsigned char key_hash;
+ char *dhchap_secret;
+
+ if (sscanf(secret, "DHHC-1:%hhd:%*s", &key_hash) != 1)
+ return -EINVAL;
+ if (key_hash > 3) {
+ pr_warn("Invalid DH-HMAC-CHAP hash id %d\n",
+ key_hash);
+ return -EINVAL;
+ }
+ if (key_hash > 0) {
+ /* Validate selected hash algorithm */
+ const char *hmac = nvme_auth_hmac_name(key_hash);
+
+ if (!crypto_has_shash(hmac, 0, 0)) {
+ pr_err("DH-HMAC-CHAP hash %s unsupported\n", hmac);
+ return -ENOTSUPP;
+ }
+ }
+ dhchap_secret = kstrdup(secret, GFP_KERNEL);
+ if (!dhchap_secret)
+ return -ENOMEM;
+ if (set_ctrl) {
+ host->dhchap_ctrl_secret = strim(dhchap_secret);
+ host->dhchap_ctrl_key_hash = key_hash;
+ } else {
+ host->dhchap_secret = strim(dhchap_secret);
+ host->dhchap_key_hash = key_hash;
+ }
+ return 0;
+}
+
+int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id)
+{
+ const char *dhgroup_kpp;
+ int ret = 0;
+
+ pr_debug("%s: ctrl %d selecting dhgroup %d\n",
+ __func__, ctrl->cntlid, dhgroup_id);
+
+ if (ctrl->dh_tfm) {
+ if (ctrl->dh_gid == dhgroup_id) {
+ pr_debug("%s: ctrl %d reuse existing DH group %d\n",
+ __func__, ctrl->cntlid, dhgroup_id);
+ return 0;
+ }
+ crypto_free_kpp(ctrl->dh_tfm);
+ ctrl->dh_tfm = NULL;
+ ctrl->dh_gid = 0;
+ }
+
+ if (dhgroup_id == NVME_AUTH_DHGROUP_NULL)
+ return 0;
+
+ dhgroup_kpp = nvme_auth_dhgroup_kpp(dhgroup_id);
+ if (!dhgroup_kpp) {
+ pr_debug("%s: ctrl %d invalid DH group %d\n",
+ __func__, ctrl->cntlid, dhgroup_id);
+ return -EINVAL;
+ }
+ ctrl->dh_tfm = crypto_alloc_kpp(dhgroup_kpp, 0, 0);
+ if (IS_ERR(ctrl->dh_tfm)) {
+ pr_debug("%s: ctrl %d failed to setup DH group %d, err %ld\n",
+ __func__, ctrl->cntlid, dhgroup_id,
+ PTR_ERR(ctrl->dh_tfm));
+ ret = PTR_ERR(ctrl->dh_tfm);
+ ctrl->dh_tfm = NULL;
+ ctrl->dh_gid = 0;
+ } else {
+ ctrl->dh_gid = dhgroup_id;
+ pr_debug("%s: ctrl %d setup DH group %d\n",
+ __func__, ctrl->cntlid, ctrl->dh_gid);
+ ret = nvme_auth_gen_privkey(ctrl->dh_tfm, ctrl->dh_gid);
+ if (ret < 0) {
+ pr_debug("%s: ctrl %d failed to generate private key, err %d\n",
+ __func__, ctrl->cntlid, ret);
+ kfree_sensitive(ctrl->dh_key);
+ return ret;
+ }
+ ctrl->dh_keysize = crypto_kpp_maxsize(ctrl->dh_tfm);
+ kfree_sensitive(ctrl->dh_key);
+ ctrl->dh_key = kzalloc(ctrl->dh_keysize, GFP_KERNEL);
+ if (!ctrl->dh_key) {
+ pr_warn("ctrl %d failed to allocate public key\n",
+ ctrl->cntlid);
+ return -ENOMEM;
+ }
+ ret = nvme_auth_gen_pubkey(ctrl->dh_tfm, ctrl->dh_key,
+ ctrl->dh_keysize);
+ if (ret < 0) {
+ pr_warn("ctrl %d failed to generate public key\n",
+ ctrl->cntlid);
+ kfree(ctrl->dh_key);
+ ctrl->dh_key = NULL;
+ }
+ }
+
+ return ret;
+}
+
+int nvmet_setup_auth(struct nvmet_ctrl *ctrl)
+{
+ int ret = 0;
+ struct nvmet_host_link *p;
+ struct nvmet_host *host = NULL;
+ const char *hash_name;
+
+ down_read(&nvmet_config_sem);
+ if (nvmet_is_disc_subsys(ctrl->subsys))
+ goto out_unlock;
+
+ if (ctrl->subsys->allow_any_host)
+ goto out_unlock;
+
+ list_for_each_entry(p, &ctrl->subsys->hosts, entry) {
+ pr_debug("check %s\n", nvmet_host_name(p->host));
+ if (strcmp(nvmet_host_name(p->host), ctrl->hostnqn))
+ continue;
+ host = p->host;
+ break;
+ }
+ if (!host) {
+ pr_debug("host %s not found\n", ctrl->hostnqn);
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ ret = nvmet_setup_dhgroup(ctrl, host->dhchap_dhgroup_id);
+ if (ret < 0)
+ pr_warn("Failed to setup DH group");
+
+ if (!host->dhchap_secret) {
+ pr_debug("No authentication provided\n");
+ goto out_unlock;
+ }
+
+ if (host->dhchap_hash_id == ctrl->shash_id) {
+ pr_debug("Re-use existing hash ID %d\n",
+ ctrl->shash_id);
+ } else {
+ hash_name = nvme_auth_hmac_name(host->dhchap_hash_id);
+ if (!hash_name) {
+ pr_warn("Hash ID %d invalid\n", host->dhchap_hash_id);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ ctrl->shash_id = host->dhchap_hash_id;
+ }
+
+ /* Skip the 'DHHC-1:XX:' prefix */
+ nvme_auth_free_key(ctrl->host_key);
+ ctrl->host_key = nvme_auth_extract_key(host->dhchap_secret + 10,
+ host->dhchap_key_hash);
+ if (IS_ERR(ctrl->host_key)) {
+ ret = PTR_ERR(ctrl->host_key);
+ ctrl->host_key = NULL;
+ goto out_free_hash;
+ }
+ pr_debug("%s: using hash %s key %*ph\n", __func__,
+ ctrl->host_key->hash > 0 ?
+ nvme_auth_hmac_name(ctrl->host_key->hash) : "none",
+ (int)ctrl->host_key->len, ctrl->host_key->key);
+
+ nvme_auth_free_key(ctrl->ctrl_key);
+ if (!host->dhchap_ctrl_secret) {
+ ctrl->ctrl_key = NULL;
+ goto out_unlock;
+ }
+
+ ctrl->ctrl_key = nvme_auth_extract_key(host->dhchap_ctrl_secret + 10,
+ host->dhchap_ctrl_key_hash);
+ if (IS_ERR(ctrl->ctrl_key)) {
+ ret = PTR_ERR(ctrl->ctrl_key);
+ ctrl->ctrl_key = NULL;
+ }
+ pr_debug("%s: using ctrl hash %s key %*ph\n", __func__,
+ ctrl->ctrl_key->hash > 0 ?
+ nvme_auth_hmac_name(ctrl->ctrl_key->hash) : "none",
+ (int)ctrl->ctrl_key->len, ctrl->ctrl_key->key);
+
+out_free_hash:
+ if (ret) {
+ if (ctrl->host_key) {
+ nvme_auth_free_key(ctrl->host_key);
+ ctrl->host_key = NULL;
+ }
+ ctrl->shash_id = 0;
+ }
+out_unlock:
+ up_read(&nvmet_config_sem);
+
+ return ret;
+}
+
+void nvmet_auth_sq_free(struct nvmet_sq *sq)
+{
+ cancel_delayed_work(&sq->auth_expired_work);
+ kfree(sq->dhchap_c1);
+ sq->dhchap_c1 = NULL;
+ kfree(sq->dhchap_c2);
+ sq->dhchap_c2 = NULL;
+ kfree(sq->dhchap_skey);
+ sq->dhchap_skey = NULL;
+}
+
+void nvmet_destroy_auth(struct nvmet_ctrl *ctrl)
+{
+ ctrl->shash_id = 0;
+
+ if (ctrl->dh_tfm) {
+ crypto_free_kpp(ctrl->dh_tfm);
+ ctrl->dh_tfm = NULL;
+ ctrl->dh_gid = 0;
+ }
+ kfree_sensitive(ctrl->dh_key);
+ ctrl->dh_key = NULL;
+
+ if (ctrl->host_key) {
+ nvme_auth_free_key(ctrl->host_key);
+ ctrl->host_key = NULL;
+ }
+ if (ctrl->ctrl_key) {
+ nvme_auth_free_key(ctrl->ctrl_key);
+ ctrl->ctrl_key = NULL;
+ }
+}
+
+bool nvmet_check_auth_status(struct nvmet_req *req)
+{
+ if (req->sq->ctrl->host_key &&
+ !req->sq->authenticated)
+ return false;
+ return true;
+}
+
+int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
+ unsigned int shash_len)
+{
+ struct crypto_shash *shash_tfm;
+ struct shash_desc *shash;
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ const char *hash_name;
+ u8 *challenge = req->sq->dhchap_c1, *host_response;
+ u8 buf[4];
+ int ret;
+
+ hash_name = nvme_auth_hmac_name(ctrl->shash_id);
+ if (!hash_name) {
+ pr_warn("Hash ID %d invalid\n", ctrl->shash_id);
+ return -EINVAL;
+ }
+
+ shash_tfm = crypto_alloc_shash(hash_name, 0, 0);
+ if (IS_ERR(shash_tfm)) {
+ pr_err("failed to allocate shash %s\n", hash_name);
+ return PTR_ERR(shash_tfm);
+ }
+
+ if (shash_len != crypto_shash_digestsize(shash_tfm)) {
+ pr_debug("%s: hash len mismatch (len %d digest %d)\n",
+ __func__, shash_len,
+ crypto_shash_digestsize(shash_tfm));
+ ret = -EINVAL;
+ goto out_free_tfm;
+ }
+
+ host_response = nvme_auth_transform_key(ctrl->host_key, ctrl->hostnqn);
+ if (IS_ERR(host_response)) {
+ ret = PTR_ERR(host_response);
+ goto out_free_tfm;
+ }
+
+ ret = crypto_shash_setkey(shash_tfm, host_response,
+ ctrl->host_key->len);
+ if (ret)
+ goto out_free_response;
+
+ if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) {
+ challenge = kmalloc(shash_len, GFP_KERNEL);
+ if (!challenge) {
+ ret = -ENOMEM;
+ goto out_free_response;
+ }
+ ret = nvme_auth_augmented_challenge(ctrl->shash_id,
+ req->sq->dhchap_skey,
+ req->sq->dhchap_skey_len,
+ req->sq->dhchap_c1,
+ challenge, shash_len);
+ if (ret)
+ goto out_free_response;
+ }
+
+ pr_debug("ctrl %d qid %d host response seq %u transaction %d\n",
+ ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1,
+ req->sq->dhchap_tid);
+
+ shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm),
+ GFP_KERNEL);
+ if (!shash) {
+ ret = -ENOMEM;
+ goto out_free_response;
+ }
+ shash->tfm = shash_tfm;
+ ret = crypto_shash_init(shash);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, challenge, shash_len);
+ if (ret)
+ goto out;
+ put_unaligned_le32(req->sq->dhchap_s1, buf);
+ ret = crypto_shash_update(shash, buf, 4);
+ if (ret)
+ goto out;
+ put_unaligned_le16(req->sq->dhchap_tid, buf);
+ ret = crypto_shash_update(shash, buf, 2);
+ if (ret)
+ goto out;
+ memset(buf, 0, 4);
+ ret = crypto_shash_update(shash, buf, 1);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, "HostHost", 8);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn));
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, buf, 1);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, ctrl->subsysnqn,
+ strlen(ctrl->subsysnqn));
+ if (ret)
+ goto out;
+ ret = crypto_shash_final(shash, response);
+out:
+ if (challenge != req->sq->dhchap_c1)
+ kfree(challenge);
+ kfree(shash);
+out_free_response:
+ kfree_sensitive(host_response);
+out_free_tfm:
+ crypto_free_shash(shash_tfm);
+ return 0;
+}
+
+int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
+ unsigned int shash_len)
+{
+ struct crypto_shash *shash_tfm;
+ struct shash_desc *shash;
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ const char *hash_name;
+ u8 *challenge = req->sq->dhchap_c2, *ctrl_response;
+ u8 buf[4];
+ int ret;
+
+ hash_name = nvme_auth_hmac_name(ctrl->shash_id);
+ if (!hash_name) {
+ pr_warn("Hash ID %d invalid\n", ctrl->shash_id);
+ return -EINVAL;
+ }
+
+ shash_tfm = crypto_alloc_shash(hash_name, 0, 0);
+ if (IS_ERR(shash_tfm)) {
+ pr_err("failed to allocate shash %s\n", hash_name);
+ return PTR_ERR(shash_tfm);
+ }
+
+ if (shash_len != crypto_shash_digestsize(shash_tfm)) {
+ pr_debug("%s: hash len mismatch (len %d digest %d)\n",
+ __func__, shash_len,
+ crypto_shash_digestsize(shash_tfm));
+ ret = -EINVAL;
+ goto out_free_tfm;
+ }
+
+ ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key,
+ ctrl->subsysnqn);
+ if (IS_ERR(ctrl_response)) {
+ ret = PTR_ERR(ctrl_response);
+ goto out_free_tfm;
+ }
+
+ ret = crypto_shash_setkey(shash_tfm, ctrl_response,
+ ctrl->ctrl_key->len);
+ if (ret)
+ goto out_free_response;
+
+ if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) {
+ challenge = kmalloc(shash_len, GFP_KERNEL);
+ if (!challenge) {
+ ret = -ENOMEM;
+ goto out_free_response;
+ }
+ ret = nvme_auth_augmented_challenge(ctrl->shash_id,
+ req->sq->dhchap_skey,
+ req->sq->dhchap_skey_len,
+ req->sq->dhchap_c2,
+ challenge, shash_len);
+ if (ret)
+ goto out_free_response;
+ }
+
+ shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm),
+ GFP_KERNEL);
+ if (!shash) {
+ ret = -ENOMEM;
+ goto out_free_response;
+ }
+ shash->tfm = shash_tfm;
+
+ ret = crypto_shash_init(shash);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, challenge, shash_len);
+ if (ret)
+ goto out;
+ put_unaligned_le32(req->sq->dhchap_s2, buf);
+ ret = crypto_shash_update(shash, buf, 4);
+ if (ret)
+ goto out;
+ put_unaligned_le16(req->sq->dhchap_tid, buf);
+ ret = crypto_shash_update(shash, buf, 2);
+ if (ret)
+ goto out;
+ memset(buf, 0, 4);
+ ret = crypto_shash_update(shash, buf, 1);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, "Controller", 10);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, ctrl->subsysnqn,
+ strlen(ctrl->subsysnqn));
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, buf, 1);
+ if (ret)
+ goto out;
+ ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn));
+ if (ret)
+ goto out;
+ ret = crypto_shash_final(shash, response);
+out:
+ if (challenge != req->sq->dhchap_c2)
+ kfree(challenge);
+ kfree(shash);
+out_free_response:
+ kfree_sensitive(ctrl_response);
+out_free_tfm:
+ crypto_free_shash(shash_tfm);
+ return 0;
+}
+
+int nvmet_auth_ctrl_exponential(struct nvmet_req *req,
+ u8 *buf, int buf_size)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ int ret = 0;
+
+ if (!ctrl->dh_key) {
+ pr_warn("ctrl %d no DH public key!\n", ctrl->cntlid);
+ return -ENOKEY;
+ }
+ if (buf_size != ctrl->dh_keysize) {
+ pr_warn("ctrl %d DH public key size mismatch, need %zu is %d\n",
+ ctrl->cntlid, ctrl->dh_keysize, buf_size);
+ ret = -EINVAL;
+ } else {
+ memcpy(buf, ctrl->dh_key, buf_size);
+ pr_debug("%s: ctrl %d public key %*ph\n", __func__,
+ ctrl->cntlid, (int)buf_size, buf);
+ }
+
+ return ret;
+}
+
+int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
+ u8 *pkey, int pkey_size)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ int ret;
+
+ req->sq->dhchap_skey_len = ctrl->dh_keysize;
+ req->sq->dhchap_skey = kzalloc(req->sq->dhchap_skey_len, GFP_KERNEL);
+ if (!req->sq->dhchap_skey)
+ return -ENOMEM;
+ ret = nvme_auth_gen_shared_secret(ctrl->dh_tfm,
+ pkey, pkey_size,
+ req->sq->dhchap_skey,
+ req->sq->dhchap_skey_len);
+ if (ret)
+ pr_debug("failed to compute shared secret, err %d\n", ret);
+ else
+ pr_debug("%s: shared secret %*ph\n", __func__,
+ (int)req->sq->dhchap_skey_len,
+ req->sq->dhchap_skey);
+
+ return ret;
+}
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index ff77c3d2354f..2bcd60758919 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -11,6 +11,11 @@
#include <linux/ctype.h>
#include <linux/pci.h>
#include <linux/pci-p2pdma.h>
+#ifdef CONFIG_NVME_TARGET_AUTH
+#include <linux/nvme-auth.h>
+#endif
+#include <crypto/hash.h>
+#include <crypto/kpp.h>
#include "nvmet.h"
@@ -1680,10 +1685,133 @@ static const struct config_item_type nvmet_ports_type = {
static struct config_group nvmet_subsystems_group;
static struct config_group nvmet_ports_group;
+#ifdef CONFIG_NVME_TARGET_AUTH
+static ssize_t nvmet_host_dhchap_key_show(struct config_item *item,
+ char *page)
+{
+ u8 *dhchap_secret = to_host(item)->dhchap_secret;
+
+ if (!dhchap_secret)
+ return sprintf(page, "\n");
+ return sprintf(page, "%s\n", dhchap_secret);
+}
+
+static ssize_t nvmet_host_dhchap_key_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_host *host = to_host(item);
+ int ret;
+
+ ret = nvmet_auth_set_key(host, page, false);
+ /*
+ * Re-authentication is a soft state, so keep the
+ * current authentication valid until the host
+ * requests re-authentication.
+ */
+ return ret < 0 ? ret : count;
+}
+
+CONFIGFS_ATTR(nvmet_host_, dhchap_key);
+
+static ssize_t nvmet_host_dhchap_ctrl_key_show(struct config_item *item,
+ char *page)
+{
+ u8 *dhchap_secret = to_host(item)->dhchap_ctrl_secret;
+
+ if (!dhchap_secret)
+ return sprintf(page, "\n");
+ return sprintf(page, "%s\n", dhchap_secret);
+}
+
+static ssize_t nvmet_host_dhchap_ctrl_key_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_host *host = to_host(item);
+ int ret;
+
+ ret = nvmet_auth_set_key(host, page, true);
+ /*
+ * Re-authentication is a soft state, so keep the
+ * current authentication valid until the host
+ * requests re-authentication.
+ */
+ return ret < 0 ? ret : count;
+}
+
+CONFIGFS_ATTR(nvmet_host_, dhchap_ctrl_key);
+
+static ssize_t nvmet_host_dhchap_hash_show(struct config_item *item,
+ char *page)
+{
+ struct nvmet_host *host = to_host(item);
+ const char *hash_name = nvme_auth_hmac_name(host->dhchap_hash_id);
+
+ return sprintf(page, "%s\n", hash_name ? hash_name : "none");
+}
+
+static ssize_t nvmet_host_dhchap_hash_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_host *host = to_host(item);
+ u8 hmac_id;
+
+ hmac_id = nvme_auth_hmac_id(page);
+ if (hmac_id == NVME_AUTH_HASH_INVALID)
+ return -EINVAL;
+ if (!crypto_has_shash(nvme_auth_hmac_name(hmac_id), 0, 0))
+ return -ENOTSUPP;
+ host->dhchap_hash_id = hmac_id;
+ return count;
+}
+
+CONFIGFS_ATTR(nvmet_host_, dhchap_hash);
+
+static ssize_t nvmet_host_dhchap_dhgroup_show(struct config_item *item,
+ char *page)
+{
+ struct nvmet_host *host = to_host(item);
+ const char *dhgroup = nvme_auth_dhgroup_name(host->dhchap_dhgroup_id);
+
+ return sprintf(page, "%s\n", dhgroup ? dhgroup : "none");
+}
+
+static ssize_t nvmet_host_dhchap_dhgroup_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_host *host = to_host(item);
+ int dhgroup_id;
+
+ dhgroup_id = nvme_auth_dhgroup_id(page);
+ if (dhgroup_id == NVME_AUTH_DHGROUP_INVALID)
+ return -EINVAL;
+ if (dhgroup_id != NVME_AUTH_DHGROUP_NULL) {
+ const char *kpp = nvme_auth_dhgroup_kpp(dhgroup_id);
+
+ if (!crypto_has_kpp(kpp, 0, 0))
+ return -EINVAL;
+ }
+ host->dhchap_dhgroup_id = dhgroup_id;
+ return count;
+}
+
+CONFIGFS_ATTR(nvmet_host_, dhchap_dhgroup);
+
+static struct configfs_attribute *nvmet_host_attrs[] = {
+ &nvmet_host_attr_dhchap_key,
+ &nvmet_host_attr_dhchap_ctrl_key,
+ &nvmet_host_attr_dhchap_hash,
+ &nvmet_host_attr_dhchap_dhgroup,
+ NULL,
+};
+#endif /* CONFIG_NVME_TARGET_AUTH */
+
static void nvmet_host_release(struct config_item *item)
{
struct nvmet_host *host = to_host(item);
+#ifdef CONFIG_NVME_TARGET_AUTH
+ kfree(host->dhchap_secret);
+#endif
kfree(host);
}
@@ -1693,6 +1821,9 @@ static struct configfs_item_operations nvmet_host_item_ops = {
static const struct config_item_type nvmet_host_type = {
.ct_item_ops = &nvmet_host_item_ops,
+#ifdef CONFIG_NVME_TARGET_AUTH
+ .ct_attrs = nvmet_host_attrs,
+#endif
.ct_owner = THIS_MODULE,
};
@@ -1705,6 +1836,11 @@ static struct config_group *nvmet_hosts_make_group(struct config_group *group,
if (!host)
return ERR_PTR(-ENOMEM);
+#ifdef CONFIG_NVME_TARGET_AUTH
+ /* Default to SHA256 */
+ host->dhchap_hash_id = NVME_AUTH_HASH_SHA256;
+#endif
+
config_group_init_type_name(&host->group, name, &nvmet_host_type);
return &host->group;
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index c27660a660d9..a1345790005f 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -795,6 +795,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq)
wait_for_completion(&sq->confirm_done);
wait_for_completion(&sq->free_done);
percpu_ref_exit(&sq->ref);
+ nvmet_auth_sq_free(sq);
if (ctrl) {
/*
@@ -865,8 +866,15 @@ static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
{
+ struct nvme_command *cmd = req->cmd;
u16 ret;
+ if (nvme_is_fabrics(cmd))
+ return nvmet_parse_fabrics_io_cmd(req);
+
+ if (unlikely(!nvmet_check_auth_status(req)))
+ return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR;
+
ret = nvmet_check_ctrl_status(req);
if (unlikely(ret))
return ret;
@@ -1271,6 +1279,11 @@ u16 nvmet_check_ctrl_status(struct nvmet_req *req)
req->cmd->common.opcode, req->sq->qid);
return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
}
+
+ if (unlikely(!nvmet_check_auth_status(req))) {
+ pr_warn("qid %d not authenticated\n", req->sq->qid);
+ return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR;
+ }
return 0;
}
@@ -1467,6 +1480,8 @@ static void nvmet_ctrl_free(struct kref *ref)
flush_work(&ctrl->async_event_work);
cancel_work_sync(&ctrl->fatal_err_work);
+ nvmet_destroy_auth(ctrl);
+
ida_free(&cntlid_ida, ctrl->cntlid);
nvmet_async_events_free(ctrl);
diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c
new file mode 100644
index 000000000000..c851814d6cb0
--- /dev/null
+++ b/drivers/nvme/target/fabrics-cmd-auth.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics DH-HMAC-CHAP authentication command handling.
+ * Copyright (c) 2020 Hannes Reinecke, SUSE Software Solutions.
+ * All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/nvme-auth.h>
+#include <crypto/hash.h>
+#include <crypto/kpp.h>
+#include "nvmet.h"
+
+static void nvmet_auth_expired_work(struct work_struct *work)
+{
+ struct nvmet_sq *sq = container_of(to_delayed_work(work),
+ struct nvmet_sq, auth_expired_work);
+
+ pr_debug("%s: ctrl %d qid %d transaction %u expired, resetting\n",
+ __func__, sq->ctrl->cntlid, sq->qid, sq->dhchap_tid);
+ sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
+ sq->dhchap_tid = -1;
+}
+
+void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
+{
+ u32 result = le32_to_cpu(req->cqe->result.u32);
+
+ /* Initialize in-band authentication */
+ INIT_DELAYED_WORK(&req->sq->auth_expired_work,
+ nvmet_auth_expired_work);
+ req->sq->authenticated = false;
+ req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
+ result |= (u32)NVME_CONNECT_AUTHREQ_ATR << 16;
+ req->cqe->result.u32 = cpu_to_le32(result);
+}
+
+static u16 nvmet_auth_negotiate(struct nvmet_req *req, void *d)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmf_auth_dhchap_negotiate_data *data = d;
+ int i, hash_id = 0, fallback_hash_id = 0, dhgid, fallback_dhgid;
+
+ pr_debug("%s: ctrl %d qid %d: data sc_d %d napd %d authid %d halen %d dhlen %d\n",
+ __func__, ctrl->cntlid, req->sq->qid,
+ data->sc_c, data->napd, data->auth_protocol[0].dhchap.authid,
+ data->auth_protocol[0].dhchap.halen,
+ data->auth_protocol[0].dhchap.dhlen);
+ req->sq->dhchap_tid = le16_to_cpu(data->t_id);
+ if (data->sc_c)
+ return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH;
+
+ if (data->napd != 1)
+ return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+
+ if (data->auth_protocol[0].dhchap.authid !=
+ NVME_AUTH_DHCHAP_AUTH_ID)
+ return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+
+ for (i = 0; i < data->auth_protocol[0].dhchap.halen; i++) {
+ u8 host_hmac_id = data->auth_protocol[0].dhchap.idlist[i];
+
+ if (!fallback_hash_id &&
+ crypto_has_shash(nvme_auth_hmac_name(host_hmac_id), 0, 0))
+ fallback_hash_id = host_hmac_id;
+ if (ctrl->shash_id != host_hmac_id)
+ continue;
+ hash_id = ctrl->shash_id;
+ break;
+ }
+ if (hash_id == 0) {
+ if (fallback_hash_id == 0) {
+ pr_debug("%s: ctrl %d qid %d: no usable hash found\n",
+ __func__, ctrl->cntlid, req->sq->qid);
+ return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+ }
+ pr_debug("%s: ctrl %d qid %d: no usable hash found, falling back to %s\n",
+ __func__, ctrl->cntlid, req->sq->qid,
+ nvme_auth_hmac_name(fallback_hash_id));
+ ctrl->shash_id = fallback_hash_id;
+ }
+
+ dhgid = -1;
+ fallback_dhgid = -1;
+ for (i = 0; i < data->auth_protocol[0].dhchap.dhlen; i++) {
+ int tmp_dhgid = data->auth_protocol[0].dhchap.idlist[i + 30];
+
+ if (tmp_dhgid != ctrl->dh_gid) {
+ dhgid = tmp_dhgid;
+ break;
+ }
+ if (fallback_dhgid < 0) {
+ const char *kpp = nvme_auth_dhgroup_kpp(tmp_dhgid);
+
+ if (crypto_has_kpp(kpp, 0, 0))
+ fallback_dhgid = tmp_dhgid;
+ }
+ }
+ if (dhgid < 0) {
+ if (fallback_dhgid < 0) {
+ pr_debug("%s: ctrl %d qid %d: no usable DH group found\n",
+ __func__, ctrl->cntlid, req->sq->qid);
+ return NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+ }
+ pr_debug("%s: ctrl %d qid %d: configured DH group %s not found\n",
+ __func__, ctrl->cntlid, req->sq->qid,
+ nvme_auth_dhgroup_name(fallback_dhgid));
+ ctrl->dh_gid = fallback_dhgid;
+ }
+ pr_debug("%s: ctrl %d qid %d: selected DH group %s (%d)\n",
+ __func__, ctrl->cntlid, req->sq->qid,
+ nvme_auth_dhgroup_name(ctrl->dh_gid), ctrl->dh_gid);
+ return 0;
+}
+
+static u16 nvmet_auth_reply(struct nvmet_req *req, void *d)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmf_auth_dhchap_reply_data *data = d;
+ u16 dhvlen = le16_to_cpu(data->dhvlen);
+ u8 *response;
+
+ pr_debug("%s: ctrl %d qid %d: data hl %d cvalid %d dhvlen %u\n",
+ __func__, ctrl->cntlid, req->sq->qid,
+ data->hl, data->cvalid, dhvlen);
+
+ if (dhvlen) {
+ if (!ctrl->dh_tfm)
+ return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+ if (nvmet_auth_ctrl_sesskey(req, data->rval + 2 * data->hl,
+ dhvlen) < 0)
+ return NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+ }
+
+ response = kmalloc(data->hl, GFP_KERNEL);
+ if (!response)
+ return NVME_AUTH_DHCHAP_FAILURE_FAILED;
+
+ if (!ctrl->host_key) {
+ pr_warn("ctrl %d qid %d no host key\n",
+ ctrl->cntlid, req->sq->qid);
+ kfree(response);
+ return NVME_AUTH_DHCHAP_FAILURE_FAILED;
+ }
+ if (nvmet_auth_host_hash(req, response, data->hl) < 0) {
+ pr_debug("ctrl %d qid %d host hash failed\n",
+ ctrl->cntlid, req->sq->qid);
+ kfree(response);
+ return NVME_AUTH_DHCHAP_FAILURE_FAILED;
+ }
+
+ if (memcmp(data->rval, response, data->hl)) {
+ pr_info("ctrl %d qid %d host response mismatch\n",
+ ctrl->cntlid, req->sq->qid);
+ kfree(response);
+ return NVME_AUTH_DHCHAP_FAILURE_FAILED;
+ }
+ kfree(response);
+ pr_debug("%s: ctrl %d qid %d host authenticated\n",
+ __func__, ctrl->cntlid, req->sq->qid);
+ if (data->cvalid) {
+ req->sq->dhchap_c2 = kmalloc(data->hl, GFP_KERNEL);
+ if (!req->sq->dhchap_c2)
+ return NVME_AUTH_DHCHAP_FAILURE_FAILED;
+ memcpy(req->sq->dhchap_c2, data->rval + data->hl, data->hl);
+
+ pr_debug("%s: ctrl %d qid %d challenge %*ph\n",
+ __func__, ctrl->cntlid, req->sq->qid, data->hl,
+ req->sq->dhchap_c2);
+ req->sq->dhchap_s2 = le32_to_cpu(data->seqnum);
+ } else {
+ req->sq->authenticated = true;
+ req->sq->dhchap_c2 = NULL;
+ }
+
+ return 0;
+}
+
+static u16 nvmet_auth_failure2(struct nvmet_req *req, void *d)
+{
+ struct nvmf_auth_dhchap_failure_data *data = d;
+
+ return data->rescode_exp;
+}
+
+void nvmet_execute_auth_send(struct nvmet_req *req)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmf_auth_dhchap_success2_data *data;
+ void *d;
+ u32 tl;
+ u16 status = 0;
+
+ if (req->cmd->auth_send.secp != NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER) {
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ req->error_loc =
+ offsetof(struct nvmf_auth_send_command, secp);
+ goto done;
+ }
+ if (req->cmd->auth_send.spsp0 != 0x01) {
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ req->error_loc =
+ offsetof(struct nvmf_auth_send_command, spsp0);
+ goto done;
+ }
+ if (req->cmd->auth_send.spsp1 != 0x01) {
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ req->error_loc =
+ offsetof(struct nvmf_auth_send_command, spsp1);
+ goto done;
+ }
+ tl = le32_to_cpu(req->cmd->auth_send.tl);
+ if (!tl) {
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ req->error_loc =
+ offsetof(struct nvmf_auth_send_command, tl);
+ goto done;
+ }
+ if (!nvmet_check_transfer_len(req, tl)) {
+ pr_debug("%s: transfer length mismatch (%u)\n", __func__, tl);
+ return;
+ }
+
+ d = kmalloc(tl, GFP_KERNEL);
+ if (!d) {
+ status = NVME_SC_INTERNAL;
+ goto done;
+ }
+
+ status = nvmet_copy_from_sgl(req, 0, d, tl);
+ if (status) {
+ kfree(d);
+ goto done;
+ }
+
+ data = d;
+ pr_debug("%s: ctrl %d qid %d type %d id %d step %x\n", __func__,
+ ctrl->cntlid, req->sq->qid, data->auth_type, data->auth_id,
+ req->sq->dhchap_step);
+ if (data->auth_type != NVME_AUTH_COMMON_MESSAGES &&
+ data->auth_type != NVME_AUTH_DHCHAP_MESSAGES)
+ goto done_failure1;
+ if (data->auth_type == NVME_AUTH_COMMON_MESSAGES) {
+ if (data->auth_id == NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE) {
+ /* Restart negotiation */
+ pr_debug("%s: ctrl %d qid %d reset negotiation\n", __func__,
+ ctrl->cntlid, req->sq->qid);
+ if (!req->sq->qid) {
+ if (nvmet_setup_auth(ctrl) < 0) {
+ status = NVME_SC_INTERNAL;
+ pr_err("ctrl %d qid 0 failed to setup"
+ "re-authentication",
+ ctrl->cntlid);
+ goto done_failure1;
+ }
+ }
+ req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
+ } else if (data->auth_id != req->sq->dhchap_step)
+ goto done_failure1;
+ /* Validate negotiation parameters */
+ status = nvmet_auth_negotiate(req, d);
+ if (status == 0)
+ req->sq->dhchap_step =
+ NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE;
+ else {
+ req->sq->dhchap_step =
+ NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
+ req->sq->dhchap_status = status;
+ status = 0;
+ }
+ goto done_kfree;
+ }
+ if (data->auth_id != req->sq->dhchap_step) {
+ pr_debug("%s: ctrl %d qid %d step mismatch (%d != %d)\n",
+ __func__, ctrl->cntlid, req->sq->qid,
+ data->auth_id, req->sq->dhchap_step);
+ goto done_failure1;
+ }
+ if (le16_to_cpu(data->t_id) != req->sq->dhchap_tid) {
+ pr_debug("%s: ctrl %d qid %d invalid transaction %d (expected %d)\n",
+ __func__, ctrl->cntlid, req->sq->qid,
+ le16_to_cpu(data->t_id),
+ req->sq->dhchap_tid);
+ req->sq->dhchap_step =
+ NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
+ req->sq->dhchap_status =
+ NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+ goto done_kfree;
+ }
+
+ switch (data->auth_id) {
+ case NVME_AUTH_DHCHAP_MESSAGE_REPLY:
+ status = nvmet_auth_reply(req, d);
+ if (status == 0)
+ req->sq->dhchap_step =
+ NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1;
+ else {
+ req->sq->dhchap_step =
+ NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
+ req->sq->dhchap_status = status;
+ status = 0;
+ }
+ goto done_kfree;
+ break;
+ case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2:
+ req->sq->authenticated = true;
+ pr_debug("%s: ctrl %d qid %d ctrl authenticated\n",
+ __func__, ctrl->cntlid, req->sq->qid);
+ goto done_kfree;
+ break;
+ case NVME_AUTH_DHCHAP_MESSAGE_FAILURE2:
+ status = nvmet_auth_failure2(req, d);
+ if (status) {
+ pr_warn("ctrl %d qid %d: authentication failed (%d)\n",
+ ctrl->cntlid, req->sq->qid, status);
+ req->sq->dhchap_status = status;
+ req->sq->authenticated = false;
+ status = 0;
+ }
+ goto done_kfree;
+ break;
+ default:
+ req->sq->dhchap_status =
+ NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
+ req->sq->dhchap_step =
+ NVME_AUTH_DHCHAP_MESSAGE_FAILURE2;
+ req->sq->authenticated = false;
+ goto done_kfree;
+ break;
+ }
+done_failure1:
+ req->sq->dhchap_status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
+ req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_FAILURE2;
+
+done_kfree:
+ kfree(d);
+done:
+ pr_debug("%s: ctrl %d qid %d dhchap status %x step %x\n", __func__,
+ ctrl->cntlid, req->sq->qid,
+ req->sq->dhchap_status, req->sq->dhchap_step);
+ if (status)
+ pr_debug("%s: ctrl %d qid %d nvme status %x error loc %d\n",
+ __func__, ctrl->cntlid, req->sq->qid,
+ status, req->error_loc);
+ req->cqe->result.u64 = 0;
+ nvmet_req_complete(req, status);
+ if (req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2 &&
+ req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) {
+ unsigned long auth_expire_secs = ctrl->kato ? ctrl->kato : 120;
+
+ mod_delayed_work(system_wq, &req->sq->auth_expired_work,
+ auth_expire_secs * HZ);
+ return;
+ }
+ /* Final states, clear up variables */
+ nvmet_auth_sq_free(req->sq);
+ if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2)
+ nvmet_ctrl_fatal_error(ctrl);
+}
+
+static int nvmet_auth_challenge(struct nvmet_req *req, void *d, int al)
+{
+ struct nvmf_auth_dhchap_challenge_data *data = d;
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ int ret = 0;
+ int hash_len = nvme_auth_hmac_hash_len(ctrl->shash_id);
+ int data_size = sizeof(*d) + hash_len;
+
+ if (ctrl->dh_tfm)
+ data_size += ctrl->dh_keysize;
+ if (al < data_size) {
+ pr_debug("%s: buffer too small (al %d need %d)\n", __func__,
+ al, data_size);
+ return -EINVAL;
+ }
+ memset(data, 0, data_size);
+ req->sq->dhchap_s1 = nvme_auth_get_seqnum();
+ data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
+ data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE;
+ data->t_id = cpu_to_le16(req->sq->dhchap_tid);
+ data->hashid = ctrl->shash_id;
+ data->hl = hash_len;
+ data->seqnum = cpu_to_le32(req->sq->dhchap_s1);
+ req->sq->dhchap_c1 = kmalloc(data->hl, GFP_KERNEL);
+ if (!req->sq->dhchap_c1)
+ return -ENOMEM;
+ get_random_bytes(req->sq->dhchap_c1, data->hl);
+ memcpy(data->cval, req->sq->dhchap_c1, data->hl);
+ if (ctrl->dh_tfm) {
+ data->dhgid = ctrl->dh_gid;
+ data->dhvlen = cpu_to_le16(ctrl->dh_keysize);
+ ret = nvmet_auth_ctrl_exponential(req, data->cval + data->hl,
+ ctrl->dh_keysize);
+ }
+ pr_debug("%s: ctrl %d qid %d seq %d transaction %d hl %d dhvlen %zu\n",
+ __func__, ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1,
+ req->sq->dhchap_tid, data->hl, ctrl->dh_keysize);
+ return ret;
+}
+
+static int nvmet_auth_success1(struct nvmet_req *req, void *d, int al)
+{
+ struct nvmf_auth_dhchap_success1_data *data = d;
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ int hash_len = nvme_auth_hmac_hash_len(ctrl->shash_id);
+
+ WARN_ON(al < sizeof(*data));
+ memset(data, 0, sizeof(*data));
+ data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
+ data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1;
+ data->t_id = cpu_to_le16(req->sq->dhchap_tid);
+ data->hl = hash_len;
+ if (req->sq->dhchap_c2) {
+ if (!ctrl->ctrl_key) {
+ pr_warn("ctrl %d qid %d no ctrl key\n",
+ ctrl->cntlid, req->sq->qid);
+ return NVME_AUTH_DHCHAP_FAILURE_FAILED;
+ }
+ if (nvmet_auth_ctrl_hash(req, data->rval, data->hl))
+ return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+ data->rvalid = 1;
+ pr_debug("ctrl %d qid %d response %*ph\n",
+ ctrl->cntlid, req->sq->qid, data->hl, data->rval);
+ }
+ return 0;
+}
+
+static void nvmet_auth_failure1(struct nvmet_req *req, void *d, int al)
+{
+ struct nvmf_auth_dhchap_failure_data *data = d;
+
+ WARN_ON(al < sizeof(*data));
+ data->auth_type = NVME_AUTH_COMMON_MESSAGES;
+ data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
+ data->t_id = cpu_to_le16(req->sq->dhchap_tid);
+ data->rescode = NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED;
+ data->rescode_exp = req->sq->dhchap_status;
+}
+
+void nvmet_execute_auth_receive(struct nvmet_req *req)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ void *d;
+ u32 al;
+ u16 status = 0;
+
+ if (req->cmd->auth_receive.secp != NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER) {
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ req->error_loc =
+ offsetof(struct nvmf_auth_receive_command, secp);
+ goto done;
+ }
+ if (req->cmd->auth_receive.spsp0 != 0x01) {
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ req->error_loc =
+ offsetof(struct nvmf_auth_receive_command, spsp0);
+ goto done;
+ }
+ if (req->cmd->auth_receive.spsp1 != 0x01) {
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ req->error_loc =
+ offsetof(struct nvmf_auth_receive_command, spsp1);
+ goto done;
+ }
+ al = le32_to_cpu(req->cmd->auth_receive.al);
+ if (!al) {
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ req->error_loc =
+ offsetof(struct nvmf_auth_receive_command, al);
+ goto done;
+ }
+ if (!nvmet_check_transfer_len(req, al)) {
+ pr_debug("%s: transfer length mismatch (%u)\n", __func__, al);
+ return;
+ }
+
+ d = kmalloc(al, GFP_KERNEL);
+ if (!d) {
+ status = NVME_SC_INTERNAL;
+ goto done;
+ }
+ pr_debug("%s: ctrl %d qid %d step %x\n", __func__,
+ ctrl->cntlid, req->sq->qid, req->sq->dhchap_step);
+ switch (req->sq->dhchap_step) {
+ case NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE:
+ if (nvmet_auth_challenge(req, d, al) < 0) {
+ pr_warn("ctrl %d qid %d: challenge error (%d)\n",
+ ctrl->cntlid, req->sq->qid, status);
+ status = NVME_SC_INTERNAL;
+ break;
+ }
+ if (status) {
+ req->sq->dhchap_status = status;
+ nvmet_auth_failure1(req, d, al);
+ pr_warn("ctrl %d qid %d: challenge status (%x)\n",
+ ctrl->cntlid, req->sq->qid,
+ req->sq->dhchap_status);
+ status = 0;
+ break;
+ }
+ req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_REPLY;
+ break;
+ case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1:
+ status = nvmet_auth_success1(req, d, al);
+ if (status) {
+ req->sq->dhchap_status = status;
+ req->sq->authenticated = false;
+ nvmet_auth_failure1(req, d, al);
+ pr_warn("ctrl %d qid %d: success1 status (%x)\n",
+ ctrl->cntlid, req->sq->qid,
+ req->sq->dhchap_status);
+ break;
+ }
+ req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2;
+ break;
+ case NVME_AUTH_DHCHAP_MESSAGE_FAILURE1:
+ req->sq->authenticated = false;
+ nvmet_auth_failure1(req, d, al);
+ pr_warn("ctrl %d qid %d failure1 (%x)\n",
+ ctrl->cntlid, req->sq->qid, req->sq->dhchap_status);
+ break;
+ default:
+ pr_warn("ctrl %d qid %d unhandled step (%d)\n",
+ ctrl->cntlid, req->sq->qid, req->sq->dhchap_step);
+ req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
+ req->sq->dhchap_status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+ nvmet_auth_failure1(req, d, al);
+ status = 0;
+ break;
+ }
+
+ status = nvmet_copy_to_sgl(req, 0, d, al);
+ kfree(d);
+done:
+ req->cqe->result.u64 = 0;
+ nvmet_req_complete(req, status);
+ if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2)
+ nvmet_auth_sq_free(req->sq);
+ else if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) {
+ nvmet_auth_sq_free(req->sq);
+ nvmet_ctrl_fatal_error(ctrl);
+ }
+}
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 70fb587e9413..f91a56180d3d 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -82,7 +82,7 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
nvmet_req_complete(req, status);
}
-u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
+u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
@@ -93,6 +93,37 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
case nvme_fabrics_type_property_get:
req->execute = nvmet_execute_prop_get;
break;
+#ifdef CONFIG_NVME_TARGET_AUTH
+ case nvme_fabrics_type_auth_send:
+ req->execute = nvmet_execute_auth_send;
+ break;
+ case nvme_fabrics_type_auth_receive:
+ req->execute = nvmet_execute_auth_receive;
+ break;
+#endif
+ default:
+ pr_debug("received unknown capsule type 0x%x\n",
+ cmd->fabrics.fctype);
+ req->error_loc = offsetof(struct nvmf_common_command, fctype);
+ return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+ }
+
+ return 0;
+}
+
+u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req)
+{
+ struct nvme_command *cmd = req->cmd;
+
+ switch (cmd->fabrics.fctype) {
+#ifdef CONFIG_NVME_TARGET_AUTH
+ case nvme_fabrics_type_auth_send:
+ req->execute = nvmet_execute_auth_send;
+ break;
+ case nvme_fabrics_type_auth_receive:
+ req->execute = nvmet_execute_auth_receive;
+ break;
+#endif
default:
pr_debug("received unknown capsule type 0x%x\n",
cmd->fabrics.fctype);
@@ -173,6 +204,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
struct nvmf_connect_data *d;
struct nvmet_ctrl *ctrl = NULL;
u16 status = 0;
+ int ret;
if (!nvmet_check_transfer_len(req, sizeof(struct nvmf_connect_data)))
return;
@@ -215,18 +247,32 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
uuid_copy(&ctrl->hostid, &d->hostid);
+ ret = nvmet_setup_auth(ctrl);
+ if (ret < 0) {
+ pr_err("Failed to setup authentication, error %d\n", ret);
+ nvmet_ctrl_put(ctrl);
+ if (ret == -EPERM)
+ status = (NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR);
+ else
+ status = NVME_SC_INTERNAL;
+ goto out;
+ }
+
status = nvmet_install_queue(ctrl, req);
if (status) {
nvmet_ctrl_put(ctrl);
goto out;
}
- pr_info("creating %s controller %d for subsystem %s for NQN %s%s.\n",
+ pr_info("creating %s controller %d for subsystem %s for NQN %s%s%s.\n",
nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm",
ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn,
- ctrl->pi_support ? " T10-PI is enabled" : "");
+ ctrl->pi_support ? " T10-PI is enabled" : "",
+ nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : "");
req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
+ if (nvmet_has_auth(ctrl))
+ nvmet_init_auth(ctrl, req);
out:
kfree(d);
complete:
@@ -286,6 +332,9 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
+ req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
+ if (nvmet_has_auth(ctrl))
+ nvmet_init_auth(ctrl, req);
out:
kfree(d);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 0f5c77e22a0a..9750a7fca268 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -424,9 +424,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
{
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
- blk_mq_tagset_busy_iter(&ctrl->tag_set,
- nvme_cancel_request, &ctrl->ctrl);
- blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
+ nvme_cancel_tagset(&ctrl->ctrl);
nvme_loop_destroy_io_queues(ctrl);
}
@@ -434,9 +432,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
if (ctrl->ctrl.state == NVME_CTRL_LIVE)
nvme_shutdown_ctrl(&ctrl->ctrl);
- blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
- nvme_cancel_request, &ctrl->ctrl);
- blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
+ nvme_cancel_admin_tagset(&ctrl->ctrl);
nvme_loop_destroy_admin_queue(ctrl);
}
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 2b3e5719f24e..6ffeeb0a1c49 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -108,6 +108,19 @@ struct nvmet_sq {
u16 size;
u32 sqhd;
bool sqhd_disabled;
+#ifdef CONFIG_NVME_TARGET_AUTH
+ struct delayed_work auth_expired_work;
+ bool authenticated;
+ u16 dhchap_tid;
+ u16 dhchap_status;
+ int dhchap_step;
+ u8 *dhchap_c1;
+ u8 *dhchap_c2;
+ u32 dhchap_s1;
+ u32 dhchap_s2;
+ u8 *dhchap_skey;
+ int dhchap_skey_len;
+#endif
struct completion free_done;
struct completion confirm_done;
};
@@ -209,6 +222,15 @@ struct nvmet_ctrl {
u64 err_counter;
struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS];
bool pi_support;
+#ifdef CONFIG_NVME_TARGET_AUTH
+ struct nvme_dhchap_key *host_key;
+ struct nvme_dhchap_key *ctrl_key;
+ u8 shash_id;
+ struct crypto_kpp *dh_tfm;
+ u8 dh_gid;
+ u8 *dh_key;
+ size_t dh_keysize;
+#endif
};
struct nvmet_subsys {
@@ -271,6 +293,12 @@ static inline struct nvmet_subsys *namespaces_to_subsys(
struct nvmet_host {
struct config_group group;
+ u8 *dhchap_secret;
+ u8 *dhchap_ctrl_secret;
+ u8 dhchap_key_hash;
+ u8 dhchap_ctrl_key_hash;
+ u8 dhchap_hash_id;
+ u8 dhchap_dhgroup_id;
};
static inline struct nvmet_host *to_host(struct config_item *item)
@@ -420,7 +448,8 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req);
u16 nvmet_parse_admin_cmd(struct nvmet_req *req);
u16 nvmet_parse_discovery_cmd(struct nvmet_req *req);
-u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
+u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req);
+u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req);
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops);
@@ -668,4 +697,48 @@ static inline void nvmet_req_bio_put(struct nvmet_req *req, struct bio *bio)
bio_put(bio);
}
+#ifdef CONFIG_NVME_TARGET_AUTH
+void nvmet_execute_auth_send(struct nvmet_req *req);
+void nvmet_execute_auth_receive(struct nvmet_req *req);
+int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
+ bool set_ctrl);
+int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash);
+int nvmet_setup_auth(struct nvmet_ctrl *ctrl);
+void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req);
+void nvmet_destroy_auth(struct nvmet_ctrl *ctrl);
+void nvmet_auth_sq_free(struct nvmet_sq *sq);
+int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id);
+bool nvmet_check_auth_status(struct nvmet_req *req);
+int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
+ unsigned int hash_len);
+int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
+ unsigned int hash_len);
+static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl)
+{
+ return ctrl->host_key != NULL;
+}
+int nvmet_auth_ctrl_exponential(struct nvmet_req *req,
+ u8 *buf, int buf_size);
+int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
+ u8 *buf, int buf_size);
+#else
+static inline int nvmet_setup_auth(struct nvmet_ctrl *ctrl)
+{
+ return 0;
+}
+static inline void nvmet_init_auth(struct nvmet_ctrl *ctrl,
+ struct nvmet_req *req) {};
+static inline void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) {};
+static inline void nvmet_auth_sq_free(struct nvmet_sq *sq) {};
+static inline bool nvmet_check_auth_status(struct nvmet_req *req)
+{
+ return true;
+}
+static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl)
+{
+ return false;
+}
+static inline const char *nvmet_dhchap_dhgroup_name(u8 dhgid) { return NULL; }
+#endif
+
#endif /* _NVMET_H */
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 0a9542599ad1..dc3b4dc8fe08 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1839,7 +1839,8 @@ static int __init nvmet_tcp_init(void)
{
int ret;
- nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
+ nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
if (!nvmet_tcp_wq)
return -ENOMEM;
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 4df8bf6505fc..ea82821599f6 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1725,7 +1725,7 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm,
dasd_put_device(device);
}
- /* check for for attention message */
+ /* check for attention message */
if (scsw_dstat(&irb->scsw) & DEV_STAT_ATTENTION) {
device = dasd_device_from_cdev_locked(cdev);
if (!IS_ERR(device)) {
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index e9edf3b6ed7c..94ee59864971 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -639,6 +639,7 @@ static void dasd_diag_setup_blk_queue(struct dasd_block *block)
/* With page sized segments each segment can be translated into one idaw/tidaw */
blk_queue_max_segment_size(q, PAGE_SIZE);
blk_queue_segment_boundary(q, PAGE_SIZE - 1);
+ blk_queue_dma_alignment(q, PAGE_SIZE - 1);
}
static int dasd_diag_pe_handler(struct dasd_device *device,
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 836838f7d686..3cc93e2e4e15 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -6626,6 +6626,7 @@ static void dasd_eckd_setup_blk_queue(struct dasd_block *block)
/* With page sized segments each segment can be translated into one idaw/tidaw */
blk_queue_max_segment_size(q, PAGE_SIZE);
blk_queue_segment_boundary(q, PAGE_SIZE - 1);
+ blk_queue_dma_alignment(q, PAGE_SIZE - 1);
}
static struct ccw_driver dasd_eckd_driver = {
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 4d8d1759775a..5187705bd0f3 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -863,7 +863,7 @@ dcssblk_submit_bio(struct bio *bio)
unsigned long source_addr;
unsigned long bytes_done;
- blk_queue_split(&bio);
+ bio = bio_split_to_limits(bio);
bytes_done = 0;
dev_info = bio->bi_bdev->bd_disk->private_data;
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index f140e4643949..f5841992dc9b 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -718,6 +718,8 @@ static inline void ahash_request_set_crypt(struct ahash_request *req,
struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
u32 mask);
+int crypto_has_shash(const char *alg_name, u32 type, u32 mask);
+
static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm)
{
return &tfm->base;
diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h
index cccceadc164b..24d01e9877c1 100644
--- a/include/crypto/kpp.h
+++ b/include/crypto/kpp.h
@@ -104,6 +104,8 @@ struct kpp_alg {
*/
struct crypto_kpp *crypto_alloc_kpp(const char *alg_name, u32 type, u32 mask);
+int crypto_has_kpp(const char *alg_name, u32 type, u32 mask);
+
static inline struct crypto_tfm *crypto_kpp_tfm(struct crypto_kpp *tfm)
{
return &tfm->base;
diff --git a/include/linux/base64.h b/include/linux/base64.h
new file mode 100644
index 000000000000..660d4cb1ef31
--- /dev/null
+++ b/include/linux/base64.h
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * base64 encoding, lifted from fs/crypto/fname.c.
+ */
+
+#ifndef _LINUX_BASE64_H
+#define _LINUX_BASE64_H
+
+#include <linux/types.h>
+
+#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
+
+int base64_encode(const u8 *src, int len, char *dst);
+int base64_decode(const char *src, int len, u8 *dst);
+
+#endif /* _LINUX_BASE64_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index dccdf1551c62..84b13fdd34a7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -140,6 +140,8 @@ struct gendisk {
struct request_queue *queue;
void *private_data;
+ struct bio_set bio_split;
+
int flags;
unsigned long state;
#define GD_NEED_PART_SCAN 0
@@ -531,7 +533,6 @@ struct request_queue {
struct blk_mq_tag_set *tag_set;
struct list_head tag_set_list;
- struct bio_set bio_split;
struct dentry *debugfs_dir;
struct dentry *sched_debugfs_dir;
@@ -864,9 +865,9 @@ void blk_request_module(dev_t devt);
extern int blk_register_queue(struct gendisk *disk);
extern void blk_unregister_queue(struct gendisk *disk);
void submit_bio_noacct(struct bio *bio);
+struct bio *bio_split_to_limits(struct bio *bio);
extern int blk_lld_busy(struct request_queue *q);
-extern void blk_queue_split(struct bio **);
extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
extern void blk_queue_exit(struct request_queue *q);
extern void blk_sync_queue(struct request_queue *q);
diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h
new file mode 100644
index 000000000000..dcb8030062dd
--- /dev/null
+++ b/include/linux/nvme-auth.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021 Hannes Reinecke, SUSE Software Solutions
+ */
+
+#ifndef _NVME_AUTH_H
+#define _NVME_AUTH_H
+
+#include <crypto/kpp.h>
+
+struct nvme_dhchap_key {
+ u8 *key;
+ size_t len;
+ u8 hash;
+};
+
+u32 nvme_auth_get_seqnum(void);
+const char *nvme_auth_dhgroup_name(u8 dhgroup_id);
+const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id);
+u8 nvme_auth_dhgroup_id(const char *dhgroup_name);
+
+const char *nvme_auth_hmac_name(u8 hmac_id);
+const char *nvme_auth_digest_name(u8 hmac_id);
+size_t nvme_auth_hmac_hash_len(u8 hmac_id);
+u8 nvme_auth_hmac_id(const char *hmac_name);
+
+struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
+ u8 key_hash);
+void nvme_auth_free_key(struct nvme_dhchap_key *key);
+u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn);
+int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key);
+int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
+ u8 *challenge, u8 *aug, size_t hlen);
+int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid);
+int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm,
+ u8 *host_key, size_t host_key_len);
+int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
+ u8 *ctrl_key, size_t ctrl_key_len,
+ u8 *sess_key, size_t sess_key_len);
+
+#endif /* _NVME_AUTH_H */
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 07cfc922f8e4..ae53d74f3696 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -19,6 +19,7 @@
#define NVMF_TRSVCID_SIZE 32
#define NVMF_TRADDR_SIZE 256
#define NVMF_TSAS_SIZE 256
+#define NVMF_AUTH_HASH_LEN 64
#define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery"
@@ -712,6 +713,10 @@ enum {
};
enum {
+ NVME_AER_ERROR_PERSIST_INT_ERR = 0x03,
+};
+
+enum {
NVME_AER_NOTICE_NS_CHANGED = 0x00,
NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
NVME_AER_NOTICE_ANA = 0x03,
@@ -1369,6 +1374,8 @@ enum nvmf_capsule_command {
nvme_fabrics_type_property_set = 0x00,
nvme_fabrics_type_connect = 0x01,
nvme_fabrics_type_property_get = 0x04,
+ nvme_fabrics_type_auth_send = 0x05,
+ nvme_fabrics_type_auth_receive = 0x06,
};
#define nvme_fabrics_type_name(type) { type, #type }
@@ -1376,7 +1383,9 @@ enum nvmf_capsule_command {
__print_symbolic(type, \
nvme_fabrics_type_name(nvme_fabrics_type_property_set), \
nvme_fabrics_type_name(nvme_fabrics_type_connect), \
- nvme_fabrics_type_name(nvme_fabrics_type_property_get))
+ nvme_fabrics_type_name(nvme_fabrics_type_property_get), \
+ nvme_fabrics_type_name(nvme_fabrics_type_auth_send), \
+ nvme_fabrics_type_name(nvme_fabrics_type_auth_receive))
/*
* If not fabrics command, fctype will be ignored.
@@ -1472,6 +1481,11 @@ struct nvmf_connect_command {
__u8 resv4[12];
};
+enum {
+ NVME_CONNECT_AUTHREQ_ASCR = (1 << 2),
+ NVME_CONNECT_AUTHREQ_ATR = (1 << 1),
+};
+
struct nvmf_connect_data {
uuid_t hostid;
__le16 cntlid;
@@ -1506,6 +1520,200 @@ struct nvmf_property_get_command {
__u8 resv4[16];
};
+struct nvmf_auth_common_command {
+ __u8 opcode;
+ __u8 resv1;
+ __u16 command_id;
+ __u8 fctype;
+ __u8 resv2[19];
+ union nvme_data_ptr dptr;
+ __u8 resv3;
+ __u8 spsp0;
+ __u8 spsp1;
+ __u8 secp;
+ __le32 al_tl;
+ __u8 resv4[16];
+};
+
+struct nvmf_auth_send_command {
+ __u8 opcode;
+ __u8 resv1;
+ __u16 command_id;
+ __u8 fctype;
+ __u8 resv2[19];
+ union nvme_data_ptr dptr;
+ __u8 resv3;
+ __u8 spsp0;
+ __u8 spsp1;
+ __u8 secp;
+ __le32 tl;
+ __u8 resv4[16];
+};
+
+struct nvmf_auth_receive_command {
+ __u8 opcode;
+ __u8 resv1;
+ __u16 command_id;
+ __u8 fctype;
+ __u8 resv2[19];
+ union nvme_data_ptr dptr;
+ __u8 resv3;
+ __u8 spsp0;
+ __u8 spsp1;
+ __u8 secp;
+ __le32 al;
+ __u8 resv4[16];
+};
+
+/* Value for secp */
+enum {
+ NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER = 0xe9,
+};
+
+/* Defined value for auth_type */
+enum {
+ NVME_AUTH_COMMON_MESSAGES = 0x00,
+ NVME_AUTH_DHCHAP_MESSAGES = 0x01,
+};
+
+/* Defined messages for auth_id */
+enum {
+ NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE = 0x00,
+ NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE = 0x01,
+ NVME_AUTH_DHCHAP_MESSAGE_REPLY = 0x02,
+ NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1 = 0x03,
+ NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2 = 0x04,
+ NVME_AUTH_DHCHAP_MESSAGE_FAILURE2 = 0xf0,
+ NVME_AUTH_DHCHAP_MESSAGE_FAILURE1 = 0xf1,
+};
+
+struct nvmf_auth_dhchap_protocol_descriptor {
+ __u8 authid;
+ __u8 rsvd;
+ __u8 halen;
+ __u8 dhlen;
+ __u8 idlist[60];
+};
+
+enum {
+ NVME_AUTH_DHCHAP_AUTH_ID = 0x01,
+};
+
+/* Defined hash functions for DH-HMAC-CHAP authentication */
+enum {
+ NVME_AUTH_HASH_SHA256 = 0x01,
+ NVME_AUTH_HASH_SHA384 = 0x02,
+ NVME_AUTH_HASH_SHA512 = 0x03,
+ NVME_AUTH_HASH_INVALID = 0xff,
+};
+
+/* Defined Diffie-Hellman group identifiers for DH-HMAC-CHAP authentication */
+enum {
+ NVME_AUTH_DHGROUP_NULL = 0x00,
+ NVME_AUTH_DHGROUP_2048 = 0x01,
+ NVME_AUTH_DHGROUP_3072 = 0x02,
+ NVME_AUTH_DHGROUP_4096 = 0x03,
+ NVME_AUTH_DHGROUP_6144 = 0x04,
+ NVME_AUTH_DHGROUP_8192 = 0x05,
+ NVME_AUTH_DHGROUP_INVALID = 0xff,
+};
+
+union nvmf_auth_protocol {
+ struct nvmf_auth_dhchap_protocol_descriptor dhchap;
+};
+
+struct nvmf_auth_dhchap_negotiate_data {
+ __u8 auth_type;
+ __u8 auth_id;
+ __le16 rsvd;
+ __le16 t_id;
+ __u8 sc_c;
+ __u8 napd;
+ union nvmf_auth_protocol auth_protocol[];
+};
+
+struct nvmf_auth_dhchap_challenge_data {
+ __u8 auth_type;
+ __u8 auth_id;
+ __u16 rsvd1;
+ __le16 t_id;
+ __u8 hl;
+ __u8 rsvd2;
+ __u8 hashid;
+ __u8 dhgid;
+ __le16 dhvlen;
+ __le32 seqnum;
+ /* 'hl' bytes of challenge value */
+ __u8 cval[];
+ /* followed by 'dhvlen' bytes of DH value */
+};
+
+struct nvmf_auth_dhchap_reply_data {
+ __u8 auth_type;
+ __u8 auth_id;
+ __le16 rsvd1;
+ __le16 t_id;
+ __u8 hl;
+ __u8 rsvd2;
+ __u8 cvalid;
+ __u8 rsvd3;
+ __le16 dhvlen;
+ __le32 seqnum;
+ /* 'hl' bytes of response data */
+ __u8 rval[];
+ /* followed by 'hl' bytes of Challenge value */
+ /* followed by 'dhvlen' bytes of DH value */
+};
+
+enum {
+ NVME_AUTH_DHCHAP_RESPONSE_VALID = (1 << 0),
+};
+
+struct nvmf_auth_dhchap_success1_data {
+ __u8 auth_type;
+ __u8 auth_id;
+ __le16 rsvd1;
+ __le16 t_id;
+ __u8 hl;
+ __u8 rsvd2;
+ __u8 rvalid;
+ __u8 rsvd3[7];
+ /* 'hl' bytes of response value if 'rvalid' is set */
+ __u8 rval[];
+};
+
+struct nvmf_auth_dhchap_success2_data {
+ __u8 auth_type;
+ __u8 auth_id;
+ __le16 rsvd1;
+ __le16 t_id;
+ __u8 rsvd2[10];
+};
+
+struct nvmf_auth_dhchap_failure_data {
+ __u8 auth_type;
+ __u8 auth_id;
+ __le16 rsvd1;
+ __le16 t_id;
+ __u8 rescode;
+ __u8 rescode_exp;
+};
+
+enum {
+ NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED = 0x01,
+};
+
+enum {
+ NVME_AUTH_DHCHAP_FAILURE_FAILED = 0x01,
+ NVME_AUTH_DHCHAP_FAILURE_NOT_USABLE = 0x02,
+ NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH = 0x03,
+ NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE = 0x04,
+ NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE = 0x05,
+ NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD = 0x06,
+ NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE = 0x07,
+};
+
+
struct nvme_dbbuf {
__u8 opcode;
__u8 flags;
@@ -1549,6 +1757,9 @@ struct nvme_command {
struct nvmf_connect_command connect;
struct nvmf_property_set_command prop_set;
struct nvmf_property_get_command prop_get;
+ struct nvmf_auth_common_command auth_common;
+ struct nvmf_auth_send_command auth_send;
+ struct nvmf_auth_receive_command auth_receive;
struct nvme_dbbuf dbbuf;
struct nvme_directive_cmd directive;
};
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index ca33092354ab..677edaab2b66 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -15,6 +15,8 @@
#define UBLK_CMD_DEL_DEV 0x05
#define UBLK_CMD_START_DEV 0x06
#define UBLK_CMD_STOP_DEV 0x07
+#define UBLK_CMD_SET_PARAMS 0x08
+#define UBLK_CMD_GET_PARAMS 0x09
/*
* IO commands, issued by ublk server, and handled by ublk driver.
@@ -28,12 +30,21 @@
* this IO request, request's handling result is committed to ublk
* driver, meantime FETCH_REQ is piggyback, and FETCH_REQ has to be
* handled before completing io request.
+ *
+ * NEED_GET_DATA: only used for write requests to set io addr and copy data
+ * When NEED_GET_DATA is set, ublksrv has to issue UBLK_IO_NEED_GET_DATA
+ * command after ublk driver returns UBLK_IO_RES_NEED_GET_DATA.
+ *
+ * It is only used if ublksrv set UBLK_F_NEED_GET_DATA flag
+ * while starting a ublk device.
*/
#define UBLK_IO_FETCH_REQ 0x20
#define UBLK_IO_COMMIT_AND_FETCH_REQ 0x21
+#define UBLK_IO_NEED_GET_DATA 0x22
/* only ABORT means that no re-fetch */
#define UBLK_IO_RES_OK 0
+#define UBLK_IO_RES_NEED_GET_DATA 1
#define UBLK_IO_RES_ABORT (-ENODEV)
#define UBLKSRV_CMD_BUF_OFFSET 0
@@ -54,6 +65,15 @@
*/
#define UBLK_F_URING_CMD_COMP_IN_TASK (1ULL << 1)
+/*
+ * User should issue io cmd again for write requests to
+ * set io buffer address and copy data from bio vectors
+ * to the userspace io buffer.
+ *
+ * In this mode, task_work is not used.
+ */
+#define UBLK_F_NEED_GET_DATA (1UL << 2)
+
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
@@ -78,22 +98,23 @@ struct ublksrv_ctrl_cmd {
struct ublksrv_ctrl_dev_info {
__u16 nr_hw_queues;
__u16 queue_depth;
- __u16 block_size;
__u16 state;
+ __u16 pad0;
- __u32 rq_max_blocks;
+ __u32 max_io_buf_bytes;
__u32 dev_id;
- __u64 dev_blocks;
-
__s32 ublksrv_pid;
- __s32 reserved0;
+ __u32 pad1;
+
__u64 flags;
- __u64 flags_reserved;
/* For ublksrv internal use, invisible to ublk driver */
__u64 ublksrv_flags;
- __u64 reserved1[9];
+
+ __u64 reserved0;
+ __u64 reserved1;
+ __u64 reserved2;
};
#define UBLK_IO_OP_READ 0
@@ -158,4 +179,49 @@ struct ublksrv_io_cmd {
__u64 addr;
};
+struct ublk_param_basic {
+#define UBLK_ATTR_READ_ONLY (1 << 0)
+#define UBLK_ATTR_ROTATIONAL (1 << 1)
+#define UBLK_ATTR_VOLATILE_CACHE (1 << 2)
+#define UBLK_ATTR_FUA (1 << 3)
+ __u32 attrs;
+ __u8 logical_bs_shift;
+ __u8 physical_bs_shift;
+ __u8 io_opt_shift;
+ __u8 io_min_shift;
+
+ __u32 max_sectors;
+ __u32 chunk_sectors;
+
+ __u64 dev_sectors;
+ __u64 virt_boundary_mask;
+};
+
+struct ublk_param_discard {
+ __u32 discard_alignment;
+
+ __u32 discard_granularity;
+ __u32 max_discard_sectors;
+
+ __u32 max_write_zeroes_sectors;
+ __u16 max_discard_segments;
+ __u16 reserved0;
+};
+
+struct ublk_params {
+ /*
+ * Total length of parameters, userspace has to set 'len' for both
+ * SET_PARAMS and GET_PARAMS command, and driver may update len
+ * if two sides use different version of 'ublk_params', same with
+ * 'types' fields.
+ */
+ __u32 len;
+#define UBLK_PARAM_TYPE_BASIC (1 << 0)
+#define UBLK_PARAM_TYPE_DISCARD (1 << 1)
+ __u32 types; /* types of parameter included */
+
+ struct ublk_param_basic basic;
+ struct ublk_param_discard discard;
+};
+
#endif
diff --git a/lib/Makefile b/lib/Makefile
index 67482f5ec0e8..441795bf5d33 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -46,7 +46,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
list_sort.o uuid.o iov_iter.o clz_ctz.o \
bsearch.o find_bit.o llist.o memweight.o kfifo.o \
- percpu-refcount.o rhashtable.o \
+ percpu-refcount.o rhashtable.o base64.o \
once.o refcount.o usercopy.o errseq.o bucket_locks.o \
generic-radix-tree.o
obj-$(CONFIG_STRING_SELFTEST) += test_string.o
diff --git a/lib/base64.c b/lib/base64.c
new file mode 100644
index 000000000000..b736a7a431c5
--- /dev/null
+++ b/lib/base64.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * base64.c - RFC4648-compliant base64 encoding
+ *
+ * Copyright (c) 2020 Hannes Reinecke, SUSE
+ *
+ * Based on the base64url routines from fs/crypto/fname.c
+ * (which are using the URL-safe base64 encoding),
+ * modified to use the standard coding table from RFC4648 section 4.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/base64.h>
+
+static const char base64_table[65] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+/**
+ * base64_encode() - base64-encode some binary data
+ * @src: the binary data to encode
+ * @srclen: the length of @src in bytes
+ * @dst: (output) the base64-encoded string. Not NUL-terminated.
+ *
+ * Encodes data using base64 encoding, i.e. the "Base 64 Encoding" specified
+ * by RFC 4648, including the '='-padding.
+ *
+ * Return: the length of the resulting base64-encoded string in bytes.
+ */
+int base64_encode(const u8 *src, int srclen, char *dst)
+{
+ u32 ac = 0;
+ int bits = 0;
+ int i;
+ char *cp = dst;
+
+ for (i = 0; i < srclen; i++) {
+ ac = (ac << 8) | src[i];
+ bits += 8;
+ do {
+ bits -= 6;
+ *cp++ = base64_table[(ac >> bits) & 0x3f];
+ } while (bits >= 6);
+ }
+ if (bits) {
+ *cp++ = base64_table[(ac << (6 - bits)) & 0x3f];
+ bits -= 6;
+ }
+ while (bits < 0) {
+ *cp++ = '=';
+ bits += 2;
+ }
+ return cp - dst;
+}
+EXPORT_SYMBOL_GPL(base64_encode);
+
+/**
+ * base64_decode() - base64-decode a string
+ * @src: the string to decode. Doesn't need to be NUL-terminated.
+ * @srclen: the length of @src in bytes
+ * @dst: (output) the decoded binary data
+ *
+ * Decodes a string using base64 encoding, i.e. the "Base 64 Encoding"
+ * specified by RFC 4648, including the '='-padding.
+ *
+ * This implementation hasn't been optimized for performance.
+ *
+ * Return: the length of the resulting decoded binary data in bytes,
+ * or -1 if the string isn't a valid base64 string.
+ */
+int base64_decode(const char *src, int srclen, u8 *dst)
+{
+ u32 ac = 0;
+ int bits = 0;
+ int i;
+ u8 *bp = dst;
+
+ for (i = 0; i < srclen; i++) {
+ const char *p = strchr(base64_table, src[i]);
+
+ if (src[i] == '=') {
+ ac = (ac << 6);
+ bits += 6;
+ if (bits >= 8)
+ bits -= 8;
+ continue;
+ }
+ if (p == NULL || src[i] == 0)
+ return -1;
+ ac = (ac << 6) | (p - base64_table);
+ bits += 6;
+ if (bits >= 8) {
+ bits -= 8;
+ *bp++ = (u8)(ac >> bits);
+ }
+ }
+ if (ac & ((1 << bits) - 1))
+ return -1;
+ return bp - dst;
+}
+EXPORT_SYMBOL_GPL(base64_decode);