aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/block/inline-encryption.rst451
-rw-r--r--Makefile3
-rw-r--r--arch/m68k/emu/nfblock.c3
-rw-r--r--arch/mips/rb532/prom.c1
-rw-r--r--arch/mips/sibyte/common/cfe.c1
-rw-r--r--arch/mips/sibyte/swarm/setup.c1
-rw-r--r--arch/openrisc/mm/init.c1
-rw-r--r--arch/powerpc/platforms/cell/spufs/inode.c1
-rw-r--r--arch/um/drivers/ubd_kern.c1
-rw-r--r--arch/xtensa/platforms/iss/simdisk.c3
-rw-r--r--block/Kconfig28
-rw-r--r--block/Kconfig.iosched4
-rw-r--r--block/Makefile6
-rw-r--r--block/bdev.c18
-rw-r--r--block/bfq-cgroup.c14
-rw-r--r--block/bfq-iosched.c6
-rw-r--r--block/bio-integrity.c4
-rw-r--r--block/bio.c171
-rw-r--r--block/blk-cgroup.c27
-rw-r--r--block/blk-core.c404
-rw-r--r--block/blk-crypto-fallback.c119
-rw-r--r--block/blk-crypto-internal.h2
-rw-r--r--block/blk-crypto-profile.c565
-rw-r--r--block/blk-crypto.c29
-rw-r--r--block/blk-exec.c10
-rw-r--r--block/blk-flush.c12
-rw-r--r--block/blk-ia-ranges.c348
-rw-r--r--block/blk-integrity.c6
-rw-r--r--block/blk-iocost.c12
-rw-r--r--block/blk-iolatency.c1
-rw-r--r--block/blk-merge.c127
-rw-r--r--block/blk-mq-debugfs.c133
-rw-r--r--block/blk-mq-sched.c129
-rw-r--r--block/blk-mq-sched.h49
-rw-r--r--block/blk-mq-tag.c163
-rw-r--r--block/blk-mq-tag.h38
-rw-r--r--block/blk-mq.c1034
-rw-r--r--block/blk-mq.h79
-rw-r--r--block/blk-rq-qos.h5
-rw-r--r--block/blk-sysfs.c50
-rw-r--r--block/blk-throttle.c163
-rw-r--r--block/blk-throttle.h182
-rw-r--r--block/blk-wbt.c3
-rw-r--r--block/blk.h131
-rw-r--r--block/bounce.c1
-rw-r--r--block/elevator.c4
-rw-r--r--block/elevator.h (renamed from include/linux/elevator.h)21
-rw-r--r--block/fops.c282
-rw-r--r--block/genhd.c35
-rw-r--r--block/holder.c1
-rw-r--r--block/ioctl.c19
-rw-r--r--block/keyslot-manager.c578
-rw-r--r--block/kyber-iosched.c6
-rw-r--r--block/mq-deadline.c224
-rw-r--r--block/partitions/Kconfig4
-rw-r--r--block/partitions/core.c5
-rw-r--r--block/t10-pi.c2
-rw-r--r--drivers/block/amiflop.c2
-rw-r--r--drivers/block/ataflop.c1
-rw-r--r--drivers/block/brd.c12
-rw-r--r--drivers/block/drbd/drbd_int.h2
-rw-r--r--drivers/block/drbd/drbd_req.c3
-rw-r--r--drivers/block/floppy.c1
-rw-r--r--drivers/block/loop.c32
-rw-r--r--drivers/block/n64cart.c12
-rw-r--r--drivers/block/nbd.c15
-rw-r--r--drivers/block/null_blk/main.c3
-rw-r--r--drivers/block/pktcdvd.c7
-rw-r--r--drivers/block/ps3vram.c6
-rw-r--r--drivers/block/rbd.c2
-rw-r--r--drivers/block/rnbd/rnbd-clt.c2
-rw-r--r--drivers/block/rnbd/rnbd-proto.h2
-rw-r--r--drivers/block/rsxx/dev.c7
-rw-r--r--drivers/block/swim.c1
-rw-r--r--drivers/block/virtio_blk.c12
-rw-r--r--drivers/block/xen-blkfront.c1
-rw-r--r--drivers/block/zram/zram_drv.c10
-rw-r--r--drivers/gpu/drm/i915/i915_utils.h1
-rw-r--r--drivers/md/bcache/request.c13
-rw-r--r--drivers/md/bcache/request.h4
-rw-r--r--drivers/md/dm-bio-record.h1
-rw-r--r--drivers/md/dm-core.h4
-rw-r--r--drivers/md/dm-crypt.c1
-rw-r--r--drivers/md/dm-ima.c1
-rw-r--r--drivers/md/dm-ps-historical-service-time.c1
-rw-r--r--drivers/md/dm-rq.c1
-rw-r--r--drivers/md/dm-table.c169
-rw-r--r--drivers/md/dm-verity-target.c1
-rw-r--r--drivers/md/dm.c38
-rw-r--r--drivers/md/md.c12
-rw-r--r--drivers/mmc/core/crypto.c11
-rw-r--r--drivers/mmc/core/sd.c1
-rw-r--r--drivers/mmc/host/cqhci-crypto.c33
-rw-r--r--drivers/mtd/mtdsuper.c1
-rw-r--r--drivers/nvdimm/blk.c5
-rw-r--r--drivers/nvdimm/btt.c5
-rw-r--r--drivers/nvdimm/core.c1
-rw-r--r--drivers/nvdimm/pmem.c3
-rw-r--r--drivers/nvme/host/core.c90
-rw-r--r--drivers/nvme/host/fc.c8
-rw-r--r--drivers/nvme/host/multipath.c22
-rw-r--r--drivers/nvme/host/nvme.h18
-rw-r--r--drivers/nvme/host/pci.c49
-rw-r--r--drivers/nvme/host/rdma.c17
-rw-r--r--drivers/nvme/host/tcp.c18
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c1
-rw-r--r--drivers/nvme/target/loop.c6
-rw-r--r--drivers/nvme/target/rdma.c1
-rw-r--r--drivers/s390/block/dasd_genhd.c1
-rw-r--r--drivers/s390/block/dcssblk.c7
-rw-r--r--drivers/scsi/hisi_sas/hisi_sas_v3_hw.c1
-rw-r--r--drivers/scsi/lpfc/lpfc.h1
-rw-r--r--drivers/scsi/scsi_debug.c10
-rw-r--r--drivers/scsi/scsi_lib.c3
-rw-r--r--drivers/scsi/sd.c1
-rw-r--r--drivers/scsi/sd_dif.c2
-rw-r--r--drivers/scsi/sg.c1
-rw-r--r--drivers/scsi/sr.c1
-rw-r--r--drivers/scsi/st.c1
-rw-r--r--drivers/scsi/ufs/ufshcd-crypto.c32
-rw-r--r--drivers/scsi/ufs/ufshcd-crypto.h9
-rw-r--r--drivers/scsi/ufs/ufshcd.c2
-rw-r--r--drivers/scsi/ufs/ufshcd.h6
-rw-r--r--drivers/scsi/virtio_scsi.c1
-rw-r--r--drivers/target/target_core_file.c1
-rw-r--r--drivers/target/target_core_iblock.c2
-rw-r--r--fs/btrfs/compression.c1
-rw-r--r--fs/btrfs/ctree.c1
-rw-r--r--fs/btrfs/inode.c9
-rw-r--r--fs/direct-io.c14
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/f2fs/compress.c1
-rw-r--r--fs/fs-writeback.c5
-rw-r--r--fs/gfs2/file.c4
-rw-r--r--fs/io_uring.c24
-rw-r--r--fs/iomap/direct-io.c57
-rw-r--r--fs/ntfs/file.c1
-rw-r--r--fs/ntfs3/file.c1
-rw-r--r--fs/orangefs/inode.c2
-rw-r--r--fs/orangefs/super.c1
-rw-r--r--fs/quota/quota.c1
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--fs/zonefs/super.c2
-rw-r--r--include/linux/backing-dev.h19
-rw-r--r--include/linux/bio.h147
-rw-r--r--include/linux/blk-crypto-profile.h166
-rw-r--r--include/linux/blk-integrity.h183
-rw-r--r--include/linux/blk-mq.h581
-rw-r--r--include/linux/blk_types.h37
-rw-r--r--include/linux/blkdev.h909
-rw-r--r--include/linux/blktrace_api.h2
-rw-r--r--include/linux/bvec.h2
-rw-r--r--include/linux/device-mapper.h4
-rw-r--r--include/linux/fs.h10
-rw-r--r--include/linux/genhd.h25
-rw-r--r--include/linux/iomap.h5
-rw-r--r--include/linux/keyslot-manager.h120
-rw-r--r--include/linux/mmc/host.h4
-rw-r--r--include/linux/part_stat.h1
-rw-r--r--include/linux/percpu-refcount.h33
-rw-r--r--include/linux/sbitmap.h24
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/t10-pi.h2
-rw-r--r--include/linux/writeback.h14
-rw-r--r--include/scsi/scsi_device.h2
-rw-r--r--include/trace/events/block.h6
-rw-r--r--init/main.c1
-rw-r--r--kernel/acct.c1
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/sched/core.c7
-rw-r--r--kernel/sched/sched.h1
-rw-r--r--kernel/trace/blktrace.c7
-rw-r--r--lib/random32.c1
-rw-r--r--lib/sbitmap.c95
-rw-r--r--mm/backing-dev.c19
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/highmem.c1
-rw-r--r--mm/mempool.c1
-rw-r--r--mm/nommu.c1
-rw-r--r--mm/page_io.c10
-rw-r--r--mm/readahead.c1
-rw-r--r--mm/shmem.c1
-rw-r--r--mm/swapfile.c2
185 files changed, 4986 insertions, 4065 deletions
diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst
index 7f9b40d6b416..71d1044617a9 100644
--- a/Documentation/block/inline-encryption.rst
+++ b/Documentation/block/inline-encryption.rst
@@ -7,230 +7,269 @@ Inline Encryption
Background
==========
-Inline encryption hardware sits logically between memory and the disk, and can
-en/decrypt data as it goes in/out of the disk. Inline encryption hardware has a
-fixed number of "keyslots" - slots into which encryption contexts (i.e. the
-encryption key, encryption algorithm, data unit size) can be programmed by the
-kernel at any time. Each request sent to the disk can be tagged with the index
-of a keyslot (and also a data unit number to act as an encryption tweak), and
-the inline encryption hardware will en/decrypt the data in the request with the
-encryption context programmed into that keyslot. This is very different from
-full disk encryption solutions like self encrypting drives/TCG OPAL/ATA
-Security standards, since with inline encryption, any block on disk could be
-encrypted with any encryption context the kernel chooses.
-
+Inline encryption hardware sits logically between memory and disk, and can
+en/decrypt data as it goes in/out of the disk. For each I/O request, software
+can control exactly how the inline encryption hardware will en/decrypt the data
+in terms of key, algorithm, data unit size (the granularity of en/decryption),
+and data unit number (a value that determines the initialization vector(s)).
+
+Some inline encryption hardware accepts all encryption parameters including raw
+keys directly in low-level I/O requests. However, most inline encryption
+hardware instead has a fixed number of "keyslots" and requires that the key,
+algorithm, and data unit size first be programmed into a keyslot. Each
+low-level I/O request then just contains a keyslot index and data unit number.
+
+Note that inline encryption hardware is very different from traditional crypto
+accelerators, which are supported through the kernel crypto API. Traditional
+crypto accelerators operate on memory regions, whereas inline encryption
+hardware operates on I/O requests. Thus, inline encryption hardware needs to be
+managed by the block layer, not the kernel crypto API.
+
+Inline encryption hardware is also very different from "self-encrypting drives",
+such as those based on the TCG Opal or ATA Security standards. Self-encrypting
+drives don't provide fine-grained control of encryption and provide no way to
+verify the correctness of the resulting ciphertext. Inline encryption hardware
+provides fine-grained control of encryption, including the choice of key and
+initialization vector for each sector, and can be tested for correctness.
Objective
=========
-We want to support inline encryption (IE) in the kernel.
-To allow for testing, we also want a crypto API fallback when actual
-IE hardware is absent. We also want IE to work with layered devices
-like dm and loopback (i.e. we want to be able to use the IE hardware
-of the underlying devices if present, or else fall back to crypto API
-en/decryption).
-
+We want to support inline encryption in the kernel. To make testing easier, we
+also want support for falling back to the kernel crypto API when actual inline
+encryption hardware is absent. We also want inline encryption to work with
+layered devices like device-mapper and loopback (i.e. we want to be able to use
+the inline encryption hardware of the underlying devices if present, or else
+fall back to crypto API en/decryption).
Constraints and notes
=====================
-- IE hardware has a limited number of "keyslots" that can be programmed
- with an encryption context (key, algorithm, data unit size, etc.) at any time.
- One can specify a keyslot in a data request made to the device, and the
- device will en/decrypt the data using the encryption context programmed into
- that specified keyslot. When possible, we want to make multiple requests with
- the same encryption context share the same keyslot.
-
-- We need a way for upper layers like filesystems to specify an encryption
- context to use for en/decrypting a struct bio, and a device driver (like UFS)
- needs to be able to use that encryption context when it processes the bio.
-
-- We need a way for device drivers to expose their inline encryption
- capabilities in a unified way to the upper layers.
-
-
-Design
-======
-
-We add a struct bio_crypt_ctx to struct bio that can
-represent an encryption context, because we need to be able to pass this
-encryption context from the upper layers (like the fs layer) to the
-device driver to act upon.
-
-While IE hardware works on the notion of keyslots, the FS layer has no
-knowledge of keyslots - it simply wants to specify an encryption context to
-use while en/decrypting a bio.
-
-We introduce a keyslot manager (KSM) that handles the translation from
-encryption contexts specified by the FS to keyslots on the IE hardware.
-This KSM also serves as the way IE hardware can expose its capabilities to
-upper layers. The generic mode of operation is: each device driver that wants
-to support IE will construct a KSM and set it up in its struct request_queue.
-Upper layers that want to use IE on this device can then use this KSM in
-the device's struct request_queue to translate an encryption context into
-a keyslot. The presence of the KSM in the request queue shall be used to mean
-that the device supports IE.
-
-The KSM uses refcounts to track which keyslots are idle (either they have no
-encryption context programmed, or there are no in-flight struct bios
-referencing that keyslot). When a new encryption context needs a keyslot, it
-tries to find a keyslot that has already been programmed with the same
-encryption context, and if there is no such keyslot, it evicts the least
-recently used idle keyslot and programs the new encryption context into that
-one. If no idle keyslots are available, then the caller will sleep until there
-is at least one.
-
-
-blk-mq changes, other block layer changes and blk-crypto-fallback
-=================================================================
-
-We add a pointer to a ``bi_crypt_context`` and ``keyslot`` to
-struct request. These will be referred to as the ``crypto fields``
-for the request. This ``keyslot`` is the keyslot into which the
-``bi_crypt_context`` has been programmed in the KSM of the ``request_queue``
-that this request is being sent to.
-
-We introduce ``block/blk-crypto-fallback.c``, which allows upper layers to remain
-blissfully unaware of whether or not real inline encryption hardware is present
-underneath. When a bio is submitted with a target ``request_queue`` that doesn't
-support the encryption context specified with the bio, the block layer will
-en/decrypt the bio with the blk-crypto-fallback.
-
-If the bio is a ``WRITE`` bio, a bounce bio is allocated, and the data in the bio
-is encrypted stored in the bounce bio - blk-mq will then proceed to process the
-bounce bio as if it were not encrypted at all (except when blk-integrity is
-concerned). ``blk-crypto-fallback`` sets the bounce bio's ``bi_end_io`` to an
-internal function that cleans up the bounce bio and ends the original bio.
-
-If the bio is a ``READ`` bio, the bio's ``bi_end_io`` (and also ``bi_private``)
-is saved and overwritten by ``blk-crypto-fallback`` to
-``bio_crypto_fallback_decrypt_bio``. The bio's ``bi_crypt_context`` is also
-overwritten with ``NULL``, so that to the rest of the stack, the bio looks
-as if it was a regular bio that never had an encryption context specified.
-``bio_crypto_fallback_decrypt_bio`` will decrypt the bio, restore the original
-``bi_end_io`` (and also ``bi_private``) and end the bio again.
-
-Regardless of whether real inline encryption hardware is used or the
+- We need a way for upper layers (e.g. filesystems) to specify an encryption
+ context to use for en/decrypting a bio, and device drivers (e.g. UFSHCD) need
+ to be able to use that encryption context when they process the request.
+ Encryption contexts also introduce constraints on bio merging; the block layer
+ needs to be aware of these constraints.
+
+- Different inline encryption hardware has different supported algorithms,
+ supported data unit sizes, maximum data unit numbers, etc. We call these
+ properties the "crypto capabilities". We need a way for device drivers to
+ advertise crypto capabilities to upper layers in a generic way.
+
+- Inline encryption hardware usually (but not always) requires that keys be
+ programmed into keyslots before being used. Since programming keyslots may be
+ slow and there may not be very many keyslots, we shouldn't just program the
+ key for every I/O request, but rather keep track of which keys are in the
+ keyslots and reuse an already-programmed keyslot when possible.
+
+- Upper layers typically define a specific end-of-life for crypto keys, e.g.
+ when an encrypted directory is locked or when a crypto mapping is torn down.
+ At these times, keys are wiped from memory. We must provide a way for upper
+ layers to also evict keys from any keyslots they are present in.
+
+- When possible, device-mapper devices must be able to pass through the inline
+ encryption support of their underlying devices. However, it doesn't make
+ sense for device-mapper devices to have keyslots themselves.
+
+Basic design
+============
+
+We introduce ``struct blk_crypto_key`` to represent an inline encryption key and
+how it will be used. This includes the actual bytes of the key; the size of the
+key; the algorithm and data unit size the key will be used with; and the number
+of bytes needed to represent the maximum data unit number the key will be used
+with.
+
+We introduce ``struct bio_crypt_ctx`` to represent an encryption context. It
+contains a data unit number and a pointer to a blk_crypto_key. We add pointers
+to a bio_crypt_ctx to ``struct bio`` and ``struct request``; this allows users
+of the block layer (e.g. filesystems) to provide an encryption context when
+creating a bio and have it be passed down the stack for processing by the block
+layer and device drivers. Note that the encryption context doesn't explicitly
+say whether to encrypt or decrypt, as that is implicit from the direction of the
+bio; WRITE means encrypt, and READ means decrypt.
+
+We also introduce ``struct blk_crypto_profile`` to contain all generic inline
+encryption-related state for a particular inline encryption device. The
+blk_crypto_profile serves as the way that drivers for inline encryption hardware
+advertise their crypto capabilities and provide certain functions (e.g.,
+functions to program and evict keys) to upper layers. Each device driver that
+wants to support inline encryption will construct a blk_crypto_profile, then
+associate it with the disk's request_queue.
+
+The blk_crypto_profile also manages the hardware's keyslots, when applicable.
+This happens in the block layer, so that users of the block layer can just
+specify encryption contexts and don't need to know about keyslots at all, nor do
+device drivers need to care about most details of keyslot management.
+
+Specifically, for each keyslot, the block layer (via the blk_crypto_profile)
+keeps track of which blk_crypto_key that keyslot contains (if any), and how many
+in-flight I/O requests are using it. When the block layer creates a
+``struct request`` for a bio that has an encryption context, it grabs a keyslot
+that already contains the key if possible. Otherwise it waits for an idle
+keyslot (a keyslot that isn't in-use by any I/O), then programs the key into the
+least-recently-used idle keyslot using the function the device driver provided.
+In both cases, the resulting keyslot is stored in the ``crypt_keyslot`` field of
+the request, where it is then accessible to device drivers and is released after
+the request completes.
+
+``struct request`` also contains a pointer to the original bio_crypt_ctx.
+Requests can be built from multiple bios, and the block layer must take the
+encryption context into account when trying to merge bios and requests. For two
+bios/requests to be merged, they must have compatible encryption contexts: both
+unencrypted, or both encrypted with the same key and contiguous data unit
+numbers. Only the encryption context for the first bio in a request is
+retained, since the remaining bios have been verified to be merge-compatible
+with the first bio.
+
+To make it possible for inline encryption to work with request_queue based
+layered devices, when a request is cloned, its encryption context is cloned as
+well. When the cloned request is submitted, it is then processed as usual; this
+includes getting a keyslot from the clone's target device if needed.
+
+blk-crypto-fallback
+===================
+
+It is desirable for the inline encryption support of upper layers (e.g.
+filesystems) to be testable without real inline encryption hardware, and
+likewise for the block layer's keyslot management logic. It is also desirable
+to allow upper layers to just always use inline encryption rather than have to
+implement encryption in multiple ways.
+
+Therefore, we also introduce *blk-crypto-fallback*, which is an implementation
+of inline encryption using the kernel crypto API. blk-crypto-fallback is built
+into the block layer, so it works on any block device without any special setup.
+Essentially, when a bio with an encryption context is submitted to a
+request_queue that doesn't support that encryption context, the block layer will
+handle en/decryption of the bio using blk-crypto-fallback.
+
+For encryption, the data cannot be encrypted in-place, as callers usually rely
+on it being unmodified. Instead, blk-crypto-fallback allocates bounce pages,
+fills a new bio with those bounce pages, encrypts the data into those bounce
+pages, and submits that "bounce" bio. When the bounce bio completes,
+blk-crypto-fallback completes the original bio. If the original bio is too
+large, multiple bounce bios may be required; see the code for details.
+
+For decryption, blk-crypto-fallback "wraps" the bio's completion callback
+(``bi_complete``) and private data (``bi_private``) with its own, unsets the
+bio's encryption context, then submits the bio. If the read completes
+successfully, blk-crypto-fallback restores the bio's original completion
+callback and private data, then decrypts the bio's data in-place using the
+kernel crypto API. Decryption happens from a workqueue, as it may sleep.
+Afterwards, blk-crypto-fallback completes the bio.
+
+In both cases, the bios that blk-crypto-fallback submits no longer have an
+encryption context. Therefore, lower layers only see standard unencrypted I/O.
+
+blk-crypto-fallback also defines its own blk_crypto_profile and has its own
+"keyslots"; its keyslots contain ``struct crypto_skcipher`` objects. The reason
+for this is twofold. First, it allows the keyslot management logic to be tested
+without actual inline encryption hardware. Second, similar to actual inline
+encryption hardware, the crypto API doesn't accept keys directly in requests but
+rather requires that keys be set ahead of time, and setting keys can be
+expensive; moreover, allocating a crypto_skcipher can't happen on the I/O path
+at all due to the locks it takes. Therefore, the concept of keyslots still
+makes sense for blk-crypto-fallback.
+
+Note that regardless of whether real inline encryption hardware or
blk-crypto-fallback is used, the ciphertext written to disk (and hence the
-on-disk format of data) will be the same (assuming the hardware's implementation
-of the algorithm being used adheres to spec and functions correctly).
-
-If a ``request queue``'s inline encryption hardware claimed to support the
-encryption context specified with a bio, then it will not be handled by the
-``blk-crypto-fallback``. We will eventually reach a point in blk-mq when a
-struct request needs to be allocated for that bio. At that point,
-blk-mq tries to program the encryption context into the ``request_queue``'s
-keyslot_manager, and obtain a keyslot, which it stores in its newly added
-``keyslot`` field. This keyslot is released when the request is completed.
-
-When the first bio is added to a request, ``blk_crypto_rq_bio_prep`` is called,
-which sets the request's ``crypt_ctx`` to a copy of the bio's
-``bi_crypt_context``. bio_crypt_do_front_merge is called whenever a subsequent
-bio is merged to the front of the request, which updates the ``crypt_ctx`` of
-the request so that it matches the newly merged bio's ``bi_crypt_context``. In particular, the request keeps a copy of the ``bi_crypt_context`` of the first
-bio in its bio-list (blk-mq needs to be careful to maintain this invariant
-during bio and request merges).
-
-To make it possible for inline encryption to work with request queue based
-layered devices, when a request is cloned, its ``crypto fields`` are cloned as
-well. When the cloned request is submitted, blk-mq programs the
-``bi_crypt_context`` of the request into the clone's request_queue's keyslot
-manager, and stores the returned keyslot in the clone's ``keyslot``.
+on-disk format of data) will be the same (assuming that both the inline
+encryption hardware's implementation and the kernel crypto API's implementation
+of the algorithm being used adhere to spec and function correctly).
+blk-crypto-fallback is optional and is controlled by the
+``CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK`` kernel configuration option.
API presented to users of the block layer
=========================================
-``struct blk_crypto_key`` represents a crypto key (the raw key, size of the
-key, the crypto algorithm to use, the data unit size to use, and the number of
-bytes required to represent data unit numbers that will be specified with the
-``bi_crypt_context``).
-
-``blk_crypto_init_key`` allows upper layers to initialize such a
-``blk_crypto_key``.
-
-``bio_crypt_set_ctx`` should be called on any bio that a user of
-the block layer wants en/decrypted via inline encryption (or the
-blk-crypto-fallback, if hardware support isn't available for the desired
-crypto configuration). This function takes the ``blk_crypto_key`` and the
-data unit number (DUN) to use when en/decrypting the bio.
-
-``blk_crypto_config_supported`` allows upper layers to query whether or not the
-an encryption context passed to request queue can be handled by blk-crypto
-(either by real inline encryption hardware, or by the blk-crypto-fallback).
-This is useful e.g. when blk-crypto-fallback is disabled, and the upper layer
-wants to use an algorithm that may not supported by hardware - this function
-lets the upper layer know ahead of time that the algorithm isn't supported,
-and the upper layer can fallback to something else if appropriate.
-
-``blk_crypto_start_using_key`` - Upper layers must call this function on
-``blk_crypto_key`` and a ``request_queue`` before using the key with any bio
-headed for that ``request_queue``. This function ensures that either the
-hardware supports the key's crypto settings, or the crypto API fallback has
-transforms for the needed mode allocated and ready to go. Note that this
-function may allocate an ``skcipher``, and must not be called from the data
-path, since allocating ``skciphers`` from the data path can deadlock.
-
-``blk_crypto_evict_key`` *must* be called by upper layers before a
-``blk_crypto_key`` is freed. Further, it *must* only be called only once
-there are no more in-flight requests that use that ``blk_crypto_key``.
-``blk_crypto_evict_key`` will ensure that a key is removed from any keyslots in
-inline encryption hardware that the key might have been programmed into (or the blk-crypto-fallback).
+``blk_crypto_config_supported()`` allows users to check ahead of time whether
+inline encryption with particular crypto settings will work on a particular
+request_queue -- either via hardware or via blk-crypto-fallback. This function
+takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits
+the actual bytes of the key and instead just contains the algorithm, data unit
+size, etc. This function can be useful if blk-crypto-fallback is disabled.
+
+``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key.
+
+Users must call ``blk_crypto_start_using_key()`` before actually starting to use
+a blk_crypto_key on a request_queue (even if ``blk_crypto_config_supported()``
+was called earlier). This is needed to initialize blk-crypto-fallback if it
+will be needed. This must not be called from the data path, as this may have to
+allocate resources, which may deadlock in that case.
+
+Next, to attach an encryption context to a bio, users should call
+``bio_crypt_set_ctx()``. This function allocates a bio_crypt_ctx and attaches
+it to a bio, given the blk_crypto_key and the data unit number that will be used
+for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx
+later, as that happens automatically when the bio is freed or reset.
+
+Finally, when done using inline encryption with a blk_crypto_key on a
+request_queue, users must call ``blk_crypto_evict_key()``. This ensures that
+the key is evicted from all keyslots it may be programmed into and unlinked from
+any kernel data structures it may be linked into.
+
+In summary, for users of the block layer, the lifecycle of a blk_crypto_key is
+as follows:
+
+1. ``blk_crypto_config_supported()`` (optional)
+2. ``blk_crypto_init_key()``
+3. ``blk_crypto_start_using_key()``
+4. ``bio_crypt_set_ctx()`` (potentially many times)
+5. ``blk_crypto_evict_key()`` (after all I/O has completed)
+6. Zeroize the blk_crypto_key (this has no dedicated function)
+
+If a blk_crypto_key is being used on multiple request_queues, then
+``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``,
+and ``blk_crypto_evict_key()`` must be called on each request_queue.
API presented to device drivers
===============================
-A :c:type:``struct blk_keyslot_manager`` should be set up by device drivers in
-the ``request_queue`` of the device. The device driver needs to call
-``blk_ksm_init`` (or its resource-managed variant ``devm_blk_ksm_init``) on the
-``blk_keyslot_manager``, while specifying the number of keyslots supported by
-the hardware.
-
-The device driver also needs to tell the KSM how to actually manipulate the
-IE hardware in the device to do things like programming the crypto key into
-the IE hardware into a particular keyslot. All this is achieved through the
-struct blk_ksm_ll_ops field in the KSM that the device driver
-must fill up after initing the ``blk_keyslot_manager``.
-
-The KSM also handles runtime power management for the device when applicable
-(e.g. when it wants to program a crypto key into the IE hardware, the device
-must be runtime powered on) - so the device driver must also set the ``dev``
-field in the ksm to point to the `struct device` for the KSM to use for runtime
-power management.
-
-``blk_ksm_reprogram_all_keys`` can be called by device drivers if the device
-needs each and every of its keyslots to be reprogrammed with the key it
-"should have" at the point in time when the function is called. This is useful
-e.g. if a device loses all its keys on runtime power down/up.
-
-If the driver used ``blk_ksm_init`` instead of ``devm_blk_ksm_init``, then
-``blk_ksm_destroy`` should be called to free up all resources used by a
-``blk_keyslot_manager`` once it is no longer needed.
+A device driver that wants to support inline encryption must set up a
+blk_crypto_profile in the request_queue of its device. To do this, it first
+must call ``blk_crypto_profile_init()`` (or its resource-managed variant
+``devm_blk_crypto_profile_init()``), providing the number of keyslots.
+
+Next, it must advertise its crypto capabilities by setting fields in the
+blk_crypto_profile, e.g. ``modes_supported`` and ``max_dun_bytes_supported``.
+
+It then must set function pointers in the ``ll_ops`` field of the
+blk_crypto_profile to tell upper layers how to control the inline encryption
+hardware, e.g. how to program and evict keyslots. Most drivers will need to
+implement ``keyslot_program`` and ``keyslot_evict``. For details, see the
+comments for ``struct blk_crypto_ll_ops``.
+
+Once the driver registers a blk_crypto_profile with a request_queue, I/O
+requests the driver receives via that queue may have an encryption context. All
+encryption contexts will be compatible with the crypto capabilities declared in
+the blk_crypto_profile, so drivers don't need to worry about handling
+unsupported requests. Also, if a nonzero number of keyslots was declared in the
+blk_crypto_profile, then all I/O requests that have an encryption context will
+also have a keyslot which was already programmed with the appropriate key.
+
+If the driver implements runtime suspend and its blk_crypto_ll_ops don't work
+while the device is runtime-suspended, then the driver must also set the ``dev``
+field of the blk_crypto_profile to point to the ``struct device`` that will be
+resumed before any of the low-level operations are called.
+
+If there are situations where the inline encryption hardware loses the contents
+of its keyslots, e.g. device resets, the driver must handle reprogramming the
+keyslots. To do this, the driver may call ``blk_crypto_reprogram_all_keys()``.
+
+Finally, if the driver used ``blk_crypto_profile_init()`` instead of
+``devm_blk_crypto_profile_init()``, then it is responsible for calling
+``blk_crypto_profile_destroy()`` when the crypto profile is no longer needed.
Layered Devices
===============
-Request queue based layered devices like dm-rq that wish to support IE need to
-create their own keyslot manager for their request queue, and expose whatever
-functionality they choose. When a layered device wants to pass a clone of that
-request to another ``request_queue``, blk-crypto will initialize and prepare the
-clone as necessary - see ``blk_crypto_insert_cloned_request`` in
-``blk-crypto.c``.
-
-
-Future Optimizations for layered devices
-========================================
-
-Creating a keyslot manager for a layered device uses up memory for each
-keyslot, and in general, a layered device merely passes the request on to a
-"child" device, so the keyslots in the layered device itself are completely
-unused, and don't need any refcounting or keyslot programming. We can instead
-define a new type of KSM; the "passthrough KSM", that layered devices can use
-to advertise an unlimited number of keyslots, and support for any encryption
-algorithms they choose, while not actually using any memory for each keyslot.
-Another use case for the "passthrough KSM" is for IE devices that do not have a
-limited number of keyslots.
-
+Request queue based layered devices like dm-rq that wish to support inline
+encryption need to create their own blk_crypto_profile for their request_queue,
+and expose whatever functionality they choose. When a layered device wants to
+pass a clone of that request to another request_queue, blk-crypto will
+initialize and prepare the clone as necessary; see
+``blk_crypto_insert_cloned_request()``.
Interaction between inline encryption and blk integrity
=======================================================
@@ -257,7 +296,7 @@ Because there isn't any real hardware yet, it seems prudent to assume that
hardware implementations might not implement both features together correctly,
and disallow the combination for now. Whenever a device supports integrity, the
kernel will pretend that the device does not support hardware inline encryption
-(by essentially setting the keyslot manager in the request_queue of the device
-to NULL). When the crypto API fallback is enabled, this means that all bios with
-and encryption context will use the fallback, and IO will complete as usual.
-When the fallback is disabled, a bio with an encryption context will be failed.
+(by setting the blk_crypto_profile in the request_queue of the device to NULL).
+When the crypto API fallback is enabled, this means that all bios with and
+encryption context will use the fallback, and IO will complete as usual. When
+the fallback is disabled, a bio with an encryption context will be failed.
diff --git a/Makefile b/Makefile
index ed6e7ec60eff..a5231635ed43 100644
--- a/Makefile
+++ b/Makefile
@@ -1115,7 +1115,8 @@ export MODORDER := $(extmod_prefix)modules.order
export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps
ifeq ($(KBUILD_EXTMOD),)
-core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
+core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/
+core-$(CONFIG_BLOCK) += block/
vmlinux-dirs := $(patsubst %/,%,$(filter %/, \
$(core-y) $(core-m) $(drivers-y) $(drivers-m) \
diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 9a8394e96388..4ef457ba5220 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -58,7 +58,7 @@ struct nfhd_device {
struct gendisk *disk;
};
-static blk_qc_t nfhd_submit_bio(struct bio *bio)
+static void nfhd_submit_bio(struct bio *bio)
{
struct nfhd_device *dev = bio->bi_bdev->bd_disk->private_data;
struct bio_vec bvec;
@@ -76,7 +76,6 @@ static blk_qc_t nfhd_submit_bio(struct bio *bio)
sec += len;
}
bio_endio(bio);
- return BLK_QC_T_NONE;
}
static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/arch/mips/rb532/prom.c b/arch/mips/rb532/prom.c
index 23ad8dd9aa5e..b11693715547 100644
--- a/arch/mips/rb532/prom.c
+++ b/arch/mips/rb532/prom.c
@@ -16,7 +16,6 @@
#include <linux/console.h>
#include <linux/memblock.h>
#include <linux/ioport.h>
-#include <linux/blkdev.h>
#include <asm/bootinfo.h>
#include <asm/mach-rc32434/ddr.h>
diff --git a/arch/mips/sibyte/common/cfe.c b/arch/mips/sibyte/common/cfe.c
index a3323f8dcc1b..1a504294d85f 100644
--- a/arch/mips/sibyte/common/cfe.c
+++ b/arch/mips/sibyte/common/cfe.c
@@ -7,7 +7,6 @@
#include <linux/kernel.h>
#include <linux/linkage.h>
#include <linux/mm.h>
-#include <linux/blkdev.h>
#include <linux/memblock.h>
#include <linux/pm.h>
#include <linux/smp.h>
diff --git a/arch/mips/sibyte/swarm/setup.c b/arch/mips/sibyte/swarm/setup.c
index 538a2791b48c..f07b15dd1c1a 100644
--- a/arch/mips/sibyte/swarm/setup.c
+++ b/arch/mips/sibyte/swarm/setup.c
@@ -11,7 +11,6 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/memblock.h>
-#include <linux/blkdev.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/screen_info.h>
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index cfef61a7b6c2..97305bde1b16 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -25,7 +25,6 @@
#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/delay.h>
-#include <linux/blkdev.h> /* for initrd_* */
#include <linux/pagemap.h>
#include <asm/pgalloc.h>
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index bed05b644c2c..cb25acccd746 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -21,6 +21,7 @@
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/poll.h>
+#include <linux/seq_file.h>
#include <linux/slab.h>
#include <asm/prom.h>
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index cd9dc0556e91..fefd343412c7 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -27,6 +27,7 @@
#include <linux/blk-mq.h>
#include <linux/ata.h>
#include <linux/hdreg.h>
+#include <linux/major.h>
#include <linux/cdrom.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 3cdfa00738e0..ddd1fe3db474 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -100,7 +100,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector,
spin_unlock(&dev->lock);
}
-static blk_qc_t simdisk_submit_bio(struct bio *bio)
+static void simdisk_submit_bio(struct bio *bio)
{
struct simdisk *dev = bio->bi_bdev->bd_disk->private_data;
struct bio_vec bvec;
@@ -118,7 +118,6 @@ static blk_qc_t simdisk_submit_bio(struct bio *bio)
}
bio_endio(bio);
- return BLK_QC_T_NONE;
}
static int simdisk_open(struct block_device *bdev, fmode_t mode)
diff --git a/block/Kconfig b/block/Kconfig
index 8e28ae7718bd..c6ce41a5e5b2 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -73,7 +73,7 @@ config BLK_DEV_ZONED
config BLK_DEV_THROTTLING
bool "Block layer bio throttling support"
- depends on BLK_CGROUP=y
+ depends on BLK_CGROUP
select BLK_CGROUP_RWSTAT
help
Block layer bio throttling support. It can be used to limit
@@ -112,7 +112,7 @@ config BLK_WBT_MQ
config BLK_CGROUP_IOLATENCY
bool "Enable support for latency based cgroup IO protection"
- depends on BLK_CGROUP=y
+ depends on BLK_CGROUP
help
Enabling this option enables the .latency interface for IO throttling.
The IO controller will attempt to maintain average IO latencies below
@@ -132,7 +132,7 @@ config BLK_CGROUP_FC_APPID
config BLK_CGROUP_IOCOST
bool "Enable support for cost model based cgroup IO controller"
- depends on BLK_CGROUP=y
+ depends on BLK_CGROUP
select BLK_RQ_IO_DATA_LEN
select BLK_RQ_ALLOC_TIME
help
@@ -190,39 +190,31 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
by falling back to the kernel crypto API when inline
encryption hardware is not present.
-menu "Partition Types"
-
source "block/partitions/Kconfig"
-endmenu
-
-endif # BLOCK
-
config BLOCK_COMPAT
- bool
- depends on BLOCK && COMPAT
- default y
+ def_bool COMPAT
config BLK_MQ_PCI
- bool
- depends on BLOCK && PCI
- default y
+ def_bool PCI
config BLK_MQ_VIRTIO
bool
- depends on BLOCK && VIRTIO
+ depends on VIRTIO
default y
config BLK_MQ_RDMA
bool
- depends on BLOCK && INFINIBAND
+ depends on INFINIBAND
default y
config BLK_PM
- def_bool BLOCK && PM
+ def_bool PM
# do not use in new code
config BLOCK_HOLDER_DEPRECATED
bool
source "block/Kconfig.iosched"
+
+endif # BLOCK
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 2f2158e05a91..885fee86dfca 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -1,6 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-if BLOCK
-
menu "IO Schedulers"
config MQ_IOSCHED_DEADLINE
@@ -45,5 +43,3 @@ config BFQ_CGROUP_DEBUG
files in a cgroup which can be useful for debugging.
endmenu
-
-endif
diff --git a/block/Makefile b/block/Makefile
index 41aa1ba69c90..44df57e562bf 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,13 +3,13 @@
# Makefile for the kernel block layer
#
-obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
+obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
- disk-events.o
+ disk-events.o blk-ia-ranges.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
@@ -36,6 +36,6 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
-obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o
diff --git a/block/bdev.c b/block/bdev.c
index 485a258b0ab3..7e6156203a71 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -12,6 +12,7 @@
#include <linux/major.h>
#include <linux/device_cgroup.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/backing-dev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
@@ -326,12 +327,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return result;
- result = blk_queue_enter(bdev->bd_disk->queue, 0);
+ result = blk_queue_enter(bdev_get_queue(bdev), 0);
if (result)
return result;
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
REQ_OP_READ);
- blk_queue_exit(bdev->bd_disk->queue);
+ blk_queue_exit(bdev_get_queue(bdev));
return result;
}
@@ -362,7 +363,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
- result = blk_queue_enter(bdev->bd_disk->queue, 0);
+ result = blk_queue_enter(bdev_get_queue(bdev), 0);
if (result)
return result;
@@ -375,7 +376,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
clean_page_buffers(page);
unlock_page(page);
}
- blk_queue_exit(bdev->bd_disk->queue);
+ blk_queue_exit(bdev_get_queue(bdev));
return result;
}
@@ -492,6 +493,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
spin_lock_init(&bdev->bd_size_lock);
bdev->bd_partno = partno;
bdev->bd_inode = inode;
+ bdev->bd_queue = disk->queue;
bdev->bd_stats = alloc_percpu(struct disk_stats);
if (!bdev->bd_stats) {
iput(inode);
@@ -962,9 +964,11 @@ EXPORT_SYMBOL(blkdev_put);
* @pathname: special file representing the block device
* @dev: return value of the block device's dev_t
*
- * Get a reference to the blockdevice at @pathname in the current
- * namespace if possible and return it. Return ERR_PTR(error)
- * otherwise.
+ * Lookup the block device's dev_t at @pathname in the current
+ * namespace if possible and return it by @dev.
+ *
+ * RETURNS:
+ * 0 if succeeded, errno otherwise.
*/
int lookup_bdev(const char *pathname, dev_t *dev)
{
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 85b8e1c3a762..24a5c5329bcd 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -6,13 +6,13 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/cgroup.h>
-#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/sbitmap.h>
#include <linux/delay.h>
+#include "elevator.h"
#include "bfq-iosched.h"
#ifdef CONFIG_BFQ_CGROUP_DEBUG
@@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
{
if (blkg_rwstat_init(&stats->bytes, gfp) ||
blkg_rwstat_init(&stats->ios, gfp))
- return -ENOMEM;
+ goto error;
#ifdef CONFIG_BFQ_CGROUP_DEBUG
if (blkg_rwstat_init(&stats->merged, gfp) ||
@@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
bfq_stat_init(&stats->dequeue, gfp) ||
bfq_stat_init(&stats->group_wait_time, gfp) ||
bfq_stat_init(&stats->idle_time, gfp) ||
- bfq_stat_init(&stats->empty_time, gfp)) {
- bfqg_stats_exit(stats);
- return -ENOMEM;
- }
+ bfq_stat_init(&stats->empty_time, gfp))
+ goto error;
#endif
return 0;
+
+error:
+ bfqg_stats_exit(stats);
+ return -ENOMEM;
}
static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 480e1a134859..fec18118dc30 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -117,7 +117,6 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/cgroup.h>
-#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
@@ -127,6 +126,7 @@
#include <trace/events/block.h>
+#include "elevator.h"
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
@@ -6884,8 +6884,8 @@ static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
struct blk_mq_tags *tags = hctx->sched_tags;
unsigned int min_shallow;
- min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
- sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
+ min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
}
static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 6b47cddbbca1..d25114715459 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -6,7 +6,7 @@
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
*/
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/mempool.h>
#include <linux/export.h>
#include <linux/bio.h>
@@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
iv = bip->bip_vec + bip->bip_vcnt;
if (bip->bip_vcnt &&
- bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue,
+ bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
&bip->bip_vec[bip->bip_vcnt - 1], offset))
return 0;
diff --git a/block/bio.c b/block/bio.c
index a6fb6a0b4295..15ab0d6d1c06 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -87,7 +87,8 @@ static struct bio_slab *create_bio_slab(unsigned int size)
snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
bslab->slab = kmem_cache_create(bslab->name, size,
- ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL);
+ ARCH_KMALLOC_MINALIGN,
+ SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
if (!bslab->slab)
goto fail_alloc_slab;
@@ -156,7 +157,7 @@ out:
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
{
- BIO_BUG_ON(nr_vecs > BIO_MAX_VECS);
+ BUG_ON(nr_vecs > BIO_MAX_VECS);
if (nr_vecs == BIO_MAX_VECS)
mempool_free(bv, pool);
@@ -281,6 +282,7 @@ void bio_init(struct bio *bio, struct bio_vec *table,
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
+ bio->bi_cookie = BLK_QC_T_NONE;
bio->bi_max_vecs = max_vecs;
bio->bi_io_vec = table;
@@ -546,7 +548,7 @@ EXPORT_SYMBOL(zero_fill_bio);
* REQ_OP_READ, zero the truncated part. This function should only
* be used for handling corner cases, such as bio eod.
*/
-void bio_truncate(struct bio *bio, unsigned new_size)
+static void bio_truncate(struct bio *bio, unsigned new_size)
{
struct bio_vec bv;
struct bvec_iter iter;
@@ -677,7 +679,7 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
void bio_put(struct bio *bio)
{
if (unlikely(bio_flagged(bio, BIO_REFFED))) {
- BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+ BUG_ON(!atomic_read(&bio->__bi_cnt));
if (!atomic_dec_and_test(&bio->__bi_cnt))
return;
}
@@ -772,6 +774,23 @@ const char *bio_devname(struct bio *bio, char *buf)
}
EXPORT_SYMBOL(bio_devname);
+/**
+ * bio_full - check if the bio is full
+ * @bio: bio to check
+ * @len: length of one segment to be added
+ *
+ * Return true if @bio is full and one segment with @len bytes can't be
+ * added to the bio, otherwise return false
+ */
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+ if (bio->bi_vcnt >= bio->bi_max_vecs)
+ return true;
+ if (bio->bi_iter.bi_size > UINT_MAX - len)
+ return true;
+ return false;
+}
+
static inline bool page_is_mergeable(const struct bio_vec *bv,
struct page *page, unsigned int len, unsigned int off,
bool *same_page)
@@ -791,6 +810,44 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
}
+/**
+ * __bio_try_merge_page - try appending data to an existing bvec.
+ * @bio: destination bio
+ * @page: start page to add
+ * @len: length of the data to add
+ * @off: offset of the data relative to @page
+ * @same_page: return if the segment has been merged inside the same page
+ *
+ * Try to add the data at @page + @off to the last bvec of @bio. This is a
+ * useful optimisation for file systems with a block size smaller than the
+ * page size.
+ *
+ * Warn if (@len, @off) crosses pages in case that @same_page is true.
+ *
+ * Return %true on success or %false on failure.
+ */
+static bool __bio_try_merge_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int off, bool *same_page)
+{
+ if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+ return false;
+
+ if (bio->bi_vcnt > 0) {
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+ if (page_is_mergeable(bv, page, len, off, same_page)) {
+ if (bio->bi_iter.bi_size > UINT_MAX - len) {
+ *same_page = false;
+ return false;
+ }
+ bv->bv_len += len;
+ bio->bi_iter.bi_size += len;
+ return true;
+ }
+ }
+ return false;
+}
+
/*
* Try to merge a page into a segment, while obeying the hardware segment
* size limit. This is not for normal read/write bios, but for passthrough
@@ -908,7 +965,7 @@ EXPORT_SYMBOL(bio_add_pc_page);
int bio_add_zone_append_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
- struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
bool same_page = false;
if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
@@ -923,45 +980,6 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page,
EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
/**
- * __bio_try_merge_page - try appending data to an existing bvec.
- * @bio: destination bio
- * @page: start page to add
- * @len: length of the data to add
- * @off: offset of the data relative to @page
- * @same_page: return if the segment has been merged inside the same page
- *
- * Try to add the data at @page + @off to the last bvec of @bio. This is a
- * useful optimisation for file systems with a block size smaller than the
- * page size.
- *
- * Warn if (@len, @off) crosses pages in case that @same_page is true.
- *
- * Return %true on success or %false on failure.
- */
-bool __bio_try_merge_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int off, bool *same_page)
-{
- if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
- return false;
-
- if (bio->bi_vcnt > 0) {
- struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
- if (page_is_mergeable(bv, page, len, off, same_page)) {
- if (bio->bi_iter.bi_size > UINT_MAX - len) {
- *same_page = false;
- return false;
- }
- bv->bv_len += len;
- bio->bi_iter.bi_size += len;
- return true;
- }
- }
- return false;
-}
-EXPORT_SYMBOL_GPL(__bio_try_merge_page);
-
-/**
* __bio_add_page - add page(s) to a bio in a new segment
* @bio: destination bio
* @page: start page to add
@@ -1015,52 +1033,40 @@ int bio_add_page(struct bio *bio, struct page *page,
}
EXPORT_SYMBOL(bio_add_page);
-void bio_release_pages(struct bio *bio, bool mark_dirty)
+void __bio_release_pages(struct bio *bio, bool mark_dirty)
{
struct bvec_iter_all iter_all;
struct bio_vec *bvec;
- if (bio_flagged(bio, BIO_NO_PAGE_REF))
- return;
-
bio_for_each_segment_all(bvec, bio, iter_all) {
if (mark_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page);
}
}
-EXPORT_SYMBOL_GPL(bio_release_pages);
+EXPORT_SYMBOL_GPL(__bio_release_pages);
-static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
+void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{
+ size_t size = iov_iter_count(iter);
+
WARN_ON_ONCE(bio->bi_max_vecs);
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ size_t max_sectors = queue_max_zone_append_sectors(q);
+
+ size = min(size, max_sectors << SECTOR_SHIFT);
+ }
+
bio->bi_vcnt = iter->nr_segs;
bio->bi_io_vec = (struct bio_vec *)iter->bvec;
bio->bi_iter.bi_bvec_done = iter->iov_offset;
- bio->bi_iter.bi_size = iter->count;
+ bio->bi_iter.bi_size = size;
bio_set_flag(bio, BIO_NO_PAGE_REF);
bio_set_flag(bio, BIO_CLONED);
}
-static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
-{
- __bio_iov_bvec_set(bio, iter);
- iov_iter_advance(iter, iter->count);
- return 0;
-}
-
-static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter)
-{
- struct request_queue *q = bio->bi_bdev->bd_disk->queue;
- struct iov_iter i = *iter;
-
- iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9);
- __bio_iov_bvec_set(bio, &i);
- iov_iter_advance(iter, i.count);
- return 0;
-}
-
static void bio_put_pages(struct page **pages, size_t size, size_t off)
{
size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
@@ -1130,7 +1136,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
{
unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
- struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
@@ -1202,9 +1208,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
int ret = 0;
if (iov_iter_is_bvec(iter)) {
- if (bio_op(bio) == REQ_OP_ZONE_APPEND)
- return bio_iov_bvec_set_append(bio, iter);
- return bio_iov_bvec_set(bio, iter);
+ bio_iov_bvec_set(bio, iter);
+ iov_iter_advance(iter, bio->bi_iter.bi_size);
+ return 0;
}
do {
@@ -1260,18 +1266,7 @@ int submit_bio_wait(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio_wait);
-/**
- * bio_advance - increment/complete a bio by some number of bytes
- * @bio: bio to advance
- * @bytes: number of bytes to complete
- *
- * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
- * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
- * be updated on the last bvec as well.
- *
- * @bio will then represent the remaining, uncompleted portion of the io.
- */
-void bio_advance(struct bio *bio, unsigned bytes)
+void __bio_advance(struct bio *bio, unsigned bytes)
{
if (bio_integrity(bio))
bio_integrity_advance(bio, bytes);
@@ -1279,7 +1274,7 @@ void bio_advance(struct bio *bio, unsigned bytes)
bio_crypt_advance(bio, bytes);
bio_advance_iter(bio, &bio->bi_iter, bytes);
}
-EXPORT_SYMBOL(bio_advance);
+EXPORT_SYMBOL(__bio_advance);
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter)
@@ -1467,10 +1462,10 @@ again:
return;
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
- rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
+ rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
- trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
+ trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
}
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9a1c5839dd46..88b1fce90520 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -32,6 +32,7 @@
#include <linux/psi.h>
#include "blk.h"
#include "blk-ioprio.h"
+#include "blk-throttle.h"
/*
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
@@ -620,7 +621,7 @@ struct block_device *blkcg_conf_open_bdev(char **inputp)
*/
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
char *input, struct blkg_conf_ctx *ctx)
- __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock)
+ __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock)
{
struct block_device *bdev;
struct request_queue *q;
@@ -631,7 +632,15 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
- q = bdev->bd_disk->queue;
+ q = bdev_get_queue(bdev);
+
+ /*
+ * blkcg_deactivate_policy() requires queue to be frozen, we can grab
+ * q_usage_counter to prevent concurrent with blkcg_deactivate_policy().
+ */
+ ret = blk_queue_enter(q, 0);
+ if (ret)
+ return ret;
rcu_read_lock();
spin_lock_irq(&q->queue_lock);
@@ -702,6 +711,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
goto success;
}
success:
+ blk_queue_exit(q);
ctx->bdev = bdev;
ctx->blkg = blkg;
ctx->body = input;
@@ -714,6 +724,7 @@ fail_unlock:
rcu_read_unlock();
fail:
blkdev_put_no_open(bdev);
+ blk_queue_exit(q);
/*
* If queue was bypassing, we should retry. Do so after a
* short msleep(). It isn't strictly necessary but queue
@@ -736,9 +747,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
* with blkg_conf_prep().
*/
void blkg_conf_finish(struct blkg_conf_ctx *ctx)
- __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu)
+ __releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu)
{
- spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
+ spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
rcu_read_unlock();
blkdev_put_no_open(ctx->bdev);
}
@@ -841,7 +852,7 @@ static void blkcg_fill_root_iostats(void)
while ((dev = class_dev_iter_next(&iter))) {
struct block_device *bdev = dev_to_bdev(dev);
struct blkcg_gq *blkg =
- blk_queue_root_blkg(bdev->bd_disk->queue);
+ blk_queue_root_blkg(bdev_get_queue(bdev));
struct blkg_iostat tmp;
int cpu;
@@ -1800,7 +1811,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
rcu_read_lock();
blkg = blkg_lookup_create(css_to_blkcg(css),
- bio->bi_bdev->bd_disk->queue);
+ bdev_get_queue(bio->bi_bdev));
while (blkg) {
if (blkg_tryget(blkg)) {
ret_blkg = blkg;
@@ -1836,8 +1847,8 @@ void bio_associate_blkg_from_css(struct bio *bio,
if (css && css->parent) {
bio->bi_blkg = blkg_tryget_closest(bio, css);
} else {
- blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg);
- bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg;
+ blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
+ bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
}
}
EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
diff --git a/block/blk-core.c b/block/blk-core.c
index 4d8f5fe91588..fd389a16013c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -18,6 +18,7 @@
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/blk-pm.h>
+#include <linux/blk-integrity.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
@@ -49,6 +50,7 @@
#include "blk-mq.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
+#include "blk-throttle.h"
struct dentry *blk_debugfs_root;
@@ -214,8 +216,7 @@ int blk_status_to_errno(blk_status_t status)
}
EXPORT_SYMBOL_GPL(blk_status_to_errno);
-static void print_req_error(struct request *req, blk_status_t status,
- const char *caller)
+void blk_print_req_error(struct request *req, blk_status_t status)
{
int idx = (__force int)status;
@@ -223,9 +224,9 @@ static void print_req_error(struct request *req, blk_status_t status,
return;
printk_ratelimited(KERN_ERR
- "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
+ "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
"phys_seg %u prio class %u\n",
- caller, blk_errors[idx].name,
+ blk_errors[idx].name,
req->rq_disk ? req->rq_disk->disk_name : "?",
blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
req->cmd_flags & ~REQ_OP_MASK,
@@ -233,33 +234,6 @@ static void print_req_error(struct request *req, blk_status_t status,
IOPRIO_PRIO_CLASS(req->ioprio));
}
-static void req_bio_endio(struct request *rq, struct bio *bio,
- unsigned int nbytes, blk_status_t error)
-{
- if (error)
- bio->bi_status = error;
-
- if (unlikely(rq->rq_flags & RQF_QUIET))
- bio_set_flag(bio, BIO_QUIET);
-
- bio_advance(bio, nbytes);
-
- if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
- /*
- * Partial zone append completions cannot be supported as the
- * BIO fragments may end up not being written sequentially.
- */
- if (bio->bi_iter.bi_size)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_iter.bi_sector = rq->__sector;
- }
-
- /* don't actually finish bio if it's part of flush sequence */
- if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
- bio_endio(bio);
-}
-
void blk_dump_rq_flags(struct request *rq, char *msg)
{
printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
@@ -402,7 +376,7 @@ void blk_cleanup_queue(struct request_queue *q)
*/
mutex_lock(&q->sysfs_lock);
if (q->elevator)
- blk_mq_sched_free_requests(q);
+ blk_mq_sched_free_rqs(q);
mutex_unlock(&q->sysfs_lock);
percpu_ref_exit(&q->q_usage_counter);
@@ -415,7 +389,7 @@ EXPORT_SYMBOL(blk_cleanup_queue);
static bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
rcu_read_lock();
- if (!percpu_ref_tryget_live(&q->q_usage_counter))
+ if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
goto fail;
/*
@@ -430,7 +404,7 @@ static bool blk_try_enter_queue(struct request_queue *q, bool pm)
return true;
fail_put:
- percpu_ref_put(&q->q_usage_counter);
+ blk_queue_exit(q);
fail:
rcu_read_unlock();
return false;
@@ -470,10 +444,11 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
static inline int bio_queue_enter(struct bio *bio)
{
- struct gendisk *disk = bio->bi_bdev->bd_disk;
- struct request_queue *q = disk->queue;
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
while (!blk_try_enter_queue(q, false)) {
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+
if (bio->bi_opf & REQ_NOWAIT) {
if (test_bit(GD_DEAD, &disk->state))
goto dead;
@@ -553,7 +528,7 @@ struct request_queue *blk_alloc_queue(int node_id)
q->node = node_id;
- atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
+ atomic_set(&q->nr_active_requests_shared_tags, 0);
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
INIT_WORK(&q->timeout_work, blk_timeout_work);
@@ -586,7 +561,7 @@ struct request_queue *blk_alloc_queue(int node_id)
blk_queue_dma_alignment(q, 511);
blk_set_default_limits(&q->limits);
- q->nr_requests = BLKDEV_MAX_RQ;
+ q->nr_requests = BLKDEV_DEFAULT_RQ;
return q;
@@ -654,8 +629,9 @@ static void handle_bad_sector(struct bio *bio, sector_t maxsector)
{
char b[BDEVNAME_SIZE];
- pr_info_ratelimited("attempt to access beyond end of device\n"
+ pr_info_ratelimited("%s: attempt to access beyond end of device\n"
"%s: rw=%d, want=%llu, limit=%llu\n",
+ current->comm,
bio_devname(bio, b), bio->bi_opf,
bio_end_sector(bio), maxsector);
}
@@ -797,7 +773,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
static noinline_for_stack bool submit_bio_checks(struct bio *bio)
{
struct block_device *bdev = bio->bi_bdev;
- struct request_queue *q = bdev->bd_disk->queue;
+ struct request_queue *q = bdev_get_queue(bdev);
blk_status_t status = BLK_STS_IOERR;
struct blk_plug *plug;
@@ -839,7 +815,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
}
if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
- bio_clear_hipri(bio);
+ bio_clear_polled(bio);
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
@@ -912,25 +888,22 @@ end_io:
return false;
}
-static blk_qc_t __submit_bio(struct bio *bio)
+static void __submit_bio(struct bio *bio)
{
struct gendisk *disk = bio->bi_bdev->bd_disk;
- blk_qc_t ret = BLK_QC_T_NONE;
if (unlikely(bio_queue_enter(bio) != 0))
- return BLK_QC_T_NONE;
+ return;
if (!submit_bio_checks(bio) || !blk_crypto_bio_prep(&bio))
goto queue_exit;
- if (disk->fops->submit_bio) {
- ret = disk->fops->submit_bio(bio);
- goto queue_exit;
+ if (!disk->fops->submit_bio) {
+ blk_mq_submit_bio(bio);
+ return;
}
- return blk_mq_submit_bio(bio);
-
+ disk->fops->submit_bio(bio);
queue_exit:
blk_queue_exit(disk->queue);
- return ret;
}
/*
@@ -952,10 +925,9 @@ queue_exit:
* bio_list_on_stack[1] contains bios that were submitted before the current
* ->submit_bio_bio, but that haven't been processed yet.
*/
-static blk_qc_t __submit_bio_noacct(struct bio *bio)
+static void __submit_bio_noacct(struct bio *bio)
{
struct bio_list bio_list_on_stack[2];
- blk_qc_t ret = BLK_QC_T_NONE;
BUG_ON(bio->bi_next);
@@ -963,7 +935,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
current->bio_list = bio_list_on_stack;
do {
- struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
struct bio_list lower, same;
/*
@@ -972,7 +944,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
bio_list_on_stack[1] = bio_list_on_stack[0];
bio_list_init(&bio_list_on_stack[0]);
- ret = __submit_bio(bio);
+ __submit_bio(bio);
/*
* Sort new bios into those for a lower level and those for the
@@ -981,7 +953,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
bio_list_init(&lower);
bio_list_init(&same);
while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
- if (q == bio->bi_bdev->bd_disk->queue)
+ if (q == bdev_get_queue(bio->bi_bdev))
bio_list_add(&same, bio);
else
bio_list_add(&lower, bio);
@@ -995,22 +967,19 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
current->bio_list = NULL;
- return ret;
}
-static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
+static void __submit_bio_noacct_mq(struct bio *bio)
{
struct bio_list bio_list[2] = { };
- blk_qc_t ret;
current->bio_list = bio_list;
do {
- ret = __submit_bio(bio);
+ __submit_bio(bio);
} while ((bio = bio_list_pop(&bio_list[0])));
current->bio_list = NULL;
- return ret;
}
/**
@@ -1022,7 +991,7 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
* systems and other upper level users of the block layer should use
* submit_bio() instead.
*/
-blk_qc_t submit_bio_noacct(struct bio *bio)
+void submit_bio_noacct(struct bio *bio)
{
/*
* We only want one ->submit_bio to be active at a time, else stack
@@ -1030,14 +999,12 @@ blk_qc_t submit_bio_noacct(struct bio *bio)
* to collect a list of requests submited by a ->submit_bio method while
* it is active, and then process them after it returned.
*/
- if (current->bio_list) {
+ if (current->bio_list)
bio_list_add(&current->bio_list[0], bio);
- return BLK_QC_T_NONE;
- }
-
- if (!bio->bi_bdev->bd_disk->fops->submit_bio)
- return __submit_bio_noacct_mq(bio);
- return __submit_bio_noacct(bio);
+ else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
+ __submit_bio_noacct_mq(bio);
+ else
+ __submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio_noacct);
@@ -1054,10 +1021,10 @@ EXPORT_SYMBOL(submit_bio_noacct);
* in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
* been called.
*/
-blk_qc_t submit_bio(struct bio *bio)
+void submit_bio(struct bio *bio)
{
if (blkcg_punt_bio_submit(bio))
- return BLK_QC_T_NONE;
+ return;
/*
* If it's a regular read/write or a barrier with data attached,
@@ -1068,7 +1035,7 @@ blk_qc_t submit_bio(struct bio *bio)
if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
count = queue_logical_block_size(
- bio->bi_bdev->bd_disk->queue) >> 9;
+ bdev_get_queue(bio->bi_bdev)) >> 9;
else
count = bio_sectors(bio);
@@ -1089,20 +1056,93 @@ blk_qc_t submit_bio(struct bio *bio)
if (unlikely(bio_op(bio) == REQ_OP_READ &&
bio_flagged(bio, BIO_WORKINGSET))) {
unsigned long pflags;
- blk_qc_t ret;
psi_memstall_enter(&pflags);
- ret = submit_bio_noacct(bio);
+ submit_bio_noacct(bio);
psi_memstall_leave(&pflags);
-
- return ret;
+ return;
}
- return submit_bio_noacct(bio);
+ submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);
/**
+ * bio_poll - poll for BIO completions
+ * @bio: bio to poll for
+ * @flags: BLK_POLL_* flags that control the behavior
+ *
+ * Poll for completions on queue associated with the bio. Returns number of
+ * completed entries found.
+ *
+ * Note: the caller must either be the context that submitted @bio, or
+ * be in a RCU critical section to prevent freeing of @bio.
+ */
+int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
+{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
+ int ret;
+
+ if (cookie == BLK_QC_T_NONE ||
+ !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+ return 0;
+
+ if (current->plug)
+ blk_flush_plug(current->plug, false);
+
+ if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT))
+ return 0;
+ if (WARN_ON_ONCE(!queue_is_mq(q)))
+ ret = 0; /* not yet implemented, should not happen */
+ else
+ ret = blk_mq_poll(q, cookie, iob, flags);
+ blk_queue_exit(q);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bio_poll);
+
+/*
+ * Helper to implement file_operations.iopoll. Requires the bio to be stored
+ * in iocb->private, and cleared before freeing the bio.
+ */
+int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
+ unsigned int flags)
+{
+ struct bio *bio;
+ int ret = 0;
+
+ /*
+ * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
+ * point to a freshly allocated bio at this point. If that happens
+ * we have a few cases to consider:
+ *
+ * 1) the bio is beeing initialized and bi_bdev is NULL. We can just
+ * simply nothing in this case
+ * 2) the bio points to a not poll enabled device. bio_poll will catch
+ * this and return 0
+ * 3) the bio points to a poll capable device, including but not
+ * limited to the one that the original bio pointed to. In this
+ * case we will call into the actual poll method and poll for I/O,
+ * even if we don't need to, but it won't cause harm either.
+ *
+ * For cases 2) and 3) above the RCU grace period ensures that bi_bdev
+ * is still allocated. Because partitions hold a reference to the whole
+ * device bdev and thus disk, the disk is also still valid. Grabbing
+ * a reference to the queue in bio_poll() ensures the hctxs and requests
+ * are still valid as well.
+ */
+ rcu_read_lock();
+ bio = READ_ONCE(kiocb->private);
+ if (bio && bio->bi_bdev)
+ ret = bio_poll(bio, iob, flags);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
+
+/**
* blk_cloned_rq_check_limits - Helper function to check a cloned request
* for the new queue limits
* @q: the queue
@@ -1177,8 +1217,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
if (blk_crypto_insert_cloned_request(rq))
return BLK_STS_IOERR;
- if (blk_queue_io_stat(q))
- blk_account_io_start(rq);
+ blk_account_io_start(rq);
/*
* Since we have a scheduler attached on the top device,
@@ -1246,41 +1285,19 @@ again:
}
}
-static void blk_account_io_completion(struct request *req, unsigned int bytes)
+void __blk_account_io_done(struct request *req, u64 now)
{
- if (req->part && blk_do_io_stat(req)) {
- const int sgrp = op_stat_group(req_op(req));
-
- part_stat_lock();
- part_stat_add(req->part, sectors[sgrp], bytes >> 9);
- part_stat_unlock();
- }
-}
+ const int sgrp = op_stat_group(req_op(req));
-void blk_account_io_done(struct request *req, u64 now)
-{
- /*
- * Account IO completion. flush_rq isn't accounted as a
- * normal IO on queueing nor completion. Accounting the
- * containing request is enough.
- */
- if (req->part && blk_do_io_stat(req) &&
- !(req->rq_flags & RQF_FLUSH_SEQ)) {
- const int sgrp = op_stat_group(req_op(req));
-
- part_stat_lock();
- update_io_ticks(req->part, jiffies, true);
- part_stat_inc(req->part, ios[sgrp]);
- part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
- part_stat_unlock();
- }
+ part_stat_lock();
+ update_io_ticks(req->part, jiffies, true);
+ part_stat_inc(req->part, ios[sgrp]);
+ part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+ part_stat_unlock();
}
-void blk_account_io_start(struct request *rq)
+void __blk_account_io_start(struct request *rq)
{
- if (!blk_do_io_stat(rq))
- return;
-
/* passthrough requests can hold bios that do not have ->bi_bdev set */
if (rq->bio && rq->bio->bi_bdev)
rq->part = rq->bio->bi_bdev;
@@ -1376,112 +1393,6 @@ void blk_steal_bios(struct bio_list *list, struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_steal_bios);
-/**
- * blk_update_request - Complete multiple bytes without completing the request
- * @req: the request being processed
- * @error: block status code
- * @nr_bytes: number of bytes to complete for @req
- *
- * Description:
- * Ends I/O on a number of bytes attached to @req, but doesn't complete
- * the request structure even if @req doesn't have leftover.
- * If @req has leftover, sets it up for the next range of segments.
- *
- * Passing the result of blk_rq_bytes() as @nr_bytes guarantees
- * %false return from this function.
- *
- * Note:
- * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
- * except in the consistency check at the end of this function.
- *
- * Return:
- * %false - this request doesn't have any more data
- * %true - this request has more data
- **/
-bool blk_update_request(struct request *req, blk_status_t error,
- unsigned int nr_bytes)
-{
- int total_bytes;
-
- trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes);
-
- if (!req->bio)
- return false;
-
-#ifdef CONFIG_BLK_DEV_INTEGRITY
- if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
- error == BLK_STS_OK)
- req->q->integrity.profile->complete_fn(req, nr_bytes);
-#endif
-
- if (unlikely(error && !blk_rq_is_passthrough(req) &&
- !(req->rq_flags & RQF_QUIET)))
- print_req_error(req, error, __func__);
-
- blk_account_io_completion(req, nr_bytes);
-
- total_bytes = 0;
- while (req->bio) {
- struct bio *bio = req->bio;
- unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
-
- if (bio_bytes == bio->bi_iter.bi_size)
- req->bio = bio->bi_next;
-
- /* Completion has already been traced */
- bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- req_bio_endio(req, bio, bio_bytes, error);
-
- total_bytes += bio_bytes;
- nr_bytes -= bio_bytes;
-
- if (!nr_bytes)
- break;
- }
-
- /*
- * completely done
- */
- if (!req->bio) {
- /*
- * Reset counters so that the request stacking driver
- * can find how many bytes remain in the request
- * later.
- */
- req->__data_len = 0;
- return false;
- }
-
- req->__data_len -= total_bytes;
-
- /* update sector only for requests with clear definition of sector */
- if (!blk_rq_is_passthrough(req))
- req->__sector += total_bytes >> 9;
-
- /* mixed attributes always follow the first bio */
- if (req->rq_flags & RQF_MIXED_MERGE) {
- req->cmd_flags &= ~REQ_FAILFAST_MASK;
- req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
- }
-
- if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
- /*
- * If total number of sectors is less than the first segment
- * size, something has gone terribly wrong.
- */
- if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
- blk_dump_rq_flags(req, "request botched");
- req->__data_len = blk_rq_cur_bytes(req);
- }
-
- /* recalculate the number of segments */
- req->nr_phys_segments = blk_recalc_rq_segments(req);
- }
-
- return true;
-}
-EXPORT_SYMBOL_GPL(blk_update_request);
-
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
/**
* rq_flush_dcache_pages - Helper function to flush all pages in a request
@@ -1629,6 +1540,32 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
}
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
+void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
+{
+ struct task_struct *tsk = current;
+
+ /*
+ * If this is a nested plug, don't actually assign it.
+ */
+ if (tsk->plug)
+ return;
+
+ plug->mq_list = NULL;
+ plug->cached_rq = NULL;
+ plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
+ plug->rq_count = 0;
+ plug->multiple_queues = false;
+ plug->has_elevator = false;
+ plug->nowait = false;
+ INIT_LIST_HEAD(&plug->cb_list);
+
+ /*
+ * Store ordering should not be needed here, since a potential
+ * preempt will imply a full memory barrier
+ */
+ tsk->plug = plug;
+}
+
/**
* blk_start_plug - initialize blk_plug and track it inside the task_struct
* @plug: The &struct blk_plug that needs to be initialized
@@ -1654,25 +1591,7 @@ EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
*/
void blk_start_plug(struct blk_plug *plug)
{
- struct task_struct *tsk = current;
-
- /*
- * If this is a nested plug, don't actually assign it.
- */
- if (tsk->plug)
- return;
-
- INIT_LIST_HEAD(&plug->mq_list);
- INIT_LIST_HEAD(&plug->cb_list);
- plug->rq_count = 0;
- plug->multiple_queues = false;
- plug->nowait = false;
-
- /*
- * Store ordering should not be needed here, since a potential
- * preempt will imply a full memory barrier
- */
- tsk->plug = plug;
+ blk_start_plug_nr_ios(plug, 1);
}
EXPORT_SYMBOL(blk_start_plug);
@@ -1718,12 +1637,14 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
}
EXPORT_SYMBOL(blk_check_plugged);
-void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
{
- flush_plug_callbacks(plug, from_schedule);
-
- if (!list_empty(&plug->mq_list))
+ if (!list_empty(&plug->cb_list))
+ flush_plug_callbacks(plug, from_schedule);
+ if (!rq_list_empty(plug->mq_list))
blk_mq_flush_plug_list(plug, from_schedule);
+ if (unlikely(!from_schedule && plug->cached_rq))
+ blk_mq_free_plug_rqs(plug);
}
/**
@@ -1738,11 +1659,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
*/
void blk_finish_plug(struct blk_plug *plug)
{
- if (plug != current->plug)
- return;
- blk_flush_plug_list(plug, false);
-
- current->plug = NULL;
+ if (plug == current->plug) {
+ blk_flush_plug(plug, false);
+ current->plug = NULL;
+ }
}
EXPORT_SYMBOL(blk_finish_plug);
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index c322176a1e09..c87aba8584c6 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -12,12 +12,13 @@
#include <crypto/skcipher.h>
#include <linux/blk-cgroup.h>
#include <linux/blk-crypto.h>
+#include <linux/blk-crypto-profile.h>
#include <linux/blkdev.h>
#include <linux/crypto.h>
-#include <linux/keyslot-manager.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/random.h>
+#include <linux/scatterlist.h>
#include "blk-crypto-internal.h"
@@ -72,12 +73,12 @@ static mempool_t *bio_fallback_crypt_ctx_pool;
static DEFINE_MUTEX(tfms_init_lock);
static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX];
-static struct blk_crypto_keyslot {
+static struct blk_crypto_fallback_keyslot {
enum blk_crypto_mode_num crypto_mode;
struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX];
} *blk_crypto_keyslots;
-static struct blk_keyslot_manager blk_crypto_ksm;
+static struct blk_crypto_profile blk_crypto_fallback_profile;
static struct workqueue_struct *blk_crypto_wq;
static mempool_t *blk_crypto_bounce_page_pool;
static struct bio_set crypto_bio_split;
@@ -88,9 +89,9 @@ static struct bio_set crypto_bio_split;
*/
static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE];
-static void blk_crypto_evict_keyslot(unsigned int slot)
+static void blk_crypto_fallback_evict_keyslot(unsigned int slot)
{
- struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+ struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode;
int err;
@@ -103,45 +104,41 @@ static void blk_crypto_evict_keyslot(unsigned int slot)
slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID;
}
-static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key,
- unsigned int slot)
+static int
+blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key,
+ unsigned int slot)
{
- struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+ struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
const enum blk_crypto_mode_num crypto_mode =
key->crypto_cfg.crypto_mode;
int err;
if (crypto_mode != slotp->crypto_mode &&
slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID)
- blk_crypto_evict_keyslot(slot);
+ blk_crypto_fallback_evict_keyslot(slot);
slotp->crypto_mode = crypto_mode;
err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw,
key->size);
if (err) {
- blk_crypto_evict_keyslot(slot);
+ blk_crypto_fallback_evict_keyslot(slot);
return err;
}
return 0;
}
-static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key,
- unsigned int slot)
+static int blk_crypto_fallback_keyslot_evict(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key,
+ unsigned int slot)
{
- blk_crypto_evict_keyslot(slot);
+ blk_crypto_fallback_evict_keyslot(slot);
return 0;
}
-/*
- * The crypto API fallback KSM ops - only used for a bio when it specifies a
- * blk_crypto_key that was not supported by the device's inline encryption
- * hardware.
- */
-static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = {
- .keyslot_program = blk_crypto_keyslot_program,
- .keyslot_evict = blk_crypto_keyslot_evict,
+static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = {
+ .keyslot_program = blk_crypto_fallback_keyslot_program,
+ .keyslot_evict = blk_crypto_fallback_keyslot_evict,
};
static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
@@ -159,7 +156,7 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
bio_endio(src_bio);
}
-static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
+static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
{
struct bvec_iter iter;
struct bio_vec bv;
@@ -186,13 +183,14 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
return bio;
}
-static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
- struct skcipher_request **ciph_req_ret,
- struct crypto_wait *wait)
+static bool
+blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot,
+ struct skcipher_request **ciph_req_ret,
+ struct crypto_wait *wait)
{
struct skcipher_request *ciph_req;
- const struct blk_crypto_keyslot *slotp;
- int keyslot_idx = blk_ksm_get_slot_idx(slot);
+ const struct blk_crypto_fallback_keyslot *slotp;
+ int keyslot_idx = blk_crypto_keyslot_index(slot);
slotp = &blk_crypto_keyslots[keyslot_idx];
ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode],
@@ -209,7 +207,7 @@ static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
return true;
}
-static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr)
+static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr)
{
struct bio *bio = *bio_ptr;
unsigned int i = 0;
@@ -264,7 +262,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
{
struct bio *src_bio, *enc_bio;
struct bio_crypt_ctx *bc;
- struct blk_ksm_keyslot *slot;
+ struct blk_crypto_keyslot *slot;
int data_unit_size;
struct skcipher_request *ciph_req = NULL;
DECLARE_CRYPTO_WAIT(wait);
@@ -276,7 +274,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
blk_status_t blk_st;
/* Split the bio if it's too big for single page bvec */
- if (!blk_crypto_split_bio_if_needed(bio_ptr))
+ if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr))
return false;
src_bio = *bio_ptr;
@@ -284,24 +282,25 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
/* Allocate bounce bio for encryption */
- enc_bio = blk_crypto_clone_bio(src_bio);
+ enc_bio = blk_crypto_fallback_clone_bio(src_bio);
if (!enc_bio) {
src_bio->bi_status = BLK_STS_RESOURCE;
return false;
}
/*
- * Use the crypto API fallback keyslot manager to get a crypto_skcipher
- * for the algorithm and key specified for this bio.
+ * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
+ * this bio's algorithm and key.
*/
- blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+ blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
+ bc->bc_key, &slot);
if (blk_st != BLK_STS_OK) {
src_bio->bi_status = blk_st;
goto out_put_enc_bio;
}
/* and then allocate an skcipher_request for it */
- if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+ if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
src_bio->bi_status = BLK_STS_RESOURCE;
goto out_release_keyslot;
}
@@ -362,7 +361,7 @@ out_free_bounce_pages:
out_free_ciph_req:
skcipher_request_free(ciph_req);
out_release_keyslot:
- blk_ksm_put_slot(slot);
+ blk_crypto_put_keyslot(slot);
out_put_enc_bio:
if (enc_bio)
bio_put(enc_bio);
@@ -380,7 +379,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
container_of(work, struct bio_fallback_crypt_ctx, work);
struct bio *bio = f_ctx->bio;
struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx;
- struct blk_ksm_keyslot *slot;
+ struct blk_crypto_keyslot *slot;
struct skcipher_request *ciph_req = NULL;
DECLARE_CRYPTO_WAIT(wait);
u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
@@ -393,17 +392,18 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
blk_status_t blk_st;
/*
- * Use the crypto API fallback keyslot manager to get a crypto_skcipher
- * for the algorithm and key specified for this bio.
+ * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
+ * this bio's algorithm and key.
*/
- blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+ blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
+ bc->bc_key, &slot);
if (blk_st != BLK_STS_OK) {
bio->bi_status = blk_st;
goto out_no_keyslot;
}
/* and then allocate an skcipher_request for it */
- if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+ if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
bio->bi_status = BLK_STS_RESOURCE;
goto out;
}
@@ -434,7 +434,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
out:
skcipher_request_free(ciph_req);
- blk_ksm_put_slot(slot);
+ blk_crypto_put_keyslot(slot);
out_no_keyslot:
mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
bio_endio(bio);
@@ -473,9 +473,9 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio)
* @bio_ptr: pointer to the bio to prepare
*
* If bio is doing a WRITE operation, this splits the bio into two parts if it's
- * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio
- * for the first part, encrypts it, and update bio_ptr to point to the bounce
- * bio.
+ * too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a
+ * bounce bio for the first part, encrypts it, and updates bio_ptr to point to
+ * the bounce bio.
*
* For a READ operation, we mark the bio for decryption by using bi_private and
* bi_end_io.
@@ -499,8 +499,8 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
return false;
}
- if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm,
- &bc->bc_key->crypto_cfg)) {
+ if (!__blk_crypto_cfg_supported(&blk_crypto_fallback_profile,
+ &bc->bc_key->crypto_cfg)) {
bio->bi_status = BLK_STS_NOTSUPP;
return false;
}
@@ -526,7 +526,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
{
- return blk_ksm_evict_key(&blk_crypto_ksm, key);
+ return __blk_crypto_evict_key(&blk_crypto_fallback_profile, key);
}
static bool blk_crypto_fallback_inited;
@@ -534,6 +534,7 @@ static int blk_crypto_fallback_init(void)
{
int i;
int err;
+ struct blk_crypto_profile *profile = &blk_crypto_fallback_profile;
if (blk_crypto_fallback_inited)
return 0;
@@ -544,24 +545,24 @@ static int blk_crypto_fallback_init(void)
if (err)
goto out;
- err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots);
+ err = blk_crypto_profile_init(profile, blk_crypto_num_keyslots);
if (err)
goto fail_free_bioset;
err = -ENOMEM;
- blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops;
- blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
+ profile->ll_ops = blk_crypto_fallback_ll_ops;
+ profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
/* All blk-crypto modes have a crypto API fallback. */
for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++)
- blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF;
- blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
+ profile->modes_supported[i] = 0xFFFFFFFF;
+ profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
blk_crypto_wq = alloc_workqueue("blk_crypto_wq",
WQ_UNBOUND | WQ_HIGHPRI |
WQ_MEM_RECLAIM, num_online_cpus());
if (!blk_crypto_wq)
- goto fail_free_ksm;
+ goto fail_destroy_profile;
blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots,
sizeof(blk_crypto_keyslots[0]),
@@ -595,8 +596,8 @@ fail_free_keyslots:
kfree(blk_crypto_keyslots);
fail_free_wq:
destroy_workqueue(blk_crypto_wq);
-fail_free_ksm:
- blk_ksm_destroy(&blk_crypto_ksm);
+fail_destroy_profile:
+ blk_crypto_profile_destroy(profile);
fail_free_bioset:
bioset_exit(&crypto_bio_split);
out:
@@ -610,7 +611,7 @@ out:
int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
{
const char *cipher_str = blk_crypto_modes[mode_num].cipher_str;
- struct blk_crypto_keyslot *slotp;
+ struct blk_crypto_fallback_keyslot *slotp;
unsigned int i;
int err = 0;
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index 0d36aae538d7..2fb0d65a464c 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -7,7 +7,7 @@
#define __LINUX_BLK_CRYPTO_INTERNAL_H
#include <linux/bio.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
/* Represents a crypto mode supported by blk-crypto */
struct blk_crypto_mode {
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
new file mode 100644
index 000000000000..605ba0626a5c
--- /dev/null
+++ b/block/blk-crypto-profile.c
@@ -0,0 +1,565 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ */
+
+/**
+ * DOC: blk-crypto profiles
+ *
+ * 'struct blk_crypto_profile' contains all generic inline encryption-related
+ * state for a particular inline encryption device. blk_crypto_profile serves
+ * as the way that drivers for inline encryption hardware expose their crypto
+ * capabilities and certain functions (e.g., functions to program and evict
+ * keys) to upper layers. Device drivers that want to support inline encryption
+ * construct a crypto profile, then associate it with the disk's request_queue.
+ *
+ * If the device has keyslots, then its blk_crypto_profile also handles managing
+ * these keyslots in a device-independent way, using the driver-provided
+ * functions to program and evict keys as needed. This includes keeping track
+ * of which key and how many I/O requests are using each keyslot, getting
+ * keyslots for I/O requests, and handling key eviction requests.
+ *
+ * For more information, see Documentation/block/inline-encryption.rst.
+ */
+
+#define pr_fmt(fmt) "blk-crypto: " fmt
+
+#include <linux/blk-crypto-profile.h>
+#include <linux/device.h>
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/pm_runtime.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
+
+struct blk_crypto_keyslot {
+ atomic_t slot_refs;
+ struct list_head idle_slot_node;
+ struct hlist_node hash_node;
+ const struct blk_crypto_key *key;
+ struct blk_crypto_profile *profile;
+};
+
+static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile)
+{
+ /*
+ * Calling into the driver requires profile->lock held and the device
+ * resumed. But we must resume the device first, since that can acquire
+ * and release profile->lock via blk_crypto_reprogram_all_keys().
+ */
+ if (profile->dev)
+ pm_runtime_get_sync(profile->dev);
+ down_write(&profile->lock);
+}
+
+static inline void blk_crypto_hw_exit(struct blk_crypto_profile *profile)
+{
+ up_write(&profile->lock);
+ if (profile->dev)
+ pm_runtime_put_sync(profile->dev);
+}
+
+/**
+ * blk_crypto_profile_init() - Initialize a blk_crypto_profile
+ * @profile: the blk_crypto_profile to initialize
+ * @num_slots: the number of keyslots
+ *
+ * Storage drivers must call this when starting to set up a blk_crypto_profile,
+ * before filling in additional fields.
+ *
+ * Return: 0 on success, or else a negative error code.
+ */
+int blk_crypto_profile_init(struct blk_crypto_profile *profile,
+ unsigned int num_slots)
+{
+ unsigned int slot;
+ unsigned int i;
+ unsigned int slot_hashtable_size;
+
+ memset(profile, 0, sizeof(*profile));
+ init_rwsem(&profile->lock);
+
+ if (num_slots == 0)
+ return 0;
+
+ /* Initialize keyslot management data. */
+
+ profile->slots = kvcalloc(num_slots, sizeof(profile->slots[0]),
+ GFP_KERNEL);
+ if (!profile->slots)
+ return -ENOMEM;
+
+ profile->num_slots = num_slots;
+
+ init_waitqueue_head(&profile->idle_slots_wait_queue);
+ INIT_LIST_HEAD(&profile->idle_slots);
+
+ for (slot = 0; slot < num_slots; slot++) {
+ profile->slots[slot].profile = profile;
+ list_add_tail(&profile->slots[slot].idle_slot_node,
+ &profile->idle_slots);
+ }
+
+ spin_lock_init(&profile->idle_slots_lock);
+
+ slot_hashtable_size = roundup_pow_of_two(num_slots);
+ /*
+ * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2
+ * buckets. This only makes a difference when there is only 1 keyslot.
+ */
+ if (slot_hashtable_size < 2)
+ slot_hashtable_size = 2;
+
+ profile->log_slot_ht_size = ilog2(slot_hashtable_size);
+ profile->slot_hashtable =
+ kvmalloc_array(slot_hashtable_size,
+ sizeof(profile->slot_hashtable[0]), GFP_KERNEL);
+ if (!profile->slot_hashtable)
+ goto err_destroy;
+ for (i = 0; i < slot_hashtable_size; i++)
+ INIT_HLIST_HEAD(&profile->slot_hashtable[i]);
+
+ return 0;
+
+err_destroy:
+ blk_crypto_profile_destroy(profile);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_profile_init);
+
+static void blk_crypto_profile_destroy_callback(void *profile)
+{
+ blk_crypto_profile_destroy(profile);
+}
+
+/**
+ * devm_blk_crypto_profile_init() - Resource-managed blk_crypto_profile_init()
+ * @dev: the device which owns the blk_crypto_profile
+ * @profile: the blk_crypto_profile to initialize
+ * @num_slots: the number of keyslots
+ *
+ * Like blk_crypto_profile_init(), but causes blk_crypto_profile_destroy() to be
+ * called automatically on driver detach.
+ *
+ * Return: 0 on success, or else a negative error code.
+ */
+int devm_blk_crypto_profile_init(struct device *dev,
+ struct blk_crypto_profile *profile,
+ unsigned int num_slots)
+{
+ int err = blk_crypto_profile_init(profile, num_slots);
+
+ if (err)
+ return err;
+
+ return devm_add_action_or_reset(dev,
+ blk_crypto_profile_destroy_callback,
+ profile);
+}
+EXPORT_SYMBOL_GPL(devm_blk_crypto_profile_init);
+
+static inline struct hlist_head *
+blk_crypto_hash_bucket_for_key(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key)
+{
+ return &profile->slot_hashtable[
+ hash_ptr(key, profile->log_slot_ht_size)];
+}
+
+static void
+blk_crypto_remove_slot_from_lru_list(struct blk_crypto_keyslot *slot)
+{
+ struct blk_crypto_profile *profile = slot->profile;
+ unsigned long flags;
+
+ spin_lock_irqsave(&profile->idle_slots_lock, flags);
+ list_del(&slot->idle_slot_node);
+ spin_unlock_irqrestore(&profile->idle_slots_lock, flags);
+}
+
+static struct blk_crypto_keyslot *
+blk_crypto_find_keyslot(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key)
+{
+ const struct hlist_head *head =
+ blk_crypto_hash_bucket_for_key(profile, key);
+ struct blk_crypto_keyslot *slotp;
+
+ hlist_for_each_entry(slotp, head, hash_node) {
+ if (slotp->key == key)
+ return slotp;
+ }
+ return NULL;
+}
+
+static struct blk_crypto_keyslot *
+blk_crypto_find_and_grab_keyslot(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key)
+{
+ struct blk_crypto_keyslot *slot;
+
+ slot = blk_crypto_find_keyslot(profile, key);
+ if (!slot)
+ return NULL;
+ if (atomic_inc_return(&slot->slot_refs) == 1) {
+ /* Took first reference to this slot; remove it from LRU list */
+ blk_crypto_remove_slot_from_lru_list(slot);
+ }
+ return slot;
+}
+
+/**
+ * blk_crypto_keyslot_index() - Get the index of a keyslot
+ * @slot: a keyslot that blk_crypto_get_keyslot() returned
+ *
+ * Return: the 0-based index of the keyslot within the device's keyslots.
+ */
+unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot)
+{
+ return slot - slot->profile->slots;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_keyslot_index);
+
+/**
+ * blk_crypto_get_keyslot() - Get a keyslot for a key, if needed.
+ * @profile: the crypto profile of the device the key will be used on
+ * @key: the key that will be used
+ * @slot_ptr: If a keyslot is allocated, an opaque pointer to the keyslot struct
+ * will be stored here; otherwise NULL will be stored here.
+ *
+ * If the device has keyslots, this gets a keyslot that's been programmed with
+ * the specified key. If the key is already in a slot, this reuses it;
+ * otherwise this waits for a slot to become idle and programs the key into it.
+ *
+ * This must be paired with a call to blk_crypto_put_keyslot().
+ *
+ * Context: Process context. Takes and releases profile->lock.
+ * Return: BLK_STS_OK on success, meaning that either a keyslot was allocated or
+ * one wasn't needed; or a blk_status_t error on failure.
+ */
+blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key,
+ struct blk_crypto_keyslot **slot_ptr)
+{
+ struct blk_crypto_keyslot *slot;
+ int slot_idx;
+ int err;
+
+ *slot_ptr = NULL;
+
+ /*
+ * If the device has no concept of "keyslots", then there is no need to
+ * get one.
+ */
+ if (profile->num_slots == 0)
+ return BLK_STS_OK;
+
+ down_read(&profile->lock);
+ slot = blk_crypto_find_and_grab_keyslot(profile, key);
+ up_read(&profile->lock);
+ if (slot)
+ goto success;
+
+ for (;;) {
+ blk_crypto_hw_enter(profile);
+ slot = blk_crypto_find_and_grab_keyslot(profile, key);
+ if (slot) {
+ blk_crypto_hw_exit(profile);
+ goto success;
+ }
+
+ /*
+ * If we're here, that means there wasn't a slot that was
+ * already programmed with the key. So try to program it.
+ */
+ if (!list_empty(&profile->idle_slots))
+ break;
+
+ blk_crypto_hw_exit(profile);
+ wait_event(profile->idle_slots_wait_queue,
+ !list_empty(&profile->idle_slots));
+ }
+
+ slot = list_first_entry(&profile->idle_slots, struct blk_crypto_keyslot,
+ idle_slot_node);
+ slot_idx = blk_crypto_keyslot_index(slot);
+
+ err = profile->ll_ops.keyslot_program(profile, key, slot_idx);
+ if (err) {
+ wake_up(&profile->idle_slots_wait_queue);
+ blk_crypto_hw_exit(profile);
+ return errno_to_blk_status(err);
+ }
+
+ /* Move this slot to the hash list for the new key. */
+ if (slot->key)
+ hlist_del(&slot->hash_node);
+ slot->key = key;
+ hlist_add_head(&slot->hash_node,
+ blk_crypto_hash_bucket_for_key(profile, key));
+
+ atomic_set(&slot->slot_refs, 1);
+
+ blk_crypto_remove_slot_from_lru_list(slot);
+
+ blk_crypto_hw_exit(profile);
+success:
+ *slot_ptr = slot;
+ return BLK_STS_OK;
+}
+
+/**
+ * blk_crypto_put_keyslot() - Release a reference to a keyslot
+ * @slot: The keyslot to release the reference of (may be NULL).
+ *
+ * Context: Any context.
+ */
+void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot)
+{
+ struct blk_crypto_profile *profile;
+ unsigned long flags;
+
+ if (!slot)
+ return;
+
+ profile = slot->profile;
+
+ if (atomic_dec_and_lock_irqsave(&slot->slot_refs,
+ &profile->idle_slots_lock, flags)) {
+ list_add_tail(&slot->idle_slot_node, &profile->idle_slots);
+ spin_unlock_irqrestore(&profile->idle_slots_lock, flags);
+ wake_up(&profile->idle_slots_wait_queue);
+ }
+}
+
+/**
+ * __blk_crypto_cfg_supported() - Check whether the given crypto profile
+ * supports the given crypto configuration.
+ * @profile: the crypto profile to check
+ * @cfg: the crypto configuration to check for
+ *
+ * Return: %true if @profile supports the given @cfg.
+ */
+bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
+ const struct blk_crypto_config *cfg)
+{
+ if (!profile)
+ return false;
+ if (!(profile->modes_supported[cfg->crypto_mode] & cfg->data_unit_size))
+ return false;
+ if (profile->max_dun_bytes_supported < cfg->dun_bytes)
+ return false;
+ return true;
+}
+
+/**
+ * __blk_crypto_evict_key() - Evict a key from a device.
+ * @profile: the crypto profile of the device
+ * @key: the key to evict. It must not still be used in any I/O.
+ *
+ * If the device has keyslots, this finds the keyslot (if any) that contains the
+ * specified key and calls the driver's keyslot_evict function to evict it.
+ *
+ * Otherwise, this just calls the driver's keyslot_evict function if it is
+ * implemented, passing just the key (without any particular keyslot). This
+ * allows layered devices to evict the key from their underlying devices.
+ *
+ * Context: Process context. Takes and releases profile->lock.
+ * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY
+ * if the keyslot is still in use, or another -errno value on other
+ * error.
+ */
+int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key)
+{
+ struct blk_crypto_keyslot *slot;
+ int err = 0;
+
+ if (profile->num_slots == 0) {
+ if (profile->ll_ops.keyslot_evict) {
+ blk_crypto_hw_enter(profile);
+ err = profile->ll_ops.keyslot_evict(profile, key, -1);
+ blk_crypto_hw_exit(profile);
+ return err;
+ }
+ return 0;
+ }
+
+ blk_crypto_hw_enter(profile);
+ slot = blk_crypto_find_keyslot(profile, key);
+ if (!slot)
+ goto out_unlock;
+
+ if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) {
+ err = -EBUSY;
+ goto out_unlock;
+ }
+ err = profile->ll_ops.keyslot_evict(profile, key,
+ blk_crypto_keyslot_index(slot));
+ if (err)
+ goto out_unlock;
+
+ hlist_del(&slot->hash_node);
+ slot->key = NULL;
+ err = 0;
+out_unlock:
+ blk_crypto_hw_exit(profile);
+ return err;
+}
+
+/**
+ * blk_crypto_reprogram_all_keys() - Re-program all keyslots.
+ * @profile: The crypto profile
+ *
+ * Re-program all keyslots that are supposed to have a key programmed. This is
+ * intended only for use by drivers for hardware that loses its keys on reset.
+ *
+ * Context: Process context. Takes and releases profile->lock.
+ */
+void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile)
+{
+ unsigned int slot;
+
+ if (profile->num_slots == 0)
+ return;
+
+ /* This is for device initialization, so don't resume the device */
+ down_write(&profile->lock);
+ for (slot = 0; slot < profile->num_slots; slot++) {
+ const struct blk_crypto_key *key = profile->slots[slot].key;
+ int err;
+
+ if (!key)
+ continue;
+
+ err = profile->ll_ops.keyslot_program(profile, key, slot);
+ WARN_ON(err);
+ }
+ up_write(&profile->lock);
+}
+EXPORT_SYMBOL_GPL(blk_crypto_reprogram_all_keys);
+
+void blk_crypto_profile_destroy(struct blk_crypto_profile *profile)
+{
+ if (!profile)
+ return;
+ kvfree(profile->slot_hashtable);
+ kvfree_sensitive(profile->slots,
+ sizeof(profile->slots[0]) * profile->num_slots);
+ memzero_explicit(profile, sizeof(*profile));
+}
+EXPORT_SYMBOL_GPL(blk_crypto_profile_destroy);
+
+bool blk_crypto_register(struct blk_crypto_profile *profile,
+ struct request_queue *q)
+{
+ if (blk_integrity_queue_supports_integrity(q)) {
+ pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
+ return false;
+ }
+ q->crypto_profile = profile;
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_register);
+
+void blk_crypto_unregister(struct request_queue *q)
+{
+ q->crypto_profile = NULL;
+}
+
+/**
+ * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities
+ * by child device
+ * @parent: the crypto profile for the parent device
+ * @child: the crypto profile for the child device, or NULL
+ *
+ * This clears all crypto capabilities in @parent that aren't set in @child. If
+ * @child is NULL, then this clears all parent capabilities.
+ *
+ * Only use this when setting up the crypto profile for a layered device, before
+ * it's been exposed yet.
+ */
+void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent,
+ const struct blk_crypto_profile *child)
+{
+ if (child) {
+ unsigned int i;
+
+ parent->max_dun_bytes_supported =
+ min(parent->max_dun_bytes_supported,
+ child->max_dun_bytes_supported);
+ for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++)
+ parent->modes_supported[i] &= child->modes_supported[i];
+ } else {
+ parent->max_dun_bytes_supported = 0;
+ memset(parent->modes_supported, 0,
+ sizeof(parent->modes_supported));
+ }
+}
+EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities);
+
+/**
+ * blk_crypto_has_capabilities() - Check whether @target supports at least all
+ * the crypto capabilities that @reference does.
+ * @target: the target profile
+ * @reference: the reference profile
+ *
+ * Return: %true if @target supports all the crypto capabilities of @reference.
+ */
+bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target,
+ const struct blk_crypto_profile *reference)
+{
+ int i;
+
+ if (!reference)
+ return true;
+
+ if (!target)
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(target->modes_supported); i++) {
+ if (reference->modes_supported[i] & ~target->modes_supported[i])
+ return false;
+ }
+
+ if (reference->max_dun_bytes_supported >
+ target->max_dun_bytes_supported)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities);
+
+/**
+ * blk_crypto_update_capabilities() - Update the capabilities of a crypto
+ * profile to match those of another crypto
+ * profile.
+ * @dst: The crypto profile whose capabilities to update.
+ * @src: The crypto profile whose capabilities this function will update @dst's
+ * capabilities to.
+ *
+ * Blk-crypto requires that crypto capabilities that were
+ * advertised when a bio was created continue to be supported by the
+ * device until that bio is ended. This is turn means that a device cannot
+ * shrink its advertised crypto capabilities without any explicit
+ * synchronization with upper layers. So if there's no such explicit
+ * synchronization, @src must support all the crypto capabilities that
+ * @dst does (i.e. we need blk_crypto_has_capabilities(@src, @dst)).
+ *
+ * Note also that as long as the crypto capabilities are being expanded, the
+ * order of updates becoming visible is not important because it's alright
+ * for blk-crypto to see stale values - they only cause blk-crypto to
+ * believe that a crypto capability isn't supported when it actually is (which
+ * might result in blk-crypto-fallback being used if available, or the bio being
+ * failed).
+ */
+void blk_crypto_update_capabilities(struct blk_crypto_profile *dst,
+ const struct blk_crypto_profile *src)
+{
+ memcpy(dst->modes_supported, src->modes_supported,
+ sizeof(dst->modes_supported));
+
+ dst->max_dun_bytes_supported = src->max_dun_bytes_supported;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities);
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 103c2e2d50d6..ec9efeeeca91 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -11,7 +11,7 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -218,8 +218,9 @@ static bool bio_crypt_check_alignment(struct bio *bio)
blk_status_t __blk_crypto_init_request(struct request *rq)
{
- return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key,
- &rq->crypt_keyslot);
+ return blk_crypto_get_keyslot(rq->q->crypto_profile,
+ rq->crypt_ctx->bc_key,
+ &rq->crypt_keyslot);
}
/**
@@ -233,7 +234,7 @@ blk_status_t __blk_crypto_init_request(struct request *rq)
*/
void __blk_crypto_free_request(struct request *rq)
{
- blk_ksm_put_slot(rq->crypt_keyslot);
+ blk_crypto_put_keyslot(rq->crypt_keyslot);
mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
blk_crypto_rq_set_defaults(rq);
}
@@ -264,6 +265,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
{
struct bio *bio = *bio_ptr;
const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
+ struct blk_crypto_profile *profile;
/* Error if bio has no data. */
if (WARN_ON_ONCE(!bio_has_data(bio))) {
@@ -280,8 +282,8 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
* Success if device supports the encryption context, or if we succeeded
* in falling back to the crypto API.
*/
- if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm,
- &bc_key->crypto_cfg))
+ profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
+ if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
return true;
if (blk_crypto_fallback_bio_prep(bio_ptr))
@@ -357,7 +359,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
const struct blk_crypto_config *cfg)
{
return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
- blk_ksm_crypto_cfg_supported(q->ksm, cfg);
+ __blk_crypto_cfg_supported(q->crypto_profile, cfg);
}
/**
@@ -378,7 +380,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
int blk_crypto_start_using_key(const struct blk_crypto_key *key,
struct request_queue *q)
{
- if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
+ if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
return 0;
return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
}
@@ -394,18 +396,17 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
* evicted from any hardware that it might have been programmed into. The key
* must not be in use by any in-flight IO when this function is called.
*
- * Return: 0 on success or if key is not present in the q's ksm, -err on error.
+ * Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
*/
int blk_crypto_evict_key(struct request_queue *q,
const struct blk_crypto_key *key)
{
- if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
- return blk_ksm_evict_key(q->ksm, key);
+ if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
+ return __blk_crypto_evict_key(q->crypto_profile, key);
/*
- * If the request queue's associated inline encryption hardware didn't
- * have support for the key, then the key might have been programmed
- * into the fallback keyslot manager, so try to evict from there.
+ * If the request_queue didn't support the key, then blk-crypto-fallback
+ * may have been used, so try to evict the key from blk-crypto-fallback.
*/
return blk_crypto_fallback_evict_key(key);
}
diff --git a/block/blk-exec.c b/block/blk-exec.c
index d6cd501c0d34..1b8b47f6e79b 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -65,13 +65,19 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
static bool blk_rq_is_poll(struct request *rq)
{
- return rq->mq_hctx && rq->mq_hctx->type == HCTX_TYPE_POLL;
+ if (!rq->mq_hctx)
+ return false;
+ if (rq->mq_hctx->type != HCTX_TYPE_POLL)
+ return false;
+ if (WARN_ON_ONCE(!rq->bio))
+ return false;
+ return true;
}
static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
{
do {
- blk_poll(rq->q, request_to_qc_t(rq->mq_hctx, rq), true);
+ bio_poll(rq->bio, NULL, 0);
cond_resched();
} while (!completion_done(wait));
}
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 4201728bf3a5..8e364bda5166 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -379,7 +379,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
* @rq is being submitted. Analyze what needs to be done and put it on the
* right queue.
*/
-void blk_insert_flush(struct request *rq)
+bool blk_insert_flush(struct request *rq)
{
struct request_queue *q = rq->q;
unsigned long fflags = q->queue_flags; /* may change, cache */
@@ -409,7 +409,7 @@ void blk_insert_flush(struct request *rq)
*/
if (!policy) {
blk_mq_end_request(rq, 0);
- return;
+ return true;
}
BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
@@ -420,10 +420,8 @@ void blk_insert_flush(struct request *rq)
* for normal execution.
*/
if ((policy & REQ_FSEQ_DATA) &&
- !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
- blk_mq_request_bypass_insert(rq, false, false);
- return;
- }
+ !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH)))
+ return false;
/*
* @rq should go through flush machinery. Mark it part of flush
@@ -439,6 +437,8 @@ void blk_insert_flush(struct request *rq)
spin_lock_irq(&fq->mq_flush_lock);
blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
spin_unlock_irq(&fq->mq_flush_lock);
+
+ return true;
}
/**
diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c
new file mode 100644
index 000000000000..c246c425d0d7
--- /dev/null
+++ b/block/blk-ia-ranges.c
@@ -0,0 +1,348 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Block device concurrent positioning ranges.
+ *
+ * Copyright (C) 2021 Western Digital Corporation or its Affiliates.
+ */
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+
+#include "blk.h"
+
+static ssize_t
+blk_ia_range_sector_show(struct blk_independent_access_range *iar,
+ char *buf)
+{
+ return sprintf(buf, "%llu\n", iar->sector);
+}
+
+static ssize_t
+blk_ia_range_nr_sectors_show(struct blk_independent_access_range *iar,
+ char *buf)
+{
+ return sprintf(buf, "%llu\n", iar->nr_sectors);
+}
+
+struct blk_ia_range_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct blk_independent_access_range *iar, char *buf);
+};
+
+static struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = {
+ .attr = { .name = "sector", .mode = 0444 },
+ .show = blk_ia_range_sector_show,
+};
+
+static struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = {
+ .attr = { .name = "nr_sectors", .mode = 0444 },
+ .show = blk_ia_range_nr_sectors_show,
+};
+
+static struct attribute *blk_ia_range_attrs[] = {
+ &blk_ia_range_sector_entry.attr,
+ &blk_ia_range_nr_sectors_entry.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(blk_ia_range);
+
+static ssize_t blk_ia_range_sysfs_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct blk_ia_range_sysfs_entry *entry =
+ container_of(attr, struct blk_ia_range_sysfs_entry, attr);
+ struct blk_independent_access_range *iar =
+ container_of(kobj, struct blk_independent_access_range, kobj);
+ ssize_t ret;
+
+ mutex_lock(&iar->queue->sysfs_lock);
+ ret = entry->show(iar, buf);
+ mutex_unlock(&iar->queue->sysfs_lock);
+
+ return ret;
+}
+
+static const struct sysfs_ops blk_ia_range_sysfs_ops = {
+ .show = blk_ia_range_sysfs_show,
+};
+
+/*
+ * Independent access range entries are not freed individually, but alltogether
+ * with struct blk_independent_access_ranges and its array of ranges. Since
+ * kobject_add() takes a reference on the parent kobject contained in
+ * struct blk_independent_access_ranges, the array of independent access range
+ * entries cannot be freed until kobject_del() is called for all entries.
+ * So we do not need to do anything here, but still need this no-op release
+ * operation to avoid complaints from the kobject code.
+ */
+static void blk_ia_range_sysfs_nop_release(struct kobject *kobj)
+{
+}
+
+static struct kobj_type blk_ia_range_ktype = {
+ .sysfs_ops = &blk_ia_range_sysfs_ops,
+ .default_groups = blk_ia_range_groups,
+ .release = blk_ia_range_sysfs_nop_release,
+};
+
+/*
+ * This will be executed only after all independent access range entries are
+ * removed with kobject_del(), at which point, it is safe to free everything,
+ * including the array of ranges.
+ */
+static void blk_ia_ranges_sysfs_release(struct kobject *kobj)
+{
+ struct blk_independent_access_ranges *iars =
+ container_of(kobj, struct blk_independent_access_ranges, kobj);
+
+ kfree(iars);
+}
+
+static struct kobj_type blk_ia_ranges_ktype = {
+ .release = blk_ia_ranges_sysfs_release,
+};
+
+/**
+ * disk_register_ia_ranges - register with sysfs a set of independent
+ * access ranges
+ * @disk: Target disk
+ * @new_iars: New set of independent access ranges
+ *
+ * Register with sysfs a set of independent access ranges for @disk.
+ * If @new_iars is not NULL, this set of ranges is registered and the old set
+ * specified by q->ia_ranges is unregistered. Otherwise, q->ia_ranges is
+ * registered if it is not already.
+ */
+int disk_register_independent_access_ranges(struct gendisk *disk,
+ struct blk_independent_access_ranges *new_iars)
+{
+ struct request_queue *q = disk->queue;
+ struct blk_independent_access_ranges *iars;
+ int i, ret;
+
+ lockdep_assert_held(&q->sysfs_dir_lock);
+ lockdep_assert_held(&q->sysfs_lock);
+
+ /* If a new range set is specified, unregister the old one */
+ if (new_iars) {
+ if (q->ia_ranges)
+ disk_unregister_independent_access_ranges(disk);
+ q->ia_ranges = new_iars;
+ }
+
+ iars = q->ia_ranges;
+ if (!iars)
+ return 0;
+
+ /*
+ * At this point, iars is the new set of sector access ranges that needs
+ * to be registered with sysfs.
+ */
+ WARN_ON(iars->sysfs_registered);
+ ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype,
+ &q->kobj, "%s", "independent_access_ranges");
+ if (ret) {
+ q->ia_ranges = NULL;
+ kfree(iars);
+ return ret;
+ }
+
+ for (i = 0; i < iars->nr_ia_ranges; i++) {
+ iars->ia_range[i].queue = q;
+ ret = kobject_init_and_add(&iars->ia_range[i].kobj,
+ &blk_ia_range_ktype, &iars->kobj,
+ "%d", i);
+ if (ret) {
+ while (--i >= 0)
+ kobject_del(&iars->ia_range[i].kobj);
+ kobject_del(&iars->kobj);
+ kobject_put(&iars->kobj);
+ return ret;
+ }
+ }
+
+ iars->sysfs_registered = true;
+
+ return 0;
+}
+
+void disk_unregister_independent_access_ranges(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+ struct blk_independent_access_ranges *iars = q->ia_ranges;
+ int i;
+
+ lockdep_assert_held(&q->sysfs_dir_lock);
+ lockdep_assert_held(&q->sysfs_lock);
+
+ if (!iars)
+ return;
+
+ if (iars->sysfs_registered) {
+ for (i = 0; i < iars->nr_ia_ranges; i++)
+ kobject_del(&iars->ia_range[i].kobj);
+ kobject_del(&iars->kobj);
+ kobject_put(&iars->kobj);
+ } else {
+ kfree(iars);
+ }
+
+ q->ia_ranges = NULL;
+}
+
+static struct blk_independent_access_range *
+disk_find_ia_range(struct blk_independent_access_ranges *iars,
+ sector_t sector)
+{
+ struct blk_independent_access_range *iar;
+ int i;
+
+ for (i = 0; i < iars->nr_ia_ranges; i++) {
+ iar = &iars->ia_range[i];
+ if (sector >= iar->sector &&
+ sector < iar->sector + iar->nr_sectors)
+ return iar;
+ }
+
+ return NULL;
+}
+
+static bool disk_check_ia_ranges(struct gendisk *disk,
+ struct blk_independent_access_ranges *iars)
+{
+ struct blk_independent_access_range *iar, *tmp;
+ sector_t capacity = get_capacity(disk);
+ sector_t sector = 0;
+ int i;
+
+ /*
+ * While sorting the ranges in increasing LBA order, check that the
+ * ranges do not overlap, that there are no sector holes and that all
+ * sectors belong to one range.
+ */
+ for (i = 0; i < iars->nr_ia_ranges; i++) {
+ tmp = disk_find_ia_range(iars, sector);
+ if (!tmp || tmp->sector != sector) {
+ pr_warn("Invalid non-contiguous independent access ranges\n");
+ return false;
+ }
+
+ iar = &iars->ia_range[i];
+ if (tmp != iar) {
+ swap(iar->sector, tmp->sector);
+ swap(iar->nr_sectors, tmp->nr_sectors);
+ }
+
+ sector += iar->nr_sectors;
+ }
+
+ if (sector != capacity) {
+ pr_warn("Independent access ranges do not match disk capacity\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool disk_ia_ranges_changed(struct gendisk *disk,
+ struct blk_independent_access_ranges *new)
+{
+ struct blk_independent_access_ranges *old = disk->queue->ia_ranges;
+ int i;
+
+ if (!old)
+ return true;
+
+ if (old->nr_ia_ranges != new->nr_ia_ranges)
+ return true;
+
+ for (i = 0; i < old->nr_ia_ranges; i++) {
+ if (new->ia_range[i].sector != old->ia_range[i].sector ||
+ new->ia_range[i].nr_sectors != old->ia_range[i].nr_sectors)
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * disk_alloc_independent_access_ranges - Allocate an independent access ranges
+ * data structure
+ * @disk: target disk
+ * @nr_ia_ranges: Number of independent access ranges
+ *
+ * Allocate a struct blk_independent_access_ranges structure with @nr_ia_ranges
+ * access range descriptors.
+ */
+struct blk_independent_access_ranges *
+disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges)
+{
+ struct blk_independent_access_ranges *iars;
+
+ iars = kzalloc_node(struct_size(iars, ia_range, nr_ia_ranges),
+ GFP_KERNEL, disk->queue->node);
+ if (iars)
+ iars->nr_ia_ranges = nr_ia_ranges;
+ return iars;
+}
+EXPORT_SYMBOL_GPL(disk_alloc_independent_access_ranges);
+
+/**
+ * disk_set_independent_access_ranges - Set a disk independent access ranges
+ * @disk: target disk
+ * @iars: independent access ranges structure
+ *
+ * Set the independent access ranges information of the request queue
+ * of @disk to @iars. If @iars is NULL and the independent access ranges
+ * structure already set is cleared. If there are no differences between
+ * @iars and the independent access ranges structure already set, @iars
+ * is freed.
+ */
+void disk_set_independent_access_ranges(struct gendisk *disk,
+ struct blk_independent_access_ranges *iars)
+{
+ struct request_queue *q = disk->queue;
+
+ if (WARN_ON_ONCE(iars && !iars->nr_ia_ranges)) {
+ kfree(iars);
+ iars = NULL;
+ }
+
+ mutex_lock(&q->sysfs_dir_lock);
+ mutex_lock(&q->sysfs_lock);
+
+ if (iars) {
+ if (!disk_check_ia_ranges(disk, iars)) {
+ kfree(iars);
+ iars = NULL;
+ goto reg;
+ }
+
+ if (!disk_ia_ranges_changed(disk, iars)) {
+ kfree(iars);
+ goto unlock;
+ }
+ }
+
+ /*
+ * This may be called for a registered queue. E.g. during a device
+ * revalidation. If that is the case, we need to unregister the old
+ * set of independent access ranges and register the new set. If the
+ * queue is not registered, registration of the device request queue
+ * will register the independent access ranges, so only swap in the
+ * new set and free the old one.
+ */
+reg:
+ if (blk_queue_registered(q)) {
+ disk_register_independent_access_ranges(disk, iars);
+ } else {
+ swap(q->ia_ranges, iars);
+ kfree(iars);
+ }
+
+unlock:
+ mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->sysfs_dir_lock);
+}
+EXPORT_SYMBOL_GPL(disk_set_independent_access_ranges);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 16d5d5338392..d670d54e5f7a 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -6,7 +6,7 @@
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
*/
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/backing-dev.h>
#include <linux/mempool.h>
#include <linux/bio.h>
@@ -409,9 +409,9 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
- if (disk->queue->ksm) {
+ if (disk->queue->crypto_profile) {
pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
- blk_ksm_unregister(disk->queue);
+ blk_crypto_unregister(disk->queue);
}
#endif
}
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index b3880e4ba22a..a5b37cc65b17 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3165,12 +3165,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
- ioc = q_to_ioc(bdev->bd_disk->queue);
+ ioc = q_to_ioc(bdev_get_queue(bdev));
if (!ioc) {
- ret = blk_iocost_init(bdev->bd_disk->queue);
+ ret = blk_iocost_init(bdev_get_queue(bdev));
if (ret)
goto err;
- ioc = q_to_ioc(bdev->bd_disk->queue);
+ ioc = q_to_ioc(bdev_get_queue(bdev));
}
spin_lock_irq(&ioc->lock);
@@ -3332,12 +3332,12 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
- ioc = q_to_ioc(bdev->bd_disk->queue);
+ ioc = q_to_ioc(bdev_get_queue(bdev));
if (!ioc) {
- ret = blk_iocost_init(bdev->bd_disk->queue);
+ ret = blk_iocost_init(bdev_get_queue(bdev));
if (ret)
goto err;
- ioc = q_to_ioc(bdev->bd_disk->queue);
+ ioc = q_to_ioc(bdev_get_queue(bdev));
}
spin_lock_irq(&ioc->lock);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index c0545f9da549..6593c7123b97 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -74,6 +74,7 @@
#include <linux/sched/signal.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
+#include <linux/blk-cgroup.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
#include "blk.h"
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 7a5c81c02c80..df69f4bb7717 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -6,12 +6,45 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/scatterlist.h>
#include <trace/events/block.h>
#include "blk.h"
#include "blk-rq-qos.h"
+#include "blk-throttle.h"
+
+static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
+{
+ *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+}
+
+static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
+{
+ struct bvec_iter iter = bio->bi_iter;
+ int idx;
+
+ bio_get_first_bvec(bio, bv);
+ if (bv->bv_len == bio->bi_iter.bi_size)
+ return; /* this bio only has a single bvec */
+
+ bio_advance_iter(bio, &iter, iter.bi_size);
+
+ if (!iter.bi_bvec_done)
+ idx = iter.bi_idx - 1;
+ else /* in the middle of bvec */
+ idx = iter.bi_idx;
+
+ *bv = bio->bi_io_vec[idx];
+
+ /*
+ * iter.bi_bvec_done records actual length of the last bvec
+ * if this bio ends in the middle of one io vector
+ */
+ if (iter.bi_bvec_done)
+ bv->bv_len = iter.bi_bvec_done;
+}
static inline bool bio_will_gap(struct request_queue *q,
struct request *prev_rq, struct bio *prev, struct bio *next)
@@ -285,13 +318,13 @@ split:
* iopoll in direct IO routine. Given performance gain of iopoll for
* big IO can be trival, disable iopoll when split needed.
*/
- bio_clear_hipri(bio);
-
+ bio_clear_polled(bio);
return bio_split(bio, sectors, GFP_NOIO, bs);
}
/**
* __blk_queue_split - split a bio and submit the second half
+ * @q: [in] request_queue new bio is being queued at
* @bio: [in, out] bio to be split
* @nr_segs: [out] number of segments in the first bio
*
@@ -302,9 +335,9 @@ split:
* of the caller to ensure that q->bio_split is only released after processing
* of the split bio has finished.
*/
-void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
+void __blk_queue_split(struct request_queue *q, struct bio **bio,
+ unsigned int *nr_segs)
{
- struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue;
struct bio *split = NULL;
switch (bio_op(*bio)) {
@@ -321,21 +354,6 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
nr_segs);
break;
default:
- /*
- * All drivers must accept single-segments bios that are <=
- * PAGE_SIZE. This is a quick and dirty check that relies on
- * the fact that bi_io_vec[0] is always valid if a bio has data.
- * The check might lead to occasional false negatives when bios
- * are cloned, but compared to the performance impact of cloned
- * bios themselves the loop below doesn't matter anyway.
- */
- if (!q->limits.chunk_sectors &&
- (*bio)->bi_vcnt == 1 &&
- ((*bio)->bi_io_vec[0].bv_len +
- (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) {
- *nr_segs = 1;
- break;
- }
split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
break;
}
@@ -365,9 +383,11 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
*/
void blk_queue_split(struct bio **bio)
{
+ struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
unsigned int nr_segs;
- __blk_queue_split(bio, &nr_segs);
+ if (blk_may_split(q, *bio))
+ __blk_queue_split(q, bio, &nr_segs);
}
EXPORT_SYMBOL(blk_queue_split);
@@ -558,6 +578,23 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq)
return queue_max_segments(rq->q);
}
+static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
+ sector_t offset)
+{
+ struct request_queue *q = rq->q;
+
+ if (blk_rq_is_passthrough(rq))
+ return q->limits.max_hw_sectors;
+
+ if (!q->limits.chunk_sectors ||
+ req_op(rq) == REQ_OP_DISCARD ||
+ req_op(rq) == REQ_OP_SECURE_ERASE)
+ return blk_queue_get_max_sectors(q, req_op(rq));
+
+ return min(blk_max_size_offset(q, offset, 0),
+ blk_queue_get_max_sectors(q, req_op(rq)));
+}
+
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs)
{
@@ -718,6 +755,13 @@ static enum elv_merge blk_try_req_merge(struct request *req,
return ELEVATOR_NO_MERGE;
}
+static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
+{
+ if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b))
+ return true;
+ return false;
+}
+
/*
* For non-mq, this has to be called with the request spinlock acquired.
* For mq with scheduling, the appropriate queue wide lock should be held.
@@ -1023,12 +1067,11 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
* @nr_segs: number of segments in @bio
- * @same_queue_rq: pointer to &struct request that gets filled in when
- * another request associated with @q is found on the plug list
- * (optional, may be %NULL)
+ * @same_queue_rq: output value, will be true if there's an existing request
+ * from the passed in @q already in the plug list
*
- * Determine whether @bio being queued on @q can be merged with a request
- * on %current's plugged list. Returns %true if merge was successful,
+ * Determine whether @bio being queued on @q can be merged with the previous
+ * request on %current's plugged list. Returns %true if merge was successful,
* otherwise %false.
*
* Plugging coalesces IOs from the same issuer for the same purpose without
@@ -1041,36 +1084,26 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
* Caller must ensure !blk_queue_nomerges(q) beforehand.
*/
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs, struct request **same_queue_rq)
+ unsigned int nr_segs, bool *same_queue_rq)
{
struct blk_plug *plug;
struct request *rq;
- struct list_head *plug_list;
plug = blk_mq_plug(q, bio);
- if (!plug)
+ if (!plug || rq_list_empty(plug->mq_list))
return false;
- plug_list = &plug->mq_list;
-
- list_for_each_entry_reverse(rq, plug_list, queuelist) {
- if (rq->q == q && same_queue_rq) {
- /*
- * Only blk-mq multiple hardware queues case checks the
- * rq in the same queue, there should be only one such
- * rq in a queue
- **/
- *same_queue_rq = rq;
- }
-
- if (rq->q != q)
- continue;
-
- if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
- BIO_MERGE_OK)
- return true;
+ /* check the previously added entry for a quick merge attempt */
+ rq = rq_list_peek(&plug->mq_list);
+ if (rq->q == q) {
+ /*
+ * Only blk-mq multiple hardware queues case checks the rq in
+ * the same queue, there should be only one such rq in a queue
+ */
+ *same_queue_rq = true;
}
-
+ if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == BIO_MERGE_OK)
+ return true;
return false;
}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3b38d15723de..0f8c60e9c719 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -287,7 +287,7 @@ static const char *const cmd_flag_name[] = {
CMD_FLAG_NAME(BACKGROUND),
CMD_FLAG_NAME(NOWAIT),
CMD_FLAG_NAME(NOUNMAP),
- CMD_FLAG_NAME(HIPRI),
+ CMD_FLAG_NAME(POLLED),
};
#undef CMD_FLAG_NAME
@@ -453,11 +453,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
atomic_read(&tags->active_queues));
seq_puts(m, "\nbitmap_tags:\n");
- sbitmap_queue_show(tags->bitmap_tags, m);
+ sbitmap_queue_show(&tags->bitmap_tags, m);
if (tags->nr_reserved_tags) {
seq_puts(m, "\nbreserved_tags:\n");
- sbitmap_queue_show(tags->breserved_tags, m);
+ sbitmap_queue_show(&tags->breserved_tags, m);
}
}
@@ -488,7 +488,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
if (res)
goto out;
if (hctx->tags)
- sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m);
+ sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
mutex_unlock(&q->sysfs_lock);
out:
@@ -522,77 +522,13 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
if (res)
goto out;
if (hctx->sched_tags)
- sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m);
+ sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
mutex_unlock(&q->sysfs_lock);
out:
return res;
}
-static int hctx_io_poll_show(void *data, struct seq_file *m)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- seq_printf(m, "considered=%lu\n", hctx->poll_considered);
- seq_printf(m, "invoked=%lu\n", hctx->poll_invoked);
- seq_printf(m, "success=%lu\n", hctx->poll_success);
- return 0;
-}
-
-static ssize_t hctx_io_poll_write(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
- return count;
-}
-
-static int hctx_dispatched_show(void *data, struct seq_file *m)
-{
- struct blk_mq_hw_ctx *hctx = data;
- int i;
-
- seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
-
- for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
- unsigned int d = 1U << (i - 1);
-
- seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]);
- }
-
- seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]);
- return 0;
-}
-
-static ssize_t hctx_dispatched_write(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct blk_mq_hw_ctx *hctx = data;
- int i;
-
- for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++)
- hctx->dispatched[i] = 0;
- return count;
-}
-
-static int hctx_queued_show(void *data, struct seq_file *m)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- seq_printf(m, "%lu\n", hctx->queued);
- return 0;
-}
-
-static ssize_t hctx_queued_write(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- hctx->queued = 0;
- return count;
-}
-
static int hctx_run_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
@@ -614,7 +550,7 @@ static int hctx_active_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
- seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
+ seq_printf(m, "%d\n", __blk_mq_active_requests(hctx));
return 0;
}
@@ -663,57 +599,6 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
-static int ctx_dispatched_show(void *data, struct seq_file *m)
-{
- struct blk_mq_ctx *ctx = data;
-
- seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]);
- return 0;
-}
-
-static ssize_t ctx_dispatched_write(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct blk_mq_ctx *ctx = data;
-
- ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0;
- return count;
-}
-
-static int ctx_merged_show(void *data, struct seq_file *m)
-{
- struct blk_mq_ctx *ctx = data;
-
- seq_printf(m, "%lu\n", ctx->rq_merged);
- return 0;
-}
-
-static ssize_t ctx_merged_write(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct blk_mq_ctx *ctx = data;
-
- ctx->rq_merged = 0;
- return count;
-}
-
-static int ctx_completed_show(void *data, struct seq_file *m)
-{
- struct blk_mq_ctx *ctx = data;
-
- seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]);
- return 0;
-}
-
-static ssize_t ctx_completed_write(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct blk_mq_ctx *ctx = data;
-
- ctx->rq_completed[0] = ctx->rq_completed[1] = 0;
- return count;
-}
-
static int blk_mq_debugfs_show(struct seq_file *m, void *v)
{
const struct blk_mq_debugfs_attr *attr = m->private;
@@ -789,9 +674,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"tags_bitmap", 0400, hctx_tags_bitmap_show},
{"sched_tags", 0400, hctx_sched_tags_show},
{"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
- {"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write},
- {"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write},
- {"queued", 0600, hctx_queued_show, hctx_queued_write},
{"run", 0600, hctx_run_show, hctx_run_write},
{"active", 0400, hctx_active_show},
{"dispatch_busy", 0400, hctx_dispatch_busy_show},
@@ -803,9 +685,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
{"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops},
{"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
{"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
- {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
- {"merged", 0600, ctx_merged_show, ctx_merged_write},
- {"completed", 0600, ctx_completed_show, ctx_completed_write},
{},
};
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 0f006cabfd91..c62b966dfaba 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -57,10 +57,8 @@ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
}
EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
+void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- return;
clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
/*
@@ -363,7 +361,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
}
}
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
struct elevator_queue *e = q->elevator;
@@ -389,13 +387,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
* potentially merge with. Currently includes a hand-wavy stop
* count of 8, to not spend too much time checking for merges.
*/
- if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
- ctx->rq_merged++;
+ if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
ret = true;
- }
spin_unlock(&ctx->lock);
-
return ret;
}
@@ -515,83 +510,71 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
percpu_ref_put(&q->q_usage_counter);
}
-static int blk_mq_sched_alloc_tags(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
+static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
+ struct blk_mq_hw_ctx *hctx,
+ unsigned int hctx_idx)
{
- struct blk_mq_tag_set *set = q->tag_set;
- int ret;
+ if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+ hctx->sched_tags = q->sched_shared_tags;
+ return 0;
+ }
+
+ hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
+ q->nr_requests);
- hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
- set->reserved_tags, set->flags);
if (!hctx->sched_tags)
return -ENOMEM;
+ return 0;
+}
- ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
- if (ret) {
- blk_mq_free_rq_map(hctx->sched_tags, set->flags);
- hctx->sched_tags = NULL;
- }
-
- return ret;
+static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
+{
+ blk_mq_free_rq_map(queue->sched_shared_tags);
+ queue->sched_shared_tags = NULL;
}
/* called in queue's release handler, tagset has gone away */
-static void blk_mq_sched_tags_teardown(struct request_queue *q)
+static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags) {
- blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
+ if (!blk_mq_is_shared_tags(flags))
+ blk_mq_free_rq_map(hctx->sched_tags);
hctx->sched_tags = NULL;
}
}
+
+ if (blk_mq_is_shared_tags(flags))
+ blk_mq_exit_sched_shared_tags(q);
}
-static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
+static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
{
struct blk_mq_tag_set *set = queue->tag_set;
- int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
- struct blk_mq_hw_ctx *hctx;
- int ret, i;
/*
* Set initial depth at max so that we don't need to reallocate for
* updating nr_requests.
*/
- ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
- &queue->sched_breserved_tags,
- MAX_SCHED_RQ, set->reserved_tags,
- set->numa_node, alloc_policy);
- if (ret)
- return ret;
-
- queue_for_each_hw_ctx(queue, hctx, i) {
- hctx->sched_tags->bitmap_tags =
- &queue->sched_bitmap_tags;
- hctx->sched_tags->breserved_tags =
- &queue->sched_breserved_tags;
- }
+ queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
+ BLK_MQ_NO_HCTX_IDX,
+ MAX_SCHED_RQ);
+ if (!queue->sched_shared_tags)
+ return -ENOMEM;
- sbitmap_queue_resize(&queue->sched_bitmap_tags,
- queue->nr_requests - set->reserved_tags);
+ blk_mq_tag_update_sched_shared_tags(queue);
return 0;
}
-static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
-{
- sbitmap_queue_free(&queue->sched_bitmap_tags);
- sbitmap_queue_free(&queue->sched_breserved_tags);
-}
-
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
+ unsigned int i, flags = q->tag_set->flags;
struct blk_mq_hw_ctx *hctx;
struct elevator_queue *eq;
- unsigned int i;
int ret;
if (!e) {
@@ -606,23 +589,23 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
* Additionally, this is a per-hw queue depth.
*/
q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
- BLKDEV_MAX_RQ);
+ BLKDEV_DEFAULT_RQ);
- queue_for_each_hw_ctx(q, hctx, i) {
- ret = blk_mq_sched_alloc_tags(q, hctx, i);
+ if (blk_mq_is_shared_tags(flags)) {
+ ret = blk_mq_init_sched_shared_tags(q);
if (ret)
- goto err_free_tags;
+ return ret;
}
- if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
- ret = blk_mq_init_sched_shared_sbitmap(q);
+ queue_for_each_hw_ctx(q, hctx, i) {
+ ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
if (ret)
- goto err_free_tags;
+ goto err_free_map_and_rqs;
}
ret = e->ops.init_sched(q, e);
if (ret)
- goto err_free_sbitmap;
+ goto err_free_map_and_rqs;
blk_mq_debugfs_register_sched(q);
@@ -631,7 +614,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
ret = e->ops.init_hctx(hctx, i);
if (ret) {
eq = q->elevator;
- blk_mq_sched_free_requests(q);
+ blk_mq_sched_free_rqs(q);
blk_mq_exit_sched(q, eq);
kobject_put(&eq->kobj);
return ret;
@@ -642,12 +625,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
return 0;
-err_free_sbitmap:
- if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
- blk_mq_exit_sched_shared_sbitmap(q);
-err_free_tags:
- blk_mq_sched_free_requests(q);
- blk_mq_sched_tags_teardown(q);
+err_free_map_and_rqs:
+ blk_mq_sched_free_rqs(q);
+ blk_mq_sched_tags_teardown(q, flags);
+
q->elevator = NULL;
return ret;
}
@@ -656,14 +637,20 @@ err_free_tags:
* called in either blk_queue_cleanup or elevator_switch, tagset
* is required for freeing requests
*/
-void blk_mq_sched_free_requests(struct request_queue *q)
+void blk_mq_sched_free_rqs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (hctx->sched_tags)
- blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
+ if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+ blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
+ BLK_MQ_NO_HCTX_IDX);
+ } else {
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (hctx->sched_tags)
+ blk_mq_free_rqs(q->tag_set,
+ hctx->sched_tags, i);
+ }
}
}
@@ -684,8 +671,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
blk_mq_debugfs_unregister_sched(q);
if (e->type->ops.exit_sched)
e->type->ops.exit_sched(e);
- blk_mq_sched_tags_teardown(q);
- if (blk_mq_is_sbitmap_shared(flags))
- blk_mq_exit_sched_shared_sbitmap(q);
+ blk_mq_sched_tags_teardown(q, flags);
q->elevator = NULL;
}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 5246ae040704..25d1034952b6 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -2,21 +2,22 @@
#ifndef BLK_MQ_SCHED_H
#define BLK_MQ_SCHED_H
+#include "elevator.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
-#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
+#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
void blk_mq_sched_assign_ioc(struct request *rq);
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **merged_request);
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs);
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
struct list_head *free);
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
+void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async);
@@ -28,45 +29,51 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
-void blk_mq_sched_free_requests(struct request_queue *q);
+void blk_mq_sched_free_rqs(struct request_queue *q);
-static inline bool
-blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs)
+static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
- if (blk_queue_nomerges(q) || !bio_mergeable(bio))
- return false;
+ if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+ __blk_mq_sched_restart(hctx);
+}
- return __blk_mq_sched_bio_merge(q, bio, nr_segs);
+static inline bool bio_mergeable(struct bio *bio)
+{
+ return !(bio->bi_opf & REQ_NOMERGE_FLAGS);
}
static inline bool
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- struct elevator_queue *e = q->elevator;
-
- if (e && e->type->ops.allow_merge)
- return e->type->ops.allow_merge(q, rq, bio);
+ if (rq->rq_flags & RQF_ELV) {
+ struct elevator_queue *e = q->elevator;
+ if (e->type->ops.allow_merge)
+ return e->type->ops.allow_merge(q, rq, bio);
+ }
return true;
}
static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
{
- struct elevator_queue *e = rq->q->elevator;
+ if (rq->rq_flags & RQF_ELV) {
+ struct elevator_queue *e = rq->q->elevator;
- if (e && e->type->ops.completed_request)
- e->type->ops.completed_request(rq, now);
+ if (e->type->ops.completed_request)
+ e->type->ops.completed_request(rq, now);
+ }
}
static inline void blk_mq_sched_requeue_request(struct request *rq)
{
- struct request_queue *q = rq->q;
- struct elevator_queue *e = q->elevator;
+ if (rq->rq_flags & RQF_ELV) {
+ struct request_queue *q = rq->q;
+ struct elevator_queue *e = q->elevator;
- if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request)
- e->type->ops.requeue_request(rq);
+ if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request)
+ e->type->ops.requeue_request(rq);
+ }
}
static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index ff5caeb82542..995336abee33 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -24,13 +24,12 @@
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
- if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+ if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
- struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
!test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
- atomic_inc(&set->active_queues_shared_sbitmap);
+ atomic_inc(&hctx->tags->active_queues);
} else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
!test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
@@ -45,9 +44,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
*/
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
{
- sbitmap_queue_wake_all(tags->bitmap_tags);
+ sbitmap_queue_wake_all(&tags->bitmap_tags);
if (include_reserve)
- sbitmap_queue_wake_all(tags->breserved_tags);
+ sbitmap_queue_wake_all(&tags->breserved_tags);
}
/*
@@ -57,20 +56,20 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
- struct request_queue *q = hctx->queue;
- struct blk_mq_tag_set *set = q->tag_set;
- if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+ if (blk_mq_is_shared_tags(hctx->flags)) {
+ struct request_queue *q = hctx->queue;
+
if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
&q->queue_flags))
return;
- atomic_dec(&set->active_queues_shared_sbitmap);
} else {
if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return;
- atomic_dec(&tags->active_queues);
}
+ atomic_dec(&tags->active_queues);
+
blk_mq_tag_wakeup_all(tags, false);
}
@@ -87,6 +86,21 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
return __sbitmap_queue_get(bt);
}
+unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
+ unsigned int *offset)
+{
+ struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+ struct sbitmap_queue *bt = &tags->bitmap_tags;
+ unsigned long ret;
+
+ if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED ||
+ data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ return 0;
+ ret = __sbitmap_queue_get_batch(bt, nr_tags, offset);
+ *offset += tags->nr_reserved_tags;
+ return ret;
+}
+
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@@ -101,10 +115,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
WARN_ON_ONCE(1);
return BLK_MQ_NO_TAG;
}
- bt = tags->breserved_tags;
+ bt = &tags->breserved_tags;
tag_offset = 0;
} else {
- bt = tags->bitmap_tags;
+ bt = &tags->bitmap_tags;
tag_offset = tags->nr_reserved_tags;
}
@@ -150,9 +164,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
data->ctx);
tags = blk_mq_tags_from_data(data);
if (data->flags & BLK_MQ_REQ_RESERVED)
- bt = tags->breserved_tags;
+ bt = &tags->breserved_tags;
else
- bt = tags->bitmap_tags;
+ bt = &tags->bitmap_tags;
/*
* If destination hw queue is changed, fake wake up on
@@ -186,13 +200,19 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
const int real_tag = tag - tags->nr_reserved_tags;
BUG_ON(real_tag >= tags->nr_tags);
- sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu);
+ sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
} else {
BUG_ON(tag >= tags->nr_reserved_tags);
- sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu);
+ sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
}
}
+void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
+{
+ sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
+ tag_array, nr_tags);
+}
+
struct bt_iter_data {
struct blk_mq_hw_ctx *hctx;
busy_iter_fn *fn;
@@ -340,9 +360,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
if (tags->nr_reserved_tags)
- bt_tags_for_each(tags, tags->breserved_tags, fn, priv,
+ bt_tags_for_each(tags, &tags->breserved_tags, fn, priv,
flags | BT_TAG_ITER_RESERVED);
- bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags);
+ bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags);
}
/**
@@ -379,9 +399,12 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv)
{
- int i;
+ unsigned int flags = tagset->flags;
+ int i, nr_tags;
+
+ nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;
- for (i = 0; i < tagset->nr_hw_queues; i++) {
+ for (i = 0; i < nr_tags; i++) {
if (tagset->tags && tagset->tags[i])
__blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
BT_TAG_ITER_STARTED);
@@ -459,8 +482,8 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
continue;
if (tags->nr_reserved_tags)
- bt_for_each(hctx, tags->breserved_tags, fn, priv, true);
- bt_for_each(hctx, tags->bitmap_tags, fn, priv, false);
+ bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
+ bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
}
blk_queue_exit(q);
}
@@ -492,56 +515,10 @@ free_bitmap_tags:
return -ENOMEM;
}
-static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
- int node, int alloc_policy)
-{
- int ret;
-
- ret = blk_mq_init_bitmaps(&tags->__bitmap_tags,
- &tags->__breserved_tags,
- tags->nr_tags, tags->nr_reserved_tags,
- node, alloc_policy);
- if (ret)
- return ret;
-
- tags->bitmap_tags = &tags->__bitmap_tags;
- tags->breserved_tags = &tags->__breserved_tags;
-
- return 0;
-}
-
-int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)
-{
- int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
- int i, ret;
-
- ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags,
- set->queue_depth, set->reserved_tags,
- set->numa_node, alloc_policy);
- if (ret)
- return ret;
-
- for (i = 0; i < set->nr_hw_queues; i++) {
- struct blk_mq_tags *tags = set->tags[i];
-
- tags->bitmap_tags = &set->__bitmap_tags;
- tags->breserved_tags = &set->__breserved_tags;
- }
-
- return 0;
-}
-
-void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
-{
- sbitmap_queue_free(&set->__bitmap_tags);
- sbitmap_queue_free(&set->__breserved_tags);
-}
-
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
unsigned int reserved_tags,
- int node, unsigned int flags)
+ int node, int alloc_policy)
{
- int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
struct blk_mq_tags *tags;
if (total_tags > BLK_MQ_TAG_MAX) {
@@ -557,22 +534,19 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_reserved_tags = reserved_tags;
spin_lock_init(&tags->lock);
- if (blk_mq_is_sbitmap_shared(flags))
- return tags;
-
- if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
+ if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
+ total_tags, reserved_tags, node,
+ alloc_policy) < 0) {
kfree(tags);
return NULL;
}
return tags;
}
-void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)
+void blk_mq_free_tags(struct blk_mq_tags *tags)
{
- if (!blk_mq_is_sbitmap_shared(flags)) {
- sbitmap_queue_free(tags->bitmap_tags);
- sbitmap_queue_free(tags->breserved_tags);
- }
+ sbitmap_queue_free(&tags->bitmap_tags);
+ sbitmap_queue_free(&tags->breserved_tags);
kfree(tags);
}
@@ -592,7 +566,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth > tags->nr_tags) {
struct blk_mq_tag_set *set = hctx->queue->tag_set;
struct blk_mq_tags *new;
- bool ret;
if (!can_grow)
return -EINVAL;
@@ -604,34 +577,42 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth > MAX_SCHED_RQ)
return -EINVAL;
- new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
- tags->nr_reserved_tags, set->flags);
+ /*
+ * Only the sbitmap needs resizing since we allocated the max
+ * initially.
+ */
+ if (blk_mq_is_shared_tags(set->flags))
+ return 0;
+
+ new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth);
if (!new)
return -ENOMEM;
- ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
- if (ret) {
- blk_mq_free_rq_map(new, set->flags);
- return -ENOMEM;
- }
- blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
- blk_mq_free_rq_map(*tagsptr, set->flags);
+ blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num);
*tagsptr = new;
} else {
/*
* Don't need (or can't) update reserved tags here, they
* remain static and should never need resizing.
*/
- sbitmap_queue_resize(tags->bitmap_tags,
+ sbitmap_queue_resize(&tags->bitmap_tags,
tdepth - tags->nr_reserved_tags);
}
return 0;
}
-void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)
+void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size)
+{
+ struct blk_mq_tags *tags = set->shared_tags;
+
+ sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags);
+}
+
+void blk_mq_tag_update_sched_shared_tags(struct request_queue *q)
{
- sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);
+ sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags,
+ q->nr_requests - q->tag_set->reserved_tags);
}
/**
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 8ed55af08427..df787b5a23bd 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -2,52 +2,30 @@
#ifndef INT_BLK_MQ_TAG_H
#define INT_BLK_MQ_TAG_H
-/*
- * Tag address space map.
- */
-struct blk_mq_tags {
- unsigned int nr_tags;
- unsigned int nr_reserved_tags;
-
- atomic_t active_queues;
-
- struct sbitmap_queue *bitmap_tags;
- struct sbitmap_queue *breserved_tags;
-
- struct sbitmap_queue __bitmap_tags;
- struct sbitmap_queue __breserved_tags;
-
- struct request **rqs;
- struct request **static_rqs;
- struct list_head page_list;
-
- /*
- * used to clear request reference in rqs[] before freeing one
- * request pool
- */
- spinlock_t lock;
-};
+struct blk_mq_alloc_data;
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
unsigned int reserved_tags,
- int node, unsigned int flags);
-extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);
+ int node, int alloc_policy);
+extern void blk_mq_free_tags(struct blk_mq_tags *tags);
extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
struct sbitmap_queue *breserved_tags,
unsigned int queue_depth,
unsigned int reserved,
int node, int alloc_policy);
-extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set);
-extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set);
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
+unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
+ unsigned int *offset);
extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
unsigned int tag);
+void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
struct blk_mq_tags **tags,
unsigned int depth, bool can_grow);
-extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set,
+extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
unsigned int size);
+extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 652a31fc3bb3..07eb1412760b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -10,14 +10,15 @@
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/kmemleak.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
+#include <linux/interrupt.h>
#include <linux/llist.h>
-#include <linux/list_sort.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
@@ -63,6 +64,32 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
return bucket;
}
+#define BLK_QC_T_SHIFT 16
+#define BLK_QC_T_INTERNAL (1U << 31)
+
+static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
+ blk_qc_t qc)
+{
+ return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT];
+}
+
+static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
+ blk_qc_t qc)
+{
+ unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
+
+ if (qc & BLK_QC_T_INTERNAL)
+ return blk_mq_tag_to_rq(hctx->sched_tags, tag);
+ return blk_mq_tag_to_rq(hctx->tags, tag);
+}
+
+static inline blk_qc_t blk_rq_to_qc(struct request *rq)
+{
+ return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
+ (rq->tag != -1 ?
+ rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
+}
+
/*
* Check if any of the ctx, dispatch list or elevator
* have pending work in this hardware queue.
@@ -214,7 +241,12 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
*/
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
- blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->queue_lock, flags);
+ if (!q->quiesce_depth++)
+ blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+ spin_unlock_irqrestore(&q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
@@ -255,10 +287,21 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
*/
void blk_mq_unquiesce_queue(struct request_queue *q)
{
- blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+ unsigned long flags;
+ bool run_queue = false;
+
+ spin_lock_irqsave(&q->queue_lock, flags);
+ if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
+ ;
+ } else if (!--q->quiesce_depth) {
+ blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+ run_queue = true;
+ }
+ spin_unlock_irqrestore(&q->queue_lock, flags);
/* dispatch requests which are inserted during quiescing */
- blk_mq_run_hw_queues(q, true);
+ if (run_queue)
+ blk_mq_run_hw_queues(q, true);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
@@ -272,74 +315,67 @@ void blk_mq_wake_waiters(struct request_queue *q)
blk_mq_tag_wakeup_all(hctx->tags, true);
}
-/*
- * Only need start/end time stamping if we have iostat or
- * blk stats enabled, or using an IO scheduler.
- */
-static inline bool blk_mq_need_time_stamp(struct request *rq)
-{
- return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
-}
-
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
- unsigned int tag, u64 alloc_time_ns)
+ struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
{
- struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+ struct blk_mq_ctx *ctx = data->ctx;
+ struct blk_mq_hw_ctx *hctx = data->hctx;
+ struct request_queue *q = data->q;
struct request *rq = tags->static_rqs[tag];
- if (data->q->elevator) {
- rq->tag = BLK_MQ_NO_TAG;
- rq->internal_tag = tag;
- } else {
+ rq->q = q;
+ rq->mq_ctx = ctx;
+ rq->mq_hctx = hctx;
+ rq->cmd_flags = data->cmd_flags;
+
+ if (data->flags & BLK_MQ_REQ_PM)
+ data->rq_flags |= RQF_PM;
+ if (blk_queue_io_stat(q))
+ data->rq_flags |= RQF_IO_STAT;
+ rq->rq_flags = data->rq_flags;
+
+ if (!(data->rq_flags & RQF_ELV)) {
rq->tag = tag;
rq->internal_tag = BLK_MQ_NO_TAG;
+ } else {
+ rq->tag = BLK_MQ_NO_TAG;
+ rq->internal_tag = tag;
}
+ rq->timeout = 0;
- /* csd/requeue_work/fifo_time is initialized before use */
- rq->q = data->q;
- rq->mq_ctx = data->ctx;
- rq->mq_hctx = data->hctx;
- rq->rq_flags = 0;
- rq->cmd_flags = data->cmd_flags;
- if (data->flags & BLK_MQ_REQ_PM)
- rq->rq_flags |= RQF_PM;
- if (blk_queue_io_stat(data->q))
- rq->rq_flags |= RQF_IO_STAT;
- INIT_LIST_HEAD(&rq->queuelist);
- INIT_HLIST_NODE(&rq->hash);
- RB_CLEAR_NODE(&rq->rb_node);
+ if (blk_mq_need_time_stamp(rq))
+ rq->start_time_ns = ktime_get_ns();
+ else
+ rq->start_time_ns = 0;
rq->rq_disk = NULL;
rq->part = NULL;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
rq->alloc_time_ns = alloc_time_ns;
#endif
- if (blk_mq_need_time_stamp(rq))
- rq->start_time_ns = ktime_get_ns();
- else
- rq->start_time_ns = 0;
rq->io_start_time_ns = 0;
rq->stats_sectors = 0;
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
#endif
- blk_crypto_rq_set_defaults(rq);
- /* tag was already set */
- WRITE_ONCE(rq->deadline, 0);
-
- rq->timeout = 0;
-
rq->end_io = NULL;
rq->end_io_data = NULL;
- data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
+ blk_crypto_rq_set_defaults(rq);
+ INIT_LIST_HEAD(&rq->queuelist);
+ /* tag was already set */
+ WRITE_ONCE(rq->deadline, 0);
refcount_set(&rq->ref, 1);
- if (!op_is_flush(data->cmd_flags)) {
+ if (rq->rq_flags & RQF_ELV) {
struct elevator_queue *e = data->q->elevator;
rq->elv.icq = NULL;
- if (e && e->type->ops.prepare_request) {
+ INIT_HLIST_NODE(&rq->hash);
+ RB_CLEAR_NODE(&rq->rb_node);
+
+ if (!op_is_flush(data->cmd_flags) &&
+ e->type->ops.prepare_request) {
if (e->type->icq_cache)
blk_mq_sched_assign_ioc(rq);
@@ -348,15 +384,44 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
}
}
- data->hctx->queued++;
return rq;
}
-static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
+static inline struct request *
+__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
+ u64 alloc_time_ns)
+{
+ unsigned int tag, tag_offset;
+ struct blk_mq_tags *tags;
+ struct request *rq;
+ unsigned long tag_mask;
+ int i, nr = 0;
+
+ tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
+ if (unlikely(!tag_mask))
+ return NULL;
+
+ tags = blk_mq_tags_from_data(data);
+ for (i = 0; tag_mask; i++) {
+ if (!(tag_mask & (1UL << i)))
+ continue;
+ prefetch(tags->static_rqs[tag]);
+ tag = tag_offset + i;
+ tag_mask &= ~(1UL << i);
+ rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
+ rq_list_add(data->cached_rq, rq);
+ }
+ data->nr_tags -= nr;
+
+ return rq_list_pop(data->cached_rq);
+}
+
+static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
{
struct request_queue *q = data->q;
struct elevator_queue *e = q->elevator;
u64 alloc_time_ns = 0;
+ struct request *rq;
unsigned int tag;
/* alloc_time includes depth and tag waits */
@@ -386,6 +451,16 @@ retry:
blk_mq_tag_busy(data->hctx);
/*
+ * Try batched alloc if we want more than 1 tag.
+ */
+ if (data->nr_tags > 1) {
+ rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns);
+ if (rq)
+ return rq;
+ data->nr_tags = 1;
+ }
+
+ /*
* Waiting allocations only fail because of an inactive hctx. In that
* case just retry the hctx assignment and tag allocation as CPU hotplug
* should have migrated us to an online CPU by now.
@@ -394,16 +469,18 @@ retry:
if (tag == BLK_MQ_NO_TAG) {
if (data->flags & BLK_MQ_REQ_NOWAIT)
return NULL;
-
/*
- * Give up the CPU and sleep for a random short time to ensure
- * that thread using a realtime scheduling class are migrated
- * off the CPU, and thus off the hctx that is going away.
+ * Give up the CPU and sleep for a random short time to
+ * ensure that thread using a realtime scheduling class
+ * are migrated off the CPU, and thus off the hctx that
+ * is going away.
*/
msleep(3);
goto retry;
}
- return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
+
+ return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag,
+ alloc_time_ns);
}
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
@@ -413,6 +490,8 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
.q = q,
.flags = flags,
.cmd_flags = op,
+ .rq_flags = q->elevator ? RQF_ELV : 0,
+ .nr_tags = 1,
};
struct request *rq;
int ret;
@@ -421,7 +500,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
if (ret)
return ERR_PTR(ret);
- rq = __blk_mq_alloc_request(&data);
+ rq = __blk_mq_alloc_requests(&data);
if (!rq)
goto out_queue_exit;
rq->__data_len = 0;
@@ -441,6 +520,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
.q = q,
.flags = flags,
.cmd_flags = op,
+ .rq_flags = q->elevator ? RQF_ELV : 0,
+ .nr_tags = 1,
};
u64 alloc_time_ns = 0;
unsigned int cpu;
@@ -485,7 +566,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
tag = blk_mq_get_tag(&data);
if (tag == BLK_MQ_NO_TAG)
goto out_queue_exit;
- return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
+ return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag,
+ alloc_time_ns);
out_queue_exit:
blk_queue_exit(q);
@@ -514,12 +596,12 @@ static void __blk_mq_free_request(struct request *rq)
void blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
- struct elevator_queue *e = q->elevator;
- struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
if (rq->rq_flags & RQF_ELVPRIV) {
- if (e && e->type->ops.finish_request)
+ struct elevator_queue *e = q->elevator;
+
+ if (e->type->ops.finish_request)
e->type->ops.finish_request(rq);
if (rq->elv.icq) {
put_io_context(rq->elv.icq->ioc);
@@ -527,7 +609,6 @@ void blk_mq_free_request(struct request *rq)
}
}
- ctx->rq_completed[rq_is_sync(rq)]++;
if (rq->rq_flags & RQF_MQ_INFLIGHT)
__blk_mq_dec_active_requests(hctx);
@@ -542,21 +623,173 @@ void blk_mq_free_request(struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_mq_free_request);
-inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+void blk_mq_free_plug_rqs(struct blk_plug *plug)
{
- u64 now = 0;
+ struct request *rq;
- if (blk_mq_need_time_stamp(rq))
- now = ktime_get_ns();
+ while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) {
+ percpu_ref_get(&rq->q->q_usage_counter);
+ blk_mq_free_request(rq);
+ }
+}
+static void req_bio_endio(struct request *rq, struct bio *bio,
+ unsigned int nbytes, blk_status_t error)
+{
+ if (unlikely(error)) {
+ bio->bi_status = error;
+ } else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
+ /*
+ * Partial zone append completions cannot be supported as the
+ * BIO fragments may end up not being written sequentially.
+ */
+ if (bio->bi_iter.bi_size != nbytes)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_iter.bi_sector = rq->__sector;
+ }
+
+ bio_advance(bio, nbytes);
+
+ if (unlikely(rq->rq_flags & RQF_QUIET))
+ bio_set_flag(bio, BIO_QUIET);
+ /* don't actually finish bio if it's part of flush sequence */
+ if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
+ bio_endio(bio);
+}
+
+static void blk_account_io_completion(struct request *req, unsigned int bytes)
+{
+ if (req->part && blk_do_io_stat(req)) {
+ const int sgrp = op_stat_group(req_op(req));
+
+ part_stat_lock();
+ part_stat_add(req->part, sectors[sgrp], bytes >> 9);
+ part_stat_unlock();
+ }
+}
+
+/**
+ * blk_update_request - Complete multiple bytes without completing the request
+ * @req: the request being processed
+ * @error: block status code
+ * @nr_bytes: number of bytes to complete for @req
+ *
+ * Description:
+ * Ends I/O on a number of bytes attached to @req, but doesn't complete
+ * the request structure even if @req doesn't have leftover.
+ * If @req has leftover, sets it up for the next range of segments.
+ *
+ * Passing the result of blk_rq_bytes() as @nr_bytes guarantees
+ * %false return from this function.
+ *
+ * Note:
+ * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
+ * except in the consistency check at the end of this function.
+ *
+ * Return:
+ * %false - this request doesn't have any more data
+ * %true - this request has more data
+ **/
+bool blk_update_request(struct request *req, blk_status_t error,
+ unsigned int nr_bytes)
+{
+ int total_bytes;
+
+ trace_block_rq_complete(req, error, nr_bytes);
+
+ if (!req->bio)
+ return false;
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
+ error == BLK_STS_OK)
+ req->q->integrity.profile->complete_fn(req, nr_bytes);
+#endif
+
+ if (unlikely(error && !blk_rq_is_passthrough(req) &&
+ !(req->rq_flags & RQF_QUIET)))
+ blk_print_req_error(req, error);
+
+ blk_account_io_completion(req, nr_bytes);
+
+ total_bytes = 0;
+ while (req->bio) {
+ struct bio *bio = req->bio;
+ unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
+
+ if (bio_bytes == bio->bi_iter.bi_size)
+ req->bio = bio->bi_next;
+
+ /* Completion has already been traced */
+ bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+ req_bio_endio(req, bio, bio_bytes, error);
+
+ total_bytes += bio_bytes;
+ nr_bytes -= bio_bytes;
+
+ if (!nr_bytes)
+ break;
+ }
+
+ /*
+ * completely done
+ */
+ if (!req->bio) {
+ /*
+ * Reset counters so that the request stacking driver
+ * can find how many bytes remain in the request
+ * later.
+ */
+ req->__data_len = 0;
+ return false;
+ }
+
+ req->__data_len -= total_bytes;
+
+ /* update sector only for requests with clear definition of sector */
+ if (!blk_rq_is_passthrough(req))
+ req->__sector += total_bytes >> 9;
+
+ /* mixed attributes always follow the first bio */
+ if (req->rq_flags & RQF_MIXED_MERGE) {
+ req->cmd_flags &= ~REQ_FAILFAST_MASK;
+ req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
+ }
+
+ if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
+ /*
+ * If total number of sectors is less than the first segment
+ * size, something has gone terribly wrong.
+ */
+ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
+ blk_dump_rq_flags(req, "request botched");
+ req->__data_len = blk_rq_cur_bytes(req);
+ }
+
+ /* recalculate the number of segments */
+ req->nr_phys_segments = blk_recalc_rq_segments(req);
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_update_request);
+
+static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
+{
if (rq->rq_flags & RQF_STATS) {
blk_mq_poll_stats_start(rq->q);
blk_stat_add(rq, now);
}
blk_mq_sched_completed_request(rq, now);
-
blk_account_io_done(rq, now);
+}
+
+inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+{
+ if (blk_mq_need_time_stamp(rq))
+ __blk_mq_end_request_acct(rq, ktime_get_ns());
if (rq->end_io) {
rq_qos_done(rq->q, rq);
@@ -575,6 +808,57 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
}
EXPORT_SYMBOL(blk_mq_end_request);
+#define TAG_COMP_BATCH 32
+
+static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
+ int *tag_array, int nr_tags)
+{
+ struct request_queue *q = hctx->queue;
+
+ blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
+ percpu_ref_put_many(&q->q_usage_counter, nr_tags);
+}
+
+void blk_mq_end_request_batch(struct io_comp_batch *iob)
+{
+ int tags[TAG_COMP_BATCH], nr_tags = 0;
+ struct blk_mq_hw_ctx *cur_hctx = NULL;
+ struct request *rq;
+ u64 now = 0;
+
+ if (iob->need_ts)
+ now = ktime_get_ns();
+
+ while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
+ prefetch(rq->bio);
+ prefetch(rq->rq_next);
+
+ blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq));
+ if (iob->need_ts)
+ __blk_mq_end_request_acct(rq, now);
+
+ WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+ if (!refcount_dec_and_test(&rq->ref))
+ continue;
+
+ blk_crypto_free_request(rq);
+ blk_pm_mark_last_busy(rq);
+ rq_qos_done(rq->q, rq);
+
+ if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
+ if (cur_hctx)
+ blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
+ nr_tags = 0;
+ cur_hctx = rq->mq_hctx;
+ }
+ tags[nr_tags++] = rq->tag;
+ }
+
+ if (nr_tags)
+ blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
+}
+EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);
+
static void blk_complete_reqs(struct llist_head *list)
{
struct llist_node *entry = llist_reverse_order(llist_del_all(list));
@@ -658,7 +942,7 @@ bool blk_mq_complete_request_remote(struct request *rq)
* For a polled request, always complete locallly, it's pointless
* to redirect the completion.
*/
- if (rq->cmd_flags & REQ_HIPRI)
+ if (rq->cmd_flags & REQ_POLLED)
return false;
if (blk_mq_complete_need_ipi(rq)) {
@@ -723,7 +1007,14 @@ void blk_mq_start_request(struct request *rq)
trace_block_rq_issue(rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
- rq->io_start_time_ns = ktime_get_ns();
+ u64 start_time;
+#ifdef CONFIG_BLK_CGROUP
+ if (rq->bio)
+ start_time = bio_issue_time(&rq->bio->bi_issue);
+ else
+#endif
+ start_time = ktime_get_ns();
+ rq->io_start_time_ns = start_time;
rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS;
rq_qos_issue(q, rq);
@@ -738,6 +1029,8 @@ void blk_mq_start_request(struct request *rq)
if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
q->integrity.profile->prepare_fn(rq);
#endif
+ if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
+ WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq));
}
EXPORT_SYMBOL(blk_mq_start_request);
@@ -763,7 +1056,6 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
/* this request will be re-inserted to io scheduler queue */
blk_mq_sched_requeue_request(rq);
- BUG_ON(!list_empty(&rq->queuelist));
blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
}
EXPORT_SYMBOL(blk_mq_requeue_request);
@@ -844,17 +1136,6 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q,
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
-struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
-{
- if (tag < tags->nr_tags) {
- prefetch(tags->rqs[tag]);
- return tags->rqs[tag];
- }
-
- return NULL;
-}
-EXPORT_SYMBOL(blk_mq_tag_to_rq);
-
static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
void *priv, bool reserved)
{
@@ -1059,24 +1340,16 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
return data.rq;
}
-static inline unsigned int queued_to_index(unsigned int queued)
-{
- if (!queued)
- return 0;
-
- return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
-}
-
-static bool __blk_mq_get_driver_tag(struct request *rq)
+static bool __blk_mq_alloc_driver_tag(struct request *rq)
{
- struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
+ struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
int tag;
blk_mq_tag_busy(rq->mq_hctx);
if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
- bt = rq->mq_hctx->tags->breserved_tags;
+ bt = &rq->mq_hctx->tags->breserved_tags;
tag_offset = 0;
} else {
if (!hctx_may_queue(rq->mq_hctx, bt))
@@ -1091,11 +1364,9 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
return true;
}
-bool blk_mq_get_driver_tag(struct request *rq)
+bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq)
{
- struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
-
- if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
return false;
if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
@@ -1119,7 +1390,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
struct sbitmap_queue *sbq;
list_del_init(&wait->entry);
- sbq = hctx->tags->bitmap_tags;
+ sbq = &hctx->tags->bitmap_tags;
atomic_dec(&sbq->ws_active);
}
spin_unlock(&hctx->dispatch_wait_lock);
@@ -1137,7 +1408,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
+ struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags;
struct wait_queue_head *wq;
wait_queue_entry_t *wait;
bool ret;
@@ -1394,8 +1665,6 @@ out:
if (!list_empty(&zone_list))
list_splice_tail_init(&zone_list, list);
- hctx->dispatched[queued_to_index(queued)]++;
-
/* If we didn't flush the entire list, we could have told the driver
* there was more coming, but that turned out to be a lie.
*/
@@ -1899,54 +2168,106 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
spin_unlock(&ctx->lock);
}
-static int plug_rq_cmp(void *priv, const struct list_head *a,
- const struct list_head *b)
+static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued,
+ bool from_schedule)
{
- struct request *rqa = container_of(a, struct request, queuelist);
- struct request *rqb = container_of(b, struct request, queuelist);
+ if (hctx->queue->mq_ops->commit_rqs) {
+ trace_block_unplug(hctx->queue, *queued, !from_schedule);
+ hctx->queue->mq_ops->commit_rqs(hctx);
+ }
+ *queued = 0;
+}
- if (rqa->mq_ctx != rqb->mq_ctx)
- return rqa->mq_ctx > rqb->mq_ctx;
- if (rqa->mq_hctx != rqb->mq_hctx)
- return rqa->mq_hctx > rqb->mq_hctx;
+static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
+{
+ struct blk_mq_hw_ctx *hctx = NULL;
+ struct request *rq;
+ int queued = 0;
+ int errors = 0;
- return blk_rq_pos(rqa) > blk_rq_pos(rqb);
+ while ((rq = rq_list_pop(&plug->mq_list))) {
+ bool last = rq_list_empty(plug->mq_list);
+ blk_status_t ret;
+
+ if (hctx != rq->mq_hctx) {
+ if (hctx)
+ blk_mq_commit_rqs(hctx, &queued, from_schedule);
+ hctx = rq->mq_hctx;
+ }
+
+ ret = blk_mq_request_issue_directly(rq, last);
+ switch (ret) {
+ case BLK_STS_OK:
+ queued++;
+ break;
+ case BLK_STS_RESOURCE:
+ case BLK_STS_DEV_RESOURCE:
+ blk_mq_request_bypass_insert(rq, false, last);
+ blk_mq_commit_rqs(hctx, &queued, from_schedule);
+ return;
+ default:
+ blk_mq_end_request(rq, ret);
+ errors++;
+ break;
+ }
+ }
+
+ /*
+ * If we didn't flush the entire list, we could have told the driver
+ * there was more coming, but that turned out to be a lie.
+ */
+ if (errors)
+ blk_mq_commit_rqs(hctx, &queued, from_schedule);
}
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
+ struct blk_mq_hw_ctx *this_hctx;
+ struct blk_mq_ctx *this_ctx;
+ unsigned int depth;
LIST_HEAD(list);
- if (list_empty(&plug->mq_list))
+ if (rq_list_empty(plug->mq_list))
return;
- list_splice_init(&plug->mq_list, &list);
-
- if (plug->rq_count > 2 && plug->multiple_queues)
- list_sort(NULL, &list, plug_rq_cmp);
-
plug->rq_count = 0;
+ if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
+ blk_mq_plug_issue_direct(plug, from_schedule);
+ if (rq_list_empty(plug->mq_list))
+ return;
+ }
+
+ this_hctx = NULL;
+ this_ctx = NULL;
+ depth = 0;
do {
- struct list_head rq_list;
- struct request *rq, *head_rq = list_entry_rq(list.next);
- struct list_head *pos = &head_rq->queuelist; /* skip first */
- struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
- struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
- unsigned int depth = 1;
-
- list_for_each_continue(pos, &list) {
- rq = list_entry_rq(pos);
- BUG_ON(!rq->q);
- if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
- break;
- depth++;
+ struct request *rq;
+
+ rq = rq_list_pop(&plug->mq_list);
+
+ if (!this_hctx) {
+ this_hctx = rq->mq_hctx;
+ this_ctx = rq->mq_ctx;
+ } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
+ trace_block_unplug(this_hctx->queue, depth,
+ !from_schedule);
+ blk_mq_sched_insert_requests(this_hctx, this_ctx,
+ &list, from_schedule);
+ depth = 0;
+ this_hctx = rq->mq_hctx;
+ this_ctx = rq->mq_ctx;
+
}
- list_cut_before(&rq_list, &list, pos);
- trace_block_unplug(head_rq->q, depth, !from_schedule);
- blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
+ list_add(&rq->queuelist, &list);
+ depth++;
+ } while (!rq_list_empty(plug->mq_list));
+
+ if (!list_empty(&list)) {
+ trace_block_unplug(this_hctx->queue, depth, !from_schedule);
+ blk_mq_sched_insert_requests(this_hctx, this_ctx, &list,
from_schedule);
- } while(!list_empty(&list));
+ }
}
static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
@@ -1969,19 +2290,15 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
}
static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
- struct request *rq,
- blk_qc_t *cookie, bool last)
+ struct request *rq, bool last)
{
struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = {
.rq = rq,
.last = last,
};
- blk_qc_t new_cookie;
blk_status_t ret;
- new_cookie = request_to_qc_t(hctx, rq);
-
/*
* For OK queue, we are done. For error, caller may kill it.
* Any other error (busy), just add it to our list as we
@@ -1991,7 +2308,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
switch (ret) {
case BLK_STS_OK:
blk_mq_update_dispatch_busy(hctx, false);
- *cookie = new_cookie;
break;
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
@@ -2000,7 +2316,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
break;
default:
blk_mq_update_dispatch_busy(hctx, false);
- *cookie = BLK_QC_T_NONE;
break;
}
@@ -2009,7 +2324,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq,
- blk_qc_t *cookie,
bool bypass_insert, bool last)
{
struct request_queue *q = rq->q;
@@ -2029,7 +2343,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
goto insert;
}
- if (q->elevator && !bypass_insert)
+ if ((rq->rq_flags & RQF_ELV) && !bypass_insert)
goto insert;
budget_token = blk_mq_get_dispatch_budget(q);
@@ -2043,7 +2357,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
goto insert;
}
- return __blk_mq_issue_directly(hctx, rq, cookie, last);
+ return __blk_mq_issue_directly(hctx, rq, last);
insert:
if (bypass_insert)
return BLK_STS_RESOURCE;
@@ -2057,7 +2371,6 @@ insert:
* blk_mq_try_issue_directly - Try to send a request directly to device driver.
* @hctx: Pointer of the associated hardware queue.
* @rq: Pointer to request to be sent.
- * @cookie: Request queue cookie.
*
* If the device has enough resources to accept a new request now, send the
* request directly to device driver. Else, insert at hctx->dispatch queue, so
@@ -2065,7 +2378,7 @@ insert:
* queue have higher priority.
*/
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
- struct request *rq, blk_qc_t *cookie)
+ struct request *rq)
{
blk_status_t ret;
int srcu_idx;
@@ -2074,7 +2387,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
hctx_lock(hctx, &srcu_idx);
- ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
+ ret = __blk_mq_try_issue_directly(hctx, rq, false, true);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
blk_mq_request_bypass_insert(rq, false, true);
else if (ret != BLK_STS_OK)
@@ -2087,11 +2400,10 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
{
blk_status_t ret;
int srcu_idx;
- blk_qc_t unused_cookie;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
hctx_lock(hctx, &srcu_idx);
- ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
+ ret = __blk_mq_try_issue_directly(hctx, rq, true, last);
hctx_unlock(hctx, srcu_idx);
return ret;
@@ -2135,27 +2447,28 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
{
- list_add_tail(&rq->queuelist, &plug->mq_list);
- plug->rq_count++;
- if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
- struct request *tmp;
+ if (!plug->multiple_queues) {
+ struct request *nxt = rq_list_peek(&plug->mq_list);
- tmp = list_first_entry(&plug->mq_list, struct request,
- queuelist);
- if (tmp->q != rq->q)
+ if (nxt && nxt->q != rq->q)
plug->multiple_queues = true;
}
+ if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
+ plug->has_elevator = true;
+ rq->rq_next = NULL;
+ rq_list_add(&plug->mq_list, rq);
+ plug->rq_count++;
}
/*
- * Allow 4x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
* queues. This is important for md arrays to benefit from merging
* requests.
*/
static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
{
if (plug->multiple_queues)
- return BLK_MAX_REQUEST_COUNT * 4;
+ return BLK_MAX_REQUEST_COUNT * 2;
return BLK_MAX_REQUEST_COUNT;
}
@@ -2171,57 +2484,63 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
*
* It will not queue the request if there is an error with the bio, or at the
* request creation.
- *
- * Returns: Request queue cookie.
*/
-blk_qc_t blk_mq_submit_bio(struct bio *bio)
+void blk_mq_submit_bio(struct bio *bio)
{
- struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
const int is_sync = op_is_sync(bio->bi_opf);
- const int is_flush_fua = op_is_flush(bio->bi_opf);
- struct blk_mq_alloc_data data = {
- .q = q,
- };
struct request *rq;
struct blk_plug *plug;
- struct request *same_queue_rq = NULL;
- unsigned int nr_segs;
- blk_qc_t cookie;
+ bool same_queue_rq = false;
+ unsigned int nr_segs = 1;
blk_status_t ret;
- bool hipri;
blk_queue_bounce(q, &bio);
- __blk_queue_split(&bio, &nr_segs);
+ if (blk_may_split(q, bio))
+ __blk_queue_split(q, &bio, &nr_segs);
if (!bio_integrity_prep(bio))
goto queue_exit;
- if (!is_flush_fua && !blk_queue_nomerges(q) &&
- blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
- goto queue_exit;
-
- if (blk_mq_sched_bio_merge(q, bio, nr_segs))
- goto queue_exit;
+ if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
+ if (blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
+ goto queue_exit;
+ if (blk_mq_sched_bio_merge(q, bio, nr_segs))
+ goto queue_exit;
+ }
rq_qos_throttle(q, bio);
- hipri = bio->bi_opf & REQ_HIPRI;
-
- data.cmd_flags = bio->bi_opf;
- rq = __blk_mq_alloc_request(&data);
- if (unlikely(!rq)) {
- rq_qos_cleanup(q, bio);
- if (bio->bi_opf & REQ_NOWAIT)
- bio_wouldblock_error(bio);
- goto queue_exit;
+ plug = blk_mq_plug(q, bio);
+ if (plug && plug->cached_rq) {
+ rq = rq_list_pop(&plug->cached_rq);
+ INIT_LIST_HEAD(&rq->queuelist);
+ } else {
+ struct blk_mq_alloc_data data = {
+ .q = q,
+ .nr_tags = 1,
+ .cmd_flags = bio->bi_opf,
+ .rq_flags = q->elevator ? RQF_ELV : 0,
+ };
+
+ if (plug) {
+ data.nr_tags = plug->nr_ios;
+ plug->nr_ios = 1;
+ data.cached_rq = &plug->cached_rq;
+ }
+ rq = __blk_mq_alloc_requests(&data);
+ if (unlikely(!rq)) {
+ rq_qos_cleanup(q, bio);
+ if (bio->bi_opf & REQ_NOWAIT)
+ bio_wouldblock_error(bio);
+ goto queue_exit;
+ }
}
trace_block_getrq(bio);
rq_qos_track(q, rq, bio);
- cookie = request_to_qc_t(data.hctx, rq);
-
blk_mq_bio_to_request(rq, bio, nr_segs);
ret = blk_crypto_init_request(rq);
@@ -2229,17 +2548,15 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
bio->bi_status = ret;
bio_endio(bio);
blk_mq_free_request(rq);
- return BLK_QC_T_NONE;
+ return;
}
- plug = blk_mq_plug(q, bio);
- if (unlikely(is_flush_fua)) {
- /* Bypass scheduler for flush requests */
- blk_insert_flush(rq);
- blk_mq_run_hw_queue(data.hctx, true);
- } else if (plug && (q->nr_hw_queues == 1 ||
- blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) ||
- q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
+ if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
+ return;
+
+ if (plug && (q->nr_hw_queues == 1 ||
+ blk_mq_is_shared_tags(rq->mq_hctx->flags) ||
+ q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
/*
* Use plugging if we have a ->commit_rqs() hook as well, as
* we know the driver uses bd->last in a smart fashion.
@@ -2250,22 +2567,26 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
unsigned int request_count = plug->rq_count;
struct request *last = NULL;
- if (!request_count)
+ if (!request_count) {
trace_block_plug(q);
- else
- last = list_entry_rq(plug->mq_list.prev);
+ } else if (!blk_queue_nomerges(q)) {
+ last = rq_list_peek(&plug->mq_list);
+ if (blk_rq_bytes(last) < BLK_PLUG_FLUSH_SIZE)
+ last = NULL;
+ }
- if (request_count >= blk_plug_max_rq_count(plug) || (last &&
- blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
- blk_flush_plug_list(plug, false);
+ if (request_count >= blk_plug_max_rq_count(plug) || last) {
+ blk_mq_flush_plug_list(plug, false);
trace_block_plug(q);
}
blk_add_rq_to_plug(plug, rq);
- } else if (q->elevator) {
+ } else if (rq->rq_flags & RQF_ELV) {
/* Insert the request at the IO scheduler queue */
blk_mq_sched_insert_request(rq, false, true, true);
} else if (plug && !blk_queue_nomerges(q)) {
+ struct request *next_rq = NULL;
+
/*
* We do limited plugging. If the bio can be merged, do that.
* Otherwise the existing request in the plug list will be
@@ -2273,39 +2594,32 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
* The plug list might get flushed before this. If that happens,
* the plug list is empty, and same_queue_rq is invalid.
*/
- if (list_empty(&plug->mq_list))
- same_queue_rq = NULL;
if (same_queue_rq) {
- list_del_init(&same_queue_rq->queuelist);
+ next_rq = rq_list_pop(&plug->mq_list);
plug->rq_count--;
}
blk_add_rq_to_plug(plug, rq);
trace_block_plug(q);
- if (same_queue_rq) {
- data.hctx = same_queue_rq->mq_hctx;
+ if (next_rq) {
trace_block_unplug(q, 1, true);
- blk_mq_try_issue_directly(data.hctx, same_queue_rq,
- &cookie);
+ blk_mq_try_issue_directly(next_rq->mq_hctx, next_rq);
}
} else if ((q->nr_hw_queues > 1 && is_sync) ||
- !data.hctx->dispatch_busy) {
+ !rq->mq_hctx->dispatch_busy) {
/*
* There is no scheduler and we can try to send directly
* to the hardware.
*/
- blk_mq_try_issue_directly(data.hctx, rq, &cookie);
+ blk_mq_try_issue_directly(rq->mq_hctx, rq);
} else {
/* Default case. */
blk_mq_sched_insert_request(rq, false, true, true);
}
- if (!hipri)
- return BLK_QC_T_NONE;
- return cookie;
+ return;
queue_exit:
blk_queue_exit(q);
- return BLK_QC_T_NONE;
}
static size_t order_to_size(unsigned int order)
@@ -2314,19 +2628,22 @@ static size_t order_to_size(unsigned int order)
}
/* called before freeing request pool in @tags */
-static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
- struct blk_mq_tags *tags, unsigned int hctx_idx)
+static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
+ struct blk_mq_tags *tags)
{
- struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
struct page *page;
unsigned long flags;
+ /* There is no need to clear a driver tags own mapping */
+ if (drv_tags == tags)
+ return;
+
list_for_each_entry(page, &tags->page_list, lru) {
unsigned long start = (unsigned long)page_address(page);
unsigned long end = start + order_to_size(page->private);
int i;
- for (i = 0; i < set->queue_depth; i++) {
+ for (i = 0; i < drv_tags->nr_tags; i++) {
struct request *rq = drv_tags->rqs[i];
unsigned long rq_addr = (unsigned long)rq;
@@ -2350,9 +2667,15 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
{
+ struct blk_mq_tags *drv_tags;
struct page *page;
- if (tags->rqs && set->ops->exit_request) {
+ if (blk_mq_is_shared_tags(set->flags))
+ drv_tags = set->shared_tags;
+ else
+ drv_tags = set->tags[hctx_idx];
+
+ if (tags->static_rqs && set->ops->exit_request) {
int i;
for (i = 0; i < tags->nr_tags; i++) {
@@ -2365,7 +2688,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
}
}
- blk_mq_clear_rq_mapping(set, tags, hctx_idx);
+ blk_mq_clear_rq_mapping(drv_tags, tags);
while (!list_empty(&tags->page_list)) {
page = list_first_entry(&tags->page_list, struct page, lru);
@@ -2379,21 +2702,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
}
}
-void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
+void blk_mq_free_rq_map(struct blk_mq_tags *tags)
{
kfree(tags->rqs);
tags->rqs = NULL;
kfree(tags->static_rqs);
tags->static_rqs = NULL;
- blk_mq_free_tags(tags, flags);
+ blk_mq_free_tags(tags);
}
-struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
- unsigned int hctx_idx,
- unsigned int nr_tags,
- unsigned int reserved_tags,
- unsigned int flags)
+static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx,
+ unsigned int nr_tags,
+ unsigned int reserved_tags)
{
struct blk_mq_tags *tags;
int node;
@@ -2402,7 +2724,8 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
if (node == NUMA_NO_NODE)
node = set->numa_node;
- tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
+ tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
+ BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
if (!tags)
return NULL;
@@ -2410,7 +2733,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
if (!tags->rqs) {
- blk_mq_free_tags(tags, flags);
+ blk_mq_free_tags(tags);
return NULL;
}
@@ -2419,7 +2742,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
node);
if (!tags->static_rqs) {
kfree(tags->rqs);
- blk_mq_free_tags(tags, flags);
+ blk_mq_free_tags(tags);
return NULL;
}
@@ -2441,8 +2764,9 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
return 0;
}
-int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
- unsigned int hctx_idx, unsigned int depth)
+static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
+ struct blk_mq_tags *tags,
+ unsigned int hctx_idx, unsigned int depth)
{
unsigned int i, j, entries_per_page, max_order = 4;
size_t rq_size, left;
@@ -2853,37 +3177,58 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
}
}
-static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
- int hctx_idx)
+struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx,
+ unsigned int depth)
{
- unsigned int flags = set->flags;
- int ret = 0;
+ struct blk_mq_tags *tags;
+ int ret;
- set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
- set->queue_depth, set->reserved_tags, flags);
- if (!set->tags[hctx_idx])
- return false;
+ tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
+ if (!tags)
+ return NULL;
- ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
- set->queue_depth);
- if (!ret)
- return true;
+ ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
+ if (ret) {
+ blk_mq_free_rq_map(tags);
+ return NULL;
+ }
- blk_mq_free_rq_map(set->tags[hctx_idx], flags);
- set->tags[hctx_idx] = NULL;
- return false;
+ return tags;
}
-static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
- unsigned int hctx_idx)
+static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+ int hctx_idx)
{
- unsigned int flags = set->flags;
+ if (blk_mq_is_shared_tags(set->flags)) {
+ set->tags[hctx_idx] = set->shared_tags;
- if (set->tags && set->tags[hctx_idx]) {
- blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
- blk_mq_free_rq_map(set->tags[hctx_idx], flags);
- set->tags[hctx_idx] = NULL;
+ return true;
}
+
+ set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
+ set->queue_depth);
+
+ return set->tags[hctx_idx];
+}
+
+void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+ struct blk_mq_tags *tags,
+ unsigned int hctx_idx)
+{
+ if (tags) {
+ blk_mq_free_rqs(set, tags, hctx_idx);
+ blk_mq_free_rq_map(tags);
+ }
+}
+
+static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
+{
+ if (!blk_mq_is_shared_tags(set->flags))
+ blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);
+
+ set->tags[hctx_idx] = NULL;
}
static void blk_mq_map_swqueue(struct request_queue *q)
@@ -2916,7 +3261,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
hctx_idx = set->map[j].mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
if (!set->tags[hctx_idx] &&
- !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
+ !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
/*
* If tags initialization fail for some hctx,
* that hctx won't be brought online. In this
@@ -2963,8 +3308,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
* fallback in case of a new remap fails
* allocation
*/
- if (i && set->tags[i])
- blk_mq_free_map_and_requests(set, i);
+ if (i)
+ __blk_mq_free_map_and_rqs(set, i);
hctx->tags = NULL;
continue;
@@ -3260,8 +3605,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx = hctxs[j];
if (hctx) {
- if (hctx->tags)
- blk_mq_free_map_and_requests(set, j);
+ __blk_mq_free_map_and_rqs(set, j);
blk_mq_exit_hctx(q, set, hctx, j);
hctxs[j] = NULL;
}
@@ -3348,8 +3692,16 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{
int i;
+ if (blk_mq_is_shared_tags(set->flags)) {
+ set->shared_tags = blk_mq_alloc_map_and_rqs(set,
+ BLK_MQ_NO_HCTX_IDX,
+ set->queue_depth);
+ if (!set->shared_tags)
+ return -ENOMEM;
+ }
+
for (i = 0; i < set->nr_hw_queues; i++) {
- if (!__blk_mq_alloc_map_and_request(set, i))
+ if (!__blk_mq_alloc_map_and_rqs(set, i))
goto out_unwind;
cond_resched();
}
@@ -3358,7 +3710,12 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
out_unwind:
while (--i >= 0)
- blk_mq_free_map_and_requests(set, i);
+ __blk_mq_free_map_and_rqs(set, i);
+
+ if (blk_mq_is_shared_tags(set->flags)) {
+ blk_mq_free_map_and_rqs(set, set->shared_tags,
+ BLK_MQ_NO_HCTX_IDX);
+ }
return -ENOMEM;
}
@@ -3368,7 +3725,7 @@ out_unwind:
* may reduce the depth asked for, if memory is tight. set->queue_depth
* will be updated to reflect the allocated depth.
*/
-static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
+static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
{
unsigned int depth;
int err;
@@ -3534,27 +3891,15 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (ret)
goto out_free_mq_map;
- ret = blk_mq_alloc_map_and_requests(set);
+ ret = blk_mq_alloc_set_map_and_rqs(set);
if (ret)
goto out_free_mq_map;
- if (blk_mq_is_sbitmap_shared(set->flags)) {
- atomic_set(&set->active_queues_shared_sbitmap, 0);
-
- if (blk_mq_init_shared_sbitmap(set)) {
- ret = -ENOMEM;
- goto out_free_mq_rq_maps;
- }
- }
-
mutex_init(&set->tag_list_lock);
INIT_LIST_HEAD(&set->tag_list);
return 0;
-out_free_mq_rq_maps:
- for (i = 0; i < set->nr_hw_queues; i++)
- blk_mq_free_map_and_requests(set, i);
out_free_mq_map:
for (i = 0; i < set->nr_maps; i++) {
kfree(set->map[i].mq_map);
@@ -3587,10 +3932,12 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
int i, j;
for (i = 0; i < set->nr_hw_queues; i++)
- blk_mq_free_map_and_requests(set, i);
+ __blk_mq_free_map_and_rqs(set, i);
- if (blk_mq_is_sbitmap_shared(set->flags))
- blk_mq_exit_shared_sbitmap(set);
+ if (blk_mq_is_shared_tags(set->flags)) {
+ blk_mq_free_map_and_rqs(set, set->shared_tags,
+ BLK_MQ_NO_HCTX_IDX);
+ }
for (j = 0; j < set->nr_maps; j++) {
kfree(set->map[j].mq_map);
@@ -3625,20 +3972,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
* If we're using an MQ scheduler, just update the scheduler
* queue depth. This is similar to what the old code would do.
*/
- if (!hctx->sched_tags) {
- ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
- false);
- if (!ret && blk_mq_is_sbitmap_shared(set->flags))
- blk_mq_tag_resize_shared_sbitmap(set, nr);
- } else {
+ if (hctx->sched_tags) {
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
- nr, true);
- if (blk_mq_is_sbitmap_shared(set->flags)) {
- hctx->sched_tags->bitmap_tags =
- &q->sched_bitmap_tags;
- hctx->sched_tags->breserved_tags =
- &q->sched_breserved_tags;
- }
+ nr, true);
+ } else {
+ ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
+ false);
}
if (ret)
break;
@@ -3647,9 +3986,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
}
if (!ret) {
q->nr_requests = nr;
- if (q->elevator && blk_mq_is_sbitmap_shared(set->flags))
- sbitmap_queue_resize(&q->sched_bitmap_tags,
- nr - set->reserved_tags);
+ if (blk_mq_is_shared_tags(set->flags)) {
+ if (q->elevator)
+ blk_mq_tag_update_sched_shared_tags(q);
+ else
+ blk_mq_tag_resize_shared_tags(set, nr);
+ }
}
blk_mq_unquiesce_queue(q);
@@ -3868,15 +4210,20 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
return ret;
}
-static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
- struct request *rq)
+static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
{
+ struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc);
+ struct request *rq = blk_qc_to_rq(hctx, qc);
struct hrtimer_sleeper hs;
enum hrtimer_mode mode;
unsigned int nsecs;
ktime_t kt;
- if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
+ /*
+ * If a request has completed on queue that uses an I/O scheduler, we
+ * won't get back a request from blk_qc_to_rq.
+ */
+ if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT))
return false;
/*
@@ -3918,92 +4265,37 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
__set_current_state(TASK_RUNNING);
destroy_hrtimer_on_stack(&hs.timer);
- return true;
-}
-
-static bool blk_mq_poll_hybrid(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
-{
- struct request *rq;
-
- if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
- return false;
-
- if (!blk_qc_t_is_internal(cookie))
- rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
- else {
- rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
- /*
- * With scheduling, if the request has completed, we'll
- * get a NULL return here, as we clear the sched tag when
- * that happens. The request still remains valid, like always,
- * so we should be safe with just the NULL check.
- */
- if (!rq)
- return false;
- }
-
- return blk_mq_poll_hybrid_sleep(q, rq);
-}
-
-/**
- * blk_poll - poll for IO completions
- * @q: the queue
- * @cookie: cookie passed back at IO submission time
- * @spin: whether to spin for completions
- *
- * Description:
- * Poll for completions on the passed in queue. Returns number of
- * completed entries found. If @spin is true, then blk_poll will continue
- * looping until at least one completion is found, unless the task is
- * otherwise marked running (or we need to reschedule).
- */
-int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
-{
- struct blk_mq_hw_ctx *hctx;
- unsigned int state;
-
- if (!blk_qc_t_valid(cookie) ||
- !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
- return 0;
-
- if (current->plug)
- blk_flush_plug_list(current->plug, false);
-
- hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
/*
- * If we sleep, have the caller restart the poll loop to reset
- * the state. Like for the other success return cases, the
- * caller is responsible for checking if the IO completed. If
- * the IO isn't complete, we'll get called again and will go
- * straight to the busy poll loop. If specified not to spin,
- * we also should not sleep.
+ * If we sleep, have the caller restart the poll loop to reset the
+ * state. Like for the other success return cases, the caller is
+ * responsible for checking if the IO completed. If the IO isn't
+ * complete, we'll get called again and will go straight to the busy
+ * poll loop.
*/
- if (spin && blk_mq_poll_hybrid(q, hctx, cookie))
- return 1;
+ return true;
+}
- hctx->poll_considered++;
+static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
+ struct io_comp_batch *iob, unsigned int flags)
+{
+ struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
+ long state = get_current_state();
+ int ret;
- state = get_current_state();
do {
- int ret;
-
- hctx->poll_invoked++;
-
- ret = q->mq_ops->poll(hctx);
+ ret = q->mq_ops->poll(hctx, iob);
if (ret > 0) {
- hctx->poll_success++;
__set_current_state(TASK_RUNNING);
return ret;
}
if (signal_pending_state(state, current))
__set_current_state(TASK_RUNNING);
-
if (task_is_running(current))
return 1;
- if (ret < 0 || !spin)
+
+ if (ret < 0 || (flags & BLK_POLL_ONESHOT))
break;
cpu_relax();
} while (!need_resched());
@@ -4011,7 +4303,17 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
__set_current_state(TASK_RUNNING);
return 0;
}
-EXPORT_SYMBOL_GPL(blk_poll);
+
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+ unsigned int flags)
+{
+ if (!(flags & BLK_POLL_NOSLEEP) &&
+ q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
+ if (blk_mq_poll_hybrid(q, cookie))
+ return 1;
+ }
+ return blk_mq_poll_classic(q, cookie, iob, flags);
+}
unsigned int blk_mq_rq_cpu(struct request *rq)
{
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d08779f77a26..28859fc5faee 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -25,18 +25,14 @@ struct blk_mq_ctx {
unsigned short index_hw[HCTX_MAX_TYPES];
struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES];
- /* incremented at dispatch time */
- unsigned long rq_dispatched[2];
- unsigned long rq_merged;
-
- /* incremented at completion time */
- unsigned long ____cacheline_aligned_in_smp rq_completed[2];
-
struct request_queue *queue;
struct blk_mq_ctxs *ctxs;
struct kobject kobj;
} ____cacheline_aligned_in_smp;
+void blk_mq_submit_bio(struct bio *bio);
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+ unsigned int flags);
void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
@@ -54,15 +50,12 @@ void blk_mq_put_rq_ref(struct request *rq);
*/
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx);
-void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags);
-struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
- unsigned int hctx_idx,
- unsigned int nr_tags,
- unsigned int reserved_tags,
- unsigned int flags);
-int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
- unsigned int hctx_idx, unsigned int depth);
-
+void blk_mq_free_rq_map(struct blk_mq_tags *tags);
+struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx, unsigned int depth);
+void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+ struct blk_mq_tags *tags,
+ unsigned int hctx_idx);
/*
* Internal helpers for request insertion into sw queues
*/
@@ -109,9 +102,9 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
enum hctx_type type = HCTX_TYPE_DEFAULT;
/*
- * The caller ensure that if REQ_HIPRI, poll must be enabled.
+ * The caller ensure that if REQ_POLLED, poll must be enabled.
*/
- if (flags & REQ_HIPRI)
+ if (flags & REQ_POLLED)
type = HCTX_TYPE_POLL;
else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
@@ -128,6 +121,8 @@ extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
+void blk_mq_free_plug_rqs(struct blk_plug *plug);
+void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
void blk_mq_release(struct request_queue *q);
@@ -154,23 +149,27 @@ struct blk_mq_alloc_data {
blk_mq_req_flags_t flags;
unsigned int shallow_depth;
unsigned int cmd_flags;
+ unsigned int rq_flags;
+
+ /* allocate multiple requests/tags in one go */
+ unsigned int nr_tags;
+ struct request **cached_rq;
/* input & output parameter */
struct blk_mq_ctx *ctx;
struct blk_mq_hw_ctx *hctx;
};
-static inline bool blk_mq_is_sbitmap_shared(unsigned int flags)
+static inline bool blk_mq_is_shared_tags(unsigned int flags)
{
return flags & BLK_MQ_F_TAG_HCTX_SHARED;
}
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{
- if (data->q->elevator)
- return data->hctx->sched_tags;
-
- return data->hctx->tags;
+ if (!(data->rq_flags & RQF_ELV))
+ return data->hctx->tags;
+ return data->hctx->sched_tags;
}
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
@@ -220,24 +219,24 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq)
static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
{
- if (blk_mq_is_sbitmap_shared(hctx->flags))
- atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap);
+ if (blk_mq_is_shared_tags(hctx->flags))
+ atomic_inc(&hctx->queue->nr_active_requests_shared_tags);
else
atomic_inc(&hctx->nr_active);
}
static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
{
- if (blk_mq_is_sbitmap_shared(hctx->flags))
- atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
+ if (blk_mq_is_shared_tags(hctx->flags))
+ atomic_dec(&hctx->queue->nr_active_requests_shared_tags);
else
atomic_dec(&hctx->nr_active);
}
static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
{
- if (blk_mq_is_sbitmap_shared(hctx->flags))
- return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap);
+ if (blk_mq_is_shared_tags(hctx->flags))
+ return atomic_read(&hctx->queue->nr_active_requests_shared_tags);
return atomic_read(&hctx->nr_active);
}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
@@ -260,7 +259,20 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
__blk_mq_put_driver_tag(rq->mq_hctx, rq);
}
-bool blk_mq_get_driver_tag(struct request *rq);
+bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
+
+static inline bool blk_mq_get_driver_tag(struct request *rq)
+{
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+ if (rq->tag != BLK_MQ_NO_TAG &&
+ !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
+ hctx->tags->rqs[rq->tag] = rq;
+ return true;
+ }
+
+ return __blk_mq_get_driver_tag(hctx, rq);
+}
static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
{
@@ -331,19 +343,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
if (bt->sb.depth == 1)
return true;
- if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+ if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
- struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
return true;
- users = atomic_read(&set->active_queues_shared_sbitmap);
} else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return true;
- users = atomic_read(&hctx->tags->active_queues);
}
+ users = atomic_read(&hctx->tags->active_queues);
+
if (!users)
return true;
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index f000f83e0621..3cfbc8668cba 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -189,9 +189,10 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
* BIO_TRACKED lets controllers know that a bio went through the
* normal rq_qos path.
*/
- bio_set_flag(bio, BIO_TRACKED);
- if (q->rq_qos)
+ if (q->rq_qos) {
+ bio_set_flag(bio, BIO_TRACKED);
__rq_qos_throttle(q->rq_qos, bio);
+ }
}
static inline void rq_qos_track(struct request_queue *q, struct request *rq,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 614d9d47de36..cef1f713370b 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -17,6 +17,7 @@
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-wbt.h"
+#include "blk-throttle.h"
struct queue_sysfs_entry {
struct attribute attr;
@@ -432,26 +433,11 @@ static ssize_t queue_poll_show(struct request_queue *q, char *page)
static ssize_t queue_poll_store(struct request_queue *q, const char *page,
size_t count)
{
- unsigned long poll_on;
- ssize_t ret;
-
- if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
- !q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
+ if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return -EINVAL;
-
- ret = queue_var_store(&poll_on, page, count);
- if (ret < 0)
- return ret;
-
- if (poll_on) {
- blk_queue_flag_set(QUEUE_FLAG_POLL, q);
- } else {
- blk_mq_freeze_queue(q);
- blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
- blk_mq_unfreeze_queue(q);
- }
-
- return ret;
+ pr_info_ratelimited("writes to the poll attribute are ignored.\n");
+ pr_info_ratelimited("please use driver specific parameters instead.\n");
+ return count;
}
static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
@@ -887,16 +873,15 @@ int blk_register_queue(struct gendisk *disk)
}
mutex_lock(&q->sysfs_lock);
+
+ ret = disk_register_independent_access_ranges(disk, NULL);
+ if (ret)
+ goto put_dev;
+
if (q->elevator) {
ret = elv_register_queue(q, false);
- if (ret) {
- mutex_unlock(&q->sysfs_lock);
- mutex_unlock(&q->sysfs_dir_lock);
- kobject_del(&q->kobj);
- blk_trace_remove_sysfs(dev);
- kobject_put(&dev->kobj);
- return ret;
- }
+ if (ret)
+ goto put_dev;
}
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
@@ -928,6 +913,16 @@ unlock:
}
return ret;
+
+put_dev:
+ disk_unregister_independent_access_ranges(disk);
+ mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->sysfs_dir_lock);
+ kobject_del(&q->kobj);
+ blk_trace_remove_sysfs(dev);
+ kobject_put(&dev->kobj);
+
+ return ret;
}
/**
@@ -972,6 +967,7 @@ void blk_unregister_queue(struct gendisk *disk)
mutex_lock(&q->sysfs_lock);
if (q->elevator)
elv_unregister_queue(q);
+ disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 7c4e7993ba97..39bb6e68a9a2 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -13,6 +13,7 @@
#include <linux/blk-cgroup.h>
#include "blk.h"
#include "blk-cgroup-rwstat.h"
+#include "blk-throttle.h"
/* Max dispatch from a group in 1 round */
#define THROTL_GRP_QUANTUM 8
@@ -37,60 +38,9 @@
*/
#define LATENCY_FILTERED_HD (1000L) /* 1ms */
-static struct blkcg_policy blkcg_policy_throtl;
-
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
-/*
- * To implement hierarchical throttling, throtl_grps form a tree and bios
- * are dispatched upwards level by level until they reach the top and get
- * issued. When dispatching bios from the children and local group at each
- * level, if the bios are dispatched into a single bio_list, there's a risk
- * of a local or child group which can queue many bios at once filling up
- * the list starving others.
- *
- * To avoid such starvation, dispatched bios are queued separately
- * according to where they came from. When they are again dispatched to
- * the parent, they're popped in round-robin order so that no single source
- * hogs the dispatch window.
- *
- * throtl_qnode is used to keep the queued bios separated by their sources.
- * Bios are queued to throtl_qnode which in turn is queued to
- * throtl_service_queue and then dispatched in round-robin order.
- *
- * It's also used to track the reference counts on blkg's. A qnode always
- * belongs to a throtl_grp and gets queued on itself or the parent, so
- * incrementing the reference of the associated throtl_grp when a qnode is
- * queued and decrementing when dequeued is enough to keep the whole blkg
- * tree pinned while bios are in flight.
- */
-struct throtl_qnode {
- struct list_head node; /* service_queue->queued[] */
- struct bio_list bios; /* queued bios */
- struct throtl_grp *tg; /* tg this qnode belongs to */
-};
-
-struct throtl_service_queue {
- struct throtl_service_queue *parent_sq; /* the parent service_queue */
-
- /*
- * Bios queued directly to this service_queue or dispatched from
- * children throtl_grp's.
- */
- struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
- unsigned int nr_queued[2]; /* number of queued bios */
-
- /*
- * RB tree of active children throtl_grp's, which are sorted by
- * their ->disptime.
- */
- struct rb_root_cached pending_tree; /* RB tree of active tgs */
- unsigned int nr_pending; /* # queued in the tree */
- unsigned long first_pending_disptime; /* disptime of the first tg */
- struct timer_list pending_timer; /* fires on first_pending_disptime */
-};
-
enum tg_state_flags {
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
@@ -98,93 +48,6 @@ enum tg_state_flags {
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
-enum {
- LIMIT_LOW,
- LIMIT_MAX,
- LIMIT_CNT,
-};
-
-struct throtl_grp {
- /* must be the first member */
- struct blkg_policy_data pd;
-
- /* active throtl group service_queue member */
- struct rb_node rb_node;
-
- /* throtl_data this group belongs to */
- struct throtl_data *td;
-
- /* this group's service queue */
- struct throtl_service_queue service_queue;
-
- /*
- * qnode_on_self is used when bios are directly queued to this
- * throtl_grp so that local bios compete fairly with bios
- * dispatched from children. qnode_on_parent is used when bios are
- * dispatched from this throtl_grp into its parent and will compete
- * with the sibling qnode_on_parents and the parent's
- * qnode_on_self.
- */
- struct throtl_qnode qnode_on_self[2];
- struct throtl_qnode qnode_on_parent[2];
-
- /*
- * Dispatch time in jiffies. This is the estimated time when group
- * will unthrottle and is ready to dispatch more bio. It is used as
- * key to sort active groups in service tree.
- */
- unsigned long disptime;
-
- unsigned int flags;
-
- /* are there any throtl rules between this group and td? */
- bool has_rules[2];
-
- /* internally used bytes per second rate limits */
- uint64_t bps[2][LIMIT_CNT];
- /* user configured bps limits */
- uint64_t bps_conf[2][LIMIT_CNT];
-
- /* internally used IOPS limits */
- unsigned int iops[2][LIMIT_CNT];
- /* user configured IOPS limits */
- unsigned int iops_conf[2][LIMIT_CNT];
-
- /* Number of bytes dispatched in current slice */
- uint64_t bytes_disp[2];
- /* Number of bio's dispatched in current slice */
- unsigned int io_disp[2];
-
- unsigned long last_low_overflow_time[2];
-
- uint64_t last_bytes_disp[2];
- unsigned int last_io_disp[2];
-
- unsigned long last_check_time;
-
- unsigned long latency_target; /* us */
- unsigned long latency_target_conf; /* us */
- /* When did we start a new slice */
- unsigned long slice_start[2];
- unsigned long slice_end[2];
-
- unsigned long last_finish_time; /* ns / 1024 */
- unsigned long checked_last_finish_time; /* ns / 1024 */
- unsigned long avg_idletime; /* ns / 1024 */
- unsigned long idletime_threshold; /* us */
- unsigned long idletime_threshold_conf; /* us */
-
- unsigned int bio_cnt; /* total bios */
- unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
- unsigned long bio_cnt_reset_time;
-
- atomic_t io_split_cnt[2];
- atomic_t last_io_split_cnt[2];
-
- struct blkg_rwstat stat_bytes;
- struct blkg_rwstat stat_ios;
-};
-
/* We measure latency for request size from <= 4k to >= 1M */
#define LATENCY_BUCKET_SIZE 9
@@ -231,16 +94,6 @@ struct throtl_data
static void throtl_pending_timer_fn(struct timer_list *t);
-static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
-{
- return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
-}
-
-static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
-{
- return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
-}
-
static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
{
return pd_to_blkg(&tg->pd);
@@ -1794,7 +1647,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
cancel_work_sync(&td->dispatch_work);
}
-static struct blkcg_policy blkcg_policy_throtl = {
+struct blkcg_policy blkcg_policy_throtl = {
.dfl_cftypes = throtl_files,
.legacy_cftypes = throtl_legacy_files,
@@ -2208,9 +2061,9 @@ void blk_throtl_charge_bio_split(struct bio *bio)
} while (parent);
}
-bool blk_throtl_bio(struct bio *bio)
+bool __blk_throtl_bio(struct bio *bio)
{
- struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
struct blkcg_gq *blkg = bio->bi_blkg;
struct throtl_qnode *qn = NULL;
struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -2221,19 +2074,12 @@ bool blk_throtl_bio(struct bio *bio)
rcu_read_lock();
- /* see throtl_charge_bio() */
- if (bio_flagged(bio, BIO_THROTTLED))
- goto out;
-
if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
bio->bi_iter.bi_size);
blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
}
- if (!tg->has_rules[rw])
- goto out;
-
spin_lock_irq(&q->queue_lock);
throtl_update_latency_buckets(td);
@@ -2317,7 +2163,6 @@ again:
out_unlock:
spin_unlock_irq(&q->queue_lock);
-out:
bio_set_flag(bio, BIO_THROTTLED);
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
new file mode 100644
index 000000000000..175f03abd9e4
--- /dev/null
+++ b/block/blk-throttle.h
@@ -0,0 +1,182 @@
+#ifndef BLK_THROTTLE_H
+#define BLK_THROTTLE_H
+
+#include "blk-cgroup-rwstat.h"
+
+/*
+ * To implement hierarchical throttling, throtl_grps form a tree and bios
+ * are dispatched upwards level by level until they reach the top and get
+ * issued. When dispatching bios from the children and local group at each
+ * level, if the bios are dispatched into a single bio_list, there's a risk
+ * of a local or child group which can queue many bios at once filling up
+ * the list starving others.
+ *
+ * To avoid such starvation, dispatched bios are queued separately
+ * according to where they came from. When they are again dispatched to
+ * the parent, they're popped in round-robin order so that no single source
+ * hogs the dispatch window.
+ *
+ * throtl_qnode is used to keep the queued bios separated by their sources.
+ * Bios are queued to throtl_qnode which in turn is queued to
+ * throtl_service_queue and then dispatched in round-robin order.
+ *
+ * It's also used to track the reference counts on blkg's. A qnode always
+ * belongs to a throtl_grp and gets queued on itself or the parent, so
+ * incrementing the reference of the associated throtl_grp when a qnode is
+ * queued and decrementing when dequeued is enough to keep the whole blkg
+ * tree pinned while bios are in flight.
+ */
+struct throtl_qnode {
+ struct list_head node; /* service_queue->queued[] */
+ struct bio_list bios; /* queued bios */
+ struct throtl_grp *tg; /* tg this qnode belongs to */
+};
+
+struct throtl_service_queue {
+ struct throtl_service_queue *parent_sq; /* the parent service_queue */
+
+ /*
+ * Bios queued directly to this service_queue or dispatched from
+ * children throtl_grp's.
+ */
+ struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
+ unsigned int nr_queued[2]; /* number of queued bios */
+
+ /*
+ * RB tree of active children throtl_grp's, which are sorted by
+ * their ->disptime.
+ */
+ struct rb_root_cached pending_tree; /* RB tree of active tgs */
+ unsigned int nr_pending; /* # queued in the tree */
+ unsigned long first_pending_disptime; /* disptime of the first tg */
+ struct timer_list pending_timer; /* fires on first_pending_disptime */
+};
+
+enum {
+ LIMIT_LOW,
+ LIMIT_MAX,
+ LIMIT_CNT,
+};
+
+struct throtl_grp {
+ /* must be the first member */
+ struct blkg_policy_data pd;
+
+ /* active throtl group service_queue member */
+ struct rb_node rb_node;
+
+ /* throtl_data this group belongs to */
+ struct throtl_data *td;
+
+ /* this group's service queue */
+ struct throtl_service_queue service_queue;
+
+ /*
+ * qnode_on_self is used when bios are directly queued to this
+ * throtl_grp so that local bios compete fairly with bios
+ * dispatched from children. qnode_on_parent is used when bios are
+ * dispatched from this throtl_grp into its parent and will compete
+ * with the sibling qnode_on_parents and the parent's
+ * qnode_on_self.
+ */
+ struct throtl_qnode qnode_on_self[2];
+ struct throtl_qnode qnode_on_parent[2];
+
+ /*
+ * Dispatch time in jiffies. This is the estimated time when group
+ * will unthrottle and is ready to dispatch more bio. It is used as
+ * key to sort active groups in service tree.
+ */
+ unsigned long disptime;
+
+ unsigned int flags;
+
+ /* are there any throtl rules between this group and td? */
+ bool has_rules[2];
+
+ /* internally used bytes per second rate limits */
+ uint64_t bps[2][LIMIT_CNT];
+ /* user configured bps limits */
+ uint64_t bps_conf[2][LIMIT_CNT];
+
+ /* internally used IOPS limits */
+ unsigned int iops[2][LIMIT_CNT];
+ /* user configured IOPS limits */
+ unsigned int iops_conf[2][LIMIT_CNT];
+
+ /* Number of bytes dispatched in current slice */
+ uint64_t bytes_disp[2];
+ /* Number of bio's dispatched in current slice */
+ unsigned int io_disp[2];
+
+ unsigned long last_low_overflow_time[2];
+
+ uint64_t last_bytes_disp[2];
+ unsigned int last_io_disp[2];
+
+ unsigned long last_check_time;
+
+ unsigned long latency_target; /* us */
+ unsigned long latency_target_conf; /* us */
+ /* When did we start a new slice */
+ unsigned long slice_start[2];
+ unsigned long slice_end[2];
+
+ unsigned long last_finish_time; /* ns / 1024 */
+ unsigned long checked_last_finish_time; /* ns / 1024 */
+ unsigned long avg_idletime; /* ns / 1024 */
+ unsigned long idletime_threshold; /* us */
+ unsigned long idletime_threshold_conf; /* us */
+
+ unsigned int bio_cnt; /* total bios */
+ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
+ unsigned long bio_cnt_reset_time;
+
+ atomic_t io_split_cnt[2];
+ atomic_t last_io_split_cnt[2];
+
+ struct blkg_rwstat stat_bytes;
+ struct blkg_rwstat stat_ios;
+};
+
+extern struct blkcg_policy blkcg_policy_throtl;
+
+static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
+}
+
+static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
+{
+ return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
+}
+
+/*
+ * Internal throttling interface
+ */
+#ifndef CONFIG_BLK_DEV_THROTTLING
+static inline int blk_throtl_init(struct request_queue *q) { return 0; }
+static inline void blk_throtl_exit(struct request_queue *q) { }
+static inline void blk_throtl_register_queue(struct request_queue *q) { }
+static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
+static inline bool blk_throtl_bio(struct bio *bio) { return false; }
+#else /* CONFIG_BLK_DEV_THROTTLING */
+int blk_throtl_init(struct request_queue *q);
+void blk_throtl_exit(struct request_queue *q);
+void blk_throtl_register_queue(struct request_queue *q);
+void blk_throtl_charge_bio_split(struct bio *bio);
+bool __blk_throtl_bio(struct bio *bio);
+static inline bool blk_throtl_bio(struct bio *bio)
+{
+ struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
+
+ if (bio_flagged(bio, BIO_THROTTLED))
+ return false;
+ if (!tg->has_rules[bio_data_dir(bio)])
+ return false;
+
+ return __blk_throtl_bio(bio);
+}
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
+#endif
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 874c1c37bf0c..0c119be0e813 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -357,6 +357,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
unsigned int inflight = wbt_inflight(rwb);
int status;
+ if (!rwb->rqos.q->disk)
+ return;
+
status = latency_exceeded(rwb, cb->stat);
trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step,
diff --git a/block/blk.h b/block/blk.h
index 6c3c00a8fe19..7afffd548daf 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -12,6 +12,8 @@
#include "blk-mq.h"
#include "blk-mq-sched.h"
+struct elevator_type;
+
/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT (5 * HZ)
@@ -94,6 +96,44 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
return __bvec_gap_to_prev(q, bprv, offset);
}
+static inline bool rq_mergeable(struct request *rq)
+{
+ if (blk_rq_is_passthrough(rq))
+ return false;
+
+ if (req_op(rq) == REQ_OP_FLUSH)
+ return false;
+
+ if (req_op(rq) == REQ_OP_WRITE_ZEROES)
+ return false;
+
+ if (req_op(rq) == REQ_OP_ZONE_APPEND)
+ return false;
+
+ if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
+ return false;
+ if (rq->rq_flags & RQF_NOMERGE_FLAGS)
+ return false;
+
+ return true;
+}
+
+/*
+ * There are two different ways to handle DISCARD merges:
+ * 1) If max_discard_segments > 1, the driver treats every bio as a range and
+ * send the bios to controller together. The ranges don't need to be
+ * contiguous.
+ * 2) Otherwise, the request will be normal read/write requests. The ranges
+ * need to be contiguous.
+ */
+static inline bool blk_discard_mergable(struct request *req)
+{
+ if (req_op(req) == REQ_OP_DISCARD &&
+ queue_max_discard_segments(req->q) > 1)
+ return true;
+ return false;
+}
+
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
@@ -175,21 +215,28 @@ static inline void blk_integrity_del(struct gendisk *disk)
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
+void blk_print_req_error(struct request *req, blk_status_t status);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs, struct request **same_queue_rq);
+ unsigned int nr_segs, bool *same_queue_rq);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
struct bio *bio, unsigned int nr_segs);
-void blk_account_io_start(struct request *req);
-void blk_account_io_done(struct request *req, u64 now);
+void __blk_account_io_start(struct request *req);
+void __blk_account_io_done(struct request *req, u64 now);
+
+/*
+ * Plug flush limits
+ */
+#define BLK_MAX_REQUEST_COUNT 32
+#define BLK_PLUG_FLUSH_SIZE (128 * 1024)
/*
* Internal elevator interface
*/
#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)
-void blk_insert_flush(struct request *rq);
+bool blk_insert_flush(struct request *rq);
int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e);
@@ -202,7 +249,7 @@ static inline void elevator_exit(struct request_queue *q,
{
lockdep_assert_held(&q->sysfs_lock);
- blk_mq_sched_free_requests(q);
+ blk_mq_sched_free_rqs(q);
__elevator_exit(q, e);
}
@@ -220,7 +267,32 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
-void __blk_queue_split(struct bio **bio, unsigned int *nr_segs);
+static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
+{
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_WRITE_SAME:
+ return true; /* non-trivial splitting decisions */
+ default:
+ break;
+ }
+
+ /*
+ * All drivers must accept single-segments bios that are <= PAGE_SIZE.
+ * This is a quick and dirty check that relies on the fact that
+ * bi_io_vec[0] is always valid if a bio has data. The check might
+ * lead to occasional false negatives when bios are cloned, but compared
+ * to the performance impact of cloned bios themselves the loop below
+ * doesn't matter anyway.
+ */
+ return q->limits.chunk_sectors || bio->bi_vcnt != 1 ||
+ bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
+}
+
+void __blk_queue_split(struct request_queue *q, struct bio **bio,
+ unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -240,7 +312,25 @@ int blk_dev_init(void);
*/
static inline bool blk_do_io_stat(struct request *rq)
{
- return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT);
+ return (rq->rq_flags & RQF_IO_STAT) && rq->rq_disk;
+}
+
+static inline void blk_account_io_done(struct request *req, u64 now)
+{
+ /*
+ * Account IO completion. flush_rq isn't accounted as a
+ * normal IO on queueing nor completion. Accounting the
+ * containing request is enough.
+ */
+ if (blk_do_io_stat(req) && req->part &&
+ !(req->rq_flags & RQF_FLUSH_SEQ))
+ __blk_account_io_done(req, now);
+}
+
+static inline void blk_account_io_start(struct request *req)
+{
+ if (blk_do_io_stat(req))
+ __blk_account_io_start(req);
}
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
@@ -285,22 +375,6 @@ void ioc_clear_queue(struct request_queue *q);
int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
-/*
- * Internal throttling interface
- */
-#ifdef CONFIG_BLK_DEV_THROTTLING
-extern int blk_throtl_init(struct request_queue *q);
-extern void blk_throtl_exit(struct request_queue *q);
-extern void blk_throtl_register_queue(struct request_queue *q);
-extern void blk_throtl_charge_bio_split(struct bio *bio);
-bool blk_throtl_bio(struct bio *bio);
-#else /* CONFIG_BLK_DEV_THROTTLING */
-static inline int blk_throtl_init(struct request_queue *q) { return 0; }
-static inline void blk_throtl_exit(struct request_queue *q) { }
-static inline void blk_throtl_register_queue(struct request_queue *q) { }
-static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
-static inline bool blk_throtl_bio(struct bio *bio) { return false; }
-#endif /* CONFIG_BLK_DEV_THROTTLING */
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
@@ -368,13 +442,20 @@ extern struct device_attribute dev_attr_events;
extern struct device_attribute dev_attr_events_async;
extern struct device_attribute dev_attr_events_poll_msecs;
-static inline void bio_clear_hipri(struct bio *bio)
+static inline void bio_clear_polled(struct bio *bio)
{
/* can't support alloc cache if we turn off polling */
bio_clear_flag(bio, BIO_PERCPU_CACHE);
- bio->bi_opf &= ~REQ_HIPRI;
+ bio->bi_opf &= ~REQ_POLLED;
}
+long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
+long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
+
extern const struct address_space_operations def_blk_aops;
+int disk_register_independent_access_ranges(struct gendisk *disk,
+ struct blk_independent_access_ranges *new_iars);
+void disk_unregister_independent_access_ranges(struct gendisk *disk);
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c
index 05fc7148489d..7af1a72835b9 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
+#include <linux/blk-cgroup.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/hash.h>
diff --git a/block/elevator.c b/block/elevator.c
index ff45d8388f48..1f39f6e8ebb9 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -26,7 +26,6 @@
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
-#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -40,6 +39,7 @@
#include <trace/events/block.h>
+#include "elevator.h"
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
@@ -637,7 +637,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
return NULL;
if (q->nr_hw_queues != 1 &&
- !blk_mq_is_sbitmap_shared(q->tag_set->flags))
+ !blk_mq_is_shared_tags(q->tag_set->flags))
return NULL;
return elevator_get(q, "mq-deadline", false);
diff --git a/include/linux/elevator.h b/block/elevator.h
index ef9ceead3db1..16cd8bdedb7e 100644
--- a/include/linux/elevator.h
+++ b/block/elevator.h
@@ -1,17 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_ELEVATOR_H
-#define _LINUX_ELEVATOR_H
+#ifndef _ELEVATOR_H
+#define _ELEVATOR_H
#include <linux/percpu.h>
#include <linux/hashtable.h>
-#ifdef CONFIG_BLOCK
-
struct io_cq;
struct elevator_type;
-#ifdef CONFIG_BLK_DEBUG_FS
struct blk_mq_debugfs_attr;
-#endif
/*
* Return values from elevator merger
@@ -162,20 +158,9 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
#define ELEVATOR_INSERT_FLUSH 5
#define ELEVATOR_INSERT_SORT_MERGE 6
-#define rq_end_sector(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq))
#define rb_entry_rq(node) rb_entry((node), struct request, rb_node)
#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
#define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist)
-/*
- * Elevator features.
- */
-
-/* Supports zoned block devices sequential write constraint */
-#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0)
-/* Supports scheduling on multiple hardware queues */
-#define ELEVATOR_F_MQ_AWARE (1U << 1)
-
-#endif /* CONFIG_BLOCK */
-#endif
+#endif /* _ELEVATOR_H */
diff --git a/block/fops.c b/block/fops.c
index 1e970c247e0e..a2f492e50782 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -17,7 +17,7 @@
#include <linux/fs.h>
#include "blk.h"
-static struct inode *bdev_file_inode(struct file *file)
+static inline struct inode *bdev_file_inode(struct file *file)
{
return file->f_mapping->host;
}
@@ -54,14 +54,12 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
struct iov_iter *iter, unsigned int nr_pages)
{
- struct file *file = iocb->ki_filp;
- struct block_device *bdev = I_BDEV(bdev_file_inode(file));
+ struct block_device *bdev = iocb->ki_filp->private_data;
struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
loff_t pos = iocb->ki_pos;
bool should_dirty = false;
struct bio bio;
ssize_t ret;
- blk_qc_t qc;
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
@@ -78,7 +76,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
bio_init(&bio, vecs, nr_pages);
bio_set_dev(&bio, bdev);
- bio.bi_iter.bi_sector = pos >> 9;
+ bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_write_hint = iocb->ki_hint;
bio.bi_private = current;
bio.bi_end_io = blkdev_bio_end_io_simple;
@@ -102,13 +100,12 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
if (iocb->ki_flags & IOCB_HIPRI)
bio_set_polled(&bio, iocb);
- qc = submit_bio(&bio);
+ submit_bio(&bio);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(bio.bi_private))
break;
- if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(bdev_get_queue(bdev), qc, true))
+ if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0))
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
@@ -126,6 +123,11 @@ out:
return ret;
}
+enum {
+ DIO_SHOULD_DIRTY = 1,
+ DIO_IS_SYNC = 2,
+};
+
struct blkdev_dio {
union {
struct kiocb *iocb;
@@ -133,35 +135,27 @@ struct blkdev_dio {
};
size_t size;
atomic_t ref;
- bool multi_bio : 1;
- bool should_dirty : 1;
- bool is_sync : 1;
- struct bio bio;
+ unsigned int flags;
+ struct bio bio ____cacheline_aligned_in_smp;
};
static struct bio_set blkdev_dio_pool;
-static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
-{
- struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
- struct request_queue *q = bdev_get_queue(bdev);
-
- return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
-}
-
static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
- bool should_dirty = dio->should_dirty;
+ bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
if (bio->bi_status && !dio->bio.bi_status)
dio->bio.bi_status = bio->bi_status;
- if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
- if (!dio->is_sync) {
+ if (atomic_dec_and_test(&dio->ref)) {
+ if (!(dio->flags & DIO_IS_SYNC)) {
struct kiocb *iocb = dio->iocb;
ssize_t ret;
+ WRITE_ONCE(iocb->private, NULL);
+
if (likely(!dio->bio.bi_status)) {
ret = dio->size;
iocb->ki_pos += ret;
@@ -170,8 +164,7 @@ static void blkdev_bio_end_io(struct bio *bio)
}
dio->iocb->ki_complete(iocb, ret, 0);
- if (dio->multi_bio)
- bio_put(&dio->bio);
+ bio_put(&dio->bio);
} else {
struct task_struct *waiter = dio->waiter;
@@ -191,16 +184,12 @@ static void blkdev_bio_end_io(struct bio *bio)
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
unsigned int nr_pages)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = bdev_file_inode(file);
- struct block_device *bdev = I_BDEV(inode);
+ struct block_device *bdev = iocb->ki_filp->private_data;
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
- bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
loff_t pos = iocb->ki_pos;
- blk_qc_t qc = BLK_QC_T_NONE;
int ret = 0;
if ((pos | iov_iter_alignment(iter)) &
@@ -210,28 +199,31 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
- dio->is_sync = is_sync = is_sync_kiocb(iocb);
- if (dio->is_sync) {
+ atomic_set(&dio->ref, 1);
+ /*
+ * Grab an extra reference to ensure the dio structure which is embedded
+ * into the first bio stays around.
+ */
+ bio_get(bio);
+
+ is_sync = is_sync_kiocb(iocb);
+ if (is_sync) {
+ dio->flags = DIO_IS_SYNC;
dio->waiter = current;
- bio_get(bio);
} else {
+ dio->flags = 0;
dio->iocb = iocb;
}
dio->size = 0;
- dio->multi_bio = false;
- dio->should_dirty = is_read && iter_is_iovec(iter);
+ if (is_read && iter_is_iovec(iter))
+ dio->flags |= DIO_SHOULD_DIRTY;
- /*
- * Don't plug for HIPRI/polled IO, as those should go straight
- * to issue
- */
- if (!is_poll)
- blk_start_plug(&plug);
+ blk_start_plug(&plug);
for (;;) {
bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector = pos >> 9;
+ bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = iocb->ki_hint;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
@@ -246,7 +238,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
if (is_read) {
bio->bi_opf = REQ_OP_READ;
- if (dio->should_dirty)
+ if (dio->flags & DIO_SHOULD_DIRTY)
bio_set_pages_dirty(bio);
} else {
bio->bi_opf = dio_bio_write_op(iocb);
@@ -260,40 +252,15 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
if (!nr_pages) {
- bool polled = false;
-
- if (iocb->ki_flags & IOCB_HIPRI) {
- bio_set_polled(bio, iocb);
- polled = true;
- }
-
- qc = submit_bio(bio);
-
- if (polled)
- WRITE_ONCE(iocb->ki_cookie, qc);
+ submit_bio(bio);
break;
}
-
- if (!dio->multi_bio) {
- /*
- * AIO needs an extra reference to ensure the dio
- * structure which is embedded into the first bio
- * stays around.
- */
- if (!is_sync)
- bio_get(bio);
- dio->multi_bio = true;
- atomic_set(&dio->ref, 2);
- } else {
- atomic_inc(&dio->ref);
- }
-
+ atomic_inc(&dio->ref);
submit_bio(bio);
bio = bio_alloc(GFP_KERNEL, nr_pages);
}
- if (!is_poll)
- blk_finish_plug(&plug);
+ blk_finish_plug(&plug);
if (!is_sync)
return -EIOCBQUEUED;
@@ -302,10 +269,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(dio->waiter))
break;
-
- if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(bdev_get_queue(bdev), qc, true))
- blk_io_schedule();
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
@@ -318,6 +282,94 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
return ret;
}
+static void blkdev_bio_end_io_async(struct bio *bio)
+{
+ struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio);
+ struct kiocb *iocb = dio->iocb;
+ ssize_t ret;
+
+ if (likely(!bio->bi_status)) {
+ ret = dio->size;
+ iocb->ki_pos += ret;
+ } else {
+ ret = blk_status_to_errno(bio->bi_status);
+ }
+
+ iocb->ki_complete(iocb, ret, 0);
+
+ if (dio->flags & DIO_SHOULD_DIRTY) {
+ bio_check_pages_dirty(bio);
+ } else {
+ bio_release_pages(bio, false);
+ bio_put(bio);
+ }
+}
+
+static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
+ struct iov_iter *iter,
+ unsigned int nr_pages)
+{
+ struct block_device *bdev = iocb->ki_filp->private_data;
+ struct blkdev_dio *dio;
+ struct bio *bio;
+ loff_t pos = iocb->ki_pos;
+ int ret = 0;
+
+ if ((pos | iov_iter_alignment(iter)) &
+ (bdev_logical_block_size(bdev) - 1))
+ return -EINVAL;
+
+ bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
+ dio = container_of(bio, struct blkdev_dio, bio);
+ dio->flags = 0;
+ dio->iocb = iocb;
+ bio_set_dev(bio, bdev);
+ bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+ bio->bi_write_hint = iocb->ki_hint;
+ bio->bi_end_io = blkdev_bio_end_io_async;
+ bio->bi_ioprio = iocb->ki_ioprio;
+
+ if (iov_iter_is_bvec(iter)) {
+ /*
+ * Users don't rely on the iterator being in any particular
+ * state for async I/O returning -EIOCBQUEUED, hence we can
+ * avoid expensive iov_iter_advance(). Bypass
+ * bio_iov_iter_get_pages() and set the bvec directly.
+ */
+ bio_iov_bvec_set(bio, iter);
+ } else {
+ ret = bio_iov_iter_get_pages(bio, iter);
+ if (unlikely(ret)) {
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ return ret;
+ }
+ }
+ dio->size = bio->bi_iter.bi_size;
+
+ if (iov_iter_rw(iter) == READ) {
+ bio->bi_opf = REQ_OP_READ;
+ if (iter_is_iovec(iter)) {
+ dio->flags |= DIO_SHOULD_DIRTY;
+ bio_set_pages_dirty(bio);
+ }
+ } else {
+ bio->bi_opf = dio_bio_write_op(iocb);
+ task_io_account_write(bio->bi_iter.bi_size);
+ }
+
+ if (iocb->ki_flags & IOCB_HIPRI) {
+ bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
+ submit_bio(bio);
+ WRITE_ONCE(iocb->private, bio);
+ } else {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ bio->bi_opf |= REQ_NOWAIT;
+ submit_bio(bio);
+ }
+ return -EIOCBQUEUED;
+}
+
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
unsigned int nr_pages;
@@ -326,9 +378,11 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return 0;
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
- if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS)
- return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
-
+ if (likely(nr_pages <= BIO_MAX_VECS)) {
+ if (is_sync_kiocb(iocb))
+ return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
+ return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+ }
return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
}
@@ -405,8 +459,7 @@ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
int datasync)
{
- struct inode *bd_inode = bdev_file_inode(filp);
- struct block_device *bdev = I_BDEV(bd_inode);
+ struct block_device *bdev = filp->private_data;
int error;
error = file_write_and_wait_range(filp, start, end);
@@ -448,6 +501,8 @@ static int blkdev_open(struct inode *inode, struct file *filp)
bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
+
+ filp->private_data = bdev;
filp->f_mapping = bdev->bd_inode->i_mapping;
filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
return 0;
@@ -455,29 +510,12 @@ static int blkdev_open(struct inode *inode, struct file *filp)
static int blkdev_close(struct inode *inode, struct file *filp)
{
- struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
+ struct block_device *bdev = filp->private_data;
blkdev_put(bdev, filp->f_mode);
return 0;
}
-static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
- struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- fmode_t mode = file->f_mode;
-
- /*
- * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
- * to updated it before every ioctl.
- */
- if (file->f_flags & O_NDELAY)
- mode |= FMODE_NDELAY;
- else
- mode &= ~FMODE_NDELAY;
-
- return blkdev_ioctl(bdev, mode, cmd, arg);
-}
-
/*
* Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device.
@@ -487,14 +525,14 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
*/
static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct inode *bd_inode = bdev_file_inode(file);
+ struct block_device *bdev = iocb->ki_filp->private_data;
+ struct inode *bd_inode = bdev->bd_inode;
loff_t size = i_size_read(bd_inode);
struct blk_plug plug;
size_t shorted = 0;
ssize_t ret;
- if (bdev_read_only(I_BDEV(bd_inode)))
+ if (bdev_read_only(bdev))
return -EPERM;
if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
@@ -526,24 +564,26 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct file *file = iocb->ki_filp;
- struct inode *bd_inode = bdev_file_inode(file);
- loff_t size = i_size_read(bd_inode);
+ struct block_device *bdev = iocb->ki_filp->private_data;
+ loff_t size = i_size_read(bdev->bd_inode);
loff_t pos = iocb->ki_pos;
size_t shorted = 0;
ssize_t ret;
- if (pos >= size)
- return 0;
-
- size -= pos;
- if (iov_iter_count(to) > size) {
- shorted = iov_iter_count(to) - size;
- iov_iter_truncate(to, size);
+ if (unlikely(pos + iov_iter_count(to) > size)) {
+ if (pos >= size)
+ return 0;
+ size -= pos;
+ if (iov_iter_count(to) > size) {
+ shorted = iov_iter_count(to) - size;
+ iov_iter_truncate(to, size);
+ }
}
ret = generic_file_read_iter(iocb, to);
- iov_iter_reexpand(to, iov_iter_count(to) + shorted);
+
+ if (unlikely(shorted))
+ iov_iter_reexpand(to, iov_iter_count(to) + shorted);
return ret;
}
@@ -592,16 +632,18 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
- error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
- GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
+ error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
+ len >> SECTOR_SHIFT, GFP_KERNEL,
+ BLKDEV_ZERO_NOUNMAP);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
- error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
- GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
+ error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
+ len >> SECTOR_SHIFT, GFP_KERNEL,
+ BLKDEV_ZERO_NOFALLBACK);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
- error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
- GFP_KERNEL, 0);
+ error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
+ len >> SECTOR_SHIFT, GFP_KERNEL, 0);
break;
default:
error = -EOPNOTSUPP;
@@ -618,10 +660,10 @@ const struct file_operations def_blk_fops = {
.llseek = blkdev_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
- .iopoll = blkdev_iopoll,
+ .iopoll = iocb_bio_iopoll,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
- .unlocked_ioctl = block_ioctl,
+ .unlocked_ioctl = blkdev_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_blkdev_ioctl,
#endif
diff --git a/block/genhd.c b/block/genhd.c
index ab12ae6e636e..2052aeffa39b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -19,6 +19,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/kmod.h>
+#include <linux/major.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/log2.h>
@@ -625,6 +626,26 @@ void del_gendisk(struct gendisk *disk)
}
EXPORT_SYMBOL(del_gendisk);
+/**
+ * invalidate_disk - invalidate the disk
+ * @disk: the struct gendisk to invalidate
+ *
+ * A helper to invalidates the disk. It will clean the disk's associated
+ * buffer/page caches and reset its internal states so that the disk
+ * can be reused by the drivers.
+ *
+ * Context: can sleep
+ */
+void invalidate_disk(struct gendisk *disk)
+{
+ struct block_device *bdev = disk->part0;
+
+ invalidate_bdev(bdev);
+ bdev->bd_inode->i_mapping->wb_err = 0;
+ set_capacity(disk, 0);
+}
+EXPORT_SYMBOL(invalidate_disk);
+
/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
struct device_attribute *attr,
@@ -884,7 +905,7 @@ ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct block_device *bdev = dev_to_bdev(dev);
- struct request_queue *q = bdev->bd_disk->queue;
+ struct request_queue *q = bdev_get_queue(bdev);
struct disk_stats stat;
unsigned int inflight;
@@ -928,7 +949,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct block_device *bdev = dev_to_bdev(dev);
- struct request_queue *q = bdev->bd_disk->queue;
+ struct request_queue *q = bdev_get_queue(bdev);
unsigned int inflight[2];
if (queue_is_mq(q))
@@ -1268,6 +1289,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (!disk->bdi)
goto out_free_disk;
+ /* bdev_alloc() might need the queue, set before the first call */
+ disk->queue = q;
+
disk->part0 = bdev_alloc(disk, 0);
if (!disk->part0)
goto out_free_bdi;
@@ -1283,7 +1307,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
disk_to_dev(disk)->type = &disk_type;
device_initialize(disk_to_dev(disk));
inc_diskseq(disk);
- disk->queue = q;
q->disk = disk;
lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
@@ -1388,12 +1411,6 @@ void set_disk_ro(struct gendisk *disk, bool read_only)
}
EXPORT_SYMBOL(set_disk_ro);
-int bdev_read_only(struct block_device *bdev)
-{
- return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
-}
-EXPORT_SYMBOL(bdev_read_only);
-
void inc_diskseq(struct gendisk *disk)
{
disk->diskseq = atomic64_inc_return(&diskseq);
diff --git a/block/holder.c b/block/holder.c
index 9dc084182337..27cddce1b446 100644
--- a/block/holder.c
+++ b/block/holder.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/genhd.h>
+#include <linux/slab.h>
struct bd_holder_disk {
struct list_head list;
diff --git a/block/ioctl.c b/block/ioctl.c
index eb0491e90b9a..77b1b2453f39 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -538,12 +538,22 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
*
* New commands must be compatible and go into blkdev_common_ioctl
*/
-int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
- unsigned long arg)
+long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
- int ret;
- loff_t size;
+ struct block_device *bdev = I_BDEV(file->f_mapping->host);
void __user *argp = (void __user *)arg;
+ fmode_t mode = file->f_mode;
+ loff_t size;
+ int ret;
+
+ /*
+ * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
+ * to updated it before every ioctl.
+ */
+ if (file->f_flags & O_NDELAY)
+ mode |= FMODE_NDELAY;
+ else
+ mode &= ~FMODE_NDELAY;
switch (cmd) {
/* These need separate implementations for the data structure */
@@ -588,7 +598,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return -ENOTTY;
return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
}
-EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */
#ifdef CONFIG_COMPAT
diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c
deleted file mode 100644
index 2c4a55bea6ca..000000000000
--- a/block/keyslot-manager.c
+++ /dev/null
@@ -1,578 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2019 Google LLC
- */
-
-/**
- * DOC: The Keyslot Manager
- *
- * Many devices with inline encryption support have a limited number of "slots"
- * into which encryption contexts may be programmed, and requests can be tagged
- * with a slot number to specify the key to use for en/decryption.
- *
- * As the number of slots is limited, and programming keys is expensive on
- * many inline encryption hardware, we don't want to program the same key into
- * multiple slots - if multiple requests are using the same key, we want to
- * program just one slot with that key and use that slot for all requests.
- *
- * The keyslot manager manages these keyslots appropriately, and also acts as
- * an abstraction between the inline encryption hardware and the upper layers.
- *
- * Lower layer devices will set up a keyslot manager in their request queue
- * and tell it how to perform device specific operations like programming/
- * evicting keys from keyslots.
- *
- * Upper layers will call blk_ksm_get_slot_for_key() to program a
- * key into some slot in the inline encryption hardware.
- */
-
-#define pr_fmt(fmt) "blk-crypto: " fmt
-
-#include <linux/keyslot-manager.h>
-#include <linux/device.h>
-#include <linux/atomic.h>
-#include <linux/mutex.h>
-#include <linux/pm_runtime.h>
-#include <linux/wait.h>
-#include <linux/blkdev.h>
-
-struct blk_ksm_keyslot {
- atomic_t slot_refs;
- struct list_head idle_slot_node;
- struct hlist_node hash_node;
- const struct blk_crypto_key *key;
- struct blk_keyslot_manager *ksm;
-};
-
-static inline void blk_ksm_hw_enter(struct blk_keyslot_manager *ksm)
-{
- /*
- * Calling into the driver requires ksm->lock held and the device
- * resumed. But we must resume the device first, since that can acquire
- * and release ksm->lock via blk_ksm_reprogram_all_keys().
- */
- if (ksm->dev)
- pm_runtime_get_sync(ksm->dev);
- down_write(&ksm->lock);
-}
-
-static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm)
-{
- up_write(&ksm->lock);
- if (ksm->dev)
- pm_runtime_put_sync(ksm->dev);
-}
-
-static inline bool blk_ksm_is_passthrough(struct blk_keyslot_manager *ksm)
-{
- return ksm->num_slots == 0;
-}
-
-/**
- * blk_ksm_init() - Initialize a keyslot manager
- * @ksm: The keyslot_manager to initialize.
- * @num_slots: The number of key slots to manage.
- *
- * Allocate memory for keyslots and initialize a keyslot manager. Called by
- * e.g. storage drivers to set up a keyslot manager in their request_queue.
- *
- * Return: 0 on success, or else a negative error code.
- */
-int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots)
-{
- unsigned int slot;
- unsigned int i;
- unsigned int slot_hashtable_size;
-
- memset(ksm, 0, sizeof(*ksm));
-
- if (num_slots == 0)
- return -EINVAL;
-
- ksm->slots = kvcalloc(num_slots, sizeof(ksm->slots[0]), GFP_KERNEL);
- if (!ksm->slots)
- return -ENOMEM;
-
- ksm->num_slots = num_slots;
-
- init_rwsem(&ksm->lock);
-
- init_waitqueue_head(&ksm->idle_slots_wait_queue);
- INIT_LIST_HEAD(&ksm->idle_slots);
-
- for (slot = 0; slot < num_slots; slot++) {
- ksm->slots[slot].ksm = ksm;
- list_add_tail(&ksm->slots[slot].idle_slot_node,
- &ksm->idle_slots);
- }
-
- spin_lock_init(&ksm->idle_slots_lock);
-
- slot_hashtable_size = roundup_pow_of_two(num_slots);
- /*
- * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2
- * buckets. This only makes a difference when there is only 1 keyslot.
- */
- if (slot_hashtable_size < 2)
- slot_hashtable_size = 2;
-
- ksm->log_slot_ht_size = ilog2(slot_hashtable_size);
- ksm->slot_hashtable = kvmalloc_array(slot_hashtable_size,
- sizeof(ksm->slot_hashtable[0]),
- GFP_KERNEL);
- if (!ksm->slot_hashtable)
- goto err_destroy_ksm;
- for (i = 0; i < slot_hashtable_size; i++)
- INIT_HLIST_HEAD(&ksm->slot_hashtable[i]);
-
- return 0;
-
-err_destroy_ksm:
- blk_ksm_destroy(ksm);
- return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_init);
-
-static void blk_ksm_destroy_callback(void *ksm)
-{
- blk_ksm_destroy(ksm);
-}
-
-/**
- * devm_blk_ksm_init() - Resource-managed blk_ksm_init()
- * @dev: The device which owns the blk_keyslot_manager.
- * @ksm: The blk_keyslot_manager to initialize.
- * @num_slots: The number of key slots to manage.
- *
- * Like blk_ksm_init(), but causes blk_ksm_destroy() to be called automatically
- * on driver detach.
- *
- * Return: 0 on success, or else a negative error code.
- */
-int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm,
- unsigned int num_slots)
-{
- int err = blk_ksm_init(ksm, num_slots);
-
- if (err)
- return err;
-
- return devm_add_action_or_reset(dev, blk_ksm_destroy_callback, ksm);
-}
-EXPORT_SYMBOL_GPL(devm_blk_ksm_init);
-
-static inline struct hlist_head *
-blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key)
-{
- return &ksm->slot_hashtable[hash_ptr(key, ksm->log_slot_ht_size)];
-}
-
-static void blk_ksm_remove_slot_from_lru_list(struct blk_ksm_keyslot *slot)
-{
- struct blk_keyslot_manager *ksm = slot->ksm;
- unsigned long flags;
-
- spin_lock_irqsave(&ksm->idle_slots_lock, flags);
- list_del(&slot->idle_slot_node);
- spin_unlock_irqrestore(&ksm->idle_slots_lock, flags);
-}
-
-static struct blk_ksm_keyslot *blk_ksm_find_keyslot(
- struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key)
-{
- const struct hlist_head *head = blk_ksm_hash_bucket_for_key(ksm, key);
- struct blk_ksm_keyslot *slotp;
-
- hlist_for_each_entry(slotp, head, hash_node) {
- if (slotp->key == key)
- return slotp;
- }
- return NULL;
-}
-
-static struct blk_ksm_keyslot *blk_ksm_find_and_grab_keyslot(
- struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key)
-{
- struct blk_ksm_keyslot *slot;
-
- slot = blk_ksm_find_keyslot(ksm, key);
- if (!slot)
- return NULL;
- if (atomic_inc_return(&slot->slot_refs) == 1) {
- /* Took first reference to this slot; remove it from LRU list */
- blk_ksm_remove_slot_from_lru_list(slot);
- }
- return slot;
-}
-
-unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot)
-{
- return slot - slot->ksm->slots;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_get_slot_idx);
-
-/**
- * blk_ksm_get_slot_for_key() - Program a key into a keyslot.
- * @ksm: The keyslot manager to program the key into.
- * @key: Pointer to the key object to program, including the raw key, crypto
- * mode, and data unit size.
- * @slot_ptr: A pointer to return the pointer of the allocated keyslot.
- *
- * Get a keyslot that's been programmed with the specified key. If one already
- * exists, return it with incremented refcount. Otherwise, wait for a keyslot
- * to become idle and program it.
- *
- * Context: Process context. Takes and releases ksm->lock.
- * Return: BLK_STS_OK on success (and keyslot is set to the pointer of the
- * allocated keyslot), or some other blk_status_t otherwise (and
- * keyslot is set to NULL).
- */
-blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key,
- struct blk_ksm_keyslot **slot_ptr)
-{
- struct blk_ksm_keyslot *slot;
- int slot_idx;
- int err;
-
- *slot_ptr = NULL;
-
- if (blk_ksm_is_passthrough(ksm))
- return BLK_STS_OK;
-
- down_read(&ksm->lock);
- slot = blk_ksm_find_and_grab_keyslot(ksm, key);
- up_read(&ksm->lock);
- if (slot)
- goto success;
-
- for (;;) {
- blk_ksm_hw_enter(ksm);
- slot = blk_ksm_find_and_grab_keyslot(ksm, key);
- if (slot) {
- blk_ksm_hw_exit(ksm);
- goto success;
- }
-
- /*
- * If we're here, that means there wasn't a slot that was
- * already programmed with the key. So try to program it.
- */
- if (!list_empty(&ksm->idle_slots))
- break;
-
- blk_ksm_hw_exit(ksm);
- wait_event(ksm->idle_slots_wait_queue,
- !list_empty(&ksm->idle_slots));
- }
-
- slot = list_first_entry(&ksm->idle_slots, struct blk_ksm_keyslot,
- idle_slot_node);
- slot_idx = blk_ksm_get_slot_idx(slot);
-
- err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot_idx);
- if (err) {
- wake_up(&ksm->idle_slots_wait_queue);
- blk_ksm_hw_exit(ksm);
- return errno_to_blk_status(err);
- }
-
- /* Move this slot to the hash list for the new key. */
- if (slot->key)
- hlist_del(&slot->hash_node);
- slot->key = key;
- hlist_add_head(&slot->hash_node, blk_ksm_hash_bucket_for_key(ksm, key));
-
- atomic_set(&slot->slot_refs, 1);
-
- blk_ksm_remove_slot_from_lru_list(slot);
-
- blk_ksm_hw_exit(ksm);
-success:
- *slot_ptr = slot;
- return BLK_STS_OK;
-}
-
-/**
- * blk_ksm_put_slot() - Release a reference to a slot
- * @slot: The keyslot to release the reference of.
- *
- * Context: Any context.
- */
-void blk_ksm_put_slot(struct blk_ksm_keyslot *slot)
-{
- struct blk_keyslot_manager *ksm;
- unsigned long flags;
-
- if (!slot)
- return;
-
- ksm = slot->ksm;
-
- if (atomic_dec_and_lock_irqsave(&slot->slot_refs,
- &ksm->idle_slots_lock, flags)) {
- list_add_tail(&slot->idle_slot_node, &ksm->idle_slots);
- spin_unlock_irqrestore(&ksm->idle_slots_lock, flags);
- wake_up(&ksm->idle_slots_wait_queue);
- }
-}
-
-/**
- * blk_ksm_crypto_cfg_supported() - Find out if a crypto configuration is
- * supported by a ksm.
- * @ksm: The keyslot manager to check
- * @cfg: The crypto configuration to check for.
- *
- * Checks for crypto_mode/data unit size/dun bytes support.
- *
- * Return: Whether or not this ksm supports the specified crypto config.
- */
-bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_config *cfg)
-{
- if (!ksm)
- return false;
- if (!(ksm->crypto_modes_supported[cfg->crypto_mode] &
- cfg->data_unit_size))
- return false;
- if (ksm->max_dun_bytes_supported < cfg->dun_bytes)
- return false;
- return true;
-}
-
-/**
- * blk_ksm_evict_key() - Evict a key from the lower layer device.
- * @ksm: The keyslot manager to evict from
- * @key: The key to evict
- *
- * Find the keyslot that the specified key was programmed into, and evict that
- * slot from the lower layer device. The slot must not be in use by any
- * in-flight IO when this function is called.
- *
- * Context: Process context. Takes and releases ksm->lock.
- * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY
- * if the keyslot is still in use, or another -errno value on other
- * error.
- */
-int blk_ksm_evict_key(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key)
-{
- struct blk_ksm_keyslot *slot;
- int err = 0;
-
- if (blk_ksm_is_passthrough(ksm)) {
- if (ksm->ksm_ll_ops.keyslot_evict) {
- blk_ksm_hw_enter(ksm);
- err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, -1);
- blk_ksm_hw_exit(ksm);
- return err;
- }
- return 0;
- }
-
- blk_ksm_hw_enter(ksm);
- slot = blk_ksm_find_keyslot(ksm, key);
- if (!slot)
- goto out_unlock;
-
- if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) {
- err = -EBUSY;
- goto out_unlock;
- }
- err = ksm->ksm_ll_ops.keyslot_evict(ksm, key,
- blk_ksm_get_slot_idx(slot));
- if (err)
- goto out_unlock;
-
- hlist_del(&slot->hash_node);
- slot->key = NULL;
- err = 0;
-out_unlock:
- blk_ksm_hw_exit(ksm);
- return err;
-}
-
-/**
- * blk_ksm_reprogram_all_keys() - Re-program all keyslots.
- * @ksm: The keyslot manager
- *
- * Re-program all keyslots that are supposed to have a key programmed. This is
- * intended only for use by drivers for hardware that loses its keys on reset.
- *
- * Context: Process context. Takes and releases ksm->lock.
- */
-void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm)
-{
- unsigned int slot;
-
- if (blk_ksm_is_passthrough(ksm))
- return;
-
- /* This is for device initialization, so don't resume the device */
- down_write(&ksm->lock);
- for (slot = 0; slot < ksm->num_slots; slot++) {
- const struct blk_crypto_key *key = ksm->slots[slot].key;
- int err;
-
- if (!key)
- continue;
-
- err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot);
- WARN_ON(err);
- }
- up_write(&ksm->lock);
-}
-EXPORT_SYMBOL_GPL(blk_ksm_reprogram_all_keys);
-
-void blk_ksm_destroy(struct blk_keyslot_manager *ksm)
-{
- if (!ksm)
- return;
- kvfree(ksm->slot_hashtable);
- kvfree_sensitive(ksm->slots, sizeof(ksm->slots[0]) * ksm->num_slots);
- memzero_explicit(ksm, sizeof(*ksm));
-}
-EXPORT_SYMBOL_GPL(blk_ksm_destroy);
-
-bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q)
-{
- if (blk_integrity_queue_supports_integrity(q)) {
- pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
- return false;
- }
- q->ksm = ksm;
- return true;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_register);
-
-void blk_ksm_unregister(struct request_queue *q)
-{
- q->ksm = NULL;
-}
-
-/**
- * blk_ksm_intersect_modes() - restrict supported modes by child device
- * @parent: The keyslot manager for parent device
- * @child: The keyslot manager for child device, or NULL
- *
- * Clear any crypto mode support bits in @parent that aren't set in @child.
- * If @child is NULL, then all parent bits are cleared.
- *
- * Only use this when setting up the keyslot manager for a layered device,
- * before it's been exposed yet.
- */
-void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent,
- const struct blk_keyslot_manager *child)
-{
- if (child) {
- unsigned int i;
-
- parent->max_dun_bytes_supported =
- min(parent->max_dun_bytes_supported,
- child->max_dun_bytes_supported);
- for (i = 0; i < ARRAY_SIZE(child->crypto_modes_supported);
- i++) {
- parent->crypto_modes_supported[i] &=
- child->crypto_modes_supported[i];
- }
- } else {
- parent->max_dun_bytes_supported = 0;
- memset(parent->crypto_modes_supported, 0,
- sizeof(parent->crypto_modes_supported));
- }
-}
-EXPORT_SYMBOL_GPL(blk_ksm_intersect_modes);
-
-/**
- * blk_ksm_is_superset() - Check if a KSM supports a superset of crypto modes
- * and DUN bytes that another KSM supports. Here,
- * "superset" refers to the mathematical meaning of the
- * word - i.e. if two KSMs have the *same* capabilities,
- * they *are* considered supersets of each other.
- * @ksm_superset: The KSM that we want to verify is a superset
- * @ksm_subset: The KSM that we want to verify is a subset
- *
- * Return: True if @ksm_superset supports a superset of the crypto modes and DUN
- * bytes that @ksm_subset supports.
- */
-bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset,
- struct blk_keyslot_manager *ksm_subset)
-{
- int i;
-
- if (!ksm_subset)
- return true;
-
- if (!ksm_superset)
- return false;
-
- for (i = 0; i < ARRAY_SIZE(ksm_superset->crypto_modes_supported); i++) {
- if (ksm_subset->crypto_modes_supported[i] &
- (~ksm_superset->crypto_modes_supported[i])) {
- return false;
- }
- }
-
- if (ksm_subset->max_dun_bytes_supported >
- ksm_superset->max_dun_bytes_supported) {
- return false;
- }
-
- return true;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_is_superset);
-
-/**
- * blk_ksm_update_capabilities() - Update the restrictions of a KSM to those of
- * another KSM
- * @target_ksm: The KSM whose restrictions to update.
- * @reference_ksm: The KSM to whose restrictions this function will update
- * @target_ksm's restrictions to.
- *
- * Blk-crypto requires that crypto capabilities that were
- * advertised when a bio was created continue to be supported by the
- * device until that bio is ended. This is turn means that a device cannot
- * shrink its advertised crypto capabilities without any explicit
- * synchronization with upper layers. So if there's no such explicit
- * synchronization, @reference_ksm must support all the crypto capabilities that
- * @target_ksm does
- * (i.e. we need blk_ksm_is_superset(@reference_ksm, @target_ksm) == true).
- *
- * Note also that as long as the crypto capabilities are being expanded, the
- * order of updates becoming visible is not important because it's alright
- * for blk-crypto to see stale values - they only cause blk-crypto to
- * believe that a crypto capability isn't supported when it actually is (which
- * might result in blk-crypto-fallback being used if available, or the bio being
- * failed).
- */
-void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm,
- struct blk_keyslot_manager *reference_ksm)
-{
- memcpy(target_ksm->crypto_modes_supported,
- reference_ksm->crypto_modes_supported,
- sizeof(target_ksm->crypto_modes_supported));
-
- target_ksm->max_dun_bytes_supported =
- reference_ksm->max_dun_bytes_supported;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_update_capabilities);
-
-/**
- * blk_ksm_init_passthrough() - Init a passthrough keyslot manager
- * @ksm: The keyslot manager to init
- *
- * Initialize a passthrough keyslot manager.
- * Called by e.g. storage drivers to set up a keyslot manager in their
- * request_queue, when the storage driver wants to manage its keys by itself.
- * This is useful for inline encryption hardware that doesn't have the concept
- * of keyslots, and for layered devices.
- */
-void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm)
-{
- memset(ksm, 0, sizeof(*ksm));
- init_rwsem(&ksm->lock);
-}
-EXPORT_SYMBOL_GPL(blk_ksm_init_passthrough);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index a0ffbabfac2c..fdd74a4df56f 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -9,12 +9,12 @@
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
-#include <linux/elevator.h>
#include <linux/module.h>
#include <linux/sbitmap.h>
#include <trace/events/block.h>
+#include "elevator.h"
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
@@ -453,11 +453,11 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
{
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags;
- unsigned int shift = tags->bitmap_tags->sb.shift;
+ unsigned int shift = tags->bitmap_tags.sb.shift;
kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
- sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth);
+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
}
static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 7f3c3932b723..85d919bf60c7 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -9,7 +9,6 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
-#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -20,6 +19,7 @@
#include <trace/events/block.h>
+#include "elevator.h"
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
@@ -31,6 +31,11 @@
*/
static const int read_expire = HZ / 2; /* max time before a read is submitted. */
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+/*
+ * Time after which to dispatch lower priority requests even if higher
+ * priority requests are pending.
+ */
+static const int prio_aging_expire = 10 * HZ;
static const int writes_starved = 2; /* max times reads can starve a write */
static const int fifo_batch = 16; /* # of sequential requests treated as one
by the above parameters. For throughput. */
@@ -51,17 +56,16 @@ enum dd_prio {
enum { DD_PRIO_COUNT = 3 };
-/* I/O statistics per I/O priority. */
+/*
+ * I/O statistics per I/O priority. It is fine if these counters overflow.
+ * What matters is that these counters are at least as wide as
+ * log2(max_outstanding_requests).
+ */
struct io_stats_per_prio {
- local_t inserted;
- local_t merged;
- local_t dispatched;
- local_t completed;
-};
-
-/* I/O statistics for all I/O priorities (enum dd_prio). */
-struct io_stats {
- struct io_stats_per_prio stats[DD_PRIO_COUNT];
+ uint32_t inserted;
+ uint32_t merged;
+ uint32_t dispatched;
+ atomic_t completed;
};
/*
@@ -74,6 +78,7 @@ struct dd_per_prio {
struct list_head fifo_list[DD_DIR_COUNT];
/* Next request in FIFO order. Read, write or both are NULL. */
struct request *next_rq[DD_DIR_COUNT];
+ struct io_stats_per_prio stats;
};
struct deadline_data {
@@ -88,8 +93,6 @@ struct deadline_data {
unsigned int batching; /* number of sequential requests made */
unsigned int starved; /* times reads have starved writes */
- struct io_stats __percpu *stats;
-
/*
* settings that change how the i/o scheduler behaves
*/
@@ -98,38 +101,12 @@ struct deadline_data {
int writes_starved;
int front_merges;
u32 async_depth;
+ int prio_aging_expire;
spinlock_t lock;
spinlock_t zone_lock;
};
-/* Count one event of type 'event_type' and with I/O priority 'prio' */
-#define dd_count(dd, event_type, prio) do { \
- struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \
- \
- BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
- BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
- local_inc(&io_stats->stats[(prio)].event_type); \
- put_cpu_ptr(io_stats); \
-} while (0)
-
-/*
- * Returns the total number of dd_count(dd, event_type, prio) calls across all
- * CPUs. No locking or barriers since it is fine if the returned sum is slightly
- * outdated.
- */
-#define dd_sum(dd, event_type, prio) ({ \
- unsigned int cpu; \
- u32 sum = 0; \
- \
- BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
- BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
- for_each_present_cpu(cpu) \
- sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \
- stats[(prio)].event_type); \
- sum; \
-})
-
/* Maps an I/O priority class to a deadline scheduler priority. */
static const enum dd_prio ioprio_class_to_prio[] = {
[IOPRIO_CLASS_NONE] = DD_BE_PRIO,
@@ -233,7 +210,9 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
const u8 ioprio_class = dd_rq_ioclass(next);
const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
- dd_count(dd, merged, prio);
+ lockdep_assert_held(&dd->lock);
+
+ dd->per_prio[prio].stats.merged++;
/*
* if next expires before rq, assign its expire time to rq
@@ -270,6 +249,16 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
deadline_remove_request(rq->q, per_prio, rq);
}
+/* Number of requests queued for a given priority level. */
+static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
+{
+ const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
+
+ lockdep_assert_held(&dd->lock);
+
+ return stats->inserted - atomic_read(&stats->completed);
+}
+
/*
* deadline_check_fifo returns 0 if there are no expired requests on the fifo,
* 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
@@ -356,11 +345,26 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
}
/*
+ * Returns true if and only if @rq started after @latest_start where
+ * @latest_start is in jiffies.
+ */
+static bool started_after(struct deadline_data *dd, struct request *rq,
+ unsigned long latest_start)
+{
+ unsigned long start_time = (unsigned long)rq->fifo_time;
+
+ start_time -= dd->fifo_expire[rq_data_dir(rq)];
+
+ return time_after(start_time, latest_start);
+}
+
+/*
* deadline_dispatch_requests selects the best request according to
- * read/write expire, fifo_batch, etc
+ * read/write expire, fifo_batch, etc and with a start time <= @latest_start.
*/
static struct request *__dd_dispatch_request(struct deadline_data *dd,
- struct dd_per_prio *per_prio)
+ struct dd_per_prio *per_prio,
+ unsigned long latest_start)
{
struct request *rq, *next_rq;
enum dd_data_dir data_dir;
@@ -372,6 +376,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
if (!list_empty(&per_prio->dispatch)) {
rq = list_first_entry(&per_prio->dispatch, struct request,
queuelist);
+ if (started_after(dd, rq, latest_start))
+ return NULL;
list_del_init(&rq->queuelist);
goto done;
}
@@ -449,6 +455,9 @@ dispatch_find_request:
dd->batching = 0;
dispatch_request:
+ if (started_after(dd, rq, latest_start))
+ return NULL;
+
/*
* rq is the selected appropriate request.
*/
@@ -457,7 +466,7 @@ dispatch_request:
done:
ioprio_class = dd_rq_ioclass(rq);
prio = ioprio_class_to_prio[ioprio_class];
- dd_count(dd, dispatched, prio);
+ dd->per_prio[prio].stats.dispatched++;
/*
* If the request needs its target zone locked, do it.
*/
@@ -467,6 +476,34 @@ done:
}
/*
+ * Check whether there are any requests with priority other than DD_RT_PRIO
+ * that were inserted more than prio_aging_expire jiffies ago.
+ */
+static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
+ unsigned long now)
+{
+ struct request *rq;
+ enum dd_prio prio;
+ int prio_cnt;
+
+ lockdep_assert_held(&dd->lock);
+
+ prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) +
+ !!dd_queued(dd, DD_IDLE_PRIO);
+ if (prio_cnt < 2)
+ return NULL;
+
+ for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
+ rq = __dd_dispatch_request(dd, &dd->per_prio[prio],
+ now - dd->prio_aging_expire);
+ if (rq)
+ return rq;
+ }
+
+ return NULL;
+}
+
+/*
* Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
*
* One confusing aspect here is that we get called for a specific
@@ -477,15 +514,26 @@ done:
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ const unsigned long now = jiffies;
struct request *rq;
enum dd_prio prio;
spin_lock(&dd->lock);
+ rq = dd_dispatch_prio_aged_requests(dd, now);
+ if (rq)
+ goto unlock;
+
+ /*
+ * Next, dispatch requests in priority order. Ignore lower priority
+ * requests if any higher priority requests are pending.
+ */
for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
- rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
- if (rq)
+ rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now);
+ if (rq || dd_queued(dd, prio))
break;
}
+
+unlock:
spin_unlock(&dd->lock);
return rq;
@@ -519,7 +567,7 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
- sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth);
+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
}
/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
@@ -536,12 +584,21 @@ static void dd_exit_sched(struct elevator_queue *e)
for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
struct dd_per_prio *per_prio = &dd->per_prio[prio];
+ const struct io_stats_per_prio *stats = &per_prio->stats;
+ uint32_t queued;
WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
- }
- free_percpu(dd->stats);
+ spin_lock(&dd->lock);
+ queued = dd_queued(dd, prio);
+ spin_unlock(&dd->lock);
+
+ WARN_ONCE(queued != 0,
+ "statistics for priority %d: i %u m %u d %u c %u\n",
+ prio, stats->inserted, stats->merged,
+ stats->dispatched, atomic_read(&stats->completed));
+ }
kfree(dd);
}
@@ -566,11 +623,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
eq->elevator_data = dd;
- dd->stats = alloc_percpu_gfp(typeof(*dd->stats),
- GFP_KERNEL | __GFP_ZERO);
- if (!dd->stats)
- goto free_dd;
-
for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
struct dd_per_prio *per_prio = &dd->per_prio[prio];
@@ -586,15 +638,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
dd->front_merges = 1;
dd->last_dir = DD_WRITE;
dd->fifo_batch = fifo_batch;
+ dd->prio_aging_expire = prio_aging_expire;
spin_lock_init(&dd->lock);
spin_lock_init(&dd->zone_lock);
q->elevator = eq;
return 0;
-free_dd:
- kfree(dd);
-
put_eq:
kobject_put(&eq->kobj);
return ret;
@@ -677,8 +727,11 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
blk_req_zone_write_unlock(rq);
prio = ioprio_class_to_prio[ioprio_class];
- dd_count(dd, inserted, prio);
- rq->elv.priv[0] = (void *)(uintptr_t)1;
+ per_prio = &dd->per_prio[prio];
+ if (!rq->elv.priv[0]) {
+ per_prio->stats.inserted++;
+ rq->elv.priv[0] = (void *)(uintptr_t)1;
+ }
if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
blk_mq_free_requests(&free);
@@ -687,7 +740,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
trace_block_rq_insert(rq);
- per_prio = &dd->per_prio[prio];
if (at_head) {
list_add(&rq->queuelist, &per_prio->dispatch);
} else {
@@ -759,12 +811,13 @@ static void dd_finish_request(struct request *rq)
/*
* The block layer core may call dd_finish_request() without having
- * called dd_insert_requests(). Hence only update statistics for
- * requests for which dd_insert_requests() has been called. See also
- * blk_mq_request_bypass_insert().
+ * called dd_insert_requests(). Skip requests that bypassed I/O
+ * scheduling. See also blk_mq_request_bypass_insert().
*/
- if (rq->elv.priv[0])
- dd_count(dd, completed, prio);
+ if (!rq->elv.priv[0])
+ return;
+
+ atomic_inc(&per_prio->stats.completed);
if (blk_queue_is_zoned(q)) {
unsigned long flags;
@@ -809,6 +862,7 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \
#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
+SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
SHOW_INT(deadline_front_merges_show, dd->front_merges);
SHOW_INT(deadline_async_depth_show, dd->front_merges);
@@ -838,6 +892,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
+STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX);
@@ -856,6 +911,7 @@ static struct elv_fs_entry deadline_attrs[] = {
DD_ATTR(front_merges),
DD_ATTR(async_depth),
DD_ATTR(fifo_batch),
+ DD_ATTR(prio_aging_expire),
__ATTR_NULL
};
@@ -947,38 +1003,48 @@ static int dd_async_depth_show(void *data, struct seq_file *m)
return 0;
}
-/* Number of requests queued for a given priority level. */
-static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
-{
- return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
-}
-
static int dd_queued_show(void *data, struct seq_file *m)
{
struct request_queue *q = data;
struct deadline_data *dd = q->elevator->elevator_data;
+ u32 rt, be, idle;
+
+ spin_lock(&dd->lock);
+ rt = dd_queued(dd, DD_RT_PRIO);
+ be = dd_queued(dd, DD_BE_PRIO);
+ idle = dd_queued(dd, DD_IDLE_PRIO);
+ spin_unlock(&dd->lock);
+
+ seq_printf(m, "%u %u %u\n", rt, be, idle);
- seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO),
- dd_queued(dd, DD_BE_PRIO),
- dd_queued(dd, DD_IDLE_PRIO));
return 0;
}
/* Number of requests owned by the block driver for a given priority. */
static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
{
- return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio)
- - dd_sum(dd, completed, prio);
+ const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
+
+ lockdep_assert_held(&dd->lock);
+
+ return stats->dispatched + stats->merged -
+ atomic_read(&stats->completed);
}
static int dd_owned_by_driver_show(void *data, struct seq_file *m)
{
struct request_queue *q = data;
struct deadline_data *dd = q->elevator->elevator_data;
+ u32 rt, be, idle;
+
+ spin_lock(&dd->lock);
+ rt = dd_owned_by_driver(dd, DD_RT_PRIO);
+ be = dd_owned_by_driver(dd, DD_BE_PRIO);
+ idle = dd_owned_by_driver(dd, DD_IDLE_PRIO);
+ spin_unlock(&dd->lock);
+
+ seq_printf(m, "%u %u %u\n", rt, be, idle);
- seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO),
- dd_owned_by_driver(dd, DD_BE_PRIO),
- dd_owned_by_driver(dd, DD_IDLE_PRIO));
return 0;
}
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 278593b8e4e9..7aff4eb81c60 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -2,6 +2,8 @@
#
# Partition configuration
#
+menu "Partition Types"
+
config PARTITION_ADVANCED
bool "Advanced partition selection"
help
@@ -267,3 +269,5 @@ config CMDLINE_PARTITION
help
Say Y here if you want to read the partition table from bootargs.
The format for the command line is just like mtdparts.
+
+endmenu
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 7bea19dd9458..ed5deef1d7e1 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -5,6 +5,7 @@
* Copyright (C) 2020 Christoph Hellwig
*/
#include <linux/fs.h>
+#include <linux/major.h>
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/genhd.h>
@@ -203,7 +204,7 @@ static ssize_t part_alignment_offset_show(struct device *dev,
struct block_device *bdev = dev_to_bdev(dev);
return sprintf(buf, "%u\n",
- queue_limit_alignment_offset(&bdev->bd_disk->queue->limits,
+ queue_limit_alignment_offset(&bdev_get_queue(bdev)->limits,
bdev->bd_start_sect));
}
@@ -213,7 +214,7 @@ static ssize_t part_discard_alignment_show(struct device *dev,
struct block_device *bdev = dev_to_bdev(dev);
return sprintf(buf, "%u\n",
- queue_limit_discard_alignment(&bdev->bd_disk->queue->limits,
+ queue_limit_discard_alignment(&bdev_get_queue(bdev)->limits,
bdev->bd_start_sect));
}
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 00c203b2a921..25a52a2a09a8 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -5,7 +5,7 @@
*/
#include <linux/t10-pi.h>
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/crc-t10dif.h>
#include <linux/module.h>
#include <net/checksum.h>
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 8b1714021498..2909fd9e72fb 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -61,10 +61,10 @@
#include <linux/hdreg.h>
#include <linux/delay.h>
#include <linux/init.h>
+#include <linux/major.h>
#include <linux/mutex.h>
#include <linux/fs.h>
#include <linux/blk-mq.h>
-#include <linux/elevator.h>
#include <linux/interrupt.h>
#include <linux/platform_device.h>
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index a093644ac39f..58e921ab5729 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -68,6 +68,7 @@
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/blk-mq.h>
+#include <linux/major.h>
#include <linux/mutex.h>
#include <linux/completion.h>
#include <linux/wait.h>
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 530b31240203..aa0472718dce 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -282,7 +282,7 @@ out:
return err;
}
-static blk_qc_t brd_submit_bio(struct bio *bio)
+static void brd_submit_bio(struct bio *bio)
{
struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
sector_t sector = bio->bi_iter.bi_sector;
@@ -299,16 +299,14 @@ static blk_qc_t brd_submit_bio(struct bio *bio)
err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
bio_op(bio), sector);
- if (err)
- goto io_error;
+ if (err) {
+ bio_io_error(bio);
+ return;
+ }
sector += len >> SECTOR_SHIFT;
}
bio_endio(bio);
- return BLK_QC_T_NONE;
-io_error:
- bio_io_error(bio);
- return BLK_QC_T_NONE;
}
static int brd_rw_page(struct block_device *bdev, sector_t sector,
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 5d9181382ce1..6674a0b88341 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1448,7 +1448,7 @@ extern void conn_free_crypto(struct drbd_connection *connection);
/* drbd_req */
extern void do_submit(struct work_struct *ws);
extern void __drbd_make_request(struct drbd_device *, struct bio *);
-extern blk_qc_t drbd_submit_bio(struct bio *bio);
+void drbd_submit_bio(struct bio *bio);
extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req);
extern int is_valid_ar_handle(struct drbd_request *, sector_t);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 5ca233644d70..3235532ae077 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1596,7 +1596,7 @@ void do_submit(struct work_struct *ws)
}
}
-blk_qc_t drbd_submit_bio(struct bio *bio)
+void drbd_submit_bio(struct bio *bio)
{
struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
@@ -1609,7 +1609,6 @@ blk_qc_t drbd_submit_bio(struct bio *bio)
inc_ap_bio(device);
__drbd_make_request(device, bio);
- return BLK_QC_T_NONE;
}
static bool net_timeout_reached(struct drbd_request *net_req,
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index fef79ea52e3e..6288ce888414 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -184,6 +184,7 @@ static int print_unex = 1;
#include <linux/ioport.h>
#include <linux/interrupt.h>
#include <linux/init.h>
+#include <linux/major.h>
#include <linux/platform_device.h>
#include <linux/mod_devicetable.h>
#include <linux/mutex.h>
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 7bf4686af774..ab23bbc467e7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -273,19 +273,6 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
}
/**
- * loop_validate_block_size() - validates the passed in block size
- * @bsize: size to validate
- */
-static int
-loop_validate_block_size(unsigned short bsize)
-{
- if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
- return -EINVAL;
-
- return 0;
-}
-
-/**
* loop_set_size() - sets device size and notifies userspace
* @lo: struct loop_device to set the size for
* @size: new size of the loop device
@@ -1236,7 +1223,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
}
if (config->block_size) {
- error = loop_validate_block_size(config->block_size);
+ error = blk_validate_block_size(config->block_size);
if (error)
goto out_unlock;
}
@@ -1329,7 +1316,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
{
struct file *filp = NULL;
gfp_t gfp = lo->old_gfp_mask;
- struct block_device *bdev = lo->lo_device;
int err = 0;
bool partscan = false;
int lo_number;
@@ -1395,22 +1381,16 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
blk_queue_logical_block_size(lo->lo_queue, 512);
blk_queue_physical_block_size(lo->lo_queue, 512);
blk_queue_io_min(lo->lo_queue, 512);
- if (bdev) {
- invalidate_bdev(bdev);
- bdev->bd_inode->i_mapping->wb_err = 0;
- }
- set_capacity(lo->lo_disk, 0);
+ invalidate_disk(lo->lo_disk);
loop_sysfs_exit(lo);
- if (bdev) {
- /* let user-space know about this change */
- kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
- }
+ /* let user-space know about this change */
+ kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
mapping_set_gfp_mask(filp->f_mapping, gfp);
/* This is safe: open() is still holding a reference. */
module_put(THIS_MODULE);
blk_mq_unfreeze_queue(lo->lo_queue);
- partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
+ partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
lo_number = lo->lo_number;
disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
out_unlock:
@@ -1759,7 +1739,7 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
if (lo->lo_state != Lo_bound)
return -ENXIO;
- err = loop_validate_block_size(arg);
+ err = blk_validate_block_size(arg);
if (err)
return err;
diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c
index 26798da661bd..b168ca25b6c9 100644
--- a/drivers/block/n64cart.c
+++ b/drivers/block/n64cart.c
@@ -84,7 +84,7 @@ static bool n64cart_do_bvec(struct device *dev, struct bio_vec *bv, u32 pos)
return true;
}
-static blk_qc_t n64cart_submit_bio(struct bio *bio)
+static void n64cart_submit_bio(struct bio *bio)
{
struct bio_vec bvec;
struct bvec_iter iter;
@@ -92,16 +92,14 @@ static blk_qc_t n64cart_submit_bio(struct bio *bio)
u32 pos = bio->bi_iter.bi_sector << SECTOR_SHIFT;
bio_for_each_segment(bvec, bio, iter) {
- if (!n64cart_do_bvec(dev, &bvec, pos))
- goto io_error;
+ if (!n64cart_do_bvec(dev, &bvec, pos)) {
+ bio_io_error(bio);
+ return;
+ }
pos += bvec.bv_len;
}
bio_endio(bio);
- return BLK_QC_T_NONE;
-io_error:
- bio_io_error(bio);
- return BLK_QC_T_NONE;
}
static const struct block_device_operations n64cart_fops = {
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 1183f7872b71..504c20a2f33e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -310,20 +310,13 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
nsock->sent = 0;
}
-static void nbd_size_clear(struct nbd_device *nbd)
-{
- if (nbd->config->bytesize) {
- set_capacity(nbd->disk, 0);
- kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
- }
-}
-
static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
loff_t blksize)
{
if (!blksize)
blksize = 1u << NBD_DEF_BLKSIZE_BITS;
- if (blksize < 512 || blksize > PAGE_SIZE || !is_power_of_2(blksize))
+
+ if (blk_validate_block_size(blksize))
return -EINVAL;
nbd->config->bytesize = bytesize;
@@ -1237,7 +1230,9 @@ static void nbd_config_put(struct nbd_device *nbd)
&nbd->config_lock)) {
struct nbd_config *config = nbd->config;
nbd_dev_dbg_close(nbd);
- nbd_size_clear(nbd);
+ invalidate_disk(nbd->disk);
+ if (nbd->config->bytesize)
+ kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
&config->runtime_flags))
device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 187d779c8ca0..e5cbcf582233 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1422,7 +1422,7 @@ static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
return &nullb->queues[index];
}
-static blk_qc_t null_submit_bio(struct bio *bio)
+static void null_submit_bio(struct bio *bio)
{
sector_t sector = bio->bi_iter.bi_sector;
sector_t nr_sectors = bio_sectors(bio);
@@ -1434,7 +1434,6 @@ static blk_qc_t null_submit_bio(struct bio *bio)
cmd->bio = bio;
null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio));
- return BLK_QC_T_NONE;
}
static bool should_timeout_request(struct request *rq)
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 0f26b2510a75..cb52cce6fb03 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2400,7 +2400,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
}
}
-static blk_qc_t pkt_submit_bio(struct bio *bio)
+static void pkt_submit_bio(struct bio *bio)
{
struct pktcdvd_device *pd;
char b[BDEVNAME_SIZE];
@@ -2423,7 +2423,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio)
*/
if (bio_data_dir(bio) == READ) {
pkt_make_request_read(pd, bio);
- return BLK_QC_T_NONE;
+ return;
}
if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
@@ -2455,10 +2455,9 @@ static blk_qc_t pkt_submit_bio(struct bio *bio)
pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split);
} while (split != bio);
- return BLK_QC_T_NONE;
+ return;
end_io:
bio_io_error(bio);
- return BLK_QC_T_NONE;
}
static void pkt_init_queue(struct pktcdvd_device *pd)
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index c7b19e128b03..d1ebf193cb9a 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -578,7 +578,7 @@ out:
return next;
}
-static blk_qc_t ps3vram_submit_bio(struct bio *bio)
+static void ps3vram_submit_bio(struct bio *bio)
{
struct ps3_system_bus_device *dev = bio->bi_bdev->bd_disk->private_data;
struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
@@ -594,13 +594,11 @@ static blk_qc_t ps3vram_submit_bio(struct bio *bio)
spin_unlock_irq(&priv->lock);
if (busy)
- return BLK_QC_T_NONE;
+ return;
do {
bio = ps3vram_do_bio(dev, bio);
} while (bio);
-
- return BLK_QC_T_NONE;
}
static const struct block_device_operations ps3vram_fops = {
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index e65c9d706f6f..bf60aebd0cfb 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -836,7 +836,7 @@ struct rbd_options {
u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
};
-#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
+#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_DEFAULT_RQ
#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
#define RBD_READ_ONLY_DEFAULT false
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index bd4a41afbbfc..0ec0191d4196 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -1176,7 +1176,7 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
return ret;
}
-static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx)
+static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
struct rnbd_queue *q = hctx->driver_data;
struct rnbd_clt_dev *dev = q->dev;
diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h
index c1bc5c0fef71..de5d5a8df81d 100644
--- a/drivers/block/rnbd/rnbd-proto.h
+++ b/drivers/block/rnbd/rnbd-proto.h
@@ -10,7 +10,7 @@
#define RNBD_PROTO_H
#include <linux/types.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/limits.h>
#include <linux/inet.h>
#include <linux/in.h>
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index 1cc40b0ea761..268252380e88 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -50,7 +50,7 @@ struct rsxx_bio_meta {
static struct kmem_cache *bio_meta_pool;
-static blk_qc_t rsxx_submit_bio(struct bio *bio);
+static void rsxx_submit_bio(struct bio *bio);
/*----------------- Block Device Operations -----------------*/
static int rsxx_blkdev_ioctl(struct block_device *bdev,
@@ -120,7 +120,7 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card,
}
}
-static blk_qc_t rsxx_submit_bio(struct bio *bio)
+static void rsxx_submit_bio(struct bio *bio)
{
struct rsxx_cardinfo *card = bio->bi_bdev->bd_disk->private_data;
struct rsxx_bio_meta *bio_meta;
@@ -169,7 +169,7 @@ static blk_qc_t rsxx_submit_bio(struct bio *bio)
if (st)
goto queue_err;
- return BLK_QC_T_NONE;
+ return;
queue_err:
kmem_cache_free(bio_meta_pool, bio_meta);
@@ -177,7 +177,6 @@ req_err:
if (st)
bio->bi_status = st;
bio_endio(bio);
- return BLK_QC_T_NONE;
}
/*----------------- Device Setup -------------------*/
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 7ccc8d2a41bc..3911d0833e1b 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -16,6 +16,7 @@
#include <linux/fd.h>
#include <linux/slab.h>
#include <linux/blk-mq.h>
+#include <linux/major.h>
#include <linux/mutex.h>
#include <linux/hdreg.h>
#include <linux/kernel.h>
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 303caf2d17d0..fd086179f980 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -815,9 +815,17 @@ static int virtblk_probe(struct virtio_device *vdev)
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
struct virtio_blk_config, blk_size,
&blk_size);
- if (!err)
+ if (!err) {
+ err = blk_validate_block_size(blk_size);
+ if (err) {
+ dev_err(&vdev->dev,
+ "virtio_blk: invalid block size: 0x%x\n",
+ blk_size);
+ goto out_cleanup_disk;
+ }
+
blk_queue_logical_block_size(q, blk_size);
- else
+ } else
blk_size = queue_logical_block_size(q);
/* Use topology information if available */
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 72902104f111..df0deb927760 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -42,6 +42,7 @@
#include <linux/cdrom.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/major.h>
#include <linux/mutex.h>
#include <linux/scatterlist.h>
#include <linux/bitmap.h>
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index fcaf2750f68f..a68297fb51a2 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1598,22 +1598,18 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
/*
* Handler function for all zram I/O requests.
*/
-static blk_qc_t zram_submit_bio(struct bio *bio)
+static void zram_submit_bio(struct bio *bio)
{
struct zram *zram = bio->bi_bdev->bd_disk->private_data;
if (!valid_io_request(zram, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size)) {
atomic64_inc(&zram->stats.invalid_io);
- goto error;
+ bio_io_error(bio);
+ return;
}
__zram_make_request(zram, bio);
- return BLK_QC_T_NONE;
-
-error:
- bio_io_error(bio);
- return BLK_QC_T_NONE;
}
static void zram_slot_free_notify(struct block_device *bdev,
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index 5259edacde38..066a9118c374 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -30,6 +30,7 @@
#include <linux/sched.h>
#include <linux/types.h>
#include <linux/workqueue.h>
+#include <linux/sched/clock.h>
struct drm_i915_private;
struct timer_list;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 6d1de889baeb..23b28edae90f 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1163,7 +1163,7 @@ static void quit_max_writeback_rate(struct cache_set *c,
/* Cached devices - read & write stuff */
-blk_qc_t cached_dev_submit_bio(struct bio *bio)
+void cached_dev_submit_bio(struct bio *bio)
{
struct search *s;
struct block_device *orig_bdev = bio->bi_bdev;
@@ -1176,7 +1176,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio)
dc->io_disable)) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
- return BLK_QC_T_NONE;
+ return;
}
if (likely(d->c)) {
@@ -1222,8 +1222,6 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio)
} else
/* I/O request sent to backing device */
detached_dev_do_request(d, bio, orig_bdev, start_time);
-
- return BLK_QC_T_NONE;
}
static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
@@ -1273,7 +1271,7 @@ static void flash_dev_nodata(struct closure *cl)
continue_at(cl, search_free, NULL);
}
-blk_qc_t flash_dev_submit_bio(struct bio *bio)
+void flash_dev_submit_bio(struct bio *bio)
{
struct search *s;
struct closure *cl;
@@ -1282,7 +1280,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
- return BLK_QC_T_NONE;
+ return;
}
s = search_alloc(bio, d, bio->bi_bdev, bio_start_io_acct(bio));
@@ -1298,7 +1296,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
continue_at_nobarrier(&s->cl,
flash_dev_nodata,
bcache_wq);
- return BLK_QC_T_NONE;
+ return;
} else if (bio_data_dir(bio)) {
bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
&KEY(d->id, bio->bi_iter.bi_sector, 0),
@@ -1314,7 +1312,6 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
}
continue_at(cl, search_free, NULL);
- return BLK_QC_T_NONE;
}
static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 82b38366a95d..38ab4856eaab 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -37,10 +37,10 @@ unsigned int bch_get_congested(const struct cache_set *c);
void bch_data_insert(struct closure *cl);
void bch_cached_dev_request_init(struct cached_dev *dc);
-blk_qc_t cached_dev_submit_bio(struct bio *bio);
+void cached_dev_submit_bio(struct bio *bio);
void bch_flash_dev_request_init(struct bcache_device *d);
-blk_qc_t flash_dev_submit_bio(struct bio *bio);
+void flash_dev_submit_bio(struct bio *bio);
extern struct kmem_cache *bch_search_cache;
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index a3b71350eec8..745e3ab4aa0a 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -8,6 +8,7 @@
#define DM_BIO_RECORD_H
#include <linux/bio.h>
+#include <linux/blk-integrity.h>
/*
* There are lots of mutable fields in the bio struct that get
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 55dccdfbcb22..b855fef4f38a 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -13,7 +13,7 @@
#include <linux/ktime.h>
#include <linux/genhd.h>
#include <linux/blk-mq.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
#include <trace/events/block.h>
@@ -200,7 +200,7 @@ struct dm_table {
struct dm_md_mempools *mempools;
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
- struct blk_keyslot_manager *ksm;
+ struct blk_crypto_profile *crypto_profile;
#endif
};
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 916b7da16de2..292f7896f733 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -15,6 +15,7 @@
#include <linux/key.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/mempool.h>
#include <linux/slab.h>
#include <linux/crypto.h>
diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c
index 2c5edfbd7711..957999998d70 100644
--- a/drivers/md/dm-ima.c
+++ b/drivers/md/dm-ima.c
@@ -12,6 +12,7 @@
#include "dm-ima.h"
#include <linux/ima.h>
+#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include <linux/crypto.h>
#include <crypto/hash_info.h>
diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c
index 1856a1b125cc..875bca30a0dd 100644
--- a/drivers/md/dm-ps-historical-service-time.c
+++ b/drivers/md/dm-ps-historical-service-time.c
@@ -27,6 +27,7 @@
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/module.h>
+#include <linux/sched/clock.h>
#define DM_MSG_PREFIX "multipath historical-service-time"
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index a896dea9750e..579ab6183d4d 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -7,7 +7,6 @@
#include "dm-core.h"
#include "dm-rq.h"
-#include <linux/elevator.h> /* for rq_end_sector() */
#include <linux/blk-mq.h>
#define DM_MSG_PREFIX "core-rq"
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2111daaacaba..8b0f27a745d9 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/namei.h>
#include <linux/ctype.h>
#include <linux/string.h>
@@ -169,7 +170,7 @@ static void free_devices(struct list_head *devices, struct mapped_device *md)
}
}
-static void dm_table_destroy_keyslot_manager(struct dm_table *t);
+static void dm_table_destroy_crypto_profile(struct dm_table *t);
void dm_table_destroy(struct dm_table *t)
{
@@ -199,7 +200,7 @@ void dm_table_destroy(struct dm_table *t)
dm_free_md_mempools(t->mempools);
- dm_table_destroy_keyslot_manager(t);
+ dm_table_destroy_crypto_profile(t);
kfree(t);
}
@@ -1186,8 +1187,8 @@ static int dm_table_register_integrity(struct dm_table *t)
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
-struct dm_keyslot_manager {
- struct blk_keyslot_manager ksm;
+struct dm_crypto_profile {
+ struct blk_crypto_profile profile;
struct mapped_device *md;
};
@@ -1213,13 +1214,11 @@ static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev,
* When an inline encryption key is evicted from a device-mapper device, evict
* it from all the underlying devices.
*/
-static int dm_keyslot_evict(struct blk_keyslot_manager *ksm,
+static int dm_keyslot_evict(struct blk_crypto_profile *profile,
const struct blk_crypto_key *key, unsigned int slot)
{
- struct dm_keyslot_manager *dksm = container_of(ksm,
- struct dm_keyslot_manager,
- ksm);
- struct mapped_device *md = dksm->md;
+ struct mapped_device *md =
+ container_of(profile, struct dm_crypto_profile, profile)->md;
struct dm_keyslot_evict_args args = { key };
struct dm_table *t;
int srcu_idx;
@@ -1239,150 +1238,148 @@ static int dm_keyslot_evict(struct blk_keyslot_manager *ksm,
return args.err;
}
-static const struct blk_ksm_ll_ops dm_ksm_ll_ops = {
- .keyslot_evict = dm_keyslot_evict,
-};
-
-static int device_intersect_crypto_modes(struct dm_target *ti,
- struct dm_dev *dev, sector_t start,
- sector_t len, void *data)
+static int
+device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
{
- struct blk_keyslot_manager *parent = data;
- struct blk_keyslot_manager *child = bdev_get_queue(dev->bdev)->ksm;
+ struct blk_crypto_profile *parent = data;
+ struct blk_crypto_profile *child =
+ bdev_get_queue(dev->bdev)->crypto_profile;
- blk_ksm_intersect_modes(parent, child);
+ blk_crypto_intersect_capabilities(parent, child);
return 0;
}
-void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm)
+void dm_destroy_crypto_profile(struct blk_crypto_profile *profile)
{
- struct dm_keyslot_manager *dksm = container_of(ksm,
- struct dm_keyslot_manager,
- ksm);
+ struct dm_crypto_profile *dmcp = container_of(profile,
+ struct dm_crypto_profile,
+ profile);
- if (!ksm)
+ if (!profile)
return;
- blk_ksm_destroy(ksm);
- kfree(dksm);
+ blk_crypto_profile_destroy(profile);
+ kfree(dmcp);
}
-static void dm_table_destroy_keyslot_manager(struct dm_table *t)
+static void dm_table_destroy_crypto_profile(struct dm_table *t)
{
- dm_destroy_keyslot_manager(t->ksm);
- t->ksm = NULL;
+ dm_destroy_crypto_profile(t->crypto_profile);
+ t->crypto_profile = NULL;
}
/*
- * Constructs and initializes t->ksm with a keyslot manager that
- * represents the common set of crypto capabilities of the devices
- * described by the dm_table. However, if the constructed keyslot
- * manager does not support a superset of the crypto capabilities
- * supported by the current keyslot manager of the mapped_device,
- * it returns an error instead, since we don't support restricting
- * crypto capabilities on table changes. Finally, if the constructed
- * keyslot manager doesn't actually support any crypto modes at all,
- * it just returns NULL.
+ * Constructs and initializes t->crypto_profile with a crypto profile that
+ * represents the common set of crypto capabilities of the devices described by
+ * the dm_table. However, if the constructed crypto profile doesn't support all
+ * crypto capabilities that are supported by the current mapped_device, it
+ * returns an error instead, since we don't support removing crypto capabilities
+ * on table changes. Finally, if the constructed crypto profile is "empty" (has
+ * no crypto capabilities at all), it just sets t->crypto_profile to NULL.
*/
-static int dm_table_construct_keyslot_manager(struct dm_table *t)
+static int dm_table_construct_crypto_profile(struct dm_table *t)
{
- struct dm_keyslot_manager *dksm;
- struct blk_keyslot_manager *ksm;
+ struct dm_crypto_profile *dmcp;
+ struct blk_crypto_profile *profile;
struct dm_target *ti;
unsigned int i;
- bool ksm_is_empty = true;
+ bool empty_profile = true;
- dksm = kmalloc(sizeof(*dksm), GFP_KERNEL);
- if (!dksm)
+ dmcp = kmalloc(sizeof(*dmcp), GFP_KERNEL);
+ if (!dmcp)
return -ENOMEM;
- dksm->md = t->md;
+ dmcp->md = t->md;
- ksm = &dksm->ksm;
- blk_ksm_init_passthrough(ksm);
- ksm->ksm_ll_ops = dm_ksm_ll_ops;
- ksm->max_dun_bytes_supported = UINT_MAX;
- memset(ksm->crypto_modes_supported, 0xFF,
- sizeof(ksm->crypto_modes_supported));
+ profile = &dmcp->profile;
+ blk_crypto_profile_init(profile, 0);
+ profile->ll_ops.keyslot_evict = dm_keyslot_evict;
+ profile->max_dun_bytes_supported = UINT_MAX;
+ memset(profile->modes_supported, 0xFF,
+ sizeof(profile->modes_supported));
for (i = 0; i < dm_table_get_num_targets(t); i++) {
ti = dm_table_get_target(t, i);
if (!dm_target_passes_crypto(ti->type)) {
- blk_ksm_intersect_modes(ksm, NULL);
+ blk_crypto_intersect_capabilities(profile, NULL);
break;
}
if (!ti->type->iterate_devices)
continue;
- ti->type->iterate_devices(ti, device_intersect_crypto_modes,
- ksm);
+ ti->type->iterate_devices(ti,
+ device_intersect_crypto_capabilities,
+ profile);
}
- if (t->md->queue && !blk_ksm_is_superset(ksm, t->md->queue->ksm)) {
+ if (t->md->queue &&
+ !blk_crypto_has_capabilities(profile,
+ t->md->queue->crypto_profile)) {
DMWARN("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!");
- dm_destroy_keyslot_manager(ksm);
+ dm_destroy_crypto_profile(profile);
return -EINVAL;
}
/*
- * If the new KSM doesn't actually support any crypto modes, we may as
- * well represent it with a NULL ksm.
+ * If the new profile doesn't actually support any crypto capabilities,
+ * we may as well represent it with a NULL profile.
*/
- ksm_is_empty = true;
- for (i = 0; i < ARRAY_SIZE(ksm->crypto_modes_supported); i++) {
- if (ksm->crypto_modes_supported[i]) {
- ksm_is_empty = false;
+ for (i = 0; i < ARRAY_SIZE(profile->modes_supported); i++) {
+ if (profile->modes_supported[i]) {
+ empty_profile = false;
break;
}
}
- if (ksm_is_empty) {
- dm_destroy_keyslot_manager(ksm);
- ksm = NULL;
+ if (empty_profile) {
+ dm_destroy_crypto_profile(profile);
+ profile = NULL;
}
/*
- * t->ksm is only set temporarily while the table is being set
- * up, and it gets set to NULL after the capabilities have
- * been transferred to the request_queue.
+ * t->crypto_profile is only set temporarily while the table is being
+ * set up, and it gets set to NULL after the profile has been
+ * transferred to the request_queue.
*/
- t->ksm = ksm;
+ t->crypto_profile = profile;
return 0;
}
-static void dm_update_keyslot_manager(struct request_queue *q,
- struct dm_table *t)
+static void dm_update_crypto_profile(struct request_queue *q,
+ struct dm_table *t)
{
- if (!t->ksm)
+ if (!t->crypto_profile)
return;
- /* Make the ksm less restrictive */
- if (!q->ksm) {
- blk_ksm_register(t->ksm, q);
+ /* Make the crypto profile less restrictive. */
+ if (!q->crypto_profile) {
+ blk_crypto_register(t->crypto_profile, q);
} else {
- blk_ksm_update_capabilities(q->ksm, t->ksm);
- dm_destroy_keyslot_manager(t->ksm);
+ blk_crypto_update_capabilities(q->crypto_profile,
+ t->crypto_profile);
+ dm_destroy_crypto_profile(t->crypto_profile);
}
- t->ksm = NULL;
+ t->crypto_profile = NULL;
}
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
-static int dm_table_construct_keyslot_manager(struct dm_table *t)
+static int dm_table_construct_crypto_profile(struct dm_table *t)
{
return 0;
}
-void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm)
+void dm_destroy_crypto_profile(struct blk_crypto_profile *profile)
{
}
-static void dm_table_destroy_keyslot_manager(struct dm_table *t)
+static void dm_table_destroy_crypto_profile(struct dm_table *t)
{
}
-static void dm_update_keyslot_manager(struct request_queue *q,
- struct dm_table *t)
+static void dm_update_crypto_profile(struct request_queue *q,
+ struct dm_table *t)
{
}
@@ -1414,9 +1411,9 @@ int dm_table_complete(struct dm_table *t)
return r;
}
- r = dm_table_construct_keyslot_manager(t);
+ r = dm_table_construct_crypto_profile(t);
if (r) {
- DMERR("could not construct keyslot manager.");
+ DMERR("could not construct crypto profile.");
return r;
}
@@ -2070,7 +2067,7 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
return r;
}
- dm_update_keyslot_manager(q, t);
+ dm_update_crypto_profile(q, t);
disk_update_readahead(t->md->disk);
return 0;
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 88288c8d6bc8..aae48a8b1a04 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -18,6 +18,7 @@
#include "dm-verity-verify-sig.h"
#include <linux/module.h>
#include <linux/reboot.h>
+#include <linux/scatterlist.h>
#define DM_MSG_PREFIX "verity"
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 76d9da49fda7..8b91f4f0e053 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -29,7 +29,7 @@
#include <linux/refcount.h>
#include <linux/part_stat.h>
#include <linux/blk-crypto.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
#define DM_MSG_PREFIX "core"
@@ -1183,14 +1183,13 @@ static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
mutex_unlock(&md->swap_bios_lock);
}
-static blk_qc_t __map_bio(struct dm_target_io *tio)
+static void __map_bio(struct dm_target_io *tio)
{
int r;
sector_t sector;
struct bio *clone = &tio->clone;
struct dm_io *io = tio->io;
struct dm_target *ti = tio->ti;
- blk_qc_t ret = BLK_QC_T_NONE;
clone->bi_end_io = clone_endio;
@@ -1226,7 +1225,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
case DM_MAPIO_REMAPPED:
/* the bio has been remapped so dispatch it */
trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector);
- ret = submit_bio_noacct(clone);
+ submit_bio_noacct(clone);
break;
case DM_MAPIO_KILL:
if (unlikely(swap_bios_limit(ti, clone))) {
@@ -1248,8 +1247,6 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
DMWARN("unimplemented target map return value: %d", r);
BUG();
}
-
- return ret;
}
static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
@@ -1336,7 +1333,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
}
}
-static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
+static void __clone_and_map_simple_bio(struct clone_info *ci,
struct dm_target_io *tio, unsigned *len)
{
struct bio *clone = &tio->clone;
@@ -1346,8 +1343,7 @@ static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
__bio_clone_fast(clone, ci->bio);
if (len)
bio_setup_sector(clone, ci->sector, *len);
-
- return __map_bio(tio);
+ __map_bio(tio);
}
static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
@@ -1361,7 +1357,7 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
while ((bio = bio_list_pop(&blist))) {
tio = container_of(bio, struct dm_target_io, clone);
- (void) __clone_and_map_simple_bio(ci, tio, len);
+ __clone_and_map_simple_bio(ci, tio, len);
}
}
@@ -1405,7 +1401,7 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
free_tio(tio);
return r;
}
- (void) __map_bio(tio);
+ __map_bio(tio);
return 0;
}
@@ -1520,11 +1516,10 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
/*
* Entry point to split a bio into clones and submit them to the targets.
*/
-static blk_qc_t __split_and_process_bio(struct mapped_device *md,
+static void __split_and_process_bio(struct mapped_device *md,
struct dm_table *map, struct bio *bio)
{
struct clone_info ci;
- blk_qc_t ret = BLK_QC_T_NONE;
int error = 0;
init_clone_info(&ci, md, map, bio);
@@ -1567,19 +1562,17 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
bio_chain(b, bio);
trace_block_split(b, bio->bi_iter.bi_sector);
- ret = submit_bio_noacct(bio);
+ submit_bio_noacct(bio);
}
}
/* drop the extra reference count */
dm_io_dec_pending(ci.io, errno_to_blk_status(error));
- return ret;
}
-static blk_qc_t dm_submit_bio(struct bio *bio)
+static void dm_submit_bio(struct bio *bio)
{
struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
- blk_qc_t ret = BLK_QC_T_NONE;
int srcu_idx;
struct dm_table *map;
@@ -1609,10 +1602,9 @@ static blk_qc_t dm_submit_bio(struct bio *bio)
if (is_abnormal_io(bio))
blk_queue_split(&bio);
- ret = __split_and_process_bio(md, map, bio);
+ __split_and_process_bio(md, map, bio);
out:
dm_put_live_table(md, srcu_idx);
- return ret;
}
/*-----------------------------------------------------------------
@@ -1671,14 +1663,14 @@ static const struct dax_operations dm_dax_ops;
static void dm_wq_work(struct work_struct *work);
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
-static void dm_queue_destroy_keyslot_manager(struct request_queue *q)
+static void dm_queue_destroy_crypto_profile(struct request_queue *q)
{
- dm_destroy_keyslot_manager(q->ksm);
+ dm_destroy_crypto_profile(q->crypto_profile);
}
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
-static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q)
+static inline void dm_queue_destroy_crypto_profile(struct request_queue *q)
{
}
#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
@@ -1704,7 +1696,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
dm_sysfs_exit(md);
del_gendisk(md->disk);
}
- dm_queue_destroy_keyslot_manager(md->queue);
+ dm_queue_destroy_crypto_profile(md->queue);
blk_cleanup_disk(md->disk);
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6c0c3d0d905a..22310d5d8d41 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -41,6 +41,7 @@
#include <linux/sched/signal.h>
#include <linux/kthread.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/badblocks.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
@@ -51,6 +52,7 @@
#include <linux/hdreg.h>
#include <linux/proc_fs.h>
#include <linux/random.h>
+#include <linux/major.h>
#include <linux/module.h>
#include <linux/reboot.h>
#include <linux/file.h>
@@ -441,19 +443,19 @@ check_suspended:
}
EXPORT_SYMBOL(md_handle_request);
-static blk_qc_t md_submit_bio(struct bio *bio)
+static void md_submit_bio(struct bio *bio)
{
const int rw = bio_data_dir(bio);
struct mddev *mddev = bio->bi_bdev->bd_disk->private_data;
if (mddev == NULL || mddev->pers == NULL) {
bio_io_error(bio);
- return BLK_QC_T_NONE;
+ return;
}
if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
bio_io_error(bio);
- return BLK_QC_T_NONE;
+ return;
}
blk_queue_split(&bio);
@@ -462,15 +464,13 @@ static blk_qc_t md_submit_bio(struct bio *bio)
if (bio_sectors(bio) != 0)
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
- return BLK_QC_T_NONE;
+ return;
}
/* bio could be mergeable after passing to underlayer */
bio->bi_opf &= ~REQ_NOMERGE;
md_handle_request(mddev, bio);
-
- return BLK_QC_T_NONE;
}
/* mddev_suspend makes sure no new requests are submitted
diff --git a/drivers/mmc/core/crypto.c b/drivers/mmc/core/crypto.c
index 67557808cada..fec4fbf16a5b 100644
--- a/drivers/mmc/core/crypto.c
+++ b/drivers/mmc/core/crypto.c
@@ -16,13 +16,13 @@ void mmc_crypto_set_initial_state(struct mmc_host *host)
{
/* Reset might clear all keys, so reprogram all the keys. */
if (host->caps2 & MMC_CAP2_CRYPTO)
- blk_ksm_reprogram_all_keys(&host->ksm);
+ blk_crypto_reprogram_all_keys(&host->crypto_profile);
}
void mmc_crypto_setup_queue(struct request_queue *q, struct mmc_host *host)
{
if (host->caps2 & MMC_CAP2_CRYPTO)
- blk_ksm_register(&host->ksm, q);
+ blk_crypto_register(&host->crypto_profile, q);
}
EXPORT_SYMBOL_GPL(mmc_crypto_setup_queue);
@@ -30,12 +30,15 @@ void mmc_crypto_prepare_req(struct mmc_queue_req *mqrq)
{
struct request *req = mmc_queue_req_to_req(mqrq);
struct mmc_request *mrq = &mqrq->brq.mrq;
+ struct blk_crypto_keyslot *keyslot;
if (!req->crypt_ctx)
return;
mrq->crypto_ctx = req->crypt_ctx;
- if (req->crypt_keyslot)
- mrq->crypto_key_slot = blk_ksm_get_slot_idx(req->crypt_keyslot);
+
+ keyslot = req->crypt_keyslot;
+ if (keyslot)
+ mrq->crypto_key_slot = blk_crypto_keyslot_index(keyslot);
}
EXPORT_SYMBOL_GPL(mmc_crypto_prepare_req);
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 4646b7a03db6..c9db24e16af1 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/pm_runtime.h>
+#include <linux/scatterlist.h>
#include <linux/mmc/host.h>
#include <linux/mmc/card.h>
diff --git a/drivers/mmc/host/cqhci-crypto.c b/drivers/mmc/host/cqhci-crypto.c
index 6419cfbb4ab7..d5f4b6972f63 100644
--- a/drivers/mmc/host/cqhci-crypto.c
+++ b/drivers/mmc/host/cqhci-crypto.c
@@ -6,7 +6,7 @@
*/
#include <linux/blk-crypto.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
#include <linux/mmc/host.h>
#include "cqhci-crypto.h"
@@ -23,9 +23,10 @@ static const struct cqhci_crypto_alg_entry {
};
static inline struct cqhci_host *
-cqhci_host_from_ksm(struct blk_keyslot_manager *ksm)
+cqhci_host_from_crypto_profile(struct blk_crypto_profile *profile)
{
- struct mmc_host *mmc = container_of(ksm, struct mmc_host, ksm);
+ struct mmc_host *mmc =
+ container_of(profile, struct mmc_host, crypto_profile);
return mmc->cqe_private;
}
@@ -57,12 +58,12 @@ static int cqhci_crypto_program_key(struct cqhci_host *cq_host,
return 0;
}
-static int cqhci_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
+static int cqhci_crypto_keyslot_program(struct blk_crypto_profile *profile,
const struct blk_crypto_key *key,
unsigned int slot)
{
- struct cqhci_host *cq_host = cqhci_host_from_ksm(ksm);
+ struct cqhci_host *cq_host = cqhci_host_from_crypto_profile(profile);
const union cqhci_crypto_cap_entry *ccap_array =
cq_host->crypto_cap_array;
const struct cqhci_crypto_alg_entry *alg =
@@ -115,11 +116,11 @@ static int cqhci_crypto_clear_keyslot(struct cqhci_host *cq_host, int slot)
return cqhci_crypto_program_key(cq_host, &cfg, slot);
}
-static int cqhci_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
+static int cqhci_crypto_keyslot_evict(struct blk_crypto_profile *profile,
const struct blk_crypto_key *key,
unsigned int slot)
{
- struct cqhci_host *cq_host = cqhci_host_from_ksm(ksm);
+ struct cqhci_host *cq_host = cqhci_host_from_crypto_profile(profile);
return cqhci_crypto_clear_keyslot(cq_host, slot);
}
@@ -132,7 +133,7 @@ static int cqhci_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
* "enabled" when these are called, i.e. CQHCI_ENABLE might not be set in the
* CQHCI_CFG register. But the hardware allows that.
*/
-static const struct blk_ksm_ll_ops cqhci_ksm_ops = {
+static const struct blk_crypto_ll_ops cqhci_crypto_ops = {
.keyslot_program = cqhci_crypto_keyslot_program,
.keyslot_evict = cqhci_crypto_keyslot_evict,
};
@@ -157,8 +158,8 @@ cqhci_find_blk_crypto_mode(union cqhci_crypto_cap_entry cap)
*
* If the driver previously set MMC_CAP2_CRYPTO and the CQE declares
* CQHCI_CAP_CS, initialize the crypto support. This involves reading the
- * crypto capability registers, initializing the keyslot manager, clearing all
- * keyslots, and enabling 128-bit task descriptors.
+ * crypto capability registers, initializing the blk_crypto_profile, clearing
+ * all keyslots, and enabling 128-bit task descriptors.
*
* Return: 0 if crypto was initialized or isn't supported; whether
* MMC_CAP2_CRYPTO remains set indicates which one of those cases it is.
@@ -168,7 +169,7 @@ int cqhci_crypto_init(struct cqhci_host *cq_host)
{
struct mmc_host *mmc = cq_host->mmc;
struct device *dev = mmc_dev(mmc);
- struct blk_keyslot_manager *ksm = &mmc->ksm;
+ struct blk_crypto_profile *profile = &mmc->crypto_profile;
unsigned int num_keyslots;
unsigned int cap_idx;
enum blk_crypto_mode_num blk_mode_num;
@@ -199,15 +200,15 @@ int cqhci_crypto_init(struct cqhci_host *cq_host)
*/
num_keyslots = cq_host->crypto_capabilities.config_count + 1;
- err = devm_blk_ksm_init(dev, ksm, num_keyslots);
+ err = devm_blk_crypto_profile_init(dev, profile, num_keyslots);
if (err)
goto out;
- ksm->ksm_ll_ops = cqhci_ksm_ops;
- ksm->dev = dev;
+ profile->ll_ops = cqhci_crypto_ops;
+ profile->dev = dev;
/* Unfortunately, CQHCI crypto only supports 32 DUN bits. */
- ksm->max_dun_bytes_supported = 4;
+ profile->max_dun_bytes_supported = 4;
/*
* Cache all the crypto capabilities and advertise the supported crypto
@@ -223,7 +224,7 @@ int cqhci_crypto_init(struct cqhci_host *cq_host)
cq_host->crypto_cap_array[cap_idx]);
if (blk_mode_num == BLK_ENCRYPTION_MODE_INVALID)
continue;
- ksm->crypto_modes_supported[blk_mode_num] |=
+ profile->modes_supported[blk_mode_num] |=
cq_host->crypto_cap_array[cap_idx].sdus_mask * 512;
}
diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index 38b6aa849c63..5ff001140ef4 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/major.h>
#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
#include <linux/fs_context.h>
#include "mtdcore.h"
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 088d3dd6f6fa..b6c6866f9259 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -162,7 +162,7 @@ static int nsblk_do_bvec(struct nd_namespace_blk *nsblk,
return err;
}
-static blk_qc_t nd_blk_submit_bio(struct bio *bio)
+static void nd_blk_submit_bio(struct bio *bio)
{
struct bio_integrity_payload *bip;
struct nd_namespace_blk *nsblk = bio->bi_bdev->bd_disk->private_data;
@@ -173,7 +173,7 @@ static blk_qc_t nd_blk_submit_bio(struct bio *bio)
bool do_acct;
if (!bio_integrity_prep(bio))
- return BLK_QC_T_NONE;
+ return;
bip = bio_integrity(bio);
rw = bio_data_dir(bio);
@@ -199,7 +199,6 @@ static blk_qc_t nd_blk_submit_bio(struct bio *bio)
bio_end_io_acct(bio, start);
bio_endio(bio);
- return BLK_QC_T_NONE;
}
static int nsblk_rw_bytes(struct nd_namespace_common *ndns,
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 92dec4952297..4295fa809420 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1440,7 +1440,7 @@ static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
return ret;
}
-static blk_qc_t btt_submit_bio(struct bio *bio)
+static void btt_submit_bio(struct bio *bio)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
struct btt *btt = bio->bi_bdev->bd_disk->private_data;
@@ -1451,7 +1451,7 @@ static blk_qc_t btt_submit_bio(struct bio *bio)
bool do_acct;
if (!bio_integrity_prep(bio))
- return BLK_QC_T_NONE;
+ return;
do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
if (do_acct)
@@ -1483,7 +1483,6 @@ static blk_qc_t btt_submit_bio(struct bio *bio)
bio_end_io_acct(bio, start);
bio_endio(bio);
- return BLK_QC_T_NONE;
}
static int btt_rw_page(struct block_device *bdev, sector_t sector,
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 7de592d7eff4..6a45fa91e8a3 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/module.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/device.h>
#include <linux/ctype.h>
#include <linux/ndctl.h>
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 054154c22899..c74d7bceb222 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -190,7 +190,7 @@ static blk_status_t pmem_do_write(struct pmem_device *pmem,
return rc;
}
-static blk_qc_t pmem_submit_bio(struct bio *bio)
+static void pmem_submit_bio(struct bio *bio)
{
int ret = 0;
blk_status_t rc = 0;
@@ -229,7 +229,6 @@ static blk_qc_t pmem_submit_bio(struct bio *bio)
bio->bi_status = errno_to_blk_status(ret);
bio_endio(bio);
- return BLK_QC_T_NONE;
}
static int pmem_rw_page(struct block_device *bdev, sector_t sector,
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f8dd664b2eda..eb284f45fc44 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -6,6 +6,7 @@
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
+#include <linux/blk-integrity.h>
#include <linux/compat.h>
#include <linux/delay.h>
#include <linux/errno.h>
@@ -118,25 +119,6 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
struct nvme_command *cmd);
-/*
- * Prepare a queue for teardown.
- *
- * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
- * the capacity to 0 after that to avoid blocking dispatchers that may be
- * holding bd_butex. This will end buffered writers dirtying pages that can't
- * be synced.
- */
-static void nvme_set_queue_dying(struct nvme_ns *ns)
-{
- if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
- return;
-
- blk_set_queue_dying(ns->queue);
- blk_mq_unquiesce_queue(ns->queue);
-
- set_capacity_and_notify(ns->disk, 0);
-}
-
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
/*
@@ -345,15 +327,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
return RETRY;
}
-static inline void nvme_end_req(struct request *req)
+static inline void nvme_end_req_zoned(struct request *req)
{
- blk_status_t status = nvme_error_status(nvme_req(req)->status);
-
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
req_op(req) == REQ_OP_ZONE_APPEND)
req->__sector = nvme_lba_to_sect(req->q->queuedata,
le64_to_cpu(nvme_req(req)->result.u64));
+}
+
+static inline void nvme_end_req(struct request *req)
+{
+ blk_status_t status = nvme_error_status(nvme_req(req)->status);
+ nvme_end_req_zoned(req);
nvme_trace_bio_complete(req);
blk_mq_end_request(req, status);
}
@@ -380,6 +366,13 @@ void nvme_complete_rq(struct request *req)
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
+void nvme_complete_batch_req(struct request *req)
+{
+ nvme_cleanup_cmd(req);
+ nvme_end_req_zoned(req);
+}
+EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
+
/*
* Called to unwind from ->queue_rq on a failed command submission so that the
* multipathing code gets called to potentially failover to another path.
@@ -631,7 +624,7 @@ static inline void nvme_init_request(struct request *req,
req->cmd_flags |= REQ_FAILFAST_DRIVER;
if (req->mq_hctx->type == HCTX_TYPE_POLL)
- req->cmd_flags |= REQ_HIPRI;
+ req->cmd_flags |= REQ_POLLED;
nvme_clear_nvme_request(req);
memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd));
}
@@ -4473,6 +4466,37 @@ out:
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
+static void nvme_start_ns_queue(struct nvme_ns *ns)
+{
+ if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
+ blk_mq_unquiesce_queue(ns->queue);
+}
+
+static void nvme_stop_ns_queue(struct nvme_ns *ns)
+{
+ if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
+ blk_mq_quiesce_queue(ns->queue);
+}
+
+/*
+ * Prepare a queue for teardown.
+ *
+ * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
+ * the capacity to 0 after that to avoid blocking dispatchers that may be
+ * holding bd_butex. This will end buffered writers dirtying pages that can't
+ * be synced.
+ */
+static void nvme_set_queue_dying(struct nvme_ns *ns)
+{
+ if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+ return;
+
+ blk_set_queue_dying(ns->queue);
+ nvme_start_ns_queue(ns);
+
+ set_capacity_and_notify(ns->disk, 0);
+}
+
/**
* nvme_kill_queues(): Ends all namespace queues
* @ctrl: the dead controller that needs to end
@@ -4488,7 +4512,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
/* Forcibly unquiesce queues to avoid blocking dispatch */
if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
- blk_mq_unquiesce_queue(ctrl->admin_q);
+ nvme_start_admin_queue(ctrl);
list_for_each_entry(ns, &ctrl->namespaces, list)
nvme_set_queue_dying(ns);
@@ -4551,7 +4575,7 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
- blk_mq_quiesce_queue(ns->queue);
+ nvme_stop_ns_queue(ns);
up_read(&ctrl->namespaces_rwsem);
}
EXPORT_SYMBOL_GPL(nvme_stop_queues);
@@ -4562,11 +4586,25 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
- blk_mq_unquiesce_queue(ns->queue);
+ nvme_start_ns_queue(ns);
up_read(&ctrl->namespaces_rwsem);
}
EXPORT_SYMBOL_GPL(nvme_start_queues);
+void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
+{
+ if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
+ blk_mq_quiesce_queue(ctrl->admin_q);
+}
+EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);
+
+void nvme_start_admin_queue(struct nvme_ctrl *ctrl)
+{
+ if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
+ blk_mq_unquiesce_queue(ctrl->admin_q);
+}
+EXPORT_SYMBOL_GPL(nvme_start_admin_queue);
+
void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index aa14ad963d91..580a216da619 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2382,7 +2382,7 @@ nvme_fc_ctrl_free(struct kref *ref)
list_del(&ctrl->ctrl_list);
spin_unlock_irqrestore(&ctrl->rport->lock, flags);
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ nvme_start_admin_queue(&ctrl->ctrl);
blk_cleanup_queue(ctrl->ctrl.admin_q);
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
blk_mq_free_tag_set(&ctrl->admin_tag_set);
@@ -2510,7 +2510,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
/*
* clean up the admin queue. Same thing as above.
*/
- blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+ nvme_stop_admin_queue(&ctrl->ctrl);
blk_sync_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
@@ -3095,7 +3095,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments <<
(ilog2(SZ_4K) - 9);
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ nvme_start_admin_queue(&ctrl->ctrl);
ret = nvme_init_ctrl_finish(&ctrl->ctrl);
if (ret || test_bit(ASSOC_FAILED, &ctrl->flags))
@@ -3249,7 +3249,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
nvme_fc_free_queue(&ctrl->queues[0]);
/* re-enable the admin_q so anything new can fast fail */
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ nvme_start_admin_queue(&ctrl->ctrl);
/* resume the io queues so that things will fast fail */
nvme_start_queues(&ctrl->ctrl);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index fba06618c6c2..11440c86881e 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -85,8 +85,13 @@ void nvme_failover_req(struct request *req)
}
spin_lock_irqsave(&ns->head->requeue_lock, flags);
- for (bio = req->bio; bio; bio = bio->bi_next)
+ for (bio = req->bio; bio; bio = bio->bi_next) {
bio_set_dev(bio, ns->head->disk->part0);
+ if (bio->bi_opf & REQ_POLLED) {
+ bio->bi_opf &= ~REQ_POLLED;
+ bio->bi_cookie = BLK_QC_T_NONE;
+ }
+ }
blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
@@ -312,12 +317,11 @@ static bool nvme_available_path(struct nvme_ns_head *head)
return false;
}
-static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
+static void nvme_ns_head_submit_bio(struct bio *bio)
{
struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
struct device *dev = disk_to_dev(head->disk);
struct nvme_ns *ns;
- blk_qc_t ret = BLK_QC_T_NONE;
int srcu_idx;
/*
@@ -334,7 +338,7 @@ static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
bio->bi_opf |= REQ_NVME_MPATH;
trace_block_bio_remap(bio, disk_devt(ns->head->disk),
bio->bi_iter.bi_sector);
- ret = submit_bio_noacct(bio);
+ submit_bio_noacct(bio);
} else if (nvme_available_path(head)) {
dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
@@ -349,7 +353,6 @@ static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
}
srcu_read_unlock(&head->srcu, srcu_idx);
- return ret;
}
static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
@@ -479,6 +482,15 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
+ /*
+ * This assumes all controllers that refer to a namespace either
+ * support poll queues or not. That is not a strict guarantee,
+ * but if the assumption is wrong the effect is only suboptimal
+ * performance but not correctness problem.
+ */
+ if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL &&
+ ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
+ blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
/* set to a default value of 512 until the disk is validated */
blk_queue_logical_block_size(head->disk->queue, 512);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ed79a6c7e804..3652439a7458 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -342,6 +342,7 @@ struct nvme_ctrl {
int nr_reconnects;
unsigned long flags;
#define NVME_CTRL_FAILFAST_EXPIRED 0
+#define NVME_CTRL_ADMIN_Q_STOPPED 1
struct nvmf_ctrl_options *opts;
struct page *discard_page;
@@ -463,6 +464,7 @@ struct nvme_ns {
#define NVME_NS_ANA_PENDING 2
#define NVME_NS_FORCE_RO 3
#define NVME_NS_READY 4
+#define NVME_NS_STOPPED 5
struct cdev cdev;
struct device cdev_device;
@@ -638,6 +640,20 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
}
void nvme_complete_rq(struct request *req);
+void nvme_complete_batch_req(struct request *req);
+
+static __always_inline void nvme_complete_batch(struct io_comp_batch *iob,
+ void (*fn)(struct request *rq))
+{
+ struct request *req;
+
+ rq_list_for_each(&iob->req_list, req) {
+ fn(req);
+ nvme_complete_batch_req(req);
+ }
+ blk_mq_end_request_batch(iob);
+}
+
blk_status_t nvme_host_path_error(struct request *req);
bool nvme_cancel_request(struct request *req, void *data, bool reserved);
void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
@@ -665,6 +681,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
void nvme_stop_queues(struct nvme_ctrl *ctrl);
void nvme_start_queues(struct nvme_ctrl *ctrl);
+void nvme_stop_admin_queue(struct nvme_ctrl *ctrl);
+void nvme_start_admin_queue(struct nvme_ctrl *ctrl);
void nvme_kill_queues(struct nvme_ctrl *ctrl);
void nvme_sync_queues(struct nvme_ctrl *ctrl);
void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 149ecf73df38..fde36c137900 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -10,6 +10,7 @@
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/blk-mq-pci.h>
+#include <linux/blk-integrity.h>
#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/interrupt.h>
@@ -958,7 +959,7 @@ out_free_cmd:
return ret;
}
-static void nvme_pci_complete_rq(struct request *req)
+static __always_inline void nvme_pci_unmap_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_dev *dev = iod->nvmeq->dev;
@@ -968,9 +969,19 @@ static void nvme_pci_complete_rq(struct request *req)
rq_integrity_vec(req)->bv_len, rq_data_dir(req));
if (blk_rq_nr_phys_segments(req))
nvme_unmap_data(dev, req);
+}
+
+static void nvme_pci_complete_rq(struct request *req)
+{
+ nvme_pci_unmap_rq(req);
nvme_complete_rq(req);
}
+static void nvme_pci_complete_batch(struct io_comp_batch *iob)
+{
+ nvme_complete_batch(iob, nvme_pci_unmap_rq);
+}
+
/* We read the CQE phase first to check if the rest of the entry is valid */
static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
{
@@ -995,7 +1006,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
}
-static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
+ struct io_comp_batch *iob, u16 idx)
{
struct nvme_completion *cqe = &nvmeq->cqes[idx];
__u16 command_id = READ_ONCE(cqe->command_id);
@@ -1022,7 +1034,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
}
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
- if (!nvme_try_complete_req(req, cqe->status, cqe->result))
+ if (!nvme_try_complete_req(req, cqe->status, cqe->result) &&
+ !blk_mq_add_to_batch(req, iob, nvme_req(req)->status,
+ nvme_pci_complete_batch))
nvme_pci_complete_rq(req);
}
@@ -1038,7 +1052,8 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
}
}
-static inline int nvme_process_cq(struct nvme_queue *nvmeq)
+static inline int nvme_poll_cq(struct nvme_queue *nvmeq,
+ struct io_comp_batch *iob)
{
int found = 0;
@@ -1049,7 +1064,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
* the cqe requires a full read memory barrier
*/
dma_rmb();
- nvme_handle_cqe(nvmeq, nvmeq->cq_head);
+ nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head);
nvme_update_cq_head(nvmeq);
}
@@ -1061,9 +1076,13 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
static irqreturn_t nvme_irq(int irq, void *data)
{
struct nvme_queue *nvmeq = data;
+ DEFINE_IO_COMP_BATCH(iob);
- if (nvme_process_cq(nvmeq))
+ if (nvme_poll_cq(nvmeq, &iob)) {
+ if (!rq_list_empty(iob.req_list))
+ nvme_pci_complete_batch(&iob);
return IRQ_HANDLED;
+ }
return IRQ_NONE;
}
@@ -1087,11 +1106,11 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
- nvme_process_cq(nvmeq);
+ nvme_poll_cq(nvmeq, NULL);
enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
}
-static int nvme_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
struct nvme_queue *nvmeq = hctx->driver_data;
bool found;
@@ -1100,7 +1119,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx)
return 0;
spin_lock(&nvmeq->cq_poll_lock);
- found = nvme_process_cq(nvmeq);
+ found = nvme_poll_cq(nvmeq, iob);
spin_unlock(&nvmeq->cq_poll_lock);
return found;
@@ -1273,7 +1292,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
* Did we miss an interrupt?
*/
if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
- nvme_poll(req->mq_hctx);
+ nvme_poll(req->mq_hctx, NULL);
else
nvme_poll_irqdisable(nvmeq);
@@ -1395,7 +1414,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
nvmeq->dev->online_queues--;
if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
- blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
+ nvme_stop_admin_queue(&nvmeq->dev->ctrl);
if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
return 0;
@@ -1433,7 +1452,7 @@ static void nvme_reap_pending_cqes(struct nvme_dev *dev)
for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
spin_lock(&dev->queues[i].cq_poll_lock);
- nvme_process_cq(&dev->queues[i]);
+ nvme_poll_cq(&dev->queues[i], NULL);
spin_unlock(&dev->queues[i].cq_poll_lock);
}
}
@@ -1654,7 +1673,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev)
* user requests may be waiting on a stopped queue. Start the
* queue to flush these to completion.
*/
- blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+ nvme_start_admin_queue(&dev->ctrl);
blk_cleanup_queue(dev->ctrl.admin_q);
blk_mq_free_tag_set(&dev->admin_tagset);
}
@@ -1688,7 +1707,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
return -ENODEV;
}
} else
- blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+ nvme_start_admin_queue(&dev->ctrl);
return 0;
}
@@ -2623,7 +2642,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
if (shutdown) {
nvme_start_queues(&dev->ctrl);
if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
- blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+ nvme_start_admin_queue(&dev->ctrl);
}
mutex_unlock(&dev->shutdown_lock);
}
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 042c594bc57e..7d9ac544db37 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -13,6 +13,7 @@
#include <linux/atomic.h>
#include <linux/blk-mq.h>
#include <linux/blk-mq-rdma.h>
+#include <linux/blk-integrity.h>
#include <linux/types.h>
#include <linux/list.h>
#include <linux/mutex.h>
@@ -918,7 +919,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
else
ctrl->ctrl.max_integrity_segments = 0;
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ nvme_start_admin_queue(&ctrl->ctrl);
error = nvme_init_ctrl_finish(&ctrl->ctrl);
if (error)
@@ -927,7 +928,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
return 0;
out_quiesce_queue:
- blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+ nvme_stop_admin_queue(&ctrl->ctrl);
blk_sync_queue(ctrl->ctrl.admin_q);
out_stop_queue:
nvme_rdma_stop_queue(&ctrl->queues[0]);
@@ -1025,12 +1026,12 @@ out_free_io_queues:
static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
- blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+ nvme_stop_admin_queue(&ctrl->ctrl);
blk_sync_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
nvme_cancel_admin_tagset(&ctrl->ctrl);
if (remove)
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ nvme_start_admin_queue(&ctrl->ctrl);
nvme_rdma_destroy_admin_queue(ctrl, remove);
}
@@ -1153,7 +1154,7 @@ destroy_io:
nvme_rdma_destroy_io_queues(ctrl, new);
}
destroy_admin:
- blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+ nvme_stop_admin_queue(&ctrl->ctrl);
blk_sync_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
nvme_cancel_admin_tagset(&ctrl->ctrl);
@@ -1193,7 +1194,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
nvme_rdma_teardown_io_queues(ctrl, false);
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_teardown_admin_queue(ctrl, false);
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ nvme_start_admin_queue(&ctrl->ctrl);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we started ctrl delete */
@@ -2105,7 +2106,7 @@ unmap_qe:
return ret;
}
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
struct nvme_rdma_queue *queue = hctx->driver_data;
@@ -2231,7 +2232,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
cancel_delayed_work_sync(&ctrl->reconnect_work);
nvme_rdma_teardown_io_queues(ctrl, shutdown);
- blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+ nvme_stop_admin_queue(&ctrl->ctrl);
if (shutdown)
nvme_shutdown_ctrl(&ctrl->ctrl);
else
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 4ae562d30d2b..19dcb9279a24 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1918,7 +1918,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
if (error)
goto out_stop_queue;
- blk_mq_unquiesce_queue(ctrl->admin_q);
+ nvme_start_admin_queue(ctrl);
error = nvme_init_ctrl_finish(ctrl);
if (error)
@@ -1927,7 +1927,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
return 0;
out_quiesce_queue:
- blk_mq_quiesce_queue(ctrl->admin_q);
+ nvme_stop_admin_queue(ctrl);
blk_sync_queue(ctrl->admin_q);
out_stop_queue:
nvme_tcp_stop_queue(ctrl, 0);
@@ -1949,12 +1949,12 @@ out_free_queue:
static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
bool remove)
{
- blk_mq_quiesce_queue(ctrl->admin_q);
+ nvme_stop_admin_queue(ctrl);
blk_sync_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0);
nvme_cancel_admin_tagset(ctrl);
if (remove)
- blk_mq_unquiesce_queue(ctrl->admin_q);
+ nvme_start_admin_queue(ctrl);
nvme_tcp_destroy_admin_queue(ctrl, remove);
}
@@ -1963,7 +1963,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
{
if (ctrl->queue_count <= 1)
return;
- blk_mq_quiesce_queue(ctrl->admin_q);
+ nvme_stop_admin_queue(ctrl);
nvme_start_freeze(ctrl);
nvme_stop_queues(ctrl);
nvme_sync_io_queues(ctrl);
@@ -2058,7 +2058,7 @@ destroy_io:
nvme_tcp_destroy_io_queues(ctrl, new);
}
destroy_admin:
- blk_mq_quiesce_queue(ctrl->admin_q);
+ nvme_stop_admin_queue(ctrl);
blk_sync_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0);
nvme_cancel_admin_tagset(ctrl);
@@ -2101,7 +2101,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
/* unquiesce to fail fast pending requests */
nvme_start_queues(ctrl);
nvme_tcp_teardown_admin_queue(ctrl, false);
- blk_mq_unquiesce_queue(ctrl->admin_q);
+ nvme_start_admin_queue(ctrl);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we started ctrl delete */
@@ -2119,7 +2119,7 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
nvme_tcp_teardown_io_queues(ctrl, shutdown);
- blk_mq_quiesce_queue(ctrl->admin_q);
+ nvme_stop_admin_queue(ctrl);
if (shutdown)
nvme_shutdown_ctrl(ctrl);
else
@@ -2432,7 +2432,7 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
return 0;
}
-static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
struct nvme_tcp_queue *queue = hctx->driver_data;
struct sock *sk = queue->sock->sk;
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 0fc2781ab970..6139e1de50a6 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -5,6 +5,7 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/module.h>
#include "nvmet.h"
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 0285ccc7541f..eb1094254c82 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -384,6 +384,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
error = PTR_ERR(ctrl->ctrl.admin_q);
goto out_cleanup_fabrics_q;
}
+ /* reset stopped state for the fresh admin queue */
+ clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags);
error = nvmf_connect_admin_queue(&ctrl->ctrl);
if (error)
@@ -398,7 +400,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
ctrl->ctrl.max_hw_sectors =
(NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9);
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ nvme_start_admin_queue(&ctrl->ctrl);
error = nvme_init_ctrl_finish(&ctrl->ctrl);
if (error)
@@ -428,7 +430,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
nvme_loop_destroy_io_queues(ctrl);
}
- blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+ nvme_stop_admin_queue(&ctrl->ctrl);
if (ctrl->ctrl.state == NVME_CTRL_LIVE)
nvme_shutdown_ctrl(&ctrl->ctrl);
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 891174ccd44b..38d1f292ecc2 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -5,6 +5,7 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/atomic.h>
+#include <linux/blk-integrity.h>
#include <linux/ctype.h>
#include <linux/delay.h>
#include <linux/err.h>
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index fa966e0db6ca..3a6f3af240fa 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -14,6 +14,7 @@
#define KMSG_COMPONENT "dasd"
#include <linux/interrupt.h>
+#include <linux/major.h>
#include <linux/fs.h>
#include <linux/blkpg.h>
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 5be3d1c39a78..59e513d34b0f 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -30,7 +30,7 @@
static int dcssblk_open(struct block_device *bdev, fmode_t mode);
static void dcssblk_release(struct gendisk *disk, fmode_t mode);
-static blk_qc_t dcssblk_submit_bio(struct bio *bio);
+static void dcssblk_submit_bio(struct bio *bio);
static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn);
@@ -854,7 +854,7 @@ dcssblk_release(struct gendisk *disk, fmode_t mode)
up_write(&dcssblk_devices_sem);
}
-static blk_qc_t
+static void
dcssblk_submit_bio(struct bio *bio)
{
struct dcssblk_dev_info *dev_info;
@@ -907,10 +907,9 @@ dcssblk_submit_bio(struct bio *bio)
bytes_done += bvec.bv_len;
}
bio_endio(bio);
- return BLK_QC_T_NONE;
+ return;
fail:
bio_io_error(bio);
- return BLK_QC_T_NONE;
}
static long
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
index 3ab669dc806f..27884f3106ab 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
@@ -3,6 +3,7 @@
* Copyright (c) 2017 Hisilicon Limited.
*/
+#include <linux/sched/clock.h>
#include "hisi_sas.h"
#define DRV_NAME "hisi_sas_v3_hw"
diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h
index befeb7c34290..337e6ed24821 100644
--- a/drivers/scsi/lpfc/lpfc.h
+++ b/drivers/scsi/lpfc/lpfc.h
@@ -22,6 +22,7 @@
*******************************************************************/
#include <scsi/scsi_host.h>
+#include <linux/hashtable.h>
#include <linux/ktime.h>
#include <linux/workqueue.h>
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 66f507469a31..40b473eea357 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -5384,7 +5384,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
{
bool new_sd_dp;
bool inject = false;
- bool hipri = scsi_cmd_to_rq(cmnd)->cmd_flags & REQ_HIPRI;
+ bool polled = scsi_cmd_to_rq(cmnd)->cmd_flags & REQ_POLLED;
int k, num_in_q, qdepth;
unsigned long iflags;
u64 ns_from_boot = 0;
@@ -5471,7 +5471,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
if (sdebug_host_max_queue)
sd_dp->hc_idx = get_tag(cmnd);
- if (hipri)
+ if (polled)
ns_from_boot = ktime_get_boottime_ns();
/* one of the resp_*() response functions is called here */
@@ -5531,7 +5531,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
kt -= d;
}
}
- if (hipri) {
+ if (polled) {
sd_dp->cmpl_ts = ktime_add(ns_to_ktime(ns_from_boot), kt);
spin_lock_irqsave(&sqp->qc_lock, iflags);
if (!sd_dp->init_poll) {
@@ -5562,7 +5562,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
if (unlikely((sdebug_opts & SDEBUG_OPT_CMD_ABORT) &&
atomic_read(&sdeb_inject_pending)))
sd_dp->aborted = true;
- if (hipri) {
+ if (polled) {
sd_dp->cmpl_ts = ns_to_ktime(ns_from_boot);
spin_lock_irqsave(&sqp->qc_lock, iflags);
if (!sd_dp->init_poll) {
@@ -7331,7 +7331,7 @@ static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
if (kt_from_boot < sd_dp->cmpl_ts)
continue;
- } else /* ignoring non REQ_HIPRI requests */
+ } else /* ignoring non REQ_POLLED requests */
continue;
devip = (struct sdebug_dev_info *)scp->device->hostdata;
if (likely(devip))
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 572673873ddf..30f7d0b4eb73 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -21,6 +21,7 @@
#include <linux/hardirq.h>
#include <linux/scatterlist.h>
#include <linux/blk-mq.h>
+#include <linux/blk-integrity.h>
#include <linux/ratelimit.h>
#include <asm/unaligned.h>
@@ -1783,7 +1784,7 @@ static void scsi_mq_exit_request(struct blk_mq_tag_set *set, struct request *rq,
}
-static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx)
+static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
struct Scsi_Host *shost = hctx->driver_data;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index fce63335084e..9bdee968d3b5 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -48,6 +48,7 @@
#include <linux/blkpg.h>
#include <linux/blk-pm.h>
#include <linux/delay.h>
+#include <linux/major.h>
#include <linux/mutex.h>
#include <linux/string_helpers.h>
#include <linux/async.h>
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index 4cadb26070a8..349950616adc 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -6,7 +6,7 @@
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
*/
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/t10-pi.h>
#include <scsi/scsi.h>
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 8f05248920e8..3c98f08dc25d 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -31,6 +31,7 @@ static int sg_version_num = 30536; /* 2 digits for each component */
#include <linux/errno.h>
#include <linux/mtio.h>
#include <linux/ioctl.h>
+#include <linux/major.h>
#include <linux/slab.h>
#include <linux/fcntl.h>
#include <linux/init.h>
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 8b17b35283aa..115f7ef7a5de 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -44,6 +44,7 @@
#include <linux/cdrom.h>
#include <linux/interrupt.h>
#include <linux/init.h>
+#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/blk-pm.h>
#include <linux/mutex.h>
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index ae8636d3780b..9933722acfd9 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -32,6 +32,7 @@ static const char *verstr = "20160209";
#include <linux/slab.h>
#include <linux/errno.h>
#include <linux/mtio.h>
+#include <linux/major.h>
#include <linux/cdrom.h>
#include <linux/ioctl.h>
#include <linux/fcntl.h>
diff --git a/drivers/scsi/ufs/ufshcd-crypto.c b/drivers/scsi/ufs/ufshcd-crypto.c
index d70cdcd35e43..67402baf6fae 100644
--- a/drivers/scsi/ufs/ufshcd-crypto.c
+++ b/drivers/scsi/ufs/ufshcd-crypto.c
@@ -48,11 +48,12 @@ out:
return err;
}
-static int ufshcd_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
+static int ufshcd_crypto_keyslot_program(struct blk_crypto_profile *profile,
const struct blk_crypto_key *key,
unsigned int slot)
{
- struct ufs_hba *hba = container_of(ksm, struct ufs_hba, ksm);
+ struct ufs_hba *hba =
+ container_of(profile, struct ufs_hba, crypto_profile);
const union ufs_crypto_cap_entry *ccap_array = hba->crypto_cap_array;
const struct ufs_crypto_alg_entry *alg =
&ufs_crypto_algs[key->crypto_cfg.crypto_mode];
@@ -105,11 +106,12 @@ static int ufshcd_clear_keyslot(struct ufs_hba *hba, int slot)
return ufshcd_program_key(hba, &cfg, slot);
}
-static int ufshcd_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
+static int ufshcd_crypto_keyslot_evict(struct blk_crypto_profile *profile,
const struct blk_crypto_key *key,
unsigned int slot)
{
- struct ufs_hba *hba = container_of(ksm, struct ufs_hba, ksm);
+ struct ufs_hba *hba =
+ container_of(profile, struct ufs_hba, crypto_profile);
return ufshcd_clear_keyslot(hba, slot);
}
@@ -120,11 +122,11 @@ bool ufshcd_crypto_enable(struct ufs_hba *hba)
return false;
/* Reset might clear all keys, so reprogram all the keys. */
- blk_ksm_reprogram_all_keys(&hba->ksm);
+ blk_crypto_reprogram_all_keys(&hba->crypto_profile);
return true;
}
-static const struct blk_ksm_ll_ops ufshcd_ksm_ops = {
+static const struct blk_crypto_ll_ops ufshcd_crypto_ops = {
.keyslot_program = ufshcd_crypto_keyslot_program,
.keyslot_evict = ufshcd_crypto_keyslot_evict,
};
@@ -179,15 +181,16 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba)
}
/* The actual number of configurations supported is (CFGC+1) */
- err = devm_blk_ksm_init(hba->dev, &hba->ksm,
- hba->crypto_capabilities.config_count + 1);
+ err = devm_blk_crypto_profile_init(
+ hba->dev, &hba->crypto_profile,
+ hba->crypto_capabilities.config_count + 1);
if (err)
goto out;
- hba->ksm.ksm_ll_ops = ufshcd_ksm_ops;
+ hba->crypto_profile.ll_ops = ufshcd_crypto_ops;
/* UFS only supports 8 bytes for any DUN */
- hba->ksm.max_dun_bytes_supported = 8;
- hba->ksm.dev = hba->dev;
+ hba->crypto_profile.max_dun_bytes_supported = 8;
+ hba->crypto_profile.dev = hba->dev;
/*
* Cache all the UFS crypto capabilities and advertise the supported
@@ -202,7 +205,7 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba)
blk_mode_num = ufshcd_find_blk_crypto_mode(
hba->crypto_cap_array[cap_idx]);
if (blk_mode_num != BLK_ENCRYPTION_MODE_INVALID)
- hba->ksm.crypto_modes_supported[blk_mode_num] |=
+ hba->crypto_profile.modes_supported[blk_mode_num] |=
hba->crypto_cap_array[cap_idx].sdus_mask * 512;
}
@@ -230,9 +233,8 @@ void ufshcd_init_crypto(struct ufs_hba *hba)
ufshcd_clear_keyslot(hba, slot);
}
-void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba,
- struct request_queue *q)
+void ufshcd_crypto_register(struct ufs_hba *hba, struct request_queue *q)
{
if (hba->caps & UFSHCD_CAP_CRYPTO)
- blk_ksm_register(&hba->ksm, q);
+ blk_crypto_register(&hba->crypto_profile, q);
}
diff --git a/drivers/scsi/ufs/ufshcd-crypto.h b/drivers/scsi/ufs/ufshcd-crypto.h
index 78a58e788dff..e18c01276873 100644
--- a/drivers/scsi/ufs/ufshcd-crypto.h
+++ b/drivers/scsi/ufs/ufshcd-crypto.h
@@ -18,7 +18,7 @@ static inline void ufshcd_prepare_lrbp_crypto(struct request *rq,
return;
}
- lrbp->crypto_key_slot = blk_ksm_get_slot_idx(rq->crypt_keyslot);
+ lrbp->crypto_key_slot = blk_crypto_keyslot_index(rq->crypt_keyslot);
lrbp->data_unit_num = rq->crypt_ctx->bc_dun[0];
}
@@ -40,8 +40,7 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba);
void ufshcd_init_crypto(struct ufs_hba *hba);
-void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba,
- struct request_queue *q);
+void ufshcd_crypto_register(struct ufs_hba *hba, struct request_queue *q);
#else /* CONFIG_SCSI_UFS_CRYPTO */
@@ -64,8 +63,8 @@ static inline int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba)
static inline void ufshcd_init_crypto(struct ufs_hba *hba) { }
-static inline void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba,
- struct request_queue *q) { }
+static inline void ufshcd_crypto_register(struct ufs_hba *hba,
+ struct request_queue *q) { }
#endif /* CONFIG_SCSI_UFS_CRYPTO */
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 41f2ff35f82b..1157b24963ef 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -4981,7 +4981,7 @@ static int ufshcd_slave_configure(struct scsi_device *sdev)
else if (ufshcd_is_rpm_autosuspend_allowed(hba))
sdev->rpm_autosuspend = 1;
- ufshcd_crypto_setup_rq_keyslot_manager(hba, q);
+ ufshcd_crypto_register(hba, q);
return 0;
}
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 41f6e06f9185..62bdc412d38a 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -32,7 +32,7 @@
#include <linux/regulator/consumer.h>
#include <linux/bitfield.h>
#include <linux/devfreq.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
#include "unipro.h"
#include <asm/irq.h>
@@ -766,7 +766,7 @@ struct ufs_hba_monitor {
* @crypto_capabilities: Content of crypto capabilities register (0x100)
* @crypto_cap_array: Array of crypto capabilities
* @crypto_cfg_register: Start of the crypto cfg array
- * @ksm: the keyslot manager tied to this hba
+ * @crypto_profile: the crypto profile of this hba (if applicable)
*/
struct ufs_hba {
void __iomem *mmio_base;
@@ -911,7 +911,7 @@ struct ufs_hba {
union ufs_crypto_capabilities crypto_capabilities;
union ufs_crypto_cap_entry *crypto_cap_array;
u32 crypto_cfg_register;
- struct blk_keyslot_manager ksm;
+ struct blk_crypto_profile crypto_profile;
#endif
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_root;
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 07d0250f17c3..b8455fcbf18b 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -22,6 +22,7 @@
#include <linux/virtio_scsi.h>
#include <linux/cpu.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_cmnd.h>
diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index ef4a8e189fba..02f64453b4c5 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -20,6 +20,7 @@
#include <linux/vmalloc.h>
#include <linux/falloc.h>
#include <linux/uio.h>
+#include <linux/scatterlist.h>
#include <scsi/scsi_proto.h>
#include <asm/unaligned.h>
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 4069a1edcfa3..31df20abe141 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -16,12 +16,14 @@
#include <linux/timer.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/bio.h>
#include <linux/genhd.h>
#include <linux/file.h>
#include <linux/module.h>
+#include <linux/scatterlist.h>
#include <scsi/scsi_proto.h>
#include <asm/unaligned.h>
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 0913ee50e6c3..6c7eb80220ca 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
+#include <linux/kthread.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 84627cbd5b5b..66290b214f2b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/mm.h>
+#include <linux/error-injection.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7c096ab9bb5e..954b53a90f04 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6,6 +6,7 @@
#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
+#include <linux/blk-cgroup.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -8248,7 +8249,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
return dip;
}
-static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
+static void btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
struct inode *inode = iter->inode;
@@ -8278,7 +8279,7 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
}
dio_bio->bi_status = BLK_STS_RESOURCE;
bio_endio(dio_bio);
- return BLK_QC_T_NONE;
+ return;
}
if (!write) {
@@ -8372,15 +8373,13 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
free_extent_map(em);
} while (submit_len > 0);
- return BLK_QC_T_NONE;
+ return;
out_err_em:
free_extent_map(em);
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
-
- return BLK_QC_T_NONE;
}
const struct iomap_ops btrfs_dio_iomap_ops = {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b2e86e739d7a..453dcff0e7f5 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -119,7 +119,6 @@ struct dio {
int flags; /* doesn't change */
int op;
int op_flags;
- blk_qc_t bio_cookie;
struct gendisk *bio_disk;
struct inode *inode;
loff_t i_size; /* i_size when submitted */
@@ -438,11 +437,10 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
dio->bio_disk = bio->bi_bdev->bd_disk;
- if (sdio->submit_io) {
+ if (sdio->submit_io)
sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
- dio->bio_cookie = BLK_QC_T_NONE;
- } else
- dio->bio_cookie = submit_bio(bio);
+ else
+ submit_bio(bio);
sdio->bio = NULL;
sdio->boundary = 0;
@@ -481,9 +479,7 @@ static struct bio *dio_await_one(struct dio *dio)
__set_current_state(TASK_UNINTERRUPTIBLE);
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
- blk_io_schedule();
+ blk_io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
dio->waiter = NULL;
@@ -1214,8 +1210,6 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
} else {
dio->op = REQ_OP_READ;
}
- if (iocb->ki_flags & IOCB_HIPRI)
- dio->op_flags |= REQ_HIPRI;
/*
* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ac0e11bbb445..9c5559faacda 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -915,7 +915,7 @@ const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index c1bf9ad4c220..20a083dc9042 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
+#include <linux/moduleparam.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/lzo.h>
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 81ec192ce067..4124a89a1a5d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1893,7 +1893,8 @@ static long writeback_sb_inodes(struct super_block *sb,
* unplug, so get our IOs out the door before we
* give up the CPU.
*/
- blk_flush_plug(current);
+ if (current->plug)
+ blk_flush_plug(current->plug, false);
cond_resched();
}
@@ -2291,7 +2292,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
* If we are expecting writeback progress we must submit plugged IO.
*/
if (blk_needs_flush_plug(current))
- blk_schedule_flush_plug(current);
+ blk_flush_plug(current->plug, true);
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 078ef29e31bc..5436a688157a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1351,7 +1351,7 @@ const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
@@ -1384,7 +1384,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3a098b473401..057d07cee9f8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2454,14 +2454,16 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
long min)
{
struct io_kiocb *req, *tmp;
+ unsigned int poll_flags = BLK_POLL_NOSLEEP;
+ DEFINE_IO_COMP_BATCH(iob);
LIST_HEAD(done);
- bool spin;
/*
* Only spin for completions if we don't have multiple devices hanging
* off our complete list, and we're under the requested amount.
*/
- spin = !ctx->poll_multi_queue && *nr_events < min;
+ if (ctx->poll_multi_queue || *nr_events >= min)
+ poll_flags |= BLK_POLL_ONESHOT;
list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
struct kiocb *kiocb = &req->rw.kiocb;
@@ -2479,17 +2481,20 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
break;
- ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+ ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
if (unlikely(ret < 0))
return ret;
else if (ret)
- spin = false;
+ poll_flags |= BLK_POLL_ONESHOT;
/* iopoll may have completed current req */
- if (READ_ONCE(req->iopoll_completed))
+ if (!rq_list_empty(iob.req_list) ||
+ READ_ONCE(req->iopoll_completed))
list_move_tail(&req->inflight_entry, &done);
}
+ if (!rq_list_empty(iob.req_list))
+ iob.complete(&iob);
if (!list_empty(&done))
io_iopoll_complete(ctx, nr_events, &done);
@@ -2735,19 +2740,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
ctx->poll_multi_queue = false;
} else if (!ctx->poll_multi_queue) {
struct io_kiocb *list_req;
- unsigned int queue_num0, queue_num1;
list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
inflight_entry);
- if (list_req->file != req->file) {
+ if (list_req->file != req->file)
ctx->poll_multi_queue = true;
- } else {
- queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
- queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
- if (queue_num0 != queue_num1)
- ctx->poll_multi_queue = true;
- }
}
/*
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 4ecd255e0511..83ecfba53abe 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -38,8 +38,7 @@ struct iomap_dio {
struct {
struct iov_iter *iter;
struct task_struct *waiter;
- struct request_queue *last_queue;
- blk_qc_t cookie;
+ struct bio *poll_bio;
} submit;
/* used for aio completion: */
@@ -49,29 +48,20 @@ struct iomap_dio {
};
};
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
-{
- struct request_queue *q = READ_ONCE(kiocb->private);
-
- if (!q)
- return 0;
- return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
-}
-EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
-
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
atomic_inc(&dio->ref);
- if (dio->iocb->ki_flags & IOCB_HIPRI)
+ if (dio->iocb->ki_flags & IOCB_HIPRI) {
bio_set_polled(bio, dio->iocb);
+ dio->submit.poll_bio = bio;
+ }
- dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev);
if (dio->dops && dio->dops->submit_io)
- dio->submit.cookie = dio->dops->submit_io(iter, bio, pos);
+ dio->dops->submit_io(iter, bio, pos);
else
- dio->submit.cookie = submit_bio(bio);
+ submit_bio(bio);
}
ssize_t iomap_dio_complete(struct iomap_dio *dio)
@@ -164,9 +154,11 @@ static void iomap_dio_bio_end_io(struct bio *bio)
} else if (dio->flags & IOMAP_DIO_WRITE) {
struct inode *inode = file_inode(dio->iocb->ki_filp);
+ WRITE_ONCE(dio->iocb->private, NULL);
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
} else {
+ WRITE_ONCE(dio->iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
}
}
@@ -282,6 +274,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
if (!iov_iter_count(dio->submit.iter))
goto out;
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ if (need_zeroout ||
+ ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
+ dio->iocb->ki_flags &= ~IOCB_HIPRI;
+
if (need_zeroout) {
/* zero out from the start of the block to the write offset */
pad = pos & (fs_block_size - 1);
@@ -339,6 +338,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
BIO_MAX_VECS);
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ if (nr_pages)
+ dio->iocb->ki_flags &= ~IOCB_HIPRI;
iomap_dio_submit_bio(iter, dio, bio, pos);
pos += n;
} while (nr_pages);
@@ -485,8 +489,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->submit.iter = iter;
dio->submit.waiter = current;
- dio->submit.cookie = BLK_QC_T_NONE;
- dio->submit.last_queue = NULL;
+ dio->submit.poll_bio = NULL;
if (iov_iter_rw(iter) == READ) {
if (iomi.pos >= dio->i_size)
@@ -565,8 +568,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
inode_dio_begin(inode);
blk_start_plug(&plug);
- while ((ret = iomap_iter(&iomi, ops)) > 0)
+ while ((ret = iomap_iter(&iomi, ops)) > 0) {
iomi.processed = iomap_dio_iter(&iomi, dio);
+
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ iocb->ki_flags &= ~IOCB_HIPRI;
+ }
+
blk_finish_plug(&plug);
/*
@@ -592,8 +602,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (dio->flags & IOMAP_DIO_WRITE_FUA)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
- WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
- WRITE_ONCE(iocb->private, dio->submit.last_queue);
+ WRITE_ONCE(iocb->private, dio->submit.poll_bio);
/*
* We are about to drop our additional submission reference, which
@@ -620,10 +629,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!READ_ONCE(dio->submit.waiter))
break;
- if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !dio->submit.last_queue ||
- !blk_poll(dio->submit.last_queue,
- dio->submit.cookie, true))
+ if (!dio->submit.poll_bio ||
+ !bio_poll(dio->submit.poll_bio, NULL, 0))
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index ab4f3362466d..373dbb627657 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -5,6 +5,7 @@
* Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
*/
+#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/gfp.h>
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 43b1451bff53..a3cd3c3f091e 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -8,6 +8,7 @@
*/
#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/compat.h>
#include <linux/falloc.h>
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index c1bb4c4b5d67..e5e3e500ed46 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -10,7 +10,7 @@
* Linux VFS inode operations.
*/
-#include <linux/bvec.h>
+#include <linux/blkdev.h>
#include <linux/fileattr.h>
#include "protocol.h"
#include "orangefs-kernel.h"
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 2f2e430461b2..8bb0a53a658b 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -11,6 +11,7 @@
#include <linux/parser.h>
#include <linux/hashtable.h>
+#include <linux/seq_file.h>
/* a cache for orangefs-inode objects (i.e. orangefs inode private data) */
static struct kmem_cache *orangefs_inode_cache;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2bcc9a6f1bfc..052f143e2e0e 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -10,6 +10,7 @@
#include <linux/namei.h>
#include <linux/slab.h>
#include <asm/current.h>
+#include <linux/blkdev.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/security.h>
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 65e7e56005b8..e2302342a67f 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -38,6 +38,7 @@
#include <linux/uaccess.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
+#include <linux/seq_file.h>
#include "internal.h"
struct ramfs_mount_opts {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7aa943edfc02..62e7fbe4e54c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1452,7 +1452,7 @@ const struct file_operations xfs_file_operations = {
.write_iter = xfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index ddc346a9df9b..3ce5f47338cb 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1128,7 +1128,7 @@ static const struct file_operations zonefs_file_operations = {
.write_iter = zonefs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
};
static struct kmem_cache *zonefs_inode_cachep;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index a62e72dd829f..9c14f0a8dbe5 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -12,13 +12,13 @@
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/sched.h>
-#include <linux/blkdev.h>
#include <linux/device.h>
#include <linux/writeback.h>
-#include <linux/blk-cgroup.h>
#include <linux/backing-dev-defs.h>
#include <linux/slab.h>
+struct blkcg;
+
static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
{
kref_get(&bdi->refcnt);
@@ -133,20 +133,7 @@ static inline bool writeback_in_progress(struct bdi_writeback *wb)
return test_bit(WB_writeback_running, &wb->state);
}
-static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
-{
- struct super_block *sb;
-
- if (!inode)
- return &noop_backing_dev_info;
-
- sb = inode->i_sb;
-#ifdef CONFIG_BLOCK
- if (sb_is_blkdev_sb(sb))
- return I_BDEV(inode)->bd_disk->bdi;
-#endif
- return sb->s_bdi;
-}
+struct backing_dev_info *inode_to_bdi(struct inode *inode);
static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
{
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 00952e92eae1..fe6bdfbbef66 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -6,19 +6,10 @@
#define __LINUX_BIO_H
#include <linux/mempool.h>
-#include <linux/ioprio.h>
/* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
#include <linux/blk_types.h>
#include <linux/uio.h>
-#define BIO_DEBUG
-
-#ifdef BIO_DEBUG
-#define BIO_BUG_ON BUG_ON
-#else
-#define BIO_BUG_ON
-#endif
-
#define BIO_MAX_VECS 256U
static inline unsigned int bio_max_segs(unsigned int nr_segs)
@@ -78,22 +69,6 @@ static inline bool bio_no_advance_iter(const struct bio *bio)
bio_op(bio) == REQ_OP_WRITE_ZEROES;
}
-static inline bool bio_mergeable(struct bio *bio)
-{
- if (bio->bi_opf & REQ_NOMERGE_FLAGS)
- return false;
-
- return true;
-}
-
-static inline unsigned int bio_cur_bytes(struct bio *bio)
-{
- if (bio_has_data(bio))
- return bio_iovec(bio).bv_len;
- else /* dataless requests such as discard */
- return bio->bi_iter.bi_size;
-}
-
static inline void *bio_data(struct bio *bio)
{
if (bio_has_data(bio))
@@ -102,25 +77,6 @@ static inline void *bio_data(struct bio *bio)
return NULL;
}
-/**
- * bio_full - check if the bio is full
- * @bio: bio to check
- * @len: length of one segment to be added
- *
- * Return true if @bio is full and one segment with @len bytes can't be
- * added to the bio, otherwise return false
- */
-static inline bool bio_full(struct bio *bio, unsigned len)
-{
- if (bio->bi_vcnt >= bio->bi_max_vecs)
- return true;
-
- if (bio->bi_iter.bi_size > UINT_MAX - len)
- return true;
-
- return false;
-}
-
static inline bool bio_next_segment(const struct bio *bio,
struct bvec_iter_all *iter)
{
@@ -163,6 +119,28 @@ static inline void bio_advance_iter_single(const struct bio *bio,
bvec_iter_advance_single(bio->bi_io_vec, iter, bytes);
}
+void __bio_advance(struct bio *, unsigned bytes);
+
+/**
+ * bio_advance - increment/complete a bio by some number of bytes
+ * @bio: bio to advance
+ * @bytes: number of bytes to complete
+ *
+ * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
+ * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
+ * be updated on the last bvec as well.
+ *
+ * @bio will then represent the remaining, uncompleted portion of the io.
+ */
+static inline void bio_advance(struct bio *bio, unsigned int nbytes)
+{
+ if (nbytes == bio->bi_iter.bi_size) {
+ bio->bi_iter.bi_size = 0;
+ return;
+ }
+ __bio_advance(bio, nbytes);
+}
+
#define __bio_for_each_segment(bvl, bio, iter, start) \
for (iter = (start); \
(iter).bi_size && \
@@ -265,37 +243,6 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
bio->bi_flags &= ~(1U << bit);
}
-static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
-{
- *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
-}
-
-static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
-{
- struct bvec_iter iter = bio->bi_iter;
- int idx;
-
- bio_get_first_bvec(bio, bv);
- if (bv->bv_len == bio->bi_iter.bi_size)
- return; /* this bio only has a single bvec */
-
- bio_advance_iter(bio, &iter, iter.bi_size);
-
- if (!iter.bi_bvec_done)
- idx = iter.bi_idx - 1;
- else /* in the middle of bvec */
- idx = iter.bi_idx;
-
- *bv = bio->bi_io_vec[idx];
-
- /*
- * iter.bi_bvec_done records actual length of the last bvec
- * if this bio ends in the middle of one io vector
- */
- if (iter.bi_bvec_done)
- bv->bv_len = iter.bi_bvec_done;
-}
-
static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
{
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
@@ -424,7 +371,7 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs)
return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
}
-extern blk_qc_t submit_bio(struct bio *);
+void submit_bio(struct bio *bio);
extern void bio_endio(struct bio *);
@@ -456,8 +403,6 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
struct request_queue;
extern int submit_bio_wait(struct bio *bio);
-extern void bio_advance(struct bio *, unsigned);
-
extern void bio_init(struct bio *bio, struct bio_vec *table,
unsigned short max_vecs);
extern void bio_uninit(struct bio *);
@@ -469,12 +414,11 @@ extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
unsigned int, unsigned int);
int bio_add_zone_append_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset);
-bool __bio_try_merge_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int off, bool *same_page);
void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off);
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
-void bio_release_pages(struct bio *bio, bool mark_dirty);
+void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter);
+void __bio_release_pages(struct bio *bio, bool mark_dirty);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
@@ -482,27 +426,16 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter);
extern void bio_copy_data(struct bio *dst, struct bio *src);
extern void bio_free_pages(struct bio *bio);
-void bio_truncate(struct bio *bio, unsigned new_size);
void guard_bio_eod(struct bio *bio);
void zero_fill_bio(struct bio *bio);
-extern const char *bio_devname(struct bio *bio, char *buffer);
+static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
+{
+ if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+ __bio_release_pages(bio, mark_dirty);
+}
-#define bio_set_dev(bio, bdev) \
-do { \
- bio_clear_flag(bio, BIO_REMAPPED); \
- if ((bio)->bi_bdev != (bdev)) \
- bio_clear_flag(bio, BIO_THROTTLED); \
- (bio)->bi_bdev = (bdev); \
- bio_associate_blkg(bio); \
-} while (0)
-
-#define bio_copy_dev(dst, src) \
-do { \
- bio_clear_flag(dst, BIO_REMAPPED); \
- (dst)->bi_bdev = (src)->bi_bdev; \
- bio_clone_blkg_association(dst, src); \
-} while (0)
+extern const char *bio_devname(struct bio *bio, char *buffer);
#define bio_dev(bio) \
disk_devt((bio)->bi_bdev->bd_disk)
@@ -521,6 +454,22 @@ static inline void bio_clone_blkg_association(struct bio *dst,
struct bio *src) { }
#endif /* CONFIG_BLK_CGROUP */
+static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
+{
+ bio_clear_flag(bio, BIO_REMAPPED);
+ if (bio->bi_bdev != bdev)
+ bio_clear_flag(bio, BIO_THROTTLED);
+ bio->bi_bdev = bdev;
+ bio_associate_blkg(bio);
+}
+
+static inline void bio_copy_dev(struct bio *dst, struct bio *src)
+{
+ bio_clear_flag(dst, BIO_REMAPPED);
+ dst->bi_bdev = src->bi_bdev;
+ bio_clone_blkg_association(dst, src);
+}
+
/*
* BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
*
@@ -784,7 +733,7 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
*/
static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
{
- bio->bi_opf |= REQ_HIPRI;
+ bio->bi_opf |= REQ_POLLED;
if (!is_sync_kiocb(kiocb))
bio->bi_opf |= REQ_NOWAIT;
}
diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h
new file mode 100644
index 000000000000..bbab65bd5428
--- /dev/null
+++ b/include/linux/blk-crypto-profile.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2019 Google LLC
+ */
+
+#ifndef __LINUX_BLK_CRYPTO_PROFILE_H
+#define __LINUX_BLK_CRYPTO_PROFILE_H
+
+#include <linux/bio.h>
+#include <linux/blk-crypto.h>
+
+struct blk_crypto_profile;
+
+/**
+ * struct blk_crypto_ll_ops - functions to control inline encryption hardware
+ *
+ * Low-level operations for controlling inline encryption hardware. This
+ * interface must be implemented by storage drivers that support inline
+ * encryption. All functions may sleep, are serialized by profile->lock, and
+ * are never called while profile->dev (if set) is runtime-suspended.
+ */
+struct blk_crypto_ll_ops {
+
+ /**
+ * @keyslot_program: Program a key into the inline encryption hardware.
+ *
+ * Program @key into the specified @slot in the inline encryption
+ * hardware, overwriting any key that the keyslot may already contain.
+ * The keyslot is guaranteed to not be in-use by any I/O.
+ *
+ * This is required if the device has keyslots. Otherwise (i.e. if the
+ * device is a layered device, or if the device is real hardware that
+ * simply doesn't have the concept of keyslots) it is never called.
+ *
+ * Must return 0 on success, or -errno on failure.
+ */
+ int (*keyslot_program)(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key,
+ unsigned int slot);
+
+ /**
+ * @keyslot_evict: Evict a key from the inline encryption hardware.
+ *
+ * If the device has keyslots, this function must evict the key from the
+ * specified @slot. The slot will contain @key, but there should be no
+ * need for the @key argument to be used as @slot should be sufficient.
+ * The keyslot is guaranteed to not be in-use by any I/O.
+ *
+ * If the device doesn't have keyslots itself, this function must evict
+ * @key from any underlying devices. @slot won't be valid in this case.
+ *
+ * If there are no keyslots and no underlying devices, this function
+ * isn't required.
+ *
+ * Must return 0 on success, or -errno on failure.
+ */
+ int (*keyslot_evict)(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key,
+ unsigned int slot);
+};
+
+/**
+ * struct blk_crypto_profile - inline encryption profile for a device
+ *
+ * This struct contains a storage device's inline encryption capabilities (e.g.
+ * the supported crypto algorithms), driver-provided functions to control the
+ * inline encryption hardware (e.g. programming and evicting keys), and optional
+ * device-independent keyslot management data.
+ */
+struct blk_crypto_profile {
+
+ /* public: Drivers must initialize the following fields. */
+
+ /**
+ * @ll_ops: Driver-provided functions to control the inline encryption
+ * hardware, e.g. program and evict keys.
+ */
+ struct blk_crypto_ll_ops ll_ops;
+
+ /**
+ * @max_dun_bytes_supported: The maximum number of bytes supported for
+ * specifying the data unit number (DUN). Specifically, the range of
+ * supported DUNs is 0 through (1 << (8 * max_dun_bytes_supported)) - 1.
+ */
+ unsigned int max_dun_bytes_supported;
+
+ /**
+ * @modes_supported: Array of bitmasks that specifies whether each
+ * combination of crypto mode and data unit size is supported.
+ * Specifically, the i'th bit of modes_supported[crypto_mode] is set if
+ * crypto_mode can be used with a data unit size of (1 << i). Note that
+ * only data unit sizes that are powers of 2 can be supported.
+ */
+ unsigned int modes_supported[BLK_ENCRYPTION_MODE_MAX];
+
+ /**
+ * @dev: An optional device for runtime power management. If the driver
+ * provides this device, it will be runtime-resumed before any function
+ * in @ll_ops is called and will remain resumed during the call.
+ */
+ struct device *dev;
+
+ /* private: The following fields shouldn't be accessed by drivers. */
+
+ /* Number of keyslots, or 0 if not applicable */
+ unsigned int num_slots;
+
+ /*
+ * Serializes all calls to functions in @ll_ops as well as all changes
+ * to @slot_hashtable. This can also be taken in read mode to look up
+ * keyslots while ensuring that they can't be changed concurrently.
+ */
+ struct rw_semaphore lock;
+
+ /* List of idle slots, with least recently used slot at front */
+ wait_queue_head_t idle_slots_wait_queue;
+ struct list_head idle_slots;
+ spinlock_t idle_slots_lock;
+
+ /*
+ * Hash table which maps struct *blk_crypto_key to keyslots, so that we
+ * can find a key's keyslot in O(1) time rather than O(num_slots).
+ * Protected by 'lock'.
+ */
+ struct hlist_head *slot_hashtable;
+ unsigned int log_slot_ht_size;
+
+ /* Per-keyslot data */
+ struct blk_crypto_keyslot *slots;
+};
+
+int blk_crypto_profile_init(struct blk_crypto_profile *profile,
+ unsigned int num_slots);
+
+int devm_blk_crypto_profile_init(struct device *dev,
+ struct blk_crypto_profile *profile,
+ unsigned int num_slots);
+
+unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot);
+
+blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key,
+ struct blk_crypto_keyslot **slot_ptr);
+
+void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
+
+bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
+ const struct blk_crypto_config *cfg);
+
+int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key);
+
+void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile);
+
+void blk_crypto_profile_destroy(struct blk_crypto_profile *profile);
+
+void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent,
+ const struct blk_crypto_profile *child);
+
+bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target,
+ const struct blk_crypto_profile *reference);
+
+void blk_crypto_update_capabilities(struct blk_crypto_profile *dst,
+ const struct blk_crypto_profile *src);
+
+#endif /* __LINUX_BLK_CRYPTO_PROFILE_H */
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
new file mode 100644
index 000000000000..8a038ea0717e
--- /dev/null
+++ b/include/linux/blk-integrity.h
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BLK_INTEGRITY_H
+#define _LINUX_BLK_INTEGRITY_H
+
+#include <linux/blk-mq.h>
+
+struct request;
+
+enum blk_integrity_flags {
+ BLK_INTEGRITY_VERIFY = 1 << 0,
+ BLK_INTEGRITY_GENERATE = 1 << 1,
+ BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2,
+ BLK_INTEGRITY_IP_CHECKSUM = 1 << 3,
+};
+
+struct blk_integrity_iter {
+ void *prot_buf;
+ void *data_buf;
+ sector_t seed;
+ unsigned int data_size;
+ unsigned short interval;
+ const char *disk_name;
+};
+
+typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *);
+typedef void (integrity_prepare_fn) (struct request *);
+typedef void (integrity_complete_fn) (struct request *, unsigned int);
+
+struct blk_integrity_profile {
+ integrity_processing_fn *generate_fn;
+ integrity_processing_fn *verify_fn;
+ integrity_prepare_fn *prepare_fn;
+ integrity_complete_fn *complete_fn;
+ const char *name;
+};
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+void blk_integrity_register(struct gendisk *, struct blk_integrity *);
+void blk_integrity_unregister(struct gendisk *);
+int blk_integrity_compare(struct gendisk *, struct gendisk *);
+int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
+ struct scatterlist *);
+int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
+
+static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
+{
+ struct blk_integrity *bi = &disk->queue->integrity;
+
+ if (!bi->profile)
+ return NULL;
+
+ return bi;
+}
+
+static inline struct blk_integrity *
+bdev_get_integrity(struct block_device *bdev)
+{
+ return blk_get_integrity(bdev->bd_disk);
+}
+
+static inline bool
+blk_integrity_queue_supports_integrity(struct request_queue *q)
+{
+ return q->integrity.profile;
+}
+
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+ unsigned int segs)
+{
+ q->limits.max_integrity_segments = segs;
+}
+
+static inline unsigned short
+queue_max_integrity_segments(const struct request_queue *q)
+{
+ return q->limits.max_integrity_segments;
+}
+
+/**
+ * bio_integrity_intervals - Return number of integrity intervals for a bio
+ * @bi: blk_integrity profile for device
+ * @sectors: Size of the bio in 512-byte sectors
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the data integrity
+ * interval size of the storage device. Convert the block layer sectors
+ * to the appropriate number of integrity intervals.
+ */
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+ unsigned int sectors)
+{
+ return sectors >> (bi->interval_exp - 9);
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+ unsigned int sectors)
+{
+ return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
+}
+
+static inline bool blk_integrity_rq(struct request *rq)
+{
+ return rq->cmd_flags & REQ_INTEGRITY;
+}
+
+/*
+ * Return the first bvec that contains integrity data. Only drivers that are
+ * limited to a single integrity segment should use this helper.
+ */
+static inline struct bio_vec *rq_integrity_vec(struct request *rq)
+{
+ if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1))
+ return NULL;
+ return rq->bio->bi_integrity->bip_vec;
+}
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static inline int blk_rq_count_integrity_sg(struct request_queue *q,
+ struct bio *b)
+{
+ return 0;
+}
+static inline int blk_rq_map_integrity_sg(struct request_queue *q,
+ struct bio *b,
+ struct scatterlist *s)
+{
+ return 0;
+}
+static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
+{
+ return NULL;
+}
+static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
+{
+ return NULL;
+}
+static inline bool
+blk_integrity_queue_supports_integrity(struct request_queue *q)
+{
+ return false;
+}
+static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
+{
+ return 0;
+}
+static inline void blk_integrity_register(struct gendisk *d,
+ struct blk_integrity *b)
+{
+}
+static inline void blk_integrity_unregister(struct gendisk *d)
+{
+}
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+ unsigned int segs)
+{
+}
+static inline unsigned short
+queue_max_integrity_segments(const struct request_queue *q)
+{
+ return 0;
+}
+
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+ unsigned int sectors)
+{
+ return 0;
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+ unsigned int sectors)
+{
+ return 0;
+}
+static inline int blk_integrity_rq(struct request *rq)
+{
+ return 0;
+}
+
+static inline struct bio_vec *rq_integrity_vec(struct request *rq)
+{
+ return NULL;
+}
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+#endif /* _LINUX_BLK_INTEGRITY_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 13ba1861e688..b4039fdf1b04 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -6,10 +6,226 @@
#include <linux/sbitmap.h>
#include <linux/srcu.h>
#include <linux/lockdep.h>
+#include <linux/scatterlist.h>
+#include <linux/prefetch.h>
struct blk_mq_tags;
struct blk_flush_queue;
+#define BLKDEV_MIN_RQ 4
+#define BLKDEV_DEFAULT_RQ 128
+
+typedef void (rq_end_io_fn)(struct request *, blk_status_t);
+
+/*
+ * request flags */
+typedef __u32 __bitwise req_flags_t;
+
+/* drive already may have started this one */
+#define RQF_STARTED ((__force req_flags_t)(1 << 1))
+/* may not be passed by ioscheduler */
+#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3))
+/* request for flush sequence */
+#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4))
+/* merge of different types, fail separately */
+#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5))
+/* track inflight for MQ */
+#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6))
+/* don't call prep for this one */
+#define RQF_DONTPREP ((__force req_flags_t)(1 << 7))
+/* vaguely specified driver internal error. Ignored by the block layer */
+#define RQF_FAILED ((__force req_flags_t)(1 << 10))
+/* don't warn about errors */
+#define RQF_QUIET ((__force req_flags_t)(1 << 11))
+/* elevator private data attached */
+#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12))
+/* account into disk and partition IO statistics */
+#define RQF_IO_STAT ((__force req_flags_t)(1 << 13))
+/* runtime pm request */
+#define RQF_PM ((__force req_flags_t)(1 << 15))
+/* on IO scheduler merge hash */
+#define RQF_HASHED ((__force req_flags_t)(1 << 16))
+/* track IO completion time */
+#define RQF_STATS ((__force req_flags_t)(1 << 17))
+/* Look at ->special_vec for the actual data payload instead of the
+ bio chain. */
+#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
+/* The per-zone write lock is held for this request */
+#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
+/* already slept for hybrid poll */
+#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20))
+/* ->timeout has been called, don't expire again */
+#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21))
+/* queue has elevator attached */
+#define RQF_ELV ((__force req_flags_t)(1 << 22))
+
+/* flags that prevent us from merging requests: */
+#define RQF_NOMERGE_FLAGS \
+ (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
+
+enum mq_rq_state {
+ MQ_RQ_IDLE = 0,
+ MQ_RQ_IN_FLIGHT = 1,
+ MQ_RQ_COMPLETE = 2,
+};
+
+/*
+ * Try to put the fields that are referenced together in the same cacheline.
+ *
+ * If you modify this structure, make sure to update blk_rq_init() and
+ * especially blk_mq_rq_ctx_init() to take care of the added fields.
+ */
+struct request {
+ struct request_queue *q;
+ struct blk_mq_ctx *mq_ctx;
+ struct blk_mq_hw_ctx *mq_hctx;
+
+ unsigned int cmd_flags; /* op and common flags */
+ req_flags_t rq_flags;
+
+ int tag;
+ int internal_tag;
+
+ unsigned int timeout;
+
+ /* the following two fields are internal, NEVER access directly */
+ unsigned int __data_len; /* total data len */
+ sector_t __sector; /* sector cursor */
+
+ struct bio *bio;
+ struct bio *biotail;
+
+ union {
+ struct list_head queuelist;
+ struct request *rq_next;
+ };
+
+ struct gendisk *rq_disk;
+ struct block_device *part;
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+ /* Time that the first bio started allocating this request. */
+ u64 alloc_time_ns;
+#endif
+ /* Time that this request was allocated for this IO. */
+ u64 start_time_ns;
+ /* Time that I/O was submitted to the device. */
+ u64 io_start_time_ns;
+
+#ifdef CONFIG_BLK_WBT
+ unsigned short wbt_flags;
+#endif
+ /*
+ * rq sectors used for blk stats. It has the same value
+ * with blk_rq_sectors(rq), except that it never be zeroed
+ * by completion.
+ */
+ unsigned short stats_sectors;
+
+ /*
+ * Number of scatter-gather DMA addr+len pairs after
+ * physical address coalescing is performed.
+ */
+ unsigned short nr_phys_segments;
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ unsigned short nr_integrity_segments;
+#endif
+
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+ struct bio_crypt_ctx *crypt_ctx;
+ struct blk_crypto_keyslot *crypt_keyslot;
+#endif
+
+ unsigned short write_hint;
+ unsigned short ioprio;
+
+ enum mq_rq_state state;
+ refcount_t ref;
+
+ unsigned long deadline;
+
+ /*
+ * The hash is used inside the scheduler, and killed once the
+ * request reaches the dispatch list. The ipi_list is only used
+ * to queue the request for softirq completion, which is long
+ * after the request has been unhashed (and even removed from
+ * the dispatch list).
+ */
+ union {
+ struct hlist_node hash; /* merge hash */
+ struct llist_node ipi_list;
+ };
+
+ /*
+ * The rb_node is only used inside the io scheduler, requests
+ * are pruned when moved to the dispatch queue. So let the
+ * completion_data share space with the rb_node.
+ */
+ union {
+ struct rb_node rb_node; /* sort/lookup */
+ struct bio_vec special_vec;
+ void *completion_data;
+ int error_count; /* for legacy drivers, don't use */
+ };
+
+
+ /*
+ * Three pointers are available for the IO schedulers, if they need
+ * more they have to dynamically allocate it. Flush requests are
+ * never put on the IO scheduler. So let the flush fields share
+ * space with the elevator data.
+ */
+ union {
+ struct {
+ struct io_cq *icq;
+ void *priv[2];
+ } elv;
+
+ struct {
+ unsigned int seq;
+ struct list_head list;
+ rq_end_io_fn *saved_end_io;
+ } flush;
+ };
+
+ union {
+ struct __call_single_data csd;
+ u64 fifo_time;
+ };
+
+ /*
+ * completion callback.
+ */
+ rq_end_io_fn *end_io;
+ void *end_io_data;
+};
+
+#define req_op(req) \
+ ((req)->cmd_flags & REQ_OP_MASK)
+
+static inline bool blk_rq_is_passthrough(struct request *rq)
+{
+ return blk_op_is_passthrough(req_op(rq));
+}
+
+static inline unsigned short req_get_ioprio(struct request *req)
+{
+ return req->ioprio;
+}
+
+#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ)
+
+#define rq_dma_dir(rq) \
+ (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
+
+enum blk_eh_timer_return {
+ BLK_EH_DONE, /* drivers has completed the command */
+ BLK_EH_RESET_TIMER, /* reset timer and try again */
+};
+
+#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
+#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
+
/**
* struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
* block device
@@ -126,9 +342,6 @@ struct blk_mq_hw_ctx {
unsigned long queued;
/** @run: Number of dispatched requests. */
unsigned long run;
-#define BLK_MQ_MAX_DISPATCH_ORDER 7
- /** @dispatched: Number of dispatch requests by queue. */
- unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
/** @numa_node: NUMA node the storage adapter has been connected to. */
unsigned int numa_node;
@@ -148,13 +361,6 @@ struct blk_mq_hw_ctx {
/** @kobj: Kernel object for sysfs. */
struct kobject kobj;
- /** @poll_considered: Count times blk_poll() was called. */
- unsigned long poll_considered;
- /** @poll_invoked: Count how many requests blk_poll() polled. */
- unsigned long poll_invoked;
- /** @poll_success: Count how many polled requests were completed. */
- unsigned long poll_success;
-
#ifdef CONFIG_BLK_DEBUG_FS
/**
* @debugfs_dir: debugfs directory for this hardware queue. Named
@@ -232,13 +438,11 @@ enum hctx_type {
* @flags: Zero or more BLK_MQ_F_* flags.
* @driver_data: Pointer to data owned by the block driver that created this
* tag set.
- * @active_queues_shared_sbitmap:
- * number of active request queues per tag set.
- * @__bitmap_tags: A shared tags sbitmap, used over all hctx's
- * @__breserved_tags:
- * A shared reserved tags sbitmap, used over all hctx's
* @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues
* elements.
+ * @shared_tags:
+ * Shared set of tags. Has @nr_hw_queues elements. If set,
+ * shared by all @tags.
* @tag_list_lock: Serializes tag_list accesses.
* @tag_list: List of the request queues that use this tag set. See also
* request_queue.tag_set_list.
@@ -255,12 +459,11 @@ struct blk_mq_tag_set {
unsigned int timeout;
unsigned int flags;
void *driver_data;
- atomic_t active_queues_shared_sbitmap;
- struct sbitmap_queue __bitmap_tags;
- struct sbitmap_queue __breserved_tags;
struct blk_mq_tags **tags;
+ struct blk_mq_tags *shared_tags;
+
struct mutex tag_list_lock;
struct list_head tag_list;
};
@@ -330,7 +533,7 @@ struct blk_mq_ops {
/**
* @poll: Called to poll for completion of a specific tag.
*/
- int (*poll)(struct blk_mq_hw_ctx *);
+ int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *);
/**
* @complete: Mark the request as complete.
@@ -432,6 +635,8 @@ enum {
((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
<< BLK_MQ_F_ALLOC_POLICY_START_BIT)
+#define BLK_MQ_NO_HCTX_IDX (-1U)
+
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
struct lock_class_key *lkclass);
#define blk_mq_alloc_disk(set, queuedata) \
@@ -451,8 +656,6 @@ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
unsigned int set_flags);
void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
-void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
-
void blk_mq_free_request(struct request *rq);
bool blk_mq_queue_inflight(struct request_queue *q);
@@ -471,7 +674,40 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
unsigned int op, blk_mq_req_flags_t flags,
unsigned int hctx_idx);
-struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
+
+/*
+ * Tag address space map.
+ */
+struct blk_mq_tags {
+ unsigned int nr_tags;
+ unsigned int nr_reserved_tags;
+
+ atomic_t active_queues;
+
+ struct sbitmap_queue bitmap_tags;
+ struct sbitmap_queue breserved_tags;
+
+ struct request **rqs;
+ struct request **static_rqs;
+ struct list_head page_list;
+
+ /*
+ * used to clear request reference in rqs[] before freeing one
+ * request pool
+ */
+ spinlock_t lock;
+};
+
+static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
+ unsigned int tag)
+{
+ if (tag < tags->nr_tags) {
+ prefetch(tags->rqs[tag]);
+ return tags->rqs[tag];
+ }
+
+ return NULL;
+}
enum {
BLK_MQ_UNIQUE_TAG_BITS = 16,
@@ -524,6 +760,35 @@ static inline void blk_mq_set_request_complete(struct request *rq)
void blk_mq_start_request(struct request *rq);
void blk_mq_end_request(struct request *rq, blk_status_t error);
void __blk_mq_end_request(struct request *rq, blk_status_t error);
+void blk_mq_end_request_batch(struct io_comp_batch *ib);
+
+/*
+ * Only need start/end time stamping if we have iostat or
+ * blk stats enabled, or using an IO scheduler.
+ */
+static inline bool blk_mq_need_time_stamp(struct request *rq)
+{
+ return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
+}
+
+/*
+ * Batched completions only work when there is no I/O error and no special
+ * ->end_io handler.
+ */
+static inline bool blk_mq_add_to_batch(struct request *req,
+ struct io_comp_batch *iob, int ioerror,
+ void (*complete)(struct io_comp_batch *))
+{
+ if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror)
+ return false;
+ if (!iob->complete)
+ iob->complete = complete;
+ else if (iob->complete != complete)
+ return false;
+ iob->need_ts |= blk_mq_need_time_stamp(req);
+ rq_list_add(&iob->req_list, req);
+ return true;
+}
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
void blk_mq_kick_requeue_list(struct request_queue *q);
@@ -605,16 +870,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
for ((i) = 0; (i) < (hctx)->nr_ctx && \
({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
-static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx,
- struct request *rq)
-{
- if (rq->tag != -1)
- return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT);
-
- return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) |
- BLK_QC_T_INTERNAL;
-}
-
static inline void blk_mq_cleanup_rq(struct request *rq)
{
if (rq->q->mq_ops->cleanup_rq)
@@ -633,8 +888,268 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
rq->rq_disk = bio->bi_bdev->bd_disk;
}
-blk_qc_t blk_mq_submit_bio(struct bio *bio);
void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
struct lock_class_key *key);
+static inline bool rq_is_sync(struct request *rq)
+{
+ return op_is_sync(rq->cmd_flags);
+}
+
+void blk_rq_init(struct request_queue *q, struct request *rq);
+void blk_put_request(struct request *rq);
+struct request *blk_get_request(struct request_queue *q, unsigned int op,
+ blk_mq_req_flags_t flags);
+int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+ struct bio_set *bs, gfp_t gfp_mask,
+ int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
+void blk_rq_unprep_clone(struct request *rq);
+blk_status_t blk_insert_cloned_request(struct request_queue *q,
+ struct request *rq);
+
+struct rq_map_data {
+ struct page **pages;
+ int page_order;
+ int nr_entries;
+ unsigned long offset;
+ int null_mapped;
+ int from_user;
+};
+
+int blk_rq_map_user(struct request_queue *, struct request *,
+ struct rq_map_data *, void __user *, unsigned long, gfp_t);
+int blk_rq_map_user_iov(struct request_queue *, struct request *,
+ struct rq_map_data *, const struct iov_iter *, gfp_t);
+int blk_rq_unmap_user(struct bio *);
+int blk_rq_map_kern(struct request_queue *, struct request *, void *,
+ unsigned int, gfp_t);
+int blk_rq_append_bio(struct request *rq, struct bio *bio);
+void blk_execute_rq_nowait(struct gendisk *, struct request *, int,
+ rq_end_io_fn *);
+blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
+ int at_head);
+
+struct req_iterator {
+ struct bvec_iter iter;
+ struct bio *bio;
+};
+
+#define __rq_for_each_bio(_bio, rq) \
+ if ((rq->bio)) \
+ for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
+
+#define rq_for_each_segment(bvl, _rq, _iter) \
+ __rq_for_each_bio(_iter.bio, _rq) \
+ bio_for_each_segment(bvl, _iter.bio, _iter.iter)
+
+#define rq_for_each_bvec(bvl, _rq, _iter) \
+ __rq_for_each_bio(_iter.bio, _rq) \
+ bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
+
+#define rq_iter_last(bvec, _iter) \
+ (_iter.bio->bi_next == NULL && \
+ bio_iter_last(bvec, _iter.iter))
+
+/*
+ * blk_rq_pos() : the current sector
+ * blk_rq_bytes() : bytes left in the entire request
+ * blk_rq_cur_bytes() : bytes left in the current segment
+ * blk_rq_err_bytes() : bytes left till the next error boundary
+ * blk_rq_sectors() : sectors left in the entire request
+ * blk_rq_cur_sectors() : sectors left in the current segment
+ * blk_rq_stats_sectors() : sectors of the entire request used for stats
+ */
+static inline sector_t blk_rq_pos(const struct request *rq)
+{
+ return rq->__sector;
+}
+
+static inline unsigned int blk_rq_bytes(const struct request *rq)
+{
+ return rq->__data_len;
+}
+
+static inline int blk_rq_cur_bytes(const struct request *rq)
+{
+ if (!rq->bio)
+ return 0;
+ if (!bio_has_data(rq->bio)) /* dataless requests such as discard */
+ return rq->bio->bi_iter.bi_size;
+ return bio_iovec(rq->bio).bv_len;
+}
+
+unsigned int blk_rq_err_bytes(const struct request *rq);
+
+static inline unsigned int blk_rq_sectors(const struct request *rq)
+{
+ return blk_rq_bytes(rq) >> SECTOR_SHIFT;
+}
+
+static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
+{
+ return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
+}
+
+static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
+{
+ return rq->stats_sectors;
+}
+
+/*
+ * Some commands like WRITE SAME have a payload or data transfer size which
+ * is different from the size of the request. Any driver that supports such
+ * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
+ * calculate the data transfer size.
+ */
+static inline unsigned int blk_rq_payload_bytes(struct request *rq)
+{
+ if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+ return rq->special_vec.bv_len;
+ return blk_rq_bytes(rq);
+}
+
+/*
+ * Return the first full biovec in the request. The caller needs to check that
+ * there are any bvecs before calling this helper.
+ */
+static inline struct bio_vec req_bvec(struct request *rq)
+{
+ if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+ return rq->special_vec;
+ return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
+}
+
+static inline unsigned int blk_rq_count_bios(struct request *rq)
+{
+ unsigned int nr_bios = 0;
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq)
+ nr_bios++;
+
+ return nr_bios;
+}
+
+void blk_steal_bios(struct bio_list *list, struct request *rq);
+
+/*
+ * Request completion related functions.
+ *
+ * blk_update_request() completes given number of bytes and updates
+ * the request without completing it.
+ */
+bool blk_update_request(struct request *rq, blk_status_t error,
+ unsigned int nr_bytes);
+void blk_abort_request(struct request *);
+
+/*
+ * Number of physical segments as sent to the device.
+ *
+ * Normally this is the number of discontiguous data segments sent by the
+ * submitter. But for data-less command like discard we might have no
+ * actual data segments submitted, but the driver might have to add it's
+ * own special payload. In that case we still return 1 here so that this
+ * special payload will be mapped.
+ */
+static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
+{
+ if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+ return 1;
+ return rq->nr_phys_segments;
+}
+
+/*
+ * Number of discard segments (or ranges) the driver needs to fill in.
+ * Each discard bio merged into a request is counted as one segment.
+ */
+static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
+{
+ return max_t(unsigned short, rq->nr_phys_segments, 1);
+}
+
+int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
+ struct scatterlist *sglist, struct scatterlist **last_sg);
+static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
+ struct scatterlist *sglist)
+{
+ struct scatterlist *last_sg = NULL;
+
+ return __blk_rq_map_sg(q, rq, sglist, &last_sg);
+}
+void blk_dump_rq_flags(struct request *, char *);
+
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline unsigned int blk_rq_zone_no(struct request *rq)
+{
+ return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
+}
+
+static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
+{
+ return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
+}
+
+bool blk_req_needs_zone_write_lock(struct request *rq);
+bool blk_req_zone_write_trylock(struct request *rq);
+void __blk_req_zone_write_lock(struct request *rq);
+void __blk_req_zone_write_unlock(struct request *rq);
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+ if (blk_req_needs_zone_write_lock(rq))
+ __blk_req_zone_write_lock(rq);
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+ if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
+ __blk_req_zone_write_unlock(rq);
+}
+
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+ return rq->q->seq_zones_wlock &&
+ test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+ if (!blk_req_needs_zone_write_lock(rq))
+ return true;
+ return !blk_req_zone_is_write_locked(rq);
+}
+#else /* CONFIG_BLK_DEV_ZONED */
+static inline bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+ return false;
+}
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+}
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+ return false;
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+ return true;
+}
+#endif /* CONFIG_BLK_DEV_ZONED */
+
+#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
#endif
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+void rq_flush_dcache_pages(struct request *rq);
+#else
+static inline void rq_flush_dcache_pages(struct request *rq)
+{
+}
+#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */
+#endif /* BLK_MQ_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index be622b5a21ed..1e370929c89e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -38,6 +38,7 @@ struct block_device {
u8 bd_partno;
spinlock_t bd_size_lock; /* for bd_inode->i_size updates */
struct gendisk * bd_disk;
+ struct request_queue * bd_queue;
/* The counter of freeze processes */
int bd_fsfreeze_count;
@@ -208,6 +209,9 @@ static inline void bio_issue_init(struct bio_issue *issue,
((u64)size << BIO_ISSUE_SIZE_SHIFT));
}
+typedef unsigned int blk_qc_t;
+#define BLK_QC_T_NONE -1U
+
/*
* main unit of I/O for the block layer and lower layers (ie drivers and
* stacking drivers)
@@ -227,8 +231,8 @@ struct bio {
struct bvec_iter bi_iter;
+ blk_qc_t bi_cookie;
bio_end_io_t *bi_end_io;
-
void *bi_private;
#ifdef CONFIG_BLK_CGROUP
/*
@@ -384,7 +388,7 @@ enum req_flag_bits {
/* command specific flags for REQ_OP_WRITE_ZEROES: */
__REQ_NOUNMAP, /* do not free blocks when zeroing */
- __REQ_HIPRI,
+ __REQ_POLLED, /* caller polls for completion using bio_poll */
/* for driver use */
__REQ_DRV,
@@ -409,7 +413,7 @@ enum req_flag_bits {
#define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT)
#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
-#define REQ_HIPRI (1ULL << __REQ_HIPRI)
+#define REQ_POLLED (1ULL << __REQ_POLLED)
#define REQ_DRV (1ULL << __REQ_DRV)
#define REQ_SWAP (1ULL << __REQ_SWAP)
@@ -431,8 +435,6 @@ enum stat_group {
#define bio_op(bio) \
((bio)->bi_opf & REQ_OP_MASK)
-#define req_op(req) \
- ((req)->cmd_flags & REQ_OP_MASK)
/* obsolete, don't use in new code */
static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
@@ -497,31 +499,6 @@ static inline int op_stat_group(unsigned int op)
return op_is_write(op);
}
-typedef unsigned int blk_qc_t;
-#define BLK_QC_T_NONE -1U
-#define BLK_QC_T_SHIFT 16
-#define BLK_QC_T_INTERNAL (1U << 31)
-
-static inline bool blk_qc_t_valid(blk_qc_t cookie)
-{
- return cookie != BLK_QC_T_NONE;
-}
-
-static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
-{
- return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT;
-}
-
-static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
-{
- return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
-}
-
-static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
-{
- return (cookie & BLK_QC_T_INTERNAL) != 0;
-}
-
struct blk_rq_stat {
u64 mean;
u64 min;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 12b9dbcc980e..d2d627e2c782 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -3,8 +3,6 @@
#define _LINUX_BLKDEV_H
#include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/major.h>
#include <linux/genhd.h>
#include <linux/list.h>
#include <linux/llist.h>
@@ -12,17 +10,11 @@
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/wait.h>
-#include <linux/mempool.h>
-#include <linux/pfn.h>
#include <linux/bio.h>
-#include <linux/stringify.h>
#include <linux/gfp.h>
-#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/percpu-refcount.h>
-#include <linux/scatterlist.h>
#include <linux/blkzoned.h>
-#include <linux/pm.h>
#include <linux/sbitmap.h>
struct module;
@@ -33,14 +25,12 @@ struct request;
struct sg_io_hdr;
struct blkcg_gq;
struct blk_flush_queue;
+struct kiocb;
struct pr_ops;
struct rq_qos;
struct blk_queue_stats;
struct blk_stat_callback;
-struct blk_keyslot_manager;
-
-#define BLKDEV_MIN_RQ 4
-#define BLKDEV_MAX_RQ 128 /* Default maximum */
+struct blk_crypto_profile;
/* Must be consistent with blk_mq_poll_stats_bkt() */
#define BLK_MQ_POLL_STATS_BKTS 16
@@ -54,186 +44,13 @@ struct blk_keyslot_manager;
*/
#define BLKCG_MAX_POLS 6
-typedef void (rq_end_io_fn)(struct request *, blk_status_t);
-
-/*
- * request flags */
-typedef __u32 __bitwise req_flags_t;
-
-/* drive already may have started this one */
-#define RQF_STARTED ((__force req_flags_t)(1 << 1))
-/* may not be passed by ioscheduler */
-#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3))
-/* request for flush sequence */
-#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4))
-/* merge of different types, fail separately */
-#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5))
-/* track inflight for MQ */
-#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6))
-/* don't call prep for this one */
-#define RQF_DONTPREP ((__force req_flags_t)(1 << 7))
-/* vaguely specified driver internal error. Ignored by the block layer */
-#define RQF_FAILED ((__force req_flags_t)(1 << 10))
-/* don't warn about errors */
-#define RQF_QUIET ((__force req_flags_t)(1 << 11))
-/* elevator private data attached */
-#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12))
-/* account into disk and partition IO statistics */
-#define RQF_IO_STAT ((__force req_flags_t)(1 << 13))
-/* runtime pm request */
-#define RQF_PM ((__force req_flags_t)(1 << 15))
-/* on IO scheduler merge hash */
-#define RQF_HASHED ((__force req_flags_t)(1 << 16))
-/* track IO completion time */
-#define RQF_STATS ((__force req_flags_t)(1 << 17))
-/* Look at ->special_vec for the actual data payload instead of the
- bio chain. */
-#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
-/* The per-zone write lock is held for this request */
-#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
-/* already slept for hybrid poll */
-#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20))
-/* ->timeout has been called, don't expire again */
-#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21))
-
-/* flags that prevent us from merging requests: */
-#define RQF_NOMERGE_FLAGS \
- (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
-
-/*
- * Request state for blk-mq.
- */
-enum mq_rq_state {
- MQ_RQ_IDLE = 0,
- MQ_RQ_IN_FLIGHT = 1,
- MQ_RQ_COMPLETE = 2,
-};
-
-/*
- * Try to put the fields that are referenced together in the same cacheline.
- *
- * If you modify this structure, make sure to update blk_rq_init() and
- * especially blk_mq_rq_ctx_init() to take care of the added fields.
- */
-struct request {
- struct request_queue *q;
- struct blk_mq_ctx *mq_ctx;
- struct blk_mq_hw_ctx *mq_hctx;
-
- unsigned int cmd_flags; /* op and common flags */
- req_flags_t rq_flags;
-
- int tag;
- int internal_tag;
-
- /* the following two fields are internal, NEVER access directly */
- unsigned int __data_len; /* total data len */
- sector_t __sector; /* sector cursor */
-
- struct bio *bio;
- struct bio *biotail;
-
- struct list_head queuelist;
-
- /*
- * The hash is used inside the scheduler, and killed once the
- * request reaches the dispatch list. The ipi_list is only used
- * to queue the request for softirq completion, which is long
- * after the request has been unhashed (and even removed from
- * the dispatch list).
- */
- union {
- struct hlist_node hash; /* merge hash */
- struct llist_node ipi_list;
- };
-
- /*
- * The rb_node is only used inside the io scheduler, requests
- * are pruned when moved to the dispatch queue. So let the
- * completion_data share space with the rb_node.
- */
- union {
- struct rb_node rb_node; /* sort/lookup */
- struct bio_vec special_vec;
- void *completion_data;
- int error_count; /* for legacy drivers, don't use */
- };
-
- /*
- * Three pointers are available for the IO schedulers, if they need
- * more they have to dynamically allocate it. Flush requests are
- * never put on the IO scheduler. So let the flush fields share
- * space with the elevator data.
- */
- union {
- struct {
- struct io_cq *icq;
- void *priv[2];
- } elv;
-
- struct {
- unsigned int seq;
- struct list_head list;
- rq_end_io_fn *saved_end_io;
- } flush;
- };
-
- struct gendisk *rq_disk;
- struct block_device *part;
-#ifdef CONFIG_BLK_RQ_ALLOC_TIME
- /* Time that the first bio started allocating this request. */
- u64 alloc_time_ns;
-#endif
- /* Time that this request was allocated for this IO. */
- u64 start_time_ns;
- /* Time that I/O was submitted to the device. */
- u64 io_start_time_ns;
-
-#ifdef CONFIG_BLK_WBT
- unsigned short wbt_flags;
-#endif
- /*
- * rq sectors used for blk stats. It has the same value
- * with blk_rq_sectors(rq), except that it never be zeroed
- * by completion.
- */
- unsigned short stats_sectors;
-
- /*
- * Number of scatter-gather DMA addr+len pairs after
- * physical address coalescing is performed.
- */
- unsigned short nr_phys_segments;
-
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
- unsigned short nr_integrity_segments;
-#endif
-
-#ifdef CONFIG_BLK_INLINE_ENCRYPTION
- struct bio_crypt_ctx *crypt_ctx;
- struct blk_ksm_keyslot *crypt_keyslot;
-#endif
-
- unsigned short write_hint;
- unsigned short ioprio;
-
- enum mq_rq_state state;
- refcount_t ref;
-
- unsigned int timeout;
- unsigned long deadline;
-
- union {
- struct __call_single_data csd;
- u64 fifo_time;
- };
+static inline int blk_validate_block_size(unsigned int bsize)
+{
+ if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
+ return -EINVAL;
- /*
- * completion callback.
- */
- rq_end_io_fn *end_io;
- void *end_io_data;
-};
+ return 0;
+}
static inline bool blk_op_is_passthrough(unsigned int op)
{
@@ -241,35 +58,6 @@ static inline bool blk_op_is_passthrough(unsigned int op)
return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
}
-static inline bool blk_rq_is_passthrough(struct request *rq)
-{
- return blk_op_is_passthrough(req_op(rq));
-}
-
-static inline unsigned short req_get_ioprio(struct request *req)
-{
- return req->ioprio;
-}
-
-#include <linux/elevator.h>
-
-struct blk_queue_ctx;
-
-struct bio_vec;
-
-enum blk_eh_timer_return {
- BLK_EH_DONE, /* drivers has completed the command */
- BLK_EH_RESET_TIMER, /* reset timer and try again */
-};
-
-enum blk_queue_state {
- Queue_down,
- Queue_up,
-};
-
-#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
-#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
-
/*
* Zoned block device models (zoned limit).
*
@@ -370,6 +158,34 @@ static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
#endif /* CONFIG_BLK_DEV_ZONED */
+/*
+ * Independent access ranges: struct blk_independent_access_range describes
+ * a range of contiguous sectors that can be accessed using device command
+ * execution resources that are independent from the resources used for
+ * other access ranges. This is typically found with single-LUN multi-actuator
+ * HDDs where each access range is served by a different set of heads.
+ * The set of independent ranges supported by the device is defined using
+ * struct blk_independent_access_ranges. The independent ranges must not overlap
+ * and must include all sectors within the disk capacity (no sector holes
+ * allowed).
+ * For a device with multiple ranges, requests targeting sectors in different
+ * ranges can be executed in parallel. A request can straddle an access range
+ * boundary.
+ */
+struct blk_independent_access_range {
+ struct kobject kobj;
+ struct request_queue *queue;
+ sector_t sector;
+ sector_t nr_sectors;
+};
+
+struct blk_independent_access_ranges {
+ struct kobject kobj;
+ bool sysfs_registered;
+ unsigned int nr_ia_ranges;
+ struct blk_independent_access_range ia_range[];
+};
+
struct request_queue {
struct request *last_merge;
struct elevator_queue *elevator;
@@ -444,8 +260,7 @@ struct request_queue {
unsigned int dma_alignment;
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
- /* Inline crypto capabilities */
- struct blk_keyslot_manager *ksm;
+ struct blk_crypto_profile *crypto_profile;
#endif
unsigned int rq_timeout;
@@ -457,10 +272,9 @@ struct request_queue {
struct timer_list timeout;
struct work_struct timeout_work;
- atomic_t nr_active_requests_shared_sbitmap;
+ atomic_t nr_active_requests_shared_tags;
- struct sbitmap_queue sched_bitmap_tags;
- struct sbitmap_queue sched_breserved_tags;
+ struct blk_mq_tags *sched_shared_tags;
struct list_head icq_list;
#ifdef CONFIG_BLK_CGROUP
@@ -536,6 +350,8 @@ struct request_queue {
*/
struct mutex mq_freeze_lock;
+ int quiesce_depth;
+
struct blk_mq_tag_set *tag_set;
struct list_head tag_set_list;
struct bio_set bio_split;
@@ -549,10 +365,14 @@ struct request_queue {
bool mq_sysfs_init_done;
- size_t cmd_size;
-
#define BLK_MAX_WRITE_HINTS 5
u64 write_hints[BLK_MAX_WRITE_HINTS];
+
+ /*
+ * Independent sector access ranges. This is always NULL for
+ * devices that do not have multiple independent access ranges.
+ */
+ struct blk_independent_access_ranges *ia_ranges;
};
/* Keep blk_queue_flag_name[] in sync with the definitions below */
@@ -638,11 +458,6 @@ extern void blk_clear_pm_only(struct request_queue *q);
#define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist)
-#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ)
-
-#define rq_dma_dir(rq) \
- (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
-
#define dma_map_bvec(dev, bv, dir, attrs) \
dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \
(dir), (attrs))
@@ -758,42 +573,6 @@ static inline unsigned int queue_max_active_zones(const struct request_queue *q)
}
#endif /* CONFIG_BLK_DEV_ZONED */
-static inline bool rq_is_sync(struct request *rq)
-{
- return op_is_sync(rq->cmd_flags);
-}
-
-static inline bool rq_mergeable(struct request *rq)
-{
- if (blk_rq_is_passthrough(rq))
- return false;
-
- if (req_op(rq) == REQ_OP_FLUSH)
- return false;
-
- if (req_op(rq) == REQ_OP_WRITE_ZEROES)
- return false;
-
- if (req_op(rq) == REQ_OP_ZONE_APPEND)
- return false;
-
- if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
- return false;
- if (rq->rq_flags & RQF_NOMERGE_FLAGS)
- return false;
-
- return true;
-}
-
-static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
-{
- if (bio_page(a) == bio_page(b) &&
- bio_offset(a) == bio_offset(b))
- return true;
-
- return false;
-}
-
static inline unsigned int blk_queue_depth(struct request_queue *q)
{
if (q->queue_depth)
@@ -808,83 +587,20 @@ static inline unsigned int blk_queue_depth(struct request_queue *q)
#define BLK_DEFAULT_SG_TIMEOUT (60 * HZ)
#define BLK_MIN_SG_TIMEOUT (7 * HZ)
-struct rq_map_data {
- struct page **pages;
- int page_order;
- int nr_entries;
- unsigned long offset;
- int null_mapped;
- int from_user;
-};
-
-struct req_iterator {
- struct bvec_iter iter;
- struct bio *bio;
-};
-
/* This should not be used directly - use rq_for_each_segment */
#define for_each_bio(_bio) \
for (; _bio; _bio = _bio->bi_next)
-#define __rq_for_each_bio(_bio, rq) \
- if ((rq->bio)) \
- for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
-
-#define rq_for_each_segment(bvl, _rq, _iter) \
- __rq_for_each_bio(_iter.bio, _rq) \
- bio_for_each_segment(bvl, _iter.bio, _iter.iter)
-#define rq_for_each_bvec(bvl, _rq, _iter) \
- __rq_for_each_bio(_iter.bio, _rq) \
- bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
-
-#define rq_iter_last(bvec, _iter) \
- (_iter.bio->bi_next == NULL && \
- bio_iter_last(bvec, _iter.iter))
-
-#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
-#endif
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-extern void rq_flush_dcache_pages(struct request *rq);
-#else
-static inline void rq_flush_dcache_pages(struct request *rq)
-{
-}
-#endif
extern int blk_register_queue(struct gendisk *disk);
extern void blk_unregister_queue(struct gendisk *disk);
-blk_qc_t submit_bio_noacct(struct bio *bio);
-extern void blk_rq_init(struct request_queue *q, struct request *rq);
-extern void blk_put_request(struct request *);
-extern struct request *blk_get_request(struct request_queue *, unsigned int op,
- blk_mq_req_flags_t flags);
+void submit_bio_noacct(struct bio *bio);
+
extern int blk_lld_busy(struct request_queue *q);
-extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
- struct bio_set *bs, gfp_t gfp_mask,
- int (*bio_ctr)(struct bio *, struct bio *, void *),
- void *data);
-extern void blk_rq_unprep_clone(struct request *rq);
-extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
- struct request *rq);
-int blk_rq_append_bio(struct request *rq, struct bio *bio);
extern void blk_queue_split(struct bio **);
extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
extern void blk_queue_exit(struct request_queue *q);
extern void blk_sync_queue(struct request_queue *q);
-extern int blk_rq_map_user(struct request_queue *, struct request *,
- struct rq_map_data *, void __user *, unsigned long,
- gfp_t);
-extern int blk_rq_unmap_user(struct bio *);
-extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
-extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
- struct rq_map_data *, const struct iov_iter *,
- gfp_t);
-extern void blk_execute_rq_nowait(struct gendisk *,
- struct request *, int, rq_end_io_fn *);
-
-blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
- int at_head);
/* Helper to convert REQ_OP_XXX to its string format XXX */
extern const char *blk_op_str(unsigned int op);
@@ -892,11 +608,17 @@ extern const char *blk_op_str(unsigned int op);
int blk_status_to_errno(blk_status_t status);
blk_status_t errno_to_blk_status(int errno);
-int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
+/* only poll the hardware once, don't continue until a completion was found */
+#define BLK_POLL_ONESHOT (1 << 0)
+/* do not sleep to wait for the expected completion time */
+#define BLK_POLL_NOSLEEP (1 << 1)
+int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
+int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
+ unsigned int flags);
static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
{
- return bdev->bd_disk->queue; /* this is never NULL */
+ return bdev->bd_queue; /* this is never NULL */
}
/*
@@ -916,47 +638,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
#define SECTOR_MASK (PAGE_SECTORS - 1)
-/*
- * blk_rq_pos() : the current sector
- * blk_rq_bytes() : bytes left in the entire request
- * blk_rq_cur_bytes() : bytes left in the current segment
- * blk_rq_err_bytes() : bytes left till the next error boundary
- * blk_rq_sectors() : sectors left in the entire request
- * blk_rq_cur_sectors() : sectors left in the current segment
- * blk_rq_stats_sectors() : sectors of the entire request used for stats
- */
-static inline sector_t blk_rq_pos(const struct request *rq)
-{
- return rq->__sector;
-}
-
-static inline unsigned int blk_rq_bytes(const struct request *rq)
-{
- return rq->__data_len;
-}
-
-static inline int blk_rq_cur_bytes(const struct request *rq)
-{
- return rq->bio ? bio_cur_bytes(rq->bio) : 0;
-}
-
-extern unsigned int blk_rq_err_bytes(const struct request *rq);
-
-static inline unsigned int blk_rq_sectors(const struct request *rq)
-{
- return blk_rq_bytes(rq) >> SECTOR_SHIFT;
-}
-
-static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
-{
- return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
-}
-
-static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
-{
- return rq->stats_sectors;
-}
-
#ifdef CONFIG_BLK_DEV_ZONED
/* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */
@@ -973,42 +654,8 @@ static inline unsigned int bio_zone_is_seq(struct bio *bio)
return blk_queue_zone_is_seq(bdev_get_queue(bio->bi_bdev),
bio->bi_iter.bi_sector);
}
-
-static inline unsigned int blk_rq_zone_no(struct request *rq)
-{
- return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
-}
-
-static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
-{
- return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
-}
#endif /* CONFIG_BLK_DEV_ZONED */
-/*
- * Some commands like WRITE SAME have a payload or data transfer size which
- * is different from the size of the request. Any driver that supports such
- * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
- * calculate the data transfer size.
- */
-static inline unsigned int blk_rq_payload_bytes(struct request *rq)
-{
- if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
- return rq->special_vec.bv_len;
- return blk_rq_bytes(rq);
-}
-
-/*
- * Return the first full biovec in the request. The caller needs to check that
- * there are any bvecs before calling this helper.
- */
-static inline struct bio_vec req_bvec(struct request *rq)
-{
- if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
- return rq->special_vec;
- return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
-}
-
static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
int op)
{
@@ -1048,47 +695,6 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q,
return min(q->limits.max_sectors, chunk_sectors);
}
-static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
- sector_t offset)
-{
- struct request_queue *q = rq->q;
-
- if (blk_rq_is_passthrough(rq))
- return q->limits.max_hw_sectors;
-
- if (!q->limits.chunk_sectors ||
- req_op(rq) == REQ_OP_DISCARD ||
- req_op(rq) == REQ_OP_SECURE_ERASE)
- return blk_queue_get_max_sectors(q, req_op(rq));
-
- return min(blk_max_size_offset(q, offset, 0),
- blk_queue_get_max_sectors(q, req_op(rq)));
-}
-
-static inline unsigned int blk_rq_count_bios(struct request *rq)
-{
- unsigned int nr_bios = 0;
- struct bio *bio;
-
- __rq_for_each_bio(bio, rq)
- nr_bios++;
-
- return nr_bios;
-}
-
-void blk_steal_bios(struct bio_list *list, struct request *rq);
-
-/*
- * Request completion related functions.
- *
- * blk_update_request() completes given number of bytes and updates
- * the request without completing it.
- */
-extern bool blk_update_request(struct request *rq, blk_status_t error,
- unsigned int nr_bytes);
-
-extern void blk_abort_request(struct request *);
-
/*
* Access functions for manipulating queue properties
*/
@@ -1133,46 +739,24 @@ extern void blk_queue_dma_alignment(struct request_queue *, int);
extern void blk_queue_update_dma_alignment(struct request_queue *, int);
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
-extern void blk_queue_required_elevator_features(struct request_queue *q,
- unsigned int features);
-extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
- struct device *dev);
-/*
- * Number of physical segments as sent to the device.
- *
- * Normally this is the number of discontiguous data segments sent by the
- * submitter. But for data-less command like discard we might have no
- * actual data segments submitted, but the driver might have to add it's
- * own special payload. In that case we still return 1 here so that this
- * special payload will be mapped.
- */
-static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
-{
- if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
- return 1;
- return rq->nr_phys_segments;
-}
+struct blk_independent_access_ranges *
+disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges);
+void disk_set_independent_access_ranges(struct gendisk *disk,
+ struct blk_independent_access_ranges *iars);
/*
- * Number of discard segments (or ranges) the driver needs to fill in.
- * Each discard bio merged into a request is counted as one segment.
+ * Elevator features for blk_queue_required_elevator_features:
*/
-static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
-{
- return max_t(unsigned short, rq->nr_phys_segments, 1);
-}
+/* Supports zoned block devices sequential write constraint */
+#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0)
+/* Supports scheduling on multiple hardware queues */
+#define ELEVATOR_F_MQ_AWARE (1U << 1)
-int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
- struct scatterlist *sglist, struct scatterlist **last_sg);
-static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
- struct scatterlist *sglist)
-{
- struct scatterlist *last_sg = NULL;
-
- return __blk_rq_map_sg(q, rq, sglist, &last_sg);
-}
-extern void blk_dump_rq_flags(struct request *, char *);
+extern void blk_queue_required_elevator_features(struct request_queue *q,
+ unsigned int features);
+extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
+ struct device *dev);
bool __must_check blk_get_queue(struct request_queue *);
extern void blk_put_queue(struct request_queue *);
@@ -1187,19 +771,24 @@ extern void blk_set_queue_dying(struct request_queue *);
* as the lock contention for request_queue lock is reduced.
*
* It is ok not to disable preemption when adding the request to the plug list
- * or when attempting a merge, because blk_schedule_flush_list() will only flush
- * the plug list when the task sleeps by itself. For details, please see
- * schedule() where blk_schedule_flush_plug() is called.
+ * or when attempting a merge. For details, please see schedule() where
+ * blk_flush_plug() is called.
*/
struct blk_plug {
- struct list_head mq_list; /* blk-mq requests */
- struct list_head cb_list; /* md requires an unplug callback */
+ struct request *mq_list; /* blk-mq requests */
+
+ /* if ios_left is > 1, we can batch tag/rq allocations */
+ struct request *cached_rq;
+ unsigned short nr_ios;
+
unsigned short rq_count;
+
bool multiple_queues;
+ bool has_elevator;
bool nowait;
+
+ struct list_head cb_list; /* md requires an unplug callback */
};
-#define BLK_MAX_REQUEST_COUNT 16
-#define BLK_PLUG_FLUSH_SIZE (128 * 1024)
struct blk_plug_cb;
typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool);
@@ -1211,32 +800,17 @@ struct blk_plug_cb {
extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug,
void *data, int size);
extern void blk_start_plug(struct blk_plug *);
+extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
extern void blk_finish_plug(struct blk_plug *);
-extern void blk_flush_plug_list(struct blk_plug *, bool);
-static inline void blk_flush_plug(struct task_struct *tsk)
-{
- struct blk_plug *plug = tsk->plug;
-
- if (plug)
- blk_flush_plug_list(plug, false);
-}
-
-static inline void blk_schedule_flush_plug(struct task_struct *tsk)
-{
- struct blk_plug *plug = tsk->plug;
-
- if (plug)
- blk_flush_plug_list(plug, true);
-}
+void blk_flush_plug(struct blk_plug *plug, bool from_schedule);
static inline bool blk_needs_flush_plug(struct task_struct *tsk)
{
struct blk_plug *plug = tsk->plug;
return plug &&
- (!list_empty(&plug->mq_list) ||
- !list_empty(&plug->cb_list));
+ (plug->mq_list || !list_empty(&plug->cb_list));
}
int blkdev_issue_flush(struct block_device *bdev);
@@ -1245,23 +819,23 @@ long nr_blockdev_pages(void);
struct blk_plug {
};
-static inline void blk_start_plug(struct blk_plug *plug)
+static inline void blk_start_plug_nr_ios(struct blk_plug *plug,
+ unsigned short nr_ios)
{
}
-static inline void blk_finish_plug(struct blk_plug *plug)
+static inline void blk_start_plug(struct blk_plug *plug)
{
}
-static inline void blk_flush_plug(struct task_struct *task)
+static inline void blk_finish_plug(struct blk_plug *plug)
{
}
-static inline void blk_schedule_flush_plug(struct task_struct *task)
+static inline void blk_flush_plug(struct blk_plug *plug, bool async)
{
}
-
static inline bool blk_needs_flush_plug(struct task_struct *tsk)
{
return false;
@@ -1499,22 +1073,6 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector
return offset << SECTOR_SHIFT;
}
-/*
- * Two cases of handling DISCARD merge:
- * If max_discard_segments > 1, the driver takes every bio
- * as a range and send them to controller together. The ranges
- * needn't to be contiguous.
- * Otherwise, the bios/requests will be handled as same as
- * others which should be contiguous.
- */
-static inline bool blk_discard_mergable(struct request *req)
-{
- if (req_op(req) == REQ_OP_DISCARD &&
- queue_max_discard_segments(req->q) > 1)
- return true;
- return false;
-}
-
static inline int bdev_discard_alignment(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
@@ -1628,210 +1186,28 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo
#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
MODULE_ALIAS("block-major-" __stringify(major) "-*")
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-
-enum blk_integrity_flags {
- BLK_INTEGRITY_VERIFY = 1 << 0,
- BLK_INTEGRITY_GENERATE = 1 << 1,
- BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2,
- BLK_INTEGRITY_IP_CHECKSUM = 1 << 3,
-};
-
-struct blk_integrity_iter {
- void *prot_buf;
- void *data_buf;
- sector_t seed;
- unsigned int data_size;
- unsigned short interval;
- const char *disk_name;
-};
-
-typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *);
-typedef void (integrity_prepare_fn) (struct request *);
-typedef void (integrity_complete_fn) (struct request *, unsigned int);
-
-struct blk_integrity_profile {
- integrity_processing_fn *generate_fn;
- integrity_processing_fn *verify_fn;
- integrity_prepare_fn *prepare_fn;
- integrity_complete_fn *complete_fn;
- const char *name;
-};
-
-extern void blk_integrity_register(struct gendisk *, struct blk_integrity *);
-extern void blk_integrity_unregister(struct gendisk *);
-extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
-extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
- struct scatterlist *);
-extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
-
-static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
-{
- struct blk_integrity *bi = &disk->queue->integrity;
-
- if (!bi->profile)
- return NULL;
-
- return bi;
-}
-
-static inline
-struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
-{
- return blk_get_integrity(bdev->bd_disk);
-}
-
-static inline bool
-blk_integrity_queue_supports_integrity(struct request_queue *q)
-{
- return q->integrity.profile;
-}
-
-static inline bool blk_integrity_rq(struct request *rq)
-{
- return rq->cmd_flags & REQ_INTEGRITY;
-}
-
-static inline void blk_queue_max_integrity_segments(struct request_queue *q,
- unsigned int segs)
-{
- q->limits.max_integrity_segments = segs;
-}
-
-static inline unsigned short
-queue_max_integrity_segments(const struct request_queue *q)
-{
- return q->limits.max_integrity_segments;
-}
-
-/**
- * bio_integrity_intervals - Return number of integrity intervals for a bio
- * @bi: blk_integrity profile for device
- * @sectors: Size of the bio in 512-byte sectors
- *
- * Description: The block layer calculates everything in 512 byte
- * sectors but integrity metadata is done in terms of the data integrity
- * interval size of the storage device. Convert the block layer sectors
- * to the appropriate number of integrity intervals.
- */
-static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
- unsigned int sectors)
-{
- return sectors >> (bi->interval_exp - 9);
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
- unsigned int sectors)
-{
- return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
-}
-
-/*
- * Return the first bvec that contains integrity data. Only drivers that are
- * limited to a single integrity segment should use this helper.
- */
-static inline struct bio_vec *rq_integrity_vec(struct request *rq)
-{
- if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1))
- return NULL;
- return rq->bio->bi_integrity->bip_vec;
-}
-
-#else /* CONFIG_BLK_DEV_INTEGRITY */
-
-struct bio;
-struct block_device;
-struct gendisk;
-struct blk_integrity;
-
-static inline int blk_integrity_rq(struct request *rq)
-{
- return 0;
-}
-static inline int blk_rq_count_integrity_sg(struct request_queue *q,
- struct bio *b)
-{
- return 0;
-}
-static inline int blk_rq_map_integrity_sg(struct request_queue *q,
- struct bio *b,
- struct scatterlist *s)
-{
- return 0;
-}
-static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
-{
- return NULL;
-}
-static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
-{
- return NULL;
-}
-static inline bool
-blk_integrity_queue_supports_integrity(struct request_queue *q)
-{
- return false;
-}
-static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
-{
- return 0;
-}
-static inline void blk_integrity_register(struct gendisk *d,
- struct blk_integrity *b)
-{
-}
-static inline void blk_integrity_unregister(struct gendisk *d)
-{
-}
-static inline void blk_queue_max_integrity_segments(struct request_queue *q,
- unsigned int segs)
-{
-}
-static inline unsigned short queue_max_integrity_segments(const struct request_queue *q)
-{
- return 0;
-}
-
-static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
- unsigned int sectors)
-{
- return 0;
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
- unsigned int sectors)
-{
- return 0;
-}
-
-static inline struct bio_vec *rq_integrity_vec(struct request *rq)
-{
- return NULL;
-}
-
-#endif /* CONFIG_BLK_DEV_INTEGRITY */
-
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
-bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q);
+bool blk_crypto_register(struct blk_crypto_profile *profile,
+ struct request_queue *q);
-void blk_ksm_unregister(struct request_queue *q);
+void blk_crypto_unregister(struct request_queue *q);
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
-static inline bool blk_ksm_register(struct blk_keyslot_manager *ksm,
- struct request_queue *q)
+static inline bool blk_crypto_register(struct blk_crypto_profile *profile,
+ struct request_queue *q)
{
return true;
}
-static inline void blk_ksm_unregister(struct request_queue *q) { }
+static inline void blk_crypto_unregister(struct request_queue *q) { }
#endif /* CONFIG_BLK_INLINE_ENCRYPTION */
struct block_device_operations {
- blk_qc_t (*submit_bio) (struct bio *bio);
+ void (*submit_bio)(struct bio *bio);
int (*open) (struct block_device *, fmode_t);
void (*release) (struct gendisk *, fmode_t);
int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);
@@ -1869,60 +1245,6 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *);
extern int bdev_write_page(struct block_device *, sector_t, struct page *,
struct writeback_control *);
-#ifdef CONFIG_BLK_DEV_ZONED
-bool blk_req_needs_zone_write_lock(struct request *rq);
-bool blk_req_zone_write_trylock(struct request *rq);
-void __blk_req_zone_write_lock(struct request *rq);
-void __blk_req_zone_write_unlock(struct request *rq);
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
- if (blk_req_needs_zone_write_lock(rq))
- __blk_req_zone_write_lock(rq);
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
- if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
- __blk_req_zone_write_unlock(rq);
-}
-
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
- return rq->q->seq_zones_wlock &&
- test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
- if (!blk_req_needs_zone_write_lock(rq))
- return true;
- return !blk_req_zone_is_write_locked(rq);
-}
-#else
-static inline bool blk_req_needs_zone_write_lock(struct request *rq)
-{
- return false;
-}
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
-}
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
- return false;
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
- return true;
-}
-#endif /* CONFIG_BLK_DEV_ZONED */
-
static inline void blk_wake_io_task(struct task_struct *waiter)
{
/*
@@ -2005,4 +1327,41 @@ int fsync_bdev(struct block_device *bdev);
int freeze_bdev(struct block_device *bdev);
int thaw_bdev(struct block_device *bdev);
+struct io_comp_batch {
+ struct request *req_list;
+ bool need_ts;
+ void (*complete)(struct io_comp_batch *);
+};
+
+#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { }
+
+#define rq_list_add(listptr, rq) do { \
+ (rq)->rq_next = *(listptr); \
+ *(listptr) = rq; \
+} while (0)
+
+#define rq_list_pop(listptr) \
+({ \
+ struct request *__req = NULL; \
+ if ((listptr) && *(listptr)) { \
+ __req = *(listptr); \
+ *(listptr) = __req->rq_next; \
+ } \
+ __req; \
+})
+
+#define rq_list_peek(listptr) \
+({ \
+ struct request *__req = NULL; \
+ if ((listptr) && *(listptr)) \
+ __req = *(listptr); \
+ __req; \
+})
+
+#define rq_list_for_each(listptr, pos) \
+ for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) \
+
+#define rq_list_next(rq) (rq)->rq_next
+#define rq_list_empty(list) ((list) == (struct request *) NULL)
+
#endif /* _LINUX_BLKDEV_H */
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index a083e15df608..22501a293fa5 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -2,7 +2,7 @@
#ifndef BLKTRACE_H
#define BLKTRACE_H
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/relay.h>
#include <linux/compat.h>
#include <uapi/linux/blktrace_api.h>
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 0e9bdd42dafb..35c25dff651a 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -44,7 +44,7 @@ struct bvec_iter {
unsigned int bi_bvec_done; /* number of bytes completed in
current bvec */
-};
+} __packed;
struct bvec_iter_all {
struct bio_vec bv;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 114553b487ef..a7df155ea49b 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -576,9 +576,9 @@ struct dm_table *dm_swap_table(struct mapped_device *md,
struct dm_table *t);
/*
- * Table keyslot manager functions
+ * Table blk_crypto_profile functions
*/
-void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm);
+void dm_destroy_crypto_profile(struct blk_crypto_profile *profile);
/*-----------------------------------------------------------------
* Macros.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e7a633353fd2..31029a91f440 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -48,6 +48,7 @@
struct backing_dev_info;
struct bdi_writeback;
struct bio;
+struct io_comp_batch;
struct export_operations;
struct fiemap_extent_info;
struct hd_geometry;
@@ -334,11 +335,7 @@ struct kiocb {
int ki_flags;
u16 ki_hint;
u16 ki_ioprio; /* See linux/ioprio.h */
- union {
- unsigned int ki_cookie; /* for ->iopoll */
- struct wait_page_queue *ki_waitq; /* for async buffered IO */
- };
-
+ struct wait_page_queue *ki_waitq; /* for async buffered IO */
randomized_struct_fields_end
};
@@ -2075,7 +2072,8 @@ struct file_operations {
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
- int (*iopoll)(struct kiocb *kiocb, bool spin);
+ int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *,
+ unsigned int flags);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 0f5315c2b5a3..13f313ab99e7 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -12,12 +12,10 @@
#include <linux/types.h>
#include <linux/kdev_t.h>
-#include <linux/rcupdate.h>
-#include <linux/slab.h>
-#include <linux/percpu-refcount.h>
#include <linux/uuid.h>
#include <linux/blk_types.h>
-#include <asm/local.h>
+#include <linux/device.h>
+#include <linux/xarray.h>
extern const struct device_type disk_type;
extern struct device_type part_type;
@@ -26,14 +24,6 @@ extern struct class block_class;
#define DISK_MAX_PARTS 256
#define DISK_NAME_LEN 32
-#include <linux/major.h>
-#include <linux/device.h>
-#include <linux/smp.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/workqueue.h>
-#include <linux/xarray.h>
-
#define PARTITION_META_INFO_VOLNAMELTH 64
/*
* Enough for the string representation of any kind of UUID plus NULL.
@@ -223,6 +213,8 @@ static inline int add_disk(struct gendisk *disk)
}
extern void del_gendisk(struct gendisk *gp);
+void invalidate_disk(struct gendisk *disk);
+
void set_disk_ro(struct gendisk *disk, bool read_only);
static inline int get_disk_ro(struct gendisk *disk)
@@ -231,6 +223,11 @@ static inline int get_disk_ro(struct gendisk *disk)
test_bit(GD_READ_ONLY, &disk->state);
}
+static inline int bdev_read_only(struct block_device *bdev)
+{
+ return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
+}
+
extern void disk_block_events(struct gendisk *disk);
extern void disk_unblock_events(struct gendisk *disk);
extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
@@ -291,10 +288,6 @@ bool bdev_check_media_change(struct block_device *bdev);
int __invalidate_device(struct block_device *bdev, bool kill_dirty);
void set_capacity(struct gendisk *disk, sector_t size);
-/* for drivers/char/raw.c: */
-int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
-long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
-
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 24f8489583ca..63f4ea4dac9b 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -313,8 +313,8 @@ int iomap_writepages(struct address_space *mapping,
struct iomap_dio_ops {
int (*end_io)(struct kiocb *iocb, ssize_t size, int error,
unsigned flags);
- blk_qc_t (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
- loff_t file_offset);
+ void (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
+ loff_t file_offset);
};
/*
@@ -337,7 +337,6 @@ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags);
ssize_t iomap_dio_complete(struct iomap_dio *dio);
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
#ifdef CONFIG_SWAP
struct file;
diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h
deleted file mode 100644
index a27605e2f826..000000000000
--- a/include/linux/keyslot-manager.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright 2019 Google LLC
- */
-
-#ifndef __LINUX_KEYSLOT_MANAGER_H
-#define __LINUX_KEYSLOT_MANAGER_H
-
-#include <linux/bio.h>
-#include <linux/blk-crypto.h>
-
-struct blk_keyslot_manager;
-
-/**
- * struct blk_ksm_ll_ops - functions to manage keyslots in hardware
- * @keyslot_program: Program the specified key into the specified slot in the
- * inline encryption hardware.
- * @keyslot_evict: Evict key from the specified keyslot in the hardware.
- * The key is provided so that e.g. dm layers can evict
- * keys from the devices that they map over.
- * Returns 0 on success, -errno otherwise.
- *
- * This structure should be provided by storage device drivers when they set up
- * a keyslot manager - this structure holds the function ptrs that the keyslot
- * manager will use to manipulate keyslots in the hardware.
- */
-struct blk_ksm_ll_ops {
- int (*keyslot_program)(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key,
- unsigned int slot);
- int (*keyslot_evict)(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key,
- unsigned int slot);
-};
-
-struct blk_keyslot_manager {
- /*
- * The struct blk_ksm_ll_ops that this keyslot manager will use
- * to perform operations like programming and evicting keys on the
- * device
- */
- struct blk_ksm_ll_ops ksm_ll_ops;
-
- /*
- * The maximum number of bytes supported for specifying the data unit
- * number.
- */
- unsigned int max_dun_bytes_supported;
-
- /*
- * Array of size BLK_ENCRYPTION_MODE_MAX of bitmasks that represents
- * whether a crypto mode and data unit size are supported. The i'th
- * bit of crypto_mode_supported[crypto_mode] is set iff a data unit
- * size of (1 << i) is supported. We only support data unit sizes
- * that are powers of 2.
- */
- unsigned int crypto_modes_supported[BLK_ENCRYPTION_MODE_MAX];
-
- /* Device for runtime power management (NULL if none) */
- struct device *dev;
-
- /* Here onwards are *private* fields for internal keyslot manager use */
-
- unsigned int num_slots;
-
- /* Protects programming and evicting keys from the device */
- struct rw_semaphore lock;
-
- /* List of idle slots, with least recently used slot at front */
- wait_queue_head_t idle_slots_wait_queue;
- struct list_head idle_slots;
- spinlock_t idle_slots_lock;
-
- /*
- * Hash table which maps struct *blk_crypto_key to keyslots, so that we
- * can find a key's keyslot in O(1) time rather than O(num_slots).
- * Protected by 'lock'.
- */
- struct hlist_head *slot_hashtable;
- unsigned int log_slot_ht_size;
-
- /* Per-keyslot data */
- struct blk_ksm_keyslot *slots;
-};
-
-int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots);
-
-int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm,
- unsigned int num_slots);
-
-blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key,
- struct blk_ksm_keyslot **slot_ptr);
-
-unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot);
-
-void blk_ksm_put_slot(struct blk_ksm_keyslot *slot);
-
-bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_config *cfg);
-
-int blk_ksm_evict_key(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key);
-
-void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm);
-
-void blk_ksm_destroy(struct blk_keyslot_manager *ksm);
-
-void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent,
- const struct blk_keyslot_manager *child);
-
-void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm);
-
-bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset,
- struct blk_keyslot_manager *ksm_subset);
-
-void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm,
- struct blk_keyslot_manager *reference_ksm);
-
-#endif /* __LINUX_KEYSLOT_MANAGER_H */
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 0c0c9a0fdf57..52eae8c45b8d 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -15,7 +15,7 @@
#include <linux/mmc/card.h>
#include <linux/mmc/pm.h>
#include <linux/dma-direction.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
struct mmc_ios {
unsigned int clock; /* clock rate */
@@ -492,7 +492,7 @@ struct mmc_host {
/* Inline encryption support */
#ifdef CONFIG_MMC_CRYPTO
- struct blk_keyslot_manager ksm;
+ struct blk_crypto_profile crypto_profile;
#endif
/* Host Software Queue support */
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index d2558121d48c..6f7949b2fd8d 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -3,6 +3,7 @@
#define _LINUX_PART_STAT_H
#include <linux/genhd.h>
+#include <asm/local.h>
struct disk_stats {
u64 nsecs[NR_STAT_GROUPS];
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index ae16a9856305..b31d3f3312ce 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -267,6 +267,28 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
}
/**
+ * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the
+ * caller is responsible for taking RCU.
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref)
+{
+ unsigned long __percpu *percpu_count;
+ bool ret = false;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (likely(__ref_is_percpu(ref, &percpu_count))) {
+ this_cpu_inc(*percpu_count);
+ ret = true;
+ } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
+ ret = atomic_long_inc_not_zero(&ref->data->count);
+ }
+ return ret;
+}
+
+/**
* percpu_ref_tryget_live - try to increment a live percpu refcount
* @ref: percpu_ref to try-get
*
@@ -283,20 +305,11 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
*/
static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
{
- unsigned long __percpu *percpu_count;
bool ret = false;
rcu_read_lock();
-
- if (__ref_is_percpu(ref, &percpu_count)) {
- this_cpu_inc(*percpu_count);
- ret = true;
- } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
- ret = atomic_long_inc_not_zero(&ref->data->count);
- }
-
+ ret = percpu_ref_tryget_live_rcu(ref);
rcu_read_unlock();
-
return ret;
}
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 2713e689ad66..4a6ff274335a 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -427,6 +427,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
int __sbitmap_queue_get(struct sbitmap_queue *sbq);
/**
+ * __sbitmap_queue_get_batch() - Try to allocate a batch of free bits
+ * @sbq: Bitmap queue to allocate from.
+ * @nr_tags: number of tags requested
+ * @offset: offset to add to returned bits
+ *
+ * Return: Mask of allocated tags, 0 if none are found. Each tag allocated is
+ * a bit in the mask returned, and the caller must add @offset to the value to
+ * get the absolute tag value.
+ */
+unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
+ unsigned int *offset);
+
+/**
* __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
* sbitmap_queue, limiting the depth used from each word, with preemption
* already disabled.
@@ -515,6 +528,17 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
unsigned int cpu);
+/**
+ * sbitmap_queue_clear_batch() - Free a batch of allocated bits
+ * &struct sbitmap_queue.
+ * @sbq: Bitmap to free from.
+ * @offset: offset for each tag in array
+ * @tags: array of tags
+ * @nr_tags: number of tags in array
+ */
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
+ int *tags, int nr_tags);
+
static inline int sbq_index_inc(int index)
{
return (index + 1) & (SBQ_WAIT_QUEUES - 1);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c1a927ddec64..e0454e60fe8f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1160,10 +1160,8 @@ struct task_struct {
/* Stacked block device info: */
struct bio_list *bio_list;
-#ifdef CONFIG_BLOCK
/* Stack plugging: */
struct blk_plug *plug;
-#endif
/* VM state: */
struct reclaim_state *reclaim_state;
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index 96305a64a5a7..c635c2e014e3 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -3,7 +3,7 @@
#define _LINUX_T10_PI_H
#include <linux/types.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
/*
* A T10 PI-capable target device can be formatted with different
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3571f383dfb6..3bfd487d1dd2 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -11,7 +11,6 @@
#include <linux/flex_proportions.h>
#include <linux/backing-dev-defs.h>
#include <linux/blk_types.h>
-#include <linux/blk-cgroup.h>
struct bio;
@@ -109,15 +108,12 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc)
return flags;
}
-static inline struct cgroup_subsys_state *
-wbc_blkcg_css(struct writeback_control *wbc)
-{
#ifdef CONFIG_CGROUP_WRITEBACK
- if (wbc->wb)
- return wbc->wb->blkcg_css;
-#endif
- return blkcg_root_css;
-}
+#define wbc_blkcg_css(wbc) \
+ ((wbc)->wb ? (wbc)->wb->blkcg_css : blkcg_root_css)
+#else
+#define wbc_blkcg_css(wbc) (blkcg_root_css)
+#endif /* CONFIG_CGROUP_WRITEBACK */
/*
* A wb_domain represents a domain that wb's (bdi_writeback's) belong to
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index b97e142a7ca9..430b73bd02ac 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -5,7 +5,7 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <scsi/scsi.h>
#include <linux/atomic.h>
#include <linux/sbitmap.h>
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index cc5ab96a7471..a95daa4d4caa 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -114,7 +114,7 @@ TRACE_EVENT(block_rq_requeue,
*/
TRACE_EVENT(block_rq_complete,
- TP_PROTO(struct request *rq, int error, unsigned int nr_bytes),
+ TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),
TP_ARGS(rq, error, nr_bytes),
@@ -122,7 +122,7 @@ TRACE_EVENT(block_rq_complete,
__field( dev_t, dev )
__field( sector_t, sector )
__field( unsigned int, nr_sector )
- __field( int, error )
+ __field( int , error )
__array( char, rwbs, RWBS_LEN )
__dynamic_array( char, cmd, 1 )
),
@@ -131,7 +131,7 @@ TRACE_EVENT(block_rq_complete,
__entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
__entry->sector = blk_rq_pos(rq);
__entry->nr_sector = nr_bytes >> 9;
- __entry->error = error;
+ __entry->error = blk_status_to_errno(error);
blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
__get_str(cmd)[0] = '\0';
diff --git a/init/main.c b/init/main.c
index 3c4054a95545..4162d7f3179f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -83,7 +83,6 @@
#include <linux/ptrace.h>
#include <linux/pti.h>
#include <linux/blkdev.h>
-#include <linux/elevator.h>
#include <linux/sched/clock.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
diff --git a/kernel/acct.c b/kernel/acct.c
index 23a7ab8e6cbc..3df53cf1dcd5 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -60,7 +60,6 @@
#include <linux/sched/cputime.h>
#include <asm/div64.h>
-#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
#include <linux/fs_pin.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 91a43e57a32e..a53863dafb3d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,7 +48,6 @@
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
-#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 38681ad44c76..67679e3933f2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -76,7 +76,6 @@
#include <linux/taskstats_kern.h>
#include <linux/random.h>
#include <linux/tty.h>
-#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f21714ea3db8..59bea523c84b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -13,7 +13,7 @@
#include "sched.h"
#include <linux/nospec.h>
-
+#include <linux/blkdev.h>
#include <linux/kcov.h>
#include <linux/scs.h>
@@ -6343,7 +6343,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
* make sure to submit it to avoid deadlocks.
*/
if (blk_needs_flush_plug(tsk))
- blk_schedule_flush_plug(tsk);
+ blk_flush_plug(tsk->plug, true);
}
static void sched_update_worker(struct task_struct *tsk)
@@ -8354,7 +8354,8 @@ int io_schedule_prepare(void)
int old_iowait = current->in_iowait;
current->in_iowait = 1;
- blk_schedule_flush_plug(current);
+ if (current->plug)
+ blk_flush_plug(current->plug, true);
return old_iowait;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3d3e5793e117..66128dfc05fb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -37,7 +37,6 @@
#include <linux/binfmts.h>
#include <linux/bitops.h>
-#include <linux/blkdev.h>
#include <linux/compat.h>
#include <linux/context_tracking.h>
#include <linux/cpufreq.h>
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fa91f398f28b..1183c88634aa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -816,7 +816,7 @@ blk_trace_request_get_cgid(struct request *rq)
* Records an action against a request. Will log the bio offset + size.
*
**/
-static void blk_add_trace_rq(struct request *rq, int error,
+static void blk_add_trace_rq(struct request *rq, blk_status_t error,
unsigned int nr_bytes, u32 what, u64 cgid)
{
struct blk_trace *bt;
@@ -834,7 +834,8 @@ static void blk_add_trace_rq(struct request *rq, int error,
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, error, 0, NULL, cgid);
+ rq->cmd_flags, what, blk_status_to_errno(error), 0,
+ NULL, cgid);
rcu_read_unlock();
}
@@ -863,7 +864,7 @@ static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
}
static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
- int error, unsigned int nr_bytes)
+ blk_status_t error, unsigned int nr_bytes)
{
blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
blk_trace_request_get_cgid(rq));
diff --git a/lib/random32.c b/lib/random32.c
index 4d0e05e471d7..a57a0e18819d 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -39,6 +39,7 @@
#include <linux/random.h>
#include <linux/sched.h>
#include <linux/bitops.h>
+#include <linux/slab.h>
#include <asm/unaligned.h>
#include <trace/events/random.h>
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index b25db9be938a..2709ab825499 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -489,6 +489,57 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq)
}
EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
+unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
+ unsigned int *offset)
+{
+ struct sbitmap *sb = &sbq->sb;
+ unsigned int hint, depth;
+ unsigned long index, nr;
+ int i;
+
+ if (unlikely(sb->round_robin))
+ return 0;
+
+ depth = READ_ONCE(sb->depth);
+ hint = update_alloc_hint_before_get(sb, depth);
+
+ index = SB_NR_TO_INDEX(sb, hint);
+
+ for (i = 0; i < sb->map_nr; i++) {
+ struct sbitmap_word *map = &sb->map[index];
+ unsigned long get_mask;
+
+ sbitmap_deferred_clear(map);
+ if (map->word == (1UL << (map->depth - 1)) - 1)
+ continue;
+
+ nr = find_first_zero_bit(&map->word, map->depth);
+ if (nr + nr_tags <= map->depth) {
+ atomic_long_t *ptr = (atomic_long_t *) &map->word;
+ int map_tags = min_t(int, nr_tags, map->depth);
+ unsigned long val, ret;
+
+ get_mask = ((1UL << map_tags) - 1) << nr;
+ do {
+ val = READ_ONCE(map->word);
+ ret = atomic_long_cmpxchg(ptr, val, get_mask | val);
+ } while (ret != val);
+ get_mask = (get_mask & ~ret) >> nr;
+ if (get_mask) {
+ *offset = nr + (index << sb->shift);
+ update_alloc_hint_after_get(sb, depth, hint,
+ *offset + map_tags - 1);
+ return get_mask;
+ }
+ }
+ /* Jump to next index. */
+ if (++index >= sb->map_nr)
+ index = 0;
+ }
+
+ return 0;
+}
+
int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
unsigned int shallow_depth)
{
@@ -577,6 +628,46 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
}
EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
+static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag)
+{
+ if (likely(!sb->round_robin && tag < sb->depth))
+ data_race(*per_cpu_ptr(sb->alloc_hint, cpu) = tag);
+}
+
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
+ int *tags, int nr_tags)
+{
+ struct sbitmap *sb = &sbq->sb;
+ unsigned long *addr = NULL;
+ unsigned long mask = 0;
+ int i;
+
+ smp_mb__before_atomic();
+ for (i = 0; i < nr_tags; i++) {
+ const int tag = tags[i] - offset;
+ unsigned long *this_addr;
+
+ /* since we're clearing a batch, skip the deferred map */
+ this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word;
+ if (!addr) {
+ addr = this_addr;
+ } else if (addr != this_addr) {
+ atomic_long_andnot(mask, (atomic_long_t *) addr);
+ mask = 0;
+ addr = this_addr;
+ }
+ mask |= (1UL << SB_NR_TO_BIT(sb, tag));
+ }
+
+ if (mask)
+ atomic_long_andnot(mask, (atomic_long_t *) addr);
+
+ smp_mb__after_atomic();
+ sbitmap_queue_wake_up(sbq);
+ sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
+ tags[nr_tags - 1] - offset);
+}
+
void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
unsigned int cpu)
{
@@ -601,9 +692,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
*/
smp_mb__after_atomic();
sbitmap_queue_wake_up(sbq);
-
- if (likely(!sbq->sb.round_robin && nr < sbq->sb.depth))
- *per_cpu_ptr(sbq->sb.alloc_hint, cpu) = nr;
+ sbitmap_update_cpu_hint(&sbq->sb, cpu, nr);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 4a9d4e27d0d9..c878d995af06 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -2,8 +2,9 @@
#include <linux/wait.h>
#include <linux/rbtree.h>
-#include <linux/backing-dev.h>
#include <linux/kthread.h>
+#include <linux/backing-dev.h>
+#include <linux/blk-cgroup.h>
#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -977,6 +978,22 @@ void bdi_put(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_put);
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
+{
+ struct super_block *sb;
+
+ if (!inode)
+ return &noop_backing_dev_info;
+
+ sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+ if (sb_is_blkdev_sb(sb))
+ return I_BDEV(inode)->bd_disk->bdi;
+#endif
+ return sb->s_bdi;
+}
+EXPORT_SYMBOL(inode_to_bdi);
+
const char *bdi_dev_name(struct backing_dev_info *bdi)
{
if (!bdi || !bdi->dev)
diff --git a/mm/filemap.c b/mm/filemap.c
index 3e9feb5cc570..5e206a429b57 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,7 +30,6 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
-#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hugetlb.h>
diff --git a/mm/highmem.c b/mm/highmem.c
index 4212ad0e4a19..471d9779a7f4 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -23,7 +23,6 @@
#include <linux/bio.h>
#include <linux/pagemap.h>
#include <linux/mempool.h>
-#include <linux/blkdev.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
diff --git a/mm/mempool.c b/mm/mempool.c
index 0b8afbec3e35..b933d0fc21b8 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -17,7 +17,6 @@
#include <linux/kmemleak.h>
#include <linux/export.h>
#include <linux/mempool.h>
-#include <linux/blkdev.h>
#include <linux/writeback.h>
#include "slab.h"
diff --git a/mm/nommu.c b/mm/nommu.c
index 02d2427b8f9e..41ef204e7482 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -27,7 +27,6 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/compiler.h>
#include <linux/mount.h>
diff --git a/mm/page_io.c b/mm/page_io.c
index d597bc6e6e45..9725c7e1eeea 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -358,8 +358,6 @@ int swap_readpage(struct page *page, bool synchronous)
struct bio *bio;
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
- blk_qc_t qc;
- struct gendisk *disk;
unsigned long pflags;
VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
@@ -409,26 +407,24 @@ int swap_readpage(struct page *page, bool synchronous)
bio->bi_iter.bi_sector = swap_page_sector(page);
bio->bi_end_io = end_swap_bio_read;
bio_add_page(bio, page, thp_size(page), 0);
-
- disk = bio->bi_bdev->bd_disk;
/*
* Keep this task valid during swap readpage because the oom killer may
* attempt to access it in the page fault retry time check.
*/
if (synchronous) {
- bio->bi_opf |= REQ_HIPRI;
+ bio->bi_opf |= REQ_POLLED;
get_task_struct(current);
bio->bi_private = current;
}
count_vm_event(PSWPIN);
bio_get(bio);
- qc = submit_bio(bio);
+ submit_bio(bio);
while (synchronous) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(bio->bi_private))
break;
- if (!blk_poll(disk->queue, qc, true))
+ if (!bio_poll(bio, NULL, 0))
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
diff --git a/mm/readahead.c b/mm/readahead.c
index 41b75d76d36e..e71e719e36c9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -12,7 +12,6 @@
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/export.h>
-#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
diff --git a/mm/shmem.c b/mm/shmem.c
index 1588f33d009a..17e344e26e73 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -59,7 +59,6 @@ static struct vfsmount *shm_mnt;
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
-#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/percpu_counter.h>
#include <linux/falloc.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e3dcaeecc50f..41c9e92f1f00 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -18,7 +18,7 @@
#include <linux/pagemap.h>
#include <linux/namei.h>
#include <linux/shmem_fs.h>
-#include <linux/blkdev.h>
+#include <linux/blk-cgroup.h>
#include <linux/random.h>
#include <linux/writeback.h>
#include <linux/proc_fs.h>