aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-13 19:15:14 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-13 19:15:14 -0800
commitd080827f850ba4df5b955d5ca8c8c0fc92fe18c0 (patch)
tree37262315200bbbe50bdd64ce3011951a92855159
parentMerge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux (diff)
parentMerge branch 'for-4.5/block-dax' into for-4.5/libnvdimm (diff)
downloadlinux-dev-d080827f850ba4df5b955d5ca8c8c0fc92fe18c0.tar.xz
linux-dev-d080827f850ba4df5b955d5ca8c8c0fc92fe18c0.zip
Merge tag 'libnvdimm-for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm updates from Dan Williams: "The bulk of this has appeared in -next and independently received a build success notification from the kbuild robot. The 'for-4.5/block- dax' topic branch was rebased over the weekend to drop the "block device end-of-life" rework that Al would like to see re-implemented with a notifier, and to address bug reports against the badblocks integration. There is pending feedback against "libnvdimm: Add a poison list and export badblocks" received last week. Linda identified some localized fixups that we will handle incrementally. Summary: - Media error handling: The 'badblocks' implementation that originated in md-raid is up-levelled to a generic capability of a block device. This initial implementation is limited to being consulted in the pmem block-i/o path. Later, 'badblocks' will be consulted when creating dax mappings. - Raw block device dax: For virtualization and other cases that want large contiguous mappings of persistent memory, add the capability to dax-mmap a block device directly. - Increased /dev/mem restrictions: Add an option to treat all io-memory as IORESOURCE_EXCLUSIVE, i.e. disable /dev/mem access while a driver is actively using an address range. This behavior is controlled via the new CONFIG_IO_STRICT_DEVMEM option and can be overridden by the existing "iomem=relaxed" kernel command line option. - Miscellaneous fixes include a 'pfn'-device huge page alignment fix, block device shutdown crash fix, and other small libnvdimm fixes" * tag 'libnvdimm-for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (32 commits) block: kill disk_{check|set|clear|alloc}_badblocks libnvdimm, pmem: nvdimm_read_bytes() badblocks support pmem, dax: disable dax in the presence of bad blocks pmem: fail io-requests to known bad blocks libnvdimm: convert to statically allocated badblocks libnvdimm: don't fail init for full badblocks list block, badblocks: introduce devm_init_badblocks block: clarify badblocks lifetime badblocks: rename badblocks_free to badblocks_exit libnvdimm, pmem: move definition of nvdimm_namespace_add_poison to nd.h libnvdimm: Add a poison list and export badblocks nfit_test: Enable DSMs for all test NFITs md: convert to use the generic badblocks code block: Add badblock management for gendisks badblocks: Add core badblock management code block: fix del_gendisk() vs blkdev_ioctl crash block: enable dax for raw block devices block: introduce bdev_file_inode() restrict /dev/mem to idle io memory ranges arch: consolidate CONFIG_STRICT_DEVM in lib/Kconfig.debug ...
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/Kconfig.debug14
-rw-r--r--arch/arm64/Kconfig1
-rw-r--r--arch/arm64/Kconfig.debug14
-rw-r--r--arch/frv/Kconfig1
-rw-r--r--arch/m32r/Kconfig1
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/Kconfig.debug12
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/Kconfig.debug12
-rw-r--r--arch/tile/Kconfig4
-rw-r--r--arch/unicore32/Kconfig1
-rw-r--r--arch/unicore32/Kconfig.debug14
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/Kconfig.debug17
-rw-r--r--block/Makefile2
-rw-r--r--block/badblocks.c585
-rw-r--r--block/genhd.c30
-rw-r--r--block/ioctl.c71
-rw-r--r--drivers/acpi/nfit.c203
-rw-r--r--drivers/md/md.c516
-rw-r--r--drivers/md/md.h40
-rw-r--r--drivers/nvdimm/core.c169
-rw-r--r--drivers/nvdimm/namespace_devs.c132
-rw-r--r--drivers/nvdimm/nd-core.h2
-rw-r--r--drivers/nvdimm/nd.h16
-rw-r--r--drivers/nvdimm/pfn_devs.c93
-rw-r--r--drivers/nvdimm/pmem.c90
-rw-r--r--drivers/nvdimm/region_devs.c66
-rw-r--r--fs/block_dev.c122
-rw-r--r--include/linux/badblocks.h65
-rw-r--r--include/linux/fs.h11
-rw-r--r--include/linux/genhd.h2
-rw-r--r--include/linux/libnvdimm.h1
-rw-r--r--include/uapi/linux/fs.h2
-rw-r--r--kernel/resource.c11
-rw-r--r--lib/Kconfig.debug39
-rw-r--r--tools/testing/nvdimm/Kbuild2
-rw-r--r--tools/testing/nvdimm/test/iomap.c93
-rw-r--r--tools/testing/nvdimm/test/nfit.c11
40 files changed, 1673 insertions, 796 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 426115f7cb63..84b1b21b08ae 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2,6 +2,7 @@ config ARM
bool
default y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAVE_CUSTOM_GPIO_H
diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index 259c0ca9c99a..e356357d86bb 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -15,20 +15,6 @@ config ARM_PTDUMP
kernel.
If in doubt, say "N"
-config STRICT_DEVMEM
- bool "Filter access to /dev/mem"
- depends on MMU
- ---help---
- If this option is disabled, you allow userspace (root) access to all
- of memory, including kernel and userspace memory. Accidental
- access to this is obviously disastrous, but specific access can
- be used by people debugging the kernel.
-
- If this option is switched on, the /dev/mem file only allows
- userspace access to memory mapped peripherals.
-
- If in doubt, say Y.
-
# RMK wants arm kernels compiled with frame pointers or stack unwinding.
# If you know what you are doing and are willing to live without stack
# traces, you can get a slightly smaller kernel by setting this option to
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index d6ebffdc6bb1..4d5b416e2e4b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -3,6 +3,7 @@ config ARM64
select ACPI_CCA_REQUIRED if ACPI
select ACPI_GENERIC_GSI if ACPI
select ACPI_REDUCED_HARDWARE_ONLY if ACPI
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index 04fb73b973f1..e13c4bf84d9e 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -14,20 +14,6 @@ config ARM64_PTDUMP
kernel.
If in doubt, say "N"
-config STRICT_DEVMEM
- bool "Filter access to /dev/mem"
- depends on MMU
- help
- If this option is disabled, you allow userspace (root) access to all
- of memory, including kernel and userspace memory. Accidental
- access to this is obviously disastrous, but specific access can
- be used by people debugging the kernel.
-
- If this option is switched on, the /dev/mem file only allows
- userspace access to memory mapped peripherals.
-
- If in doubt, say Y.
-
config PID_IN_CONTEXTIDR
bool "Write the current PID to the CONTEXTIDR register"
help
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 34aa19352dc1..03bfd6bf03e7 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -10,6 +10,7 @@ config FRV
select HAVE_DEBUG_BUGVERBOSE
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_CPU_DEVICES
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_WANT_IPC_PARSE_VERSION
select OLD_SIGSUSPEND3
select OLD_SIGACTION
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 9e44bbd8051e..836ac5a963c8 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -13,6 +13,7 @@ config M32R
select GENERIC_IRQ_PROBE
select GENERIC_IRQ_SHOW
select GENERIC_ATOMIC64
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_USES_GETTIMEOFFSET
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index db49e0d796b1..85eabc49de61 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -159,6 +159,7 @@ config PPC
select EDAC_SUPPORT
select EDAC_ATOMIC_SCRUB
select ARCH_HAS_DMA_SET_COHERENT_MASK
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
select HAVE_ARCH_SECCOMP_FILTER
config GENERIC_CSUM
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 3a510f4a6b68..a0e44a9c456f 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -335,18 +335,6 @@ config PPC_EARLY_DEBUG_CPM_ADDR
platform probing is done, all platforms selected must
share the same address.
-config STRICT_DEVMEM
- def_bool y
- prompt "Filter access to /dev/mem"
- help
- This option restricts access to /dev/mem. If this option is
- disabled, you allow userspace access to all memory, including
- kernel and userspace memory. Accidental memory access is likely
- to be disastrous.
- Memory access is required for experts who want to debug the kernel.
-
- If you are unsure, say Y.
-
config FAIL_IOMMU
bool "Fault-injection capability for IOMMU"
depends on FAULT_INJECTION
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 6d0ae7d30724..24490344c30f 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -66,6 +66,7 @@ config S390
def_bool y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_SG_CHAIN
diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug
index c56878e1245f..26c5d5beb4be 100644
--- a/arch/s390/Kconfig.debug
+++ b/arch/s390/Kconfig.debug
@@ -5,18 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
-config STRICT_DEVMEM
- def_bool y
- prompt "Filter access to /dev/mem"
- ---help---
- This option restricts access to /dev/mem. If this option is
- disabled, you allow userspace access to all memory, including
- kernel and userspace memory. Accidental memory access is likely
- to be disastrous.
- Memory access is required for experts who want to debug the kernel.
-
- If you are unsure, say Y.
-
config S390_PTDUMP
bool "Export kernel pagetable layout to userspace via debugfs"
depends on DEBUG_KERNEL
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 8ec7a4599c08..be05b4ae02b6 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -19,6 +19,7 @@ config TILE
select VIRT_TO_BUS
select SYS_HYPERVISOR
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_CLOCKEVENTS
select MODULES_USE_ELF_RELA
@@ -116,9 +117,6 @@ config ARCH_DISCONTIGMEM_DEFAULT
config TRACE_IRQFLAGS_SUPPORT
def_bool y
-config STRICT_DEVMEM
- def_bool y
-
# SMP is required for Tilera Linux.
config SMP
def_bool y
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index c9faddc61100..5dc4c0a43ccd 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -1,5 +1,6 @@
config UNICORE32
def_bool y
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select HAVE_MEMBLOCK
diff --git a/arch/unicore32/Kconfig.debug b/arch/unicore32/Kconfig.debug
index 1a3626239843..f075bbe1d46f 100644
--- a/arch/unicore32/Kconfig.debug
+++ b/arch/unicore32/Kconfig.debug
@@ -2,20 +2,6 @@ menu "Kernel hacking"
source "lib/Kconfig.debug"
-config STRICT_DEVMEM
- bool "Filter access to /dev/mem"
- depends on MMU
- ---help---
- If this option is disabled, you allow userspace (root) access to all
- of memory, including kernel and userspace memory. Accidental
- access to this is obviously disastrous, but specific access can
- be used by people debugging the kernel.
-
- If this option is switched on, the /dev/mem file only allows
- userspace access to memory mapped peripherals.
-
- If in doubt, say Y.
-
config EARLY_PRINTK
def_bool DEBUG_OCD
help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ccfededfe470..5d2293417946 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,6 +24,7 @@ config X86
select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 110253ce83af..9b18ed97a8a2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,23 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
-config STRICT_DEVMEM
- bool "Filter access to /dev/mem"
- ---help---
- If this option is disabled, you allow userspace (root) access to all
- of memory, including kernel and userspace memory. Accidental
- access to this is obviously disastrous, but specific access can
- be used by people debugging the kernel. Note that with PAT support
- enabled, even in this case there are restrictions on /dev/mem
- use due to the cache aliasing requirements.
-
- If this option is switched on, the /dev/mem file only allows
- userspace access to PCI space and the BIOS code and data regions.
- This is sufficient for dosemu and X and all common users of
- /dev/mem.
-
- If in doubt, say Y.
-
config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
diff --git a/block/Makefile b/block/Makefile
index 00ecc97629db..db5f622c9d67 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
- partitions/
+ badblocks.o partitions/
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
diff --git a/block/badblocks.c b/block/badblocks.c
new file mode 100644
index 000000000000..7be53cb1cc3c
--- /dev/null
+++ b/block/badblocks.c
@@ -0,0 +1,585 @@
+/*
+ * Bad block management
+ *
+ * - Heavily based on MD badblocks code from Neil Brown
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/badblocks.h>
+#include <linux/seqlock.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+
+/**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: sector (start) at which to check for badblocks
+ * @sectors: number of sectors to check for badblocks
+ * @first_bad: pointer to store location of the first badblock
+ * @bad_sectors: pointer to store number of badblocks after @first_bad
+ *
+ * We can record which blocks on each device are 'bad' and so just
+ * fail those blocks, or that stripe, rather than the whole device.
+ * Entries in the bad-block table are 64bits wide. This comprises:
+ * Length of bad-range, in sectors: 0-511 for lengths 1-512
+ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
+ * A 'shift' can be set so that larger blocks are tracked and
+ * consequently larger devices can be covered.
+ * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ *
+ * Locking of the bad-block table uses a seqlock so badblocks_check
+ * might need to retry if it is very unlucky.
+ * We will sometimes want to check for bad blocks in a bi_end_io function,
+ * so we use the write_seqlock_irq variant.
+ *
+ * When looking for a bad block we specify a range and want to
+ * know if any block in the range is bad. So we binary-search
+ * to the last range that starts at-or-before the given endpoint,
+ * (or "before the sector after the target range")
+ * then see if it ends after the given start.
+ *
+ * Return:
+ * 0: there are no known bad blocks in the range
+ * 1: there are known bad block which are all acknowledged
+ * -1: there are bad blocks which have not yet been acknowledged in metadata.
+ * plus the start/length of the first bad section we overlap.
+ */
+int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ int hi;
+ int lo;
+ u64 *p = bb->page;
+ int rv;
+ sector_t target = s + sectors;
+ unsigned seq;
+
+ if (bb->shift > 0) {
+ /* round the start down, and the end up */
+ s >>= bb->shift;
+ target += (1<<bb->shift) - 1;
+ target >>= bb->shift;
+ sectors = target - s;
+ }
+ /* 'target' is now the first block after the bad range */
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+ lo = 0;
+ rv = 0;
+ hi = bb->count;
+
+ /* Binary search between lo and hi for 'target'
+ * i.e. for the last range that starts before 'target'
+ */
+ /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+ * are known not to be the last range before target.
+ * VARIANT: hi-lo is the number of possible
+ * ranges, and decreases until it reaches 1
+ */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+
+ if (a < target)
+ /* This could still be the one, earlier ranges
+ * could not.
+ */
+ lo = mid;
+ else
+ /* This and later ranges are definitely out. */
+ hi = mid;
+ }
+ /* 'lo' might be the last that started before target, but 'hi' isn't */
+ if (hi > lo) {
+ /* need to check all range that end after 's' to see if
+ * any are unacknowledged.
+ */
+ while (lo >= 0 &&
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+ if (BB_OFFSET(p[lo]) < target) {
+ /* starts before the end, and finishes after
+ * the start, so they must overlap
+ */
+ if (rv != -1 && BB_ACK(p[lo]))
+ rv = 1;
+ else
+ rv = -1;
+ *first_bad = BB_OFFSET(p[lo]);
+ *bad_sectors = BB_LEN(p[lo]);
+ }
+ lo--;
+ }
+ }
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_check);
+
+/**
+ * badblocks_set() - Add a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ * @acknowledged: weather to mark the bad sectors as acknowledged
+ *
+ * This might extend the table, or might contract it if two adjacent ranges
+ * can be merged. We binary-search to find the 'insertion' point, then
+ * decide how best to handle it.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to set badblocks (out of space)
+ */
+int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
+{
+ u64 *p;
+ int lo, hi;
+ int rv = 0;
+ unsigned long flags;
+
+ if (bb->shift < 0)
+ /* badblocks are disabled */
+ return 0;
+
+ if (bb->shift) {
+ /* round the start down, and the end up */
+ sector_t next = s + sectors;
+
+ s >>= bb->shift;
+ next += (1<<bb->shift) - 1;
+ next >>= bb->shift;
+ sectors = next - s;
+ }
+
+ write_seqlock_irqsave(&bb->lock, flags);
+
+ p = bb->page;
+ lo = 0;
+ hi = bb->count;
+ /* Find the last range that starts at-or-before 's' */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+
+ if (a <= s)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (hi > lo && BB_OFFSET(p[lo]) > s)
+ hi = lo;
+
+ if (hi > lo) {
+ /* we found a range that might merge with the start
+ * of our new range
+ */
+ sector_t a = BB_OFFSET(p[lo]);
+ sector_t e = a + BB_LEN(p[lo]);
+ int ack = BB_ACK(p[lo]);
+
+ if (e >= s) {
+ /* Yes, we can merge with a previous range */
+ if (s == a && s + sectors >= e)
+ /* new range covers old */
+ ack = acknowledged;
+ else
+ ack = ack && acknowledged;
+
+ if (e < s + sectors)
+ e = s + sectors;
+ if (e - a <= BB_MAX_LEN) {
+ p[lo] = BB_MAKE(a, e-a, ack);
+ s = e;
+ } else {
+ /* does not all fit in one range,
+ * make p[lo] maximal
+ */
+ if (BB_LEN(p[lo]) != BB_MAX_LEN)
+ p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+ s = a + BB_MAX_LEN;
+ }
+ sectors = e - s;
+ }
+ }
+ if (sectors && hi < bb->count) {
+ /* 'hi' points to the first range that starts after 's'.
+ * Maybe we can merge with the start of that range
+ */
+ sector_t a = BB_OFFSET(p[hi]);
+ sector_t e = a + BB_LEN(p[hi]);
+ int ack = BB_ACK(p[hi]);
+
+ if (a <= s + sectors) {
+ /* merging is possible */
+ if (e <= s + sectors) {
+ /* full overlap */
+ e = s + sectors;
+ ack = acknowledged;
+ } else
+ ack = ack && acknowledged;
+
+ a = s;
+ if (e - a <= BB_MAX_LEN) {
+ p[hi] = BB_MAKE(a, e-a, ack);
+ s = e;
+ } else {
+ p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+ s = a + BB_MAX_LEN;
+ }
+ sectors = e - s;
+ lo = hi;
+ hi++;
+ }
+ }
+ if (sectors == 0 && hi < bb->count) {
+ /* we might be able to combine lo and hi */
+ /* Note: 's' is at the end of 'lo' */
+ sector_t a = BB_OFFSET(p[hi]);
+ int lolen = BB_LEN(p[lo]);
+ int hilen = BB_LEN(p[hi]);
+ int newlen = lolen + hilen - (s - a);
+
+ if (s >= a && newlen < BB_MAX_LEN) {
+ /* yes, we can combine them */
+ int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+
+ p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+ memmove(p + hi, p + hi + 1,
+ (bb->count - hi - 1) * 8);
+ bb->count--;
+ }
+ }
+ while (sectors) {
+ /* didn't merge (it all).
+ * Need to add a range just before 'hi'
+ */
+ if (bb->count >= MAX_BADBLOCKS) {
+ /* No room for more */
+ rv = 1;
+ break;
+ } else {
+ int this_sectors = sectors;
+
+ memmove(p + hi + 1, p + hi,
+ (bb->count - hi) * 8);
+ bb->count++;
+
+ if (this_sectors > BB_MAX_LEN)
+ this_sectors = BB_MAX_LEN;
+ p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+ sectors -= this_sectors;
+ s += this_sectors;
+ }
+ }
+
+ bb->changed = 1;
+ if (!acknowledged)
+ bb->unacked_exist = 1;
+ write_sequnlock_irqrestore(&bb->lock, flags);
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_set);
+
+/**
+ * badblocks_clear() - Remove a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ *
+ * This may involve extending the table if we spilt a region,
+ * but it must not fail. So if the table becomes full, we just
+ * drop the remove request.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to clear badblocks
+ */
+int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+{
+ u64 *p;
+ int lo, hi;
+ sector_t target = s + sectors;
+ int rv = 0;
+
+ if (bb->shift > 0) {
+ /* When clearing we round the start up and the end down.
+ * This should not matter as the shift should align with
+ * the block size and no rounding should ever be needed.
+ * However it is better the think a block is bad when it
+ * isn't than to think a block is not bad when it is.
+ */
+ s += (1<<bb->shift) - 1;
+ s >>= bb->shift;
+ target >>= bb->shift;
+ sectors = target - s;
+ }
+
+ write_seqlock_irq(&bb->lock);
+
+ p = bb->page;
+ lo = 0;
+ hi = bb->count;
+ /* Find the last range that starts before 'target' */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+
+ if (a < target)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (hi > lo) {
+ /* p[lo] is the last range that could overlap the
+ * current range. Earlier ranges could also overlap,
+ * but only this one can overlap the end of the range.
+ */
+ if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
+ /* Partial overlap, leave the tail of this range */
+ int ack = BB_ACK(p[lo]);
+ sector_t a = BB_OFFSET(p[lo]);
+ sector_t end = a + BB_LEN(p[lo]);
+
+ if (a < s) {
+ /* we need to split this range */
+ if (bb->count >= MAX_BADBLOCKS) {
+ rv = -ENOSPC;
+ goto out;
+ }
+ memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+ bb->count++;
+ p[lo] = BB_MAKE(a, s-a, ack);
+ lo++;
+ }
+ p[lo] = BB_MAKE(target, end - target, ack);
+ /* there is no longer an overlap */
+ hi = lo;
+ lo--;
+ }
+ while (lo >= 0 &&
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+ /* This range does overlap */
+ if (BB_OFFSET(p[lo]) < s) {
+ /* Keep the early parts of this range. */
+ int ack = BB_ACK(p[lo]);
+ sector_t start = BB_OFFSET(p[lo]);
+
+ p[lo] = BB_MAKE(start, s - start, ack);
+ /* now low doesn't overlap, so.. */
+ break;
+ }
+ lo--;
+ }
+ /* 'lo' is strictly before, 'hi' is strictly after,
+ * anything between needs to be discarded
+ */
+ if (hi - lo > 1) {
+ memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+ bb->count -= (hi - lo - 1);
+ }
+ }
+
+ bb->changed = 1;
+out:
+ write_sequnlock_irq(&bb->lock);
+ return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_clear);
+
+/**
+ * ack_all_badblocks() - Acknowledge all bad blocks in a list.
+ * @bb: the badblocks structure that holds all badblock information
+ *
+ * This only succeeds if ->changed is clear. It is used by
+ * in-kernel metadata updates
+ */
+void ack_all_badblocks(struct badblocks *bb)
+{
+ if (bb->page == NULL || bb->changed)
+ /* no point even trying */
+ return;
+ write_seqlock_irq(&bb->lock);
+
+ if (bb->changed == 0 && bb->unacked_exist) {
+ u64 *p = bb->page;
+ int i;
+
+ for (i = 0; i < bb->count ; i++) {
+ if (!BB_ACK(p[i])) {
+ sector_t start = BB_OFFSET(p[i]);
+ int len = BB_LEN(p[i]);
+
+ p[i] = BB_MAKE(start, len, 1);
+ }
+ }
+ bb->unacked_exist = 0;
+ }
+ write_sequnlock_irq(&bb->lock);
+}
+EXPORT_SYMBOL_GPL(ack_all_badblocks);
+
+/**
+ * badblocks_show() - sysfs access to bad-blocks list
+ * @bb: the badblocks structure that holds all badblock information
+ * @page: buffer received from sysfs
+ * @unack: weather to show unacknowledged badblocks
+ *
+ * Return:
+ * Length of returned data
+ */
+ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
+{
+ size_t len;
+ int i;
+ u64 *p = bb->page;
+ unsigned seq;
+
+ if (bb->shift < 0)
+ return 0;
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+
+ len = 0;
+ i = 0;
+
+ while (len < PAGE_SIZE && i < bb->count) {
+ sector_t s = BB_OFFSET(p[i]);
+ unsigned int length = BB_LEN(p[i]);
+ int ack = BB_ACK(p[i]);
+
+ i++;
+
+ if (unack && ack)
+ continue;
+
+ len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
+ (unsigned long long)s << bb->shift,
+ length << bb->shift);
+ }
+ if (unack && len == 0)
+ bb->unacked_exist = 0;
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(badblocks_show);
+
+/**
+ * badblocks_store() - sysfs access to bad-blocks list
+ * @bb: the badblocks structure that holds all badblock information
+ * @page: buffer received from sysfs
+ * @len: length of data received from sysfs
+ * @unack: weather to show unacknowledged badblocks
+ *
+ * Return:
+ * Length of the buffer processed or -ve error.
+ */
+ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
+ int unack)
+{
+ unsigned long long sector;
+ int length;
+ char newline;
+
+ switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
+ case 3:
+ if (newline != '\n')
+ return -EINVAL;
+ case 2:
+ if (length <= 0)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (badblocks_set(bb, sector, length, !unack))
+ return -ENOSPC;
+ else
+ return len;
+}
+EXPORT_SYMBOL_GPL(badblocks_store);
+
+static int __badblocks_init(struct device *dev, struct badblocks *bb,
+ int enable)
+{
+ bb->dev = dev;
+ bb->count = 0;
+ if (enable)
+ bb->shift = 0;
+ else
+ bb->shift = -1;
+ if (dev)
+ bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
+ else
+ bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!bb->page) {
+ bb->shift = -1;
+ return -ENOMEM;
+ }
+ seqlock_init(&bb->lock);
+
+ return 0;
+}
+
+/**
+ * badblocks_init() - initialize the badblocks structure
+ * @bb: the badblocks structure that holds all badblock information
+ * @enable: weather to enable badblocks accounting
+ *
+ * Return:
+ * 0: success
+ * -ve errno: on error
+ */
+int badblocks_init(struct badblocks *bb, int enable)
+{
+ return __badblocks_init(NULL, bb, enable);
+}
+EXPORT_SYMBOL_GPL(badblocks_init);
+
+int devm_init_badblocks(struct device *dev, struct badblocks *bb)
+{
+ if (!bb)
+ return -EINVAL;
+ return __badblocks_init(dev, bb, 1);
+}
+EXPORT_SYMBOL_GPL(devm_init_badblocks);
+
+/**
+ * badblocks_exit() - free the badblocks structure
+ * @bb: the badblocks structure that holds all badblock information
+ */
+void badblocks_exit(struct badblocks *bb)
+{
+ if (!bb)
+ return;
+ if (bb->dev)
+ devm_kfree(bb->dev, bb->page);
+ else
+ kfree(bb->page);
+ bb->page = NULL;
+}
+EXPORT_SYMBOL_GPL(badblocks_exit);
diff --git a/block/genhd.c b/block/genhd.c
index e5cafa51567c..5aaeb2ad0fd3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -20,6 +20,7 @@
#include <linux/idr.h>
#include <linux/log2.h>
#include <linux/pm_runtime.h>
+#include <linux/badblocks.h>
#include "blk.h"
@@ -664,7 +665,6 @@ void del_gendisk(struct gendisk *disk)
kobject_put(disk->part0.holder_dir);
kobject_put(disk->slave_dir);
- disk->driverfs_dev = NULL;
if (!sysfs_deprecated)
sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
@@ -672,6 +672,31 @@ void del_gendisk(struct gendisk *disk)
}
EXPORT_SYMBOL(del_gendisk);
+/* sysfs access to bad-blocks list. */
+static ssize_t disk_badblocks_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!disk->bb)
+ return sprintf(page, "\n");
+
+ return badblocks_show(disk->bb, page, 0);
+}
+
+static ssize_t disk_badblocks_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *page, size_t len)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!disk->bb)
+ return -ENXIO;
+
+ return badblocks_store(disk->bb, page, len, 0);
+}
+
/**
* get_gendisk - get partitioning information for a given device
* @devt: device to get partitioning information for
@@ -990,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show,
+ disk_badblocks_store);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -1011,6 +1038,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_capability.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
+ &dev_attr_badblocks.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
diff --git a/block/ioctl.c b/block/ioctl.c
index 0918aed2d847..2c84683aada5 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -4,6 +4,7 @@
#include <linux/gfp.h>
#include <linux/blkpg.h>
#include <linux/hdreg.h>
+#include <linux/badblocks.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/blktrace_api.h>
@@ -406,6 +407,71 @@ static inline int is_unrecognized_ioctl(int ret)
ret == -ENOIOCTLCMD;
}
+#ifdef CONFIG_FS_DAX
+bool blkdev_dax_capable(struct block_device *bdev)
+{
+ struct gendisk *disk = bdev->bd_disk;
+
+ if (!disk->fops->direct_access)
+ return false;
+
+ /*
+ * If the partition is not aligned on a page boundary, we can't
+ * do dax I/O to it.
+ */
+ if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
+ || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+ return false;
+
+ /*
+ * If the device has known bad blocks, force all I/O through the
+ * driver / page cache.
+ *
+ * TODO: support finer grained dax error handling
+ */
+ if (disk->bb && disk->bb->count)
+ return false;
+
+ return true;
+}
+
+static int blkdev_daxset(struct block_device *bdev, unsigned long argp)
+{
+ unsigned long arg;
+ int rc = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (get_user(arg, (int __user *)(argp)))
+ return -EFAULT;
+ arg = !!arg;
+ if (arg == !!(bdev->bd_inode->i_flags & S_DAX))
+ return 0;
+
+ if (arg)
+ arg = S_DAX;
+
+ if (arg && !blkdev_dax_capable(bdev))
+ return -ENOTTY;
+
+ mutex_lock(&bdev->bd_inode->i_mutex);
+ if (bdev->bd_map_count == 0)
+ inode_set_flags(bdev->bd_inode, arg, S_DAX);
+ else
+ rc = -EBUSY;
+ mutex_unlock(&bdev->bd_inode->i_mutex);
+ return rc;
+}
+#else
+static int blkdev_daxset(struct block_device *bdev, int arg)
+{
+ if (arg)
+ return -ENOTTY;
+ return 0;
+}
+#endif
+
static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg)
{
@@ -568,6 +634,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKTRACESETUP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, argp);
+ case BLKDAXSET:
+ return blkdev_daxset(bdev, arg);
+ case BLKDAXGET:
+ return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
+ break;
case IOC_PR_REGISTER:
return blkdev_pr_register(bdev, argp);
case IOC_PR_RESERVE:
diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index aa45d4802707..ad6d8c6b777e 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/ndctl.h>
+#include <linux/delay.h>
#include <linux/list.h>
#include <linux/acpi.h>
#include <linux/sort.h>
@@ -1473,6 +1474,201 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
/* devm will free nfit_blk */
}
+static int ars_get_cap(struct nvdimm_bus_descriptor *nd_desc,
+ struct nd_cmd_ars_cap *cmd, u64 addr, u64 length)
+{
+ cmd->address = addr;
+ cmd->length = length;
+
+ return nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd,
+ sizeof(*cmd));
+}
+
+static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc,
+ struct nd_cmd_ars_start *cmd, u64 addr, u64 length)
+{
+ int rc;
+
+ cmd->address = addr;
+ cmd->length = length;
+ cmd->type = ND_ARS_PERSISTENT;
+
+ while (1) {
+ rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd,
+ sizeof(*cmd));
+ if (rc)
+ return rc;
+ switch (cmd->status) {
+ case 0:
+ return 0;
+ case 1:
+ /* ARS unsupported, but we should never get here */
+ return 0;
+ case 2:
+ return -EINVAL;
+ case 3:
+ /* ARS is in progress */
+ msleep(1000);
+ break;
+ default:
+ return -ENXIO;
+ }
+ }
+}
+
+static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc,
+ struct nd_cmd_ars_status *cmd)
+{
+ int rc;
+
+ while (1) {
+ rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd,
+ sizeof(*cmd));
+ if (rc || cmd->status & 0xffff)
+ return -ENXIO;
+
+ /* Check extended status (Upper two bytes) */
+ switch (cmd->status >> 16) {
+ case 0:
+ return 0;
+ case 1:
+ /* ARS is in progress */
+ msleep(1000);
+ break;
+ case 2:
+ /* No ARS performed for the current boot */
+ return 0;
+ default:
+ return -ENXIO;
+ }
+ }
+}
+
+static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus,
+ struct nd_cmd_ars_status *ars_status, u64 start)
+{
+ int rc;
+ u32 i;
+
+ /*
+ * The address field returned by ars_status should be either
+ * less than or equal to the address we last started ARS for.
+ * The (start, length) returned by ars_status should also have
+ * non-zero overlap with the range we started ARS for.
+ * If this is not the case, bail.
+ */
+ if (ars_status->address > start ||
+ (ars_status->address + ars_status->length < start))
+ return -ENXIO;
+
+ for (i = 0; i < ars_status->num_records; i++) {
+ rc = nvdimm_bus_add_poison(nvdimm_bus,
+ ars_status->records[i].err_address,
+ ars_status->records[i].length);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+
+static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
+ struct nd_region_desc *ndr_desc)
+{
+ struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
+ struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus;
+ struct nd_cmd_ars_status *ars_status = NULL;
+ struct nd_cmd_ars_start *ars_start = NULL;
+ struct nd_cmd_ars_cap *ars_cap = NULL;
+ u64 start, len, cur, remaining;
+ int rc;
+
+ ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL);
+ if (!ars_cap)
+ return -ENOMEM;
+
+ start = ndr_desc->res->start;
+ len = ndr_desc->res->end - ndr_desc->res->start + 1;
+
+ rc = ars_get_cap(nd_desc, ars_cap, start, len);
+ if (rc)
+ goto out;
+
+ /*
+ * If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in
+ * extended status is not set, skip this but continue initialization
+ */
+ if ((ars_cap->status & 0xffff) ||
+ !(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) {
+ dev_warn(acpi_desc->dev,
+ "ARS unsupported (status: 0x%x), won't create an error list\n",
+ ars_cap->status);
+ goto out;
+ }
+
+ /*
+ * Check if a full-range ARS has been run. If so, use those results
+ * without having to start a new ARS.
+ */
+ ars_status = kzalloc(ars_cap->max_ars_out + sizeof(*ars_status),
+ GFP_KERNEL);
+ if (!ars_status) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = ars_get_status(nd_desc, ars_status);
+ if (rc)
+ goto out;
+
+ if (ars_status->address <= start &&
+ (ars_status->address + ars_status->length >= start + len)) {
+ rc = ars_status_process_records(nvdimm_bus, ars_status, start);
+ goto out;
+ }
+
+ /*
+ * ARS_STATUS can overflow if the number of poison entries found is
+ * greater than the maximum buffer size (ars_cap->max_ars_out)
+ * To detect overflow, check if the length field of ars_status
+ * is less than the length we supplied. If so, process the
+ * error entries we got, adjust the start point, and start again
+ */
+ ars_start = kzalloc(sizeof(*ars_start), GFP_KERNEL);
+ if (!ars_start)
+ return -ENOMEM;
+
+ cur = start;
+ remaining = len;
+ do {
+ u64 done, end;
+
+ rc = ars_do_start(nd_desc, ars_start, cur, remaining);
+ if (rc)
+ goto out;
+
+ rc = ars_get_status(nd_desc, ars_status);
+ if (rc)
+ goto out;
+
+ rc = ars_status_process_records(nvdimm_bus, ars_status, cur);
+ if (rc)
+ goto out;
+
+ end = min(cur + remaining,
+ ars_status->address + ars_status->length);
+ done = end - cur;
+ cur += done;
+ remaining -= done;
+ } while (remaining);
+
+ out:
+ kfree(ars_cap);
+ kfree(ars_start);
+ kfree(ars_status);
+ return rc;
+}
+
static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc,
struct acpi_nfit_memory_map *memdev,
@@ -1585,6 +1781,13 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
nvdimm_bus = acpi_desc->nvdimm_bus;
if (nfit_spa_type(spa) == NFIT_SPA_PM) {
+ rc = acpi_nfit_find_poison(acpi_desc, ndr_desc);
+ if (rc) {
+ dev_err(acpi_desc->dev,
+ "error while performing ARS to find poison: %d\n",
+ rc);
+ return rc;
+ }
if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc))
return -ENOMEM;
} else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 61aacab424cf..31b595479aa5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -34,6 +34,7 @@
#include <linux/kthread.h>
#include <linux/blkdev.h>
+#include <linux/badblocks.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
@@ -710,8 +711,7 @@ void md_rdev_clear(struct md_rdev *rdev)
put_page(rdev->bb_page);
rdev->bb_page = NULL;
}
- kfree(rdev->badblocks.page);
- rdev->badblocks.page = NULL;
+ badblocks_exit(&rdev->badblocks);
}
EXPORT_SYMBOL_GPL(md_rdev_clear);
@@ -1361,8 +1361,6 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
return cpu_to_le32(csum);
}
-static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
- int acknowledged);
static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
{
struct mdp_superblock_1 *sb;
@@ -1487,8 +1485,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
count <<= sb->bblog_shift;
if (bb + 1 == 0)
break;
- if (md_set_badblocks(&rdev->badblocks,
- sector, count, 1) == 0)
+ if (badblocks_set(&rdev->badblocks, sector, count, 1))
return -EINVAL;
}
} else if (sb->bblog_offset != 0)
@@ -2320,7 +2317,7 @@ repeat:
rdev_for_each(rdev, mddev) {
if (rdev->badblocks.changed) {
rdev->badblocks.changed = 0;
- md_ack_all_badblocks(&rdev->badblocks);
+ ack_all_badblocks(&rdev->badblocks);
md_error(mddev, rdev);
}
clear_bit(Blocked, &rdev->flags);
@@ -2446,7 +2443,7 @@ repeat:
clear_bit(Blocked, &rdev->flags);
if (any_badblocks_changed)
- md_ack_all_badblocks(&rdev->badblocks);
+ ack_all_badblocks(&rdev->badblocks);
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
}
@@ -3054,11 +3051,17 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_
static struct rdev_sysfs_entry rdev_recovery_start =
__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
-static ssize_t
-badblocks_show(struct badblocks *bb, char *page, int unack);
-static ssize_t
-badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
-
+/* sysfs access to bad-blocks list.
+ * We present two files.
+ * 'bad-blocks' lists sector numbers and lengths of ranges that
+ * are recorded as bad. The list is truncated to fit within
+ * the one-page limit of sysfs.
+ * Writing "sector length" to this file adds an acknowledged
+ * bad block list.
+ * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
+ * been acknowledged. Writing to this file adds bad blocks
+ * without acknowledging them. This is largely for testing.
+ */
static ssize_t bb_show(struct md_rdev *rdev, char *page)
{
return badblocks_show(&rdev->badblocks, page, 0);
@@ -3173,14 +3176,7 @@ int md_rdev_init(struct md_rdev *rdev)
* This reserves the space even on arrays where it cannot
* be used - I wonder if that matters
*/
- rdev->badblocks.count = 0;
- rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
- rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
- seqlock_init(&rdev->badblocks.lock);
- if (rdev->badblocks.page == NULL)
- return -ENOMEM;
-
- return 0;
+ return badblocks_init(&rdev->badblocks, 0);
}
EXPORT_SYMBOL_GPL(md_rdev_init);
/*
@@ -8489,254 +8485,9 @@ void md_finish_reshape(struct mddev *mddev)
}
EXPORT_SYMBOL(md_finish_reshape);
-/* Bad block management.
- * We can record which blocks on each device are 'bad' and so just
- * fail those blocks, or that stripe, rather than the whole device.
- * Entries in the bad-block table are 64bits wide. This comprises:
- * Length of bad-range, in sectors: 0-511 for lengths 1-512
- * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
- * A 'shift' can be set so that larger blocks are tracked and
- * consequently larger devices can be covered.
- * 'Acknowledged' flag - 1 bit. - the most significant bit.
- *
- * Locking of the bad-block table uses a seqlock so md_is_badblock
- * might need to retry if it is very unlucky.
- * We will sometimes want to check for bad blocks in a bi_end_io function,
- * so we use the write_seqlock_irq variant.
- *
- * When looking for a bad block we specify a range and want to
- * know if any block in the range is bad. So we binary-search
- * to the last range that starts at-or-before the given endpoint,
- * (or "before the sector after the target range")
- * then see if it ends after the given start.
- * We return
- * 0 if there are no known bad blocks in the range
- * 1 if there are known bad block which are all acknowledged
- * -1 if there are bad blocks which have not yet been acknowledged in metadata.
- * plus the start/length of the first bad section we overlap.
- */
-int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors)
-{
- int hi;
- int lo;
- u64 *p = bb->page;
- int rv;
- sector_t target = s + sectors;
- unsigned seq;
-
- if (bb->shift > 0) {
- /* round the start down, and the end up */
- s >>= bb->shift;
- target += (1<<bb->shift) - 1;
- target >>= bb->shift;
- sectors = target - s;
- }
- /* 'target' is now the first block after the bad range */
-
-retry:
- seq = read_seqbegin(&bb->lock);
- lo = 0;
- rv = 0;
- hi = bb->count;
-
- /* Binary search between lo and hi for 'target'
- * i.e. for the last range that starts before 'target'
- */
- /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
- * are known not to be the last range before target.
- * VARIANT: hi-lo is the number of possible
- * ranges, and decreases until it reaches 1
- */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a < target)
- /* This could still be the one, earlier ranges
- * could not. */
- lo = mid;
- else
- /* This and later ranges are definitely out. */
- hi = mid;
- }
- /* 'lo' might be the last that started before target, but 'hi' isn't */
- if (hi > lo) {
- /* need to check all range that end after 's' to see if
- * any are unacknowledged.
- */
- while (lo >= 0 &&
- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
- if (BB_OFFSET(p[lo]) < target) {
- /* starts before the end, and finishes after
- * the start, so they must overlap
- */
- if (rv != -1 && BB_ACK(p[lo]))
- rv = 1;
- else
- rv = -1;
- *first_bad = BB_OFFSET(p[lo]);
- *bad_sectors = BB_LEN(p[lo]);
- }
- lo--;
- }
- }
-
- if (read_seqretry(&bb->lock, seq))
- goto retry;
-
- return rv;
-}
-EXPORT_SYMBOL_GPL(md_is_badblock);
-
-/*
- * Add a range of bad blocks to the table.
- * This might extend the table, or might contract it
- * if two adjacent ranges can be merged.
- * We binary-search to find the 'insertion' point, then
- * decide how best to handle it.
- */
-static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
- int acknowledged)
-{
- u64 *p;
- int lo, hi;
- int rv = 1;
- unsigned long flags;
-
- if (bb->shift < 0)
- /* badblocks are disabled */
- return 0;
-
- if (bb->shift) {
- /* round the start down, and the end up */
- sector_t next = s + sectors;
- s >>= bb->shift;
- next += (1<<bb->shift) - 1;
- next >>= bb->shift;
- sectors = next - s;
- }
-
- write_seqlock_irqsave(&bb->lock, flags);
-
- p = bb->page;
- lo = 0;
- hi = bb->count;
- /* Find the last range that starts at-or-before 's' */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a <= s)
- lo = mid;
- else
- hi = mid;
- }
- if (hi > lo && BB_OFFSET(p[lo]) > s)
- hi = lo;
-
- if (hi > lo) {
- /* we found a range that might merge with the start
- * of our new range
- */
- sector_t a = BB_OFFSET(p[lo]);
- sector_t e = a + BB_LEN(p[lo]);
- int ack = BB_ACK(p[lo]);
- if (e >= s) {
- /* Yes, we can merge with a previous range */
- if (s == a && s + sectors >= e)
- /* new range covers old */
- ack = acknowledged;
- else
- ack = ack && acknowledged;
-
- if (e < s + sectors)
- e = s + sectors;
- if (e - a <= BB_MAX_LEN) {
- p[lo] = BB_MAKE(a, e-a, ack);
- s = e;
- } else {
- /* does not all fit in one range,
- * make p[lo] maximal
- */
- if (BB_LEN(p[lo]) != BB_MAX_LEN)
- p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
- s = a + BB_MAX_LEN;
- }
- sectors = e - s;
- }
- }
- if (sectors && hi < bb->count) {
- /* 'hi' points to the first range that starts after 's'.
- * Maybe we can merge with the start of that range */
- sector_t a = BB_OFFSET(p[hi]);
- sector_t e = a + BB_LEN(p[hi]);
- int ack = BB_ACK(p[hi]);
- if (a <= s + sectors) {
- /* merging is possible */
- if (e <= s + sectors) {
- /* full overlap */
- e = s + sectors;
- ack = acknowledged;
- } else
- ack = ack && acknowledged;
-
- a = s;
- if (e - a <= BB_MAX_LEN) {
- p[hi] = BB_MAKE(a, e-a, ack);
- s = e;
- } else {
- p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
- s = a + BB_MAX_LEN;
- }
- sectors = e - s;
- lo = hi;
- hi++;
- }
- }
- if (sectors == 0 && hi < bb->count) {
- /* we might be able to combine lo and hi */
- /* Note: 's' is at the end of 'lo' */
- sector_t a = BB_OFFSET(p[hi]);
- int lolen = BB_LEN(p[lo]);
- int hilen = BB_LEN(p[hi]);
- int newlen = lolen + hilen - (s - a);
- if (s >= a && newlen < BB_MAX_LEN) {
- /* yes, we can combine them */
- int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
- p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
- memmove(p + hi, p + hi + 1,
- (bb->count - hi - 1) * 8);
- bb->count--;
- }
- }
- while (sectors) {
- /* didn't merge (it all).
- * Need to add a range just before 'hi' */
- if (bb->count >= MD_MAX_BADBLOCKS) {
- /* No room for more */
- rv = 0;
- break;
- } else {
- int this_sectors = sectors;
- memmove(p + hi + 1, p + hi,
- (bb->count - hi) * 8);
- bb->count++;
-
- if (this_sectors > BB_MAX_LEN)
- this_sectors = BB_MAX_LEN;
- p[hi] = BB_MAKE(s, this_sectors, acknowledged);
- sectors -= this_sectors;
- s += this_sectors;
- }
- }
-
- bb->changed = 1;
- if (!acknowledged)
- bb->unacked_exist = 1;
- write_sequnlock_irqrestore(&bb->lock, flags);
-
- return rv;
-}
+/* Bad block management */
+/* Returns 1 on success, 0 on failure */
int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new)
{
@@ -8745,114 +8496,19 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
s += rdev->new_data_offset;
else
s += rdev->data_offset;
- rv = md_set_badblocks(&rdev->badblocks,
- s, sectors, 0);
- if (rv) {
+ rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
+ if (rv == 0) {
/* Make sure they get written out promptly */
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
md_wakeup_thread(rdev->mddev->thread);
- }
- return rv;
+ return 1;
+ } else
+ return 0;
}
EXPORT_SYMBOL_GPL(rdev_set_badblocks);
-/*
- * Remove a range of bad blocks from the table.
- * This may involve extending the table if we spilt a region,
- * but it must not fail. So if the table becomes full, we just
- * drop the remove request.
- */
-static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
-{
- u64 *p;
- int lo, hi;
- sector_t target = s + sectors;
- int rv = 0;
-
- if (bb->shift > 0) {
- /* When clearing we round the start up and the end down.
- * This should not matter as the shift should align with
- * the block size and no rounding should ever be needed.
- * However it is better the think a block is bad when it
- * isn't than to think a block is not bad when it is.
- */
- s += (1<<bb->shift) - 1;
- s >>= bb->shift;
- target >>= bb->shift;
- sectors = target - s;
- }
-
- write_seqlock_irq(&bb->lock);
-
- p = bb->page;
- lo = 0;
- hi = bb->count;
- /* Find the last range that starts before 'target' */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a < target)
- lo = mid;
- else
- hi = mid;
- }
- if (hi > lo) {
- /* p[lo] is the last range that could overlap the
- * current range. Earlier ranges could also overlap,
- * but only this one can overlap the end of the range.
- */
- if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
- /* Partial overlap, leave the tail of this range */
- int ack = BB_ACK(p[lo]);
- sector_t a = BB_OFFSET(p[lo]);
- sector_t end = a + BB_LEN(p[lo]);
-
- if (a < s) {
- /* we need to split this range */
- if (bb->count >= MD_MAX_BADBLOCKS) {
- rv = -ENOSPC;
- goto out;
- }
- memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
- bb->count++;
- p[lo] = BB_MAKE(a, s-a, ack);
- lo++;
- }
- p[lo] = BB_MAKE(target, end - target, ack);
- /* there is no longer an overlap */
- hi = lo;
- lo--;
- }
- while (lo >= 0 &&
- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
- /* This range does overlap */
- if (BB_OFFSET(p[lo]) < s) {
- /* Keep the early parts of this range. */
- int ack = BB_ACK(p[lo]);
- sector_t start = BB_OFFSET(p[lo]);
- p[lo] = BB_MAKE(start, s - start, ack);
- /* now low doesn't overlap, so.. */
- break;
- }
- lo--;
- }
- /* 'lo' is strictly before, 'hi' is strictly after,
- * anything between needs to be discarded
- */
- if (hi - lo > 1) {
- memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
- bb->count -= (hi - lo - 1);
- }
- }
-
- bb->changed = 1;
-out:
- write_sequnlock_irq(&bb->lock);
- return rv;
-}
-
int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new)
{
@@ -8860,133 +8516,11 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
s += rdev->new_data_offset;
else
s += rdev->data_offset;
- return md_clear_badblocks(&rdev->badblocks,
+ return badblocks_clear(&rdev->badblocks,
s, sectors);
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
-/*
- * Acknowledge all bad blocks in a list.
- * This only succeeds if ->changed is clear. It is used by
- * in-kernel metadata updates
- */
-void md_ack_all_badblocks(struct badblocks *bb)
-{
- if (bb->page == NULL || bb->changed)
- /* no point even trying */
- return;
- write_seqlock_irq(&bb->lock);
-
- if (bb->changed == 0 && bb->unacked_exist) {
- u64 *p = bb->page;
- int i;
- for (i = 0; i < bb->count ; i++) {
- if (!BB_ACK(p[i])) {
- sector_t start = BB_OFFSET(p[i]);
- int len = BB_LEN(p[i]);
- p[i] = BB_MAKE(start, len, 1);
- }
- }
- bb->unacked_exist = 0;
- }
- write_sequnlock_irq(&bb->lock);
-}
-EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
-
-/* sysfs access to bad-blocks list.
- * We present two files.
- * 'bad-blocks' lists sector numbers and lengths of ranges that
- * are recorded as bad. The list is truncated to fit within
- * the one-page limit of sysfs.
- * Writing "sector length" to this file adds an acknowledged
- * bad block list.
- * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
- * been acknowledged. Writing to this file adds bad blocks
- * without acknowledging them. This is largely for testing.
- */
-
-static ssize_t
-badblocks_show(struct badblocks *bb, char *page, int unack)
-{
- size_t len;
- int i;
- u64 *p = bb->page;
- unsigned seq;
-
- if (bb->shift < 0)
- return 0;
-
-retry:
- seq = read_seqbegin(&bb->lock);
-
- len = 0;
- i = 0;
-
- while (len < PAGE_SIZE && i < bb->count) {
- sector_t s = BB_OFFSET(p[i]);
- unsigned int length = BB_LEN(p[i]);
- int ack = BB_ACK(p[i]);
- i++;
-
- if (unack && ack)
- continue;
-
- len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
- (unsigned long long)s << bb->shift,
- length << bb->shift);
- }
- if (unack && len == 0)
- bb->unacked_exist = 0;
-
- if (read_seqretry(&bb->lock, seq))
- goto retry;
-
- return len;
-}
-
-#define DO_DEBUG 1
-
-static ssize_t
-badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
-{
- unsigned long long sector;
- int length;
- char newline;
-#ifdef DO_DEBUG
- /* Allow clearing via sysfs *only* for testing/debugging.
- * Normally only a successful write may clear a badblock
- */
- int clear = 0;
- if (page[0] == '-') {
- clear = 1;
- page++;
- }
-#endif /* DO_DEBUG */
-
- switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
- case 3:
- if (newline != '\n')
- return -EINVAL;
- case 2:
- if (length <= 0)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
-
-#ifdef DO_DEBUG
- if (clear) {
- md_clear_badblocks(bb, sector, length);
- return len;
- }
-#endif /* DO_DEBUG */
- if (md_set_badblocks(bb, sector, length, !unack))
- return len;
- else
- return -ENOSPC;
-}
-
static int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x)
{
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ca0b643fe3c1..75b9aaacb03f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -17,6 +17,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/badblocks.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mm.h>
@@ -28,13 +29,6 @@
#define MaxSector (~(sector_t)0)
-/* Bad block numbers are stored sorted in a single page.
- * 64bits is used for each block or extent.
- * 54 bits are sector number, 9 bits are extent size,
- * 1 bit is an 'acknowledged' flag.
- */
-#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
-
/*
* MD's 'extended' device
*/
@@ -117,22 +111,7 @@ struct md_rdev {
struct kernfs_node *sysfs_state; /* handle for 'state'
* sysfs entry */
- struct badblocks {
- int count; /* count of bad blocks */
- int unacked_exist; /* there probably are unacknowledged
- * bad blocks. This is only cleared
- * when a read discovers none
- */
- int shift; /* shift from sectors to block size
- * a -ve shift means badblocks are
- * disabled.*/
- u64 *page; /* badblock list */
- int changed;
- seqlock_t lock;
-
- sector_t sector;
- sector_t size; /* in sectors */
- } badblocks;
+ struct badblocks badblocks;
};
enum flag_bits {
Faulty, /* device is known to have a fault */
@@ -185,22 +164,11 @@ enum flag_bits {
*/
};
-#define BB_LEN_MASK (0x00000000000001FFULL)
-#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
-#define BB_ACK_MASK (0x8000000000000000ULL)
-#define BB_MAX_LEN 512
-#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
-#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
-#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
-#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
-
-extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors);
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors)
{
if (unlikely(rdev->badblocks.count)) {
- int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
+ int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s,
sectors,
first_bad, bad_sectors);
if (rv)
@@ -213,8 +181,6 @@ extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
-extern void md_ack_all_badblocks(struct badblocks *bb);
-
struct md_cluster_info;
struct mddev {
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 82c49bb87055..2e2832b83c93 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -11,6 +11,7 @@
* General Public License for more details.
*/
#include <linux/libnvdimm.h>
+#include <linux/badblocks.h>
#include <linux/export.h>
#include <linux/module.h>
#include <linux/blkdev.h>
@@ -325,6 +326,7 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
if (!nvdimm_bus)
return NULL;
INIT_LIST_HEAD(&nvdimm_bus->list);
+ INIT_LIST_HEAD(&nvdimm_bus->poison_list);
init_waitqueue_head(&nvdimm_bus->probe_wait);
nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
mutex_init(&nvdimm_bus->reconfig_mutex);
@@ -359,6 +361,172 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
}
EXPORT_SYMBOL_GPL(__nvdimm_bus_register);
+static void set_badblock(struct badblocks *bb, sector_t s, int num)
+{
+ dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n",
+ (u64) s * 512, (u64) num * 512);
+ /* this isn't an error as the hardware will still throw an exception */
+ if (badblocks_set(bb, s, num, 1))
+ dev_info_once(bb->dev, "%s: failed for sector %llx\n",
+ __func__, (u64) s);
+}
+
+/**
+ * __add_badblock_range() - Convert a physical address range to bad sectors
+ * @bb: badblocks instance to populate
+ * @ns_offset: namespace offset where the error range begins (in bytes)
+ * @len: number of bytes of poison to be added
+ *
+ * This assumes that the range provided with (ns_offset, len) is within
+ * the bounds of physical addresses for this namespace, i.e. lies in the
+ * interval [ns_start, ns_start + ns_size)
+ */
+static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
+{
+ const unsigned int sector_size = 512;
+ sector_t start_sector;
+ u64 num_sectors;
+ u32 rem;
+
+ start_sector = div_u64(ns_offset, sector_size);
+ num_sectors = div_u64_rem(len, sector_size, &rem);
+ if (rem)
+ num_sectors++;
+
+ if (unlikely(num_sectors > (u64)INT_MAX)) {
+ u64 remaining = num_sectors;
+ sector_t s = start_sector;
+
+ while (remaining) {
+ int done = min_t(u64, remaining, INT_MAX);
+
+ set_badblock(bb, s, done);
+ remaining -= done;
+ s += done;
+ }
+ } else
+ set_badblock(bb, start_sector, num_sectors);
+}
+
+/**
+ * nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks
+ * @ndns: the namespace containing poison ranges
+ * @bb: badblocks instance to populate
+ * @offset: offset at the start of the namespace before 'sector 0'
+ *
+ * The poison list generated during NFIT initialization may contain multiple,
+ * possibly overlapping ranges in the SPA (System Physical Address) space.
+ * Compare each of these ranges to the namespace currently being initialized,
+ * and add badblocks to the gendisk for all matching sub-ranges
+ */
+void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns,
+ struct badblocks *bb, resource_size_t offset)
+{
+ struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+ struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+ struct nvdimm_bus *nvdimm_bus;
+ struct list_head *poison_list;
+ u64 ns_start, ns_end, ns_size;
+ struct nd_poison *pl;
+
+ ns_size = nvdimm_namespace_capacity(ndns) - offset;
+ ns_start = nsio->res.start + offset;
+ ns_end = nsio->res.end;
+
+ nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent);
+ poison_list = &nvdimm_bus->poison_list;
+ if (list_empty(poison_list))
+ return;
+
+ list_for_each_entry(pl, poison_list, list) {
+ u64 pl_end = pl->start + pl->length - 1;
+
+ /* Discard intervals with no intersection */
+ if (pl_end < ns_start)
+ continue;
+ if (pl->start > ns_end)
+ continue;
+ /* Deal with any overlap after start of the namespace */
+ if (pl->start >= ns_start) {
+ u64 start = pl->start;
+ u64 len;
+
+ if (pl_end <= ns_end)
+ len = pl->length;
+ else
+ len = ns_start + ns_size - pl->start;
+ __add_badblock_range(bb, start - ns_start, len);
+ continue;
+ }
+ /* Deal with overlap for poison starting before the namespace */
+ if (pl->start < ns_start) {
+ u64 len;
+
+ if (pl_end < ns_end)
+ len = pl->start + pl->length - ns_start;
+ else
+ len = ns_size;
+ __add_badblock_range(bb, 0, len);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison);
+
+static int __add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
+{
+ struct nd_poison *pl;
+
+ pl = kzalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return -ENOMEM;
+
+ pl->start = addr;
+ pl->length = length;
+ list_add_tail(&pl->list, &nvdimm_bus->poison_list);
+
+ return 0;
+}
+
+int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
+{
+ struct nd_poison *pl;
+
+ if (list_empty(&nvdimm_bus->poison_list))
+ return __add_poison(nvdimm_bus, addr, length);
+
+ /*
+ * There is a chance this is a duplicate, check for those first.
+ * This will be the common case as ARS_STATUS returns all known
+ * errors in the SPA space, and we can't query it per region
+ */
+ list_for_each_entry(pl, &nvdimm_bus->poison_list, list)
+ if (pl->start == addr) {
+ /* If length has changed, update this list entry */
+ if (pl->length != length)
+ pl->length = length;
+ return 0;
+ }
+
+ /*
+ * If not a duplicate or a simple length update, add the entry as is,
+ * as any overlapping ranges will get resolved when the list is consumed
+ * and converted to badblocks
+ */
+ return __add_poison(nvdimm_bus, addr, length);
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
+
+static void free_poison_list(struct list_head *poison_list)
+{
+ struct nd_poison *pl, *next;
+
+ list_for_each_entry_safe(pl, next, poison_list, list) {
+ list_del(&pl->list);
+ kfree(pl);
+ }
+ list_del_init(poison_list);
+}
+
static int child_unregister(struct device *dev, void *data)
{
/*
@@ -385,6 +553,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
nd_synchronize();
device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
+ free_poison_list(&nvdimm_bus->poison_list);
nvdimm_bus_destroy_ndctl(nvdimm_bus);
device_unregister(&nvdimm_bus->dev);
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 0955b2cb10fe..8ebfcaae3f5a 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -77,6 +77,59 @@ static bool is_namespace_io(struct device *dev)
return dev ? dev->type == &namespace_io_device_type : false;
}
+static int is_uuid_busy(struct device *dev, void *data)
+{
+ u8 *uuid1 = data, *uuid2 = NULL;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ uuid2 = nspm->uuid;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ uuid2 = nsblk->uuid;
+ } else if (is_nd_btt(dev)) {
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+
+ uuid2 = nd_btt->uuid;
+ } else if (is_nd_pfn(dev)) {
+ struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+
+ uuid2 = nd_pfn->uuid;
+ }
+
+ if (uuid2 && memcmp(uuid1, uuid2, NSLABEL_UUID_LEN) == 0)
+ return -EBUSY;
+
+ return 0;
+}
+
+static int is_namespace_uuid_busy(struct device *dev, void *data)
+{
+ if (is_nd_pmem(dev) || is_nd_blk(dev))
+ return device_for_each_child(dev, data, is_uuid_busy);
+ return 0;
+}
+
+/**
+ * nd_is_uuid_unique - verify that no other namespace has @uuid
+ * @dev: any device on a nvdimm_bus
+ * @uuid: uuid to check
+ */
+bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ if (!nvdimm_bus)
+ return false;
+ WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev));
+ if (device_for_each_child(&nvdimm_bus->dev, uuid,
+ is_namespace_uuid_busy) != 0)
+ return false;
+ return true;
+}
+
bool pmem_should_map_pages(struct device *dev)
{
struct nd_region *nd_region = to_nd_region(dev->parent);
@@ -104,20 +157,10 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
const char *suffix = NULL;
- if (ndns->claim) {
- if (is_nd_btt(ndns->claim))
- suffix = "s";
- else if (is_nd_pfn(ndns->claim))
- suffix = "m";
- else
- dev_WARN_ONCE(&ndns->dev, 1,
- "unknown claim type by %s\n",
- dev_name(ndns->claim));
- }
+ if (ndns->claim && is_nd_btt(ndns->claim))
+ suffix = "s";
if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) {
- if (!suffix && pmem_should_map_pages(&ndns->dev))
- suffix = "m";
sprintf(name, "pmem%d%s", nd_region->id, suffix ? suffix : "");
} else if (is_namespace_blk(&ndns->dev)) {
struct nd_namespace_blk *nsblk;
@@ -791,6 +834,15 @@ static void nd_namespace_pmem_set_size(struct nd_region *nd_region,
res->end = nd_region->ndr_start + size - 1;
}
+static bool uuid_not_set(const u8 *uuid, struct device *dev, const char *where)
+{
+ if (!uuid) {
+ dev_dbg(dev, "%s: uuid not set\n", where);
+ return true;
+ }
+ return false;
+}
+
static ssize_t __size_store(struct device *dev, unsigned long long val)
{
resource_size_t allocated = 0, available = 0;
@@ -820,8 +872,12 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
* We need a uuid for the allocation-label and dimm(s) on which
* to store the label.
*/
- if (!uuid || nd_region->ndr_mappings == 0)
+ if (uuid_not_set(uuid, dev, __func__))
return -ENXIO;
+ if (nd_region->ndr_mappings == 0) {
+ dev_dbg(dev, "%s: not associated with dimm(s)\n", __func__);
+ return -ENXIO;
+ }
div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder);
if (remainder) {
@@ -1211,6 +1267,29 @@ static ssize_t holder_show(struct device *dev,
}
static DEVICE_ATTR_RO(holder);
+static ssize_t mode_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_namespace_common *ndns = to_ndns(dev);
+ struct device *claim;
+ char *mode;
+ ssize_t rc;
+
+ device_lock(dev);
+ claim = ndns->claim;
+ if (pmem_should_map_pages(dev) || (claim && is_nd_pfn(claim)))
+ mode = "memory";
+ else if (claim && is_nd_btt(claim))
+ mode = "safe";
+ else
+ mode = "raw";
+ rc = sprintf(buf, "%s\n", mode);
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(mode);
+
static ssize_t force_raw_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
@@ -1234,6 +1313,7 @@ static DEVICE_ATTR_RW(force_raw);
static struct attribute *nd_namespace_attributes[] = {
&dev_attr_nstype.attr,
&dev_attr_size.attr,
+ &dev_attr_mode.attr,
&dev_attr_uuid.attr,
&dev_attr_holder.attr,
&dev_attr_resource.attr,
@@ -1267,7 +1347,8 @@ static umode_t namespace_visible(struct kobject *kobj,
if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr
|| a == &dev_attr_holder.attr
- || a == &dev_attr_force_raw.attr)
+ || a == &dev_attr_force_raw.attr
+ || a == &dev_attr_mode.attr)
return a->mode;
return 0;
@@ -1343,14 +1424,19 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
struct nd_namespace_pmem *nspm;
nspm = to_nd_namespace_pmem(&ndns->dev);
- if (!nspm->uuid) {
- dev_dbg(&ndns->dev, "%s: uuid not set\n", __func__);
+ if (uuid_not_set(nspm->uuid, &ndns->dev, __func__))
return ERR_PTR(-ENODEV);
- }
} else if (is_namespace_blk(&ndns->dev)) {
struct nd_namespace_blk *nsblk;
nsblk = to_nd_namespace_blk(&ndns->dev);
+ if (uuid_not_set(nsblk->uuid, &ndns->dev, __func__))
+ return ERR_PTR(-ENODEV);
+ if (!nsblk->lbasize) {
+ dev_dbg(&ndns->dev, "%s: sector size not set\n",
+ __func__);
+ return ERR_PTR(-ENODEV);
+ }
if (!nd_namespace_blk_validate(nsblk))
return ERR_PTR(-ENODEV);
}
@@ -1689,6 +1775,18 @@ void nd_region_create_blk_seed(struct nd_region *nd_region)
nd_device_register(nd_region->ns_seed);
}
+void nd_region_create_pfn_seed(struct nd_region *nd_region)
+{
+ WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+ nd_region->pfn_seed = nd_pfn_create(nd_region);
+ /*
+ * Seed creation failures are not fatal, provisioning is simply
+ * disabled until memory becomes available
+ */
+ if (!nd_region->pfn_seed)
+ dev_err(&nd_region->dev, "failed to create pfn namespace\n");
+}
+
void nd_region_create_btt_seed(struct nd_region *nd_region)
{
WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 159aed532042..1d1500f3d8b5 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -30,6 +30,7 @@ struct nvdimm_bus {
struct list_head list;
struct device dev;
int id, probe_active;
+ struct list_head poison_list;
struct mutex reconfig_mutex;
};
@@ -52,6 +53,7 @@ void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
struct nd_region;
void nd_region_create_blk_seed(struct nd_region *nd_region);
void nd_region_create_btt_seed(struct nd_region *nd_region);
+void nd_region_create_pfn_seed(struct nd_region *nd_region);
void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev);
int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 417e521d299c..ba1633b9da31 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -29,13 +29,12 @@ enum {
ND_MAX_LANES = 256,
SECTOR_SHIFT = 9,
INT_LBASIZE_ALIGNMENT = 64,
-#if IS_ENABLED(CONFIG_NVDIMM_PFN)
- ND_PFN_ALIGN = PAGES_PER_SECTION * PAGE_SIZE,
- ND_PFN_MASK = ND_PFN_ALIGN - 1,
-#else
- ND_PFN_ALIGN = 0,
- ND_PFN_MASK = 0,
-#endif
+};
+
+struct nd_poison {
+ u64 start;
+ u64 length;
+ struct list_head list;
};
struct nvdimm_drvdata {
@@ -153,6 +152,7 @@ struct nd_pfn {
int id;
u8 *uuid;
struct device dev;
+ unsigned long align;
unsigned long npfns;
enum nd_pfn_mode mode;
struct nd_pfn_sb *pfn_sb;
@@ -262,6 +262,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns);
int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns);
const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
char *name);
+void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns,
+ struct badblocks *bb, resource_size_t offset);
int nd_blk_region_init(struct nd_region *nd_region);
void __nd_iostat_start(struct bio *bio, unsigned long *start);
static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 71805a1aa0f3..f9b674bc49db 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -103,6 +103,52 @@ static ssize_t mode_store(struct device *dev,
}
static DEVICE_ATTR_RW(mode);
+static ssize_t align_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+
+ return sprintf(buf, "%lx\n", nd_pfn->align);
+}
+
+static ssize_t __align_store(struct nd_pfn *nd_pfn, const char *buf)
+{
+ unsigned long val;
+ int rc;
+
+ rc = kstrtoul(buf, 0, &val);
+ if (rc)
+ return rc;
+
+ if (!is_power_of_2(val) || val < PAGE_SIZE || val > SZ_1G)
+ return -EINVAL;
+
+ if (nd_pfn->dev.driver)
+ return -EBUSY;
+ else
+ nd_pfn->align = val;
+
+ return 0;
+}
+
+static ssize_t align_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ rc = __align_store(nd_pfn, buf);
+ dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
+ rc, buf, buf[len - 1] == '\n' ? "" : "\n");
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(align);
+
static ssize_t uuid_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -164,6 +210,7 @@ static struct attribute *nd_pfn_attributes[] = {
&dev_attr_mode.attr,
&dev_attr_namespace.attr,
&dev_attr_uuid.attr,
+ &dev_attr_align.attr,
NULL,
};
@@ -179,7 +226,6 @@ static const struct attribute_group *nd_pfn_attribute_groups[] = {
};
static struct device *__nd_pfn_create(struct nd_region *nd_region,
- u8 *uuid, enum nd_pfn_mode mode,
struct nd_namespace_common *ndns)
{
struct nd_pfn *nd_pfn;
@@ -199,10 +245,8 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region,
return NULL;
}
- nd_pfn->mode = mode;
- if (uuid)
- uuid = kmemdup(uuid, 16, GFP_KERNEL);
- nd_pfn->uuid = uuid;
+ nd_pfn->mode = PFN_MODE_NONE;
+ nd_pfn->align = HPAGE_SIZE;
dev = &nd_pfn->dev;
dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id);
dev->parent = &nd_region->dev;
@@ -220,8 +264,7 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region,
struct device *nd_pfn_create(struct nd_region *nd_region)
{
- struct device *dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE,
- NULL);
+ struct device *dev = __nd_pfn_create(nd_region, NULL);
if (dev)
__nd_device_register(dev);
@@ -230,10 +273,11 @@ struct device *nd_pfn_create(struct nd_region *nd_region)
int nd_pfn_validate(struct nd_pfn *nd_pfn)
{
- struct nd_namespace_common *ndns = nd_pfn->ndns;
- struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
- struct nd_namespace_io *nsio;
u64 checksum, offset;
+ struct nd_namespace_io *nsio;
+ struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+ struct nd_namespace_common *ndns = nd_pfn->ndns;
+ const u8 *parent_uuid = nd_dev_to_uuid(&ndns->dev);
if (!pfn_sb || !ndns)
return -ENODEV;
@@ -241,10 +285,6 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
if (!is_nd_pmem(nd_pfn->dev.parent))
return -ENODEV;
- /* section alignment for simple hotplug */
- if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN)
- return -ENODEV;
-
if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb)))
return -ENXIO;
@@ -257,6 +297,9 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
return -ENODEV;
pfn_sb->checksum = cpu_to_le64(checksum);
+ if (memcmp(pfn_sb->parent_uuid, parent_uuid, 16) != 0)
+ return -ENODEV;
+
switch (le32_to_cpu(pfn_sb->mode)) {
case PFN_MODE_RAM:
break;
@@ -278,6 +321,12 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
return -EINVAL;
}
+ if (nd_pfn->align > nvdimm_namespace_capacity(ndns)) {
+ dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n",
+ nd_pfn->align, nvdimm_namespace_capacity(ndns));
+ return -EINVAL;
+ }
+
/*
* These warnings are verbose because they can only trigger in
* the case where the physical address alignment of the
@@ -286,17 +335,19 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
*/
offset = le64_to_cpu(pfn_sb->dataoff);
nsio = to_nd_namespace_io(&ndns->dev);
- if (nsio->res.start & ND_PFN_MASK) {
- dev_err(&nd_pfn->dev,
- "init failed: %s not section aligned\n",
- dev_name(&ndns->dev));
- return -EBUSY;
- } else if (offset >= resource_size(&nsio->res)) {
+ if (offset >= resource_size(&nsio->res)) {
dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n",
dev_name(&ndns->dev));
return -EBUSY;
}
+ nd_pfn->align = 1UL << ilog2(offset);
+ if (!is_power_of_2(offset) || offset < PAGE_SIZE) {
+ dev_err(&nd_pfn->dev, "bad offset: %#llx dax disabled\n",
+ offset);
+ return -ENXIO;
+ }
+
return 0;
}
EXPORT_SYMBOL(nd_pfn_validate);
@@ -313,7 +364,7 @@ int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata)
return -ENODEV;
nvdimm_bus_lock(&ndns->dev);
- dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE, ndns);
+ dev = __nd_pfn_create(nd_region, ndns);
nvdimm_bus_unlock(&ndns->dev);
if (!dev)
return -ENOMEM;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 8ee79893d2f5..b493ff3fccb2 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -23,6 +23,7 @@
#include <linux/module.h>
#include <linux/memory_hotplug.h>
#include <linux/moduleparam.h>
+#include <linux/badblocks.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/pmem.h>
@@ -41,11 +42,25 @@ struct pmem_device {
phys_addr_t data_offset;
void __pmem *virt_addr;
size_t size;
+ struct badblocks bb;
};
static int pmem_major;
-static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
+static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
+{
+ if (bb->count) {
+ sector_t first_bad;
+ int num_bad;
+
+ return !!badblocks_check(bb, sector, len / 512, &first_bad,
+ &num_bad);
+ }
+
+ return false;
+}
+
+static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
unsigned int len, unsigned int off, int rw,
sector_t sector)
{
@@ -54,6 +69,8 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
if (rw == READ) {
+ if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
+ return -EIO;
memcpy_from_pmem(mem + off, pmem_addr, len);
flush_dcache_page(page);
} else {
@@ -62,10 +79,12 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
}
kunmap_atomic(mem);
+ return 0;
}
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
{
+ int rc = 0;
bool do_acct;
unsigned long start;
struct bio_vec bvec;
@@ -74,9 +93,15 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
struct pmem_device *pmem = bdev->bd_disk->private_data;
do_acct = nd_iostat_start(bio, &start);
- bio_for_each_segment(bvec, bio, iter)
- pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset,
- bio_data_dir(bio), iter.bi_sector);
+ bio_for_each_segment(bvec, bio, iter) {
+ rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
+ bvec.bv_offset, bio_data_dir(bio),
+ iter.bi_sector);
+ if (rc) {
+ bio->bi_error = rc;
+ break;
+ }
+ }
if (do_acct)
nd_iostat_end(bio, start);
@@ -91,13 +116,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
struct page *page, int rw)
{
struct pmem_device *pmem = bdev->bd_disk->private_data;
+ int rc;
- pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
+ rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
if (rw & WRITE)
wmb_pmem();
- page_endio(page, rw & WRITE, 0);
- return 0;
+ /*
+ * The ->rw_page interface is subtle and tricky. The core
+ * retries on any error, so we can only invoke page_endio() in
+ * the successful completion case. Otherwise, we'll see crashes
+ * caused by double completion.
+ */
+ if (rc == 0)
+ page_endio(page, rw & WRITE, 0);
+
+ return rc;
}
static long pmem_direct_access(struct block_device *bdev, sector_t sector,
@@ -195,7 +229,12 @@ static int pmem_attach_disk(struct device *dev,
disk->driverfs_dev = dev;
set_capacity(disk, (pmem->size - pmem->data_offset) / 512);
pmem->pmem_disk = disk;
+ devm_exit_badblocks(dev, &pmem->bb);
+ if (devm_init_badblocks(dev, &pmem->bb))
+ return -ENOMEM;
+ nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset);
+ disk->bb = &pmem->bb;
add_disk(disk);
revalidate_disk(disk);
@@ -212,9 +251,13 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns,
return -EFAULT;
}
- if (rw == READ)
+ if (rw == READ) {
+ unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
+
+ if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align)))
+ return -EIO;
memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
- else {
+ } else {
memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
wmb_pmem();
}
@@ -238,14 +281,11 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
nd_pfn->pfn_sb = pfn_sb;
rc = nd_pfn_validate(nd_pfn);
- if (rc == 0 || rc == -EBUSY)
+ if (rc == -ENODEV)
+ /* no info block, do init */;
+ else
return rc;
- /* section alignment for simple hotplug */
- if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN
- || pmem->phys_addr & ND_PFN_MASK)
- return -ENODEV;
-
nd_region = to_nd_region(nd_pfn->dev.parent);
if (nd_region->ro) {
dev_info(&nd_pfn->dev,
@@ -263,9 +303,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
* ->direct_access() to those that are included in the memmap.
*/
if (nd_pfn->mode == PFN_MODE_PMEM)
- offset = ALIGN(SZ_8K + 64 * npfns, PMD_SIZE);
+ offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align);
else if (nd_pfn->mode == PFN_MODE_RAM)
- offset = SZ_8K;
+ offset = ALIGN(SZ_8K, nd_pfn->align);
else
goto err;
@@ -275,6 +315,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
pfn_sb->npfns = cpu_to_le64(npfns);
memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
+ memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
pfn_sb->version_major = cpu_to_le16(1);
checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
pfn_sb->checksum = cpu_to_le64(checksum);
@@ -326,21 +367,11 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
if (rc)
return rc;
- if (PAGE_SIZE != SZ_4K) {
- dev_err(dev, "only supported on systems with 4K PAGE_SIZE\n");
- return -ENXIO;
- }
- if (nsio->res.start & ND_PFN_MASK) {
- dev_err(dev, "%s not memory hotplug section aligned\n",
- dev_name(&ndns->dev));
- return -ENXIO;
- }
-
pfn_sb = nd_pfn->pfn_sb;
offset = le64_to_cpu(pfn_sb->dataoff);
nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
if (nd_pfn->mode == PFN_MODE_RAM) {
- if (offset != SZ_8K)
+ if (offset < SZ_8K)
return -EINVAL;
nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
altmap = NULL;
@@ -389,6 +420,9 @@ static int nd_pmem_probe(struct device *dev)
pmem->ndns = ndns;
dev_set_drvdata(dev, pmem);
ndns->rw_bytes = pmem_rw_bytes;
+ if (devm_init_badblocks(dev, &pmem->bb))
+ return -ENOMEM;
+ nvdimm_namespace_add_poison(ndns, &pmem->bb, 0);
if (is_nd_btt(dev))
return nvdimm_namespace_attach_btt(ndns);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 529f3f02e7b2..139bf71ca549 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -134,62 +134,6 @@ int nd_region_to_nstype(struct nd_region *nd_region)
}
EXPORT_SYMBOL(nd_region_to_nstype);
-static int is_uuid_busy(struct device *dev, void *data)
-{
- struct nd_region *nd_region = to_nd_region(dev->parent);
- u8 *uuid = data;
-
- switch (nd_region_to_nstype(nd_region)) {
- case ND_DEVICE_NAMESPACE_PMEM: {
- struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
-
- if (!nspm->uuid)
- break;
- if (memcmp(uuid, nspm->uuid, NSLABEL_UUID_LEN) == 0)
- return -EBUSY;
- break;
- }
- case ND_DEVICE_NAMESPACE_BLK: {
- struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-
- if (!nsblk->uuid)
- break;
- if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) == 0)
- return -EBUSY;
- break;
- }
- default:
- break;
- }
-
- return 0;
-}
-
-static int is_namespace_uuid_busy(struct device *dev, void *data)
-{
- if (is_nd_pmem(dev) || is_nd_blk(dev))
- return device_for_each_child(dev, data, is_uuid_busy);
- return 0;
-}
-
-/**
- * nd_is_uuid_unique - verify that no other namespace has @uuid
- * @dev: any device on a nvdimm_bus
- * @uuid: uuid to check
- */
-bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
-{
- struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
-
- if (!nvdimm_bus)
- return false;
- WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev));
- if (device_for_each_child(&nvdimm_bus->dev, uuid,
- is_namespace_uuid_busy) != 0)
- return false;
- return true;
-}
-
static ssize_t size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -406,6 +350,9 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
struct nd_interleave_set *nd_set = nd_region->nd_set;
int type = nd_region_to_nstype(nd_region);
+ if (!is_nd_pmem(dev) && a == &dev_attr_pfn_seed.attr)
+ return 0;
+
if (a != &dev_attr_set_cookie.attr
&& a != &dev_attr_available_size.attr)
return a->mode;
@@ -487,6 +434,13 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
nd_region_create_blk_seed(nd_region);
nvdimm_bus_unlock(dev);
}
+ if (is_nd_pfn(dev) && probe) {
+ nd_region = to_nd_region(dev->parent);
+ nvdimm_bus_lock(dev);
+ if (nd_region->pfn_seed == dev)
+ nd_region_create_pfn_seed(nd_region);
+ nvdimm_bus_unlock(dev);
+ }
}
void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 01b8e0d4b4ff..d878e4860fb7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -156,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
return 0;
}
+static struct inode *bdev_file_inode(struct file *file)
+{
+ return file->f_mapping->host;
+}
+
static ssize_t
blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = bdev_file_inode(file);
if (IS_DAX(inode))
return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
@@ -338,7 +343,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
*/
static loff_t block_llseek(struct file *file, loff_t offset, int whence)
{
- struct inode *bd_inode = file->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(file);
loff_t retval;
mutex_lock(&bd_inode->i_mutex);
@@ -349,7 +354,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence)
int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
- struct inode *bd_inode = filp->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(filp);
struct block_device *bdev = I_BDEV(bd_inode);
int error;
@@ -1224,8 +1229,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
}
- if (!ret)
+ if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+ if (!blkdev_dax_capable(bdev))
+ bdev->bd_inode->i_flags &= ~S_DAX;
+ }
/*
* If the device is invalidated, rescan partition
@@ -1239,6 +1247,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
else if (ret == -ENOMEDIUM)
invalidate_partitions(disk, bdev);
}
+
if (ret)
goto out_clear;
} else {
@@ -1259,12 +1268,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
- /*
- * If the partition is not aligned on a page
- * boundary, we can't do dax I/O to it.
- */
- if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) ||
- (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+ if (!blkdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}
} else {
@@ -1599,14 +1603,14 @@ EXPORT_SYMBOL(blkdev_put);
static int blkdev_close(struct inode * inode, struct file * filp)
{
- struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+ struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
blkdev_put(bdev, filp->f_mode);
return 0;
}
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
- struct block_device *bdev = I_BDEV(file->f_mapping->host);
+ struct block_device *bdev = I_BDEV(bdev_file_inode(file));
fmode_t mode = file->f_mode;
/*
@@ -1631,7 +1635,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *bd_inode = file->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
struct blk_plug plug;
ssize_t ret;
@@ -1663,7 +1667,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter);
ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
- struct inode *bd_inode = file->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
loff_t pos = iocb->ki_pos;
@@ -1702,13 +1706,101 @@ static const struct address_space_operations def_blk_aops = {
.is_dirty_writeback = buffer_check_dirty_writeback,
};
+#ifdef CONFIG_FS_DAX
+/*
+ * In the raw block case we do not need to contend with truncation nor
+ * unwritten file extents. Without those concerns there is no need for
+ * additional locking beyond the mmap_sem context that these routines
+ * are already executing under.
+ *
+ * Note, there is no protection if the block device is dynamically
+ * resized (partition grow/shrink) during a fault. A stable block device
+ * size is already not enforced in the blkdev_direct_IO path.
+ *
+ * For DAX, it is the responsibility of the block device driver to
+ * ensure the whole-disk device size is stable while requests are in
+ * flight.
+ *
+ * Finally, unlike the filemap_page_mkwrite() case there is no
+ * filesystem superblock to sync against freezing. We still include a
+ * pfn_mkwrite callback for dax drivers to receive write fault
+ * notifications.
+ */
+static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return __dax_fault(vma, vmf, blkdev_get_block, NULL);
+}
+
+static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
+}
+
+static void blkdev_vm_open(struct vm_area_struct *vma)
+{
+ struct inode *bd_inode = bdev_file_inode(vma->vm_file);
+ struct block_device *bdev = I_BDEV(bd_inode);
+
+ mutex_lock(&bd_inode->i_mutex);
+ bdev->bd_map_count++;
+ mutex_unlock(&bd_inode->i_mutex);
+}
+
+static void blkdev_vm_close(struct vm_area_struct *vma)
+{
+ struct inode *bd_inode = bdev_file_inode(vma->vm_file);
+ struct block_device *bdev = I_BDEV(bd_inode);
+
+ mutex_lock(&bd_inode->i_mutex);
+ bdev->bd_map_count--;
+ mutex_unlock(&bd_inode->i_mutex);
+}
+
+static const struct vm_operations_struct blkdev_dax_vm_ops = {
+ .open = blkdev_vm_open,
+ .close = blkdev_vm_close,
+ .fault = blkdev_dax_fault,
+ .pmd_fault = blkdev_dax_pmd_fault,
+ .pfn_mkwrite = blkdev_dax_fault,
+};
+
+static const struct vm_operations_struct blkdev_default_vm_ops = {
+ .open = blkdev_vm_open,
+ .close = blkdev_vm_close,
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+};
+
+static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct inode *bd_inode = bdev_file_inode(file);
+ struct block_device *bdev = I_BDEV(bd_inode);
+
+ file_accessed(file);
+ mutex_lock(&bd_inode->i_mutex);
+ bdev->bd_map_count++;
+ if (IS_DAX(bd_inode)) {
+ vma->vm_ops = &blkdev_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ } else {
+ vma->vm_ops = &blkdev_default_vm_ops;
+ }
+ mutex_unlock(&bd_inode->i_mutex);
+
+ return 0;
+}
+#else
+#define blkdev_mmap generic_file_mmap
+#endif
+
const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
- .mmap = generic_file_mmap,
+ .mmap = blkdev_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h
new file mode 100644
index 000000000000..c3bdf8c59480
--- /dev/null
+++ b/include/linux/badblocks.h
@@ -0,0 +1,65 @@
+#ifndef _LINUX_BADBLOCKS_H
+#define _LINUX_BADBLOCKS_H
+
+#include <linux/seqlock.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+#define BB_LEN_MASK (0x00000000000001FFULL)
+#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
+#define BB_ACK_MASK (0x8000000000000000ULL)
+#define BB_MAX_LEN 512
+#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
+#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
+#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
+#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
+
+/* Bad block numbers are stored sorted in a single page.
+ * 64bits is used for each block or extent.
+ * 54 bits are sector number, 9 bits are extent size,
+ * 1 bit is an 'acknowledged' flag.
+ */
+#define MAX_BADBLOCKS (PAGE_SIZE/8)
+
+struct badblocks {
+ struct device *dev; /* set by devm_init_badblocks */
+ int count; /* count of bad blocks */
+ int unacked_exist; /* there probably are unacknowledged
+ * bad blocks. This is only cleared
+ * when a read discovers none
+ */
+ int shift; /* shift from sectors to block size
+ * a -ve shift means badblocks are
+ * disabled.*/
+ u64 *page; /* badblock list */
+ int changed;
+ seqlock_t lock;
+ sector_t sector;
+ sector_t size; /* in sectors */
+};
+
+int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors);
+int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged);
+int badblocks_clear(struct badblocks *bb, sector_t s, int sectors);
+void ack_all_badblocks(struct badblocks *bb);
+ssize_t badblocks_show(struct badblocks *bb, char *page, int unack);
+ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
+ int unack);
+int badblocks_init(struct badblocks *bb, int enable);
+void badblocks_exit(struct badblocks *bb);
+struct device;
+int devm_init_badblocks(struct device *dev, struct badblocks *bb);
+static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb)
+{
+ if (bb->dev != dev) {
+ dev_WARN_ONCE(dev, 1, "%s: badblocks instance not associated\n",
+ __func__);
+ return;
+ }
+ badblocks_exit(bb);
+}
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 731262c3fbb7..eb73d74ed992 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -483,6 +483,9 @@ struct block_device {
int bd_fsfreeze_count;
/* Mutex for freeze */
struct mutex bd_fsfreeze_mutex;
+#ifdef CONFIG_FS_DAX
+ int bd_map_count;
+#endif
};
/*
@@ -2280,6 +2283,14 @@ extern struct super_block *freeze_bdev(struct block_device *);
extern void emergency_thaw_all(void);
extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
extern int fsync_bdev(struct block_device *);
+#ifdef CONFIG_FS_DAX
+extern bool blkdev_dax_capable(struct block_device *bdev);
+#else
+static inline bool blkdev_dax_capable(struct block_device *bdev)
+{
+ return false;
+}
+#endif
extern struct super_block *blockdev_superblock;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 847cc1d91634..5c706765404a 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -162,6 +162,7 @@ struct disk_part_tbl {
};
struct disk_events;
+struct badblocks;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -213,6 +214,7 @@ struct gendisk {
struct kobject integrity_kobj;
#endif /* CONFIG_BLK_DEV_INTEGRITY */
int node_id;
+ struct badblocks *bb;
};
static inline struct gendisk *part_to_disk(struct hd_struct *part)
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 3f021dc5da8c..bed40dff0e86 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -116,6 +116,7 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
}
+int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
struct nvdimm_bus_descriptor *nfit_desc, struct module *module);
#define nvdimm_bus_register(parent, desc) \
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index b38e647664a0..b8a5b3b86ad6 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -188,6 +188,8 @@ struct inodes_stat_t {
#define BLKSECDISCARD _IO(0x12,125)
#define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127)
+#define BLKDAXSET _IO(0x12,128)
+#define BLKDAXGET _IO(0x12,129)
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */
diff --git a/kernel/resource.c b/kernel/resource.c
index f150dbbe6f62..09c0597840b0 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1498,8 +1498,15 @@ int iomem_is_exclusive(u64 addr)
break;
if (p->end < addr)
continue;
- if (p->flags & IORESOURCE_BUSY &&
- p->flags & IORESOURCE_EXCLUSIVE) {
+ /*
+ * A resource is exclusive if IORESOURCE_EXCLUSIVE is set
+ * or CONFIG_IO_STRICT_DEVMEM is enabled and the
+ * resource is busy.
+ */
+ if ((p->flags & IORESOURCE_BUSY) == 0)
+ continue;
+ if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
+ || p->flags & IORESOURCE_EXCLUSIVE) {
err = 1;
break;
}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e2a236a7341f..ee1ac1cc082c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1886,3 +1886,42 @@ source "samples/Kconfig"
source "lib/Kconfig.kgdb"
+config ARCH_HAS_DEVMEM_IS_ALLOWED
+ bool
+
+config STRICT_DEVMEM
+ bool "Filter access to /dev/mem"
+ depends on MMU
+ depends on ARCH_HAS_DEVMEM_IS_ALLOWED
+ default y if TILE || PPC
+ ---help---
+ If this option is disabled, you allow userspace (root) access to all
+ of memory, including kernel and userspace memory. Accidental
+ access to this is obviously disastrous, but specific access can
+ be used by people debugging the kernel. Note that with PAT support
+ enabled, even in this case there are restrictions on /dev/mem
+ use due to the cache aliasing requirements.
+
+ If this option is switched on, and IO_STRICT_DEVMEM=n, the /dev/mem
+ file only allows userspace access to PCI space and the BIOS code and
+ data regions. This is sufficient for dosemu and X and all common
+ users of /dev/mem.
+
+ If in doubt, say Y.
+
+config IO_STRICT_DEVMEM
+ bool "Filter I/O access to /dev/mem"
+ depends on STRICT_DEVMEM
+ default STRICT_DEVMEM
+ ---help---
+ If this option is disabled, you allow userspace (root) access to all
+ io-memory regardless of whether a driver is actively using that
+ range. Accidental access to this is obviously disastrous, but
+ specific access can be used by people debugging kernel drivers.
+
+ If this option is switched on, the /dev/mem file only allows
+ userspace access to *idle* io-memory ranges (see /proc/iomem) This
+ may break traditional users of /dev/mem (dosemu, legacy X, etc...)
+ if the driver using a given range cannot be disabled.
+
+ If in doubt, say Y.
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index 38b00ecb2ed5..a34bfd0c8928 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -9,6 +9,8 @@ ldflags-y += --wrap=memunmap
ldflags-y += --wrap=__devm_request_region
ldflags-y += --wrap=__request_region
ldflags-y += --wrap=__release_region
+ldflags-y += --wrap=devm_memremap_pages
+ldflags-y += --wrap=phys_to_pfn_t
DRIVERS := ../../../drivers
NVDIMM_SRC := $(DRIVERS)/nvdimm
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index b7251314bbc0..7ec7df9e7fc7 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/types.h>
#include <linux/io.h>
+#include <linux/mm.h>
#include "nfit_test.h"
static LIST_HEAD(iomap_head);
@@ -41,7 +42,7 @@ void nfit_test_teardown(void)
}
EXPORT_SYMBOL(nfit_test_teardown);
-static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
+static struct nfit_test_resource *__get_nfit_res(resource_size_t resource)
{
struct iomap_ops *ops;
@@ -51,14 +52,22 @@ static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
return NULL;
}
-void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
- void __iomem *(*fallback_fn)(resource_size_t, unsigned long))
+static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
{
- struct nfit_test_resource *nfit_res;
+ struct nfit_test_resource *res;
rcu_read_lock();
- nfit_res = get_nfit_res(offset);
+ res = __get_nfit_res(resource);
rcu_read_unlock();
+
+ return res;
+}
+
+void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
+ void __iomem *(*fallback_fn)(resource_size_t, unsigned long))
+{
+ struct nfit_test_resource *nfit_res = get_nfit_res(offset);
+
if (nfit_res)
return (void __iomem *) nfit_res->buf + offset
- nfit_res->res->start;
@@ -68,11 +77,8 @@ void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
void __iomem *__wrap_devm_ioremap_nocache(struct device *dev,
resource_size_t offset, unsigned long size)
{
- struct nfit_test_resource *nfit_res;
+ struct nfit_test_resource *nfit_res = get_nfit_res(offset);
- rcu_read_lock();
- nfit_res = get_nfit_res(offset);
- rcu_read_unlock();
if (nfit_res)
return (void __iomem *) nfit_res->buf + offset
- nfit_res->res->start;
@@ -83,25 +89,58 @@ EXPORT_SYMBOL(__wrap_devm_ioremap_nocache);
void *__wrap_devm_memremap(struct device *dev, resource_size_t offset,
size_t size, unsigned long flags)
{
- struct nfit_test_resource *nfit_res;
+ struct nfit_test_resource *nfit_res = get_nfit_res(offset);
- rcu_read_lock();
- nfit_res = get_nfit_res(offset);
- rcu_read_unlock();
if (nfit_res)
return nfit_res->buf + offset - nfit_res->res->start;
return devm_memremap(dev, offset, size, flags);
}
EXPORT_SYMBOL(__wrap_devm_memremap);
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+#include <linux/memremap.h>
+#include <linux/pfn_t.h>
+
+void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res,
+ struct percpu_ref *ref, struct vmem_altmap *altmap)
+{
+ resource_size_t offset = res->start;
+ struct nfit_test_resource *nfit_res = get_nfit_res(offset);
+
+ if (nfit_res)
+ return nfit_res->buf + offset - nfit_res->res->start;
+ return devm_memremap_pages(dev, res, ref, altmap);
+}
+EXPORT_SYMBOL(__wrap_devm_memremap_pages);
+
+pfn_t __wrap_phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
+{
+ struct nfit_test_resource *nfit_res = get_nfit_res(addr);
+
+ if (nfit_res)
+ flags &= ~PFN_MAP;
+ return phys_to_pfn_t(addr, flags);
+}
+EXPORT_SYMBOL(__wrap_phys_to_pfn_t);
+#else
+/* to be removed post 4.5-rc1 */
+void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res)
+{
+ resource_size_t offset = res->start;
+ struct nfit_test_resource *nfit_res = get_nfit_res(offset);
+
+ if (nfit_res)
+ return nfit_res->buf + offset - nfit_res->res->start;
+ return devm_memremap_pages(dev, res);
+}
+EXPORT_SYMBOL(__wrap_devm_memremap_pages);
+#endif
+
void *__wrap_memremap(resource_size_t offset, size_t size,
unsigned long flags)
{
- struct nfit_test_resource *nfit_res;
+ struct nfit_test_resource *nfit_res = get_nfit_res(offset);
- rcu_read_lock();
- nfit_res = get_nfit_res(offset);
- rcu_read_unlock();
if (nfit_res)
return nfit_res->buf + offset - nfit_res->res->start;
return memremap(offset, size, flags);
@@ -110,11 +149,8 @@ EXPORT_SYMBOL(__wrap_memremap);
void __wrap_devm_memunmap(struct device *dev, void *addr)
{
- struct nfit_test_resource *nfit_res;
+ struct nfit_test_resource *nfit_res = get_nfit_res((long) addr);
- rcu_read_lock();
- nfit_res = get_nfit_res((unsigned long) addr);
- rcu_read_unlock();
if (nfit_res)
return;
return devm_memunmap(dev, addr);
@@ -135,11 +171,7 @@ EXPORT_SYMBOL(__wrap_ioremap_wc);
void __wrap_iounmap(volatile void __iomem *addr)
{
- struct nfit_test_resource *nfit_res;
-
- rcu_read_lock();
- nfit_res = get_nfit_res((unsigned long) addr);
- rcu_read_unlock();
+ struct nfit_test_resource *nfit_res = get_nfit_res((long) addr);
if (nfit_res)
return;
return iounmap(addr);
@@ -148,11 +180,8 @@ EXPORT_SYMBOL(__wrap_iounmap);
void __wrap_memunmap(void *addr)
{
- struct nfit_test_resource *nfit_res;
+ struct nfit_test_resource *nfit_res = get_nfit_res((long) addr);
- rcu_read_lock();
- nfit_res = get_nfit_res((unsigned long) addr);
- rcu_read_unlock();
if (nfit_res)
return;
return memunmap(addr);
@@ -166,9 +195,7 @@ static struct resource *nfit_test_request_region(struct device *dev,
struct nfit_test_resource *nfit_res;
if (parent == &iomem_resource) {
- rcu_read_lock();
nfit_res = get_nfit_res(start);
- rcu_read_unlock();
if (nfit_res) {
struct resource *res = nfit_res->res + 1;
@@ -218,9 +245,7 @@ void __wrap___release_region(struct resource *parent, resource_size_t start,
struct nfit_test_resource *nfit_res;
if (parent == &iomem_resource) {
- rcu_read_lock();
nfit_res = get_nfit_res(start);
- rcu_read_unlock();
if (nfit_res) {
struct resource *res = nfit_res->res + 1;
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 51cf8256c6cd..90bd2ea41032 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -248,6 +248,8 @@ static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd,
nd_cmd->out_length = 256;
nd_cmd->num_records = 0;
+ nd_cmd->address = 0;
+ nd_cmd->length = -1ULL;
nd_cmd->status = 0;
return 0;
@@ -1088,6 +1090,8 @@ static void nfit_test1_setup(struct nfit_test *t)
struct acpi_nfit_memory_map *memdev;
struct acpi_nfit_control_region *dcr;
struct acpi_nfit_system_address *spa;
+ struct nvdimm_bus_descriptor *nd_desc;
+ struct acpi_nfit_desc *acpi_desc;
offset = 0;
/* spa0 (flat range with no bdw aliasing) */
@@ -1135,6 +1139,13 @@ static void nfit_test1_setup(struct nfit_test *t)
dcr->command_size = 0;
dcr->status_offset = 0;
dcr->status_size = 0;
+
+ acpi_desc = &t->acpi_desc;
+ set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
+ set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
+ set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
+ nd_desc = &acpi_desc->nd_desc;
+ nd_desc->ndctl = nfit_test_ctl;
}
static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,