aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--CREDITS4
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst20
-rw-r--r--Documentation/arm64/memory.rst9
-rw-r--r--Documentation/core-api/memory-allocation.rst4
-rw-r--r--Documentation/dev-tools/kselftest.rst16
-rw-r--r--MAINTAINERS5
-rw-r--r--Makefile5
-rw-r--r--arch/arm/crypto/Kconfig1
-rw-r--r--arch/arm/crypto/aes-ce-core.S1
-rw-r--r--arch/arm64/Kconfig15
-rw-r--r--arch/arm64/Makefile16
-rw-r--r--arch/arm64/include/asm/atomic_lse.h6
-rw-r--r--arch/arm64/include/asm/vdso/compat_barrier.h2
-rw-r--r--arch/arm64/include/asm/vdso_datapage.h33
-rw-r--r--arch/arm64/kernel/armv8_deprecated.c5
-rw-r--r--arch/arm64/kernel/cpu_errata.c4
-rw-r--r--arch/arm64/kernel/cpufeature.c1
-rw-r--r--arch/arm64/kernel/entry.S1
-rw-r--r--arch/arm64/kernel/ftrace.c12
-rw-r--r--arch/arm64/kernel/process.c32
-rw-r--r--arch/arm64/kernel/vdso/gettimeofday.S0
-rw-r--r--arch/arm64/kernel/vdso32/Makefile44
-rw-r--r--arch/arm64/mm/fault.c13
-rw-r--r--arch/x86/include/asm/uaccess.h23
-rw-r--r--block/blk-rq-qos.c14
-rw-r--r--block/blk-rq-qos.h4
-rw-r--r--block/blk-wbt.c6
-rw-r--r--drivers/ata/libata-scsi.c21
-rw-r--r--drivers/block/nbd.c2
-rw-r--r--drivers/block/null_blk_zoned.c3
-rw-r--r--drivers/gpio/gpio-eic-sprd.c7
-rw-r--r--drivers/gpio/gpio-max77620.c6
-rw-r--r--drivers/gpio/gpiolib-of.c2
-rw-r--r--drivers/gpio/gpiolib.c27
-rw-r--r--drivers/infiniband/core/cm.c3
-rw-r--r--drivers/infiniband/core/cma.c3
-rw-r--r--drivers/infiniband/core/device.c9
-rw-r--r--drivers/infiniband/core/nldev.c12
-rw-r--r--drivers/infiniband/core/security.c2
-rw-r--r--drivers/infiniband/core/umem_odp.c6
-rw-r--r--drivers/infiniband/hw/cxgb4/device.c7
-rw-r--r--drivers/infiniband/hw/cxgb4/mem.c28
-rw-r--r--drivers/infiniband/hw/cxgb4/qp.c10
-rw-r--r--drivers/infiniband/hw/hfi1/sdma.c5
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_verbs.c4
-rw-r--r--drivers/infiniband/hw/mlx5/devx.c58
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h3
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c68
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c58
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c2
-rw-r--r--drivers/infiniband/sw/siw/siw_qp.c15
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/mr.c8
-rw-r--r--fs/btrfs/file.c13
-rw-r--r--fs/btrfs/inode.c3
-rw-r--r--fs/btrfs/ref-verify.c2
-rw-r--r--fs/btrfs/send.c2
-rw-r--r--fs/btrfs/tree-log.c36
-rw-r--r--fs/btrfs/volumes.c6
-rw-r--r--fs/fs-writeback.c9
-rw-r--r--fs/io_uring.c24
-rw-r--r--fs/libfs.c137
-rw-r--r--fs/ocfs2/aops.c25
-rw-r--r--fs/ocfs2/ioctl.c2
-rw-r--r--fs/ocfs2/xattr.c56
-rw-r--r--fs/readdir.c44
-rw-r--r--fs/super.c5
-rw-r--r--fs/xfs/libxfs/xfs_ag.c5
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c21
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c2
-rw-r--r--fs/xfs/scrub/refcount.c3
-rw-r--r--fs/xfs/xfs_bmap_util.c4
-rw-r--r--fs/xfs/xfs_buf.c12
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--include/linux/leds.h5
-rw-r--r--include/linux/memcontrol.h29
-rw-r--r--include/linux/slab.h4
-rw-r--r--include/linux/uaccess.h6
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/freezer.c6
-rw-r--r--kernel/panic.c1
-rw-r--r--lib/vdso/Kconfig9
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/memcontrol.c5
-rw-r--r--mm/memremap.c2
-rw-r--r--mm/page_alloc.c8
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/shuffle.c2
-rw-r--r--mm/slab_common.c19
-rw-r--r--mm/slob.c62
-rw-r--r--mm/slub.c14
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/vmpressure.c20
-rw-r--r--mm/vmscan.c72
-rw-r--r--mm/z3fold.c10
-rw-r--r--security/selinux/ss/services.c9
-rw-r--r--tools/testing/selftests/Makefile19
-rw-r--r--tools/testing/selftests/kselftest/runner.sh36
-rwxr-xr-xtools/testing/selftests/kselftest_install.sh4
-rw-r--r--tools/testing/selftests/rtc/settings1
-rw-r--r--tools/testing/selftests/watchdog/watchdog-test.c27
103 files changed, 891 insertions, 584 deletions
diff --git a/CREDITS b/CREDITS
index 8b67a85844b5..031605d46b4d 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1637,6 +1637,10 @@ S: Panoramastrasse 18
S: D-69126 Heidelberg
S: Germany
+N: Simon Horman
+M: horms@verge.net.au
+D: Renesas ARM/ARM64 SoC maintainer
+
N: Christopher Horn
E: chorn@warwick.net
D: Miscellaneous sysctl hacks
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 0fa8c0e615c2..5361ebec3361 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -615,8 +615,8 @@ on an IO device and is an example of this type.
Protections
-----------
-A cgroup is protected to be allocated upto the configured amount of
-the resource if the usages of all its ancestors are under their
+A cgroup is protected upto the configured amount of the resource
+as long as the usages of all its ancestors are under their
protected levels. Protections can be hard guarantees or best effort
soft boundaries. Protections can also be over-committed in which case
only upto the amount available to the parent is protected among
@@ -1096,7 +1096,10 @@ PAGE_SIZE multiple when read back.
is within its effective min boundary, the cgroup's memory
won't be reclaimed under any conditions. If there is no
unprotected reclaimable memory available, OOM killer
- is invoked.
+ is invoked. Above the effective min boundary (or
+ effective low boundary if it is higher), pages are reclaimed
+ proportionally to the overage, reducing reclaim pressure for
+ smaller overages.
Effective min boundary is limited by memory.min values of
all ancestor cgroups. If there is memory.min overcommitment
@@ -1118,7 +1121,10 @@ PAGE_SIZE multiple when read back.
Best-effort memory protection. If the memory usage of a
cgroup is within its effective low boundary, the cgroup's
memory won't be reclaimed unless memory can be reclaimed
- from unprotected cgroups.
+ from unprotected cgroups. Above the effective low boundary (or
+ effective min boundary if it is higher), pages are reclaimed
+ proportionally to the overage, reducing reclaim pressure for
+ smaller overages.
Effective low boundary is limited by memory.low values of
all ancestor cgroups. If there is memory.low overcommitment
@@ -2482,8 +2488,10 @@ system performance due to overreclaim, to the point where the feature
becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
-reserve. A cgroup enjoys reclaim protection when it's within its low,
-which makes delegation of subtrees possible.
+reserve. A cgroup enjoys reclaim protection when it's within its
+effective low, which makes delegation of subtrees possible. It also
+enjoys having reclaim pressure proportional to its overage when
+above its effective low.
The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst
index b040909e45f8..02e02175e6f5 100644
--- a/Documentation/arm64/memory.rst
+++ b/Documentation/arm64/memory.rst
@@ -154,11 +154,18 @@ return virtual addresses to userspace from a 48-bit range.
Software can "opt-in" to receiving VAs from a 52-bit space by
specifying an mmap hint parameter that is larger than 48-bit.
+
For example:
- maybe_high_address = mmap(~0UL, size, prot, flags,...);
+
+.. code-block:: c
+
+ maybe_high_address = mmap(~0UL, size, prot, flags,...);
It is also possible to build a debug kernel that returns addresses
from a 52-bit space by enabling the following kernel config options:
+
+.. code-block:: sh
+
CONFIG_EXPERT=y && CONFIG_ARM64_FORCE_52BIT=y
Note that this option is only intended for debugging applications
diff --git a/Documentation/core-api/memory-allocation.rst b/Documentation/core-api/memory-allocation.rst
index 7744aa3bf2e0..939e3dfc86e9 100644
--- a/Documentation/core-api/memory-allocation.rst
+++ b/Documentation/core-api/memory-allocation.rst
@@ -98,6 +98,10 @@ limited. The actual limit depends on the hardware and the kernel
configuration, but it is a good practice to use `kmalloc` for objects
smaller than page size.
+The address of a chunk allocated with `kmalloc` is aligned to at least
+ARCH_KMALLOC_MINALIGN bytes. For sizes which are a power of two, the
+alignment is also guaranteed to be at least the respective size.
+
For large allocations you can use :c:func:`vmalloc` and
:c:func:`vzalloc`, or directly request pages from the page
allocator. The memory allocated by `vmalloc` and related functions is
diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst
index 25604904fa6e..ecdfdc9d4b03 100644
--- a/Documentation/dev-tools/kselftest.rst
+++ b/Documentation/dev-tools/kselftest.rst
@@ -89,6 +89,22 @@ To build, save output files in a separate directory with KBUILD_OUTPUT ::
$ export KBUILD_OUTPUT=/tmp/kselftest; make TARGETS="size timers" kselftest
+Additionally you can use the "SKIP_TARGETS" variable on the make command
+line to specify one or more targets to exclude from the TARGETS list.
+
+To run all tests but a single subsystem::
+
+ $ make -C tools/testing/selftests SKIP_TARGETS=ptrace run_tests
+
+You can specify multiple tests to skip::
+
+ $ make SKIP_TARGETS="size timers" kselftest
+
+You can also specify a restricted list of tests to run together with a
+dedicated skiplist::
+
+ $ make TARGETS="bpf breakpoints size timers" SKIP_TARGETS=bpf kselftest
+
See the top-level tools/testing/selftests/Makefile for the list of all
possible targets.
diff --git a/MAINTAINERS b/MAINTAINERS
index 55199ef7fa74..d44d6732510d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2165,12 +2165,10 @@ F: arch/arm64/boot/dts/realtek/
F: Documentation/devicetree/bindings/arm/realtek.yaml
ARM/RENESAS ARM64 ARCHITECTURE
-M: Simon Horman <horms@verge.net.au>
M: Geert Uytterhoeven <geert+renesas@glider.be>
M: Magnus Damm <magnus.damm@gmail.com>
L: linux-renesas-soc@vger.kernel.org
Q: http://patchwork.kernel.org/project/linux-renesas-soc/list/
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next
T: git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git next
S: Supported
F: arch/arm64/boot/dts/renesas/
@@ -2282,12 +2280,10 @@ S: Maintained
F: drivers/media/platform/s5p-mfc/
ARM/SHMOBILE ARM ARCHITECTURE
-M: Simon Horman <horms@verge.net.au>
M: Geert Uytterhoeven <geert+renesas@glider.be>
M: Magnus Damm <magnus.damm@gmail.com>
L: linux-renesas-soc@vger.kernel.org
Q: http://patchwork.kernel.org/project/linux-renesas-soc/list/
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next
T: git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git next
S: Supported
F: arch/arm/boot/dts/emev2*
@@ -9187,6 +9183,7 @@ M: Pavel Machek <pavel@ucw.cz>
R: Dan Murphy <dmurphy@ti.com>
L: linux-leds@vger.kernel.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/j.anaszewski/linux-leds.git
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/pavel/linux-leds.git
S: Maintained
F: Documentation/devicetree/bindings/leds/
F: drivers/leds/
diff --git a/Makefile b/Makefile
index f47dfdec7086..c73922ad4d36 100644
--- a/Makefile
+++ b/Makefile
@@ -1217,9 +1217,8 @@ PHONY += kselftest
kselftest:
$(Q)$(MAKE) -C $(srctree)/tools/testing/selftests run_tests
-PHONY += kselftest-clean
-kselftest-clean:
- $(Q)$(MAKE) -C $(srctree)/tools/testing/selftests clean
+kselftest-%: FORCE
+ $(Q)$(MAKE) -C $(srctree)/tools/testing/selftests $*
PHONY += kselftest-merge
kselftest-merge:
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index b24df84a1d7a..043b0b18bf7e 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -98,6 +98,7 @@ config CRYPTO_AES_ARM_CE
tristate "Accelerated AES using ARMv8 Crypto Extensions"
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
+ select CRYPTO_LIB_AES
select CRYPTO_SIMD
help
Use an implementation of AES in CBC, CTR and XTS modes that uses
diff --git a/arch/arm/crypto/aes-ce-core.S b/arch/arm/crypto/aes-ce-core.S
index b978cdf133af..4d1707388d94 100644
--- a/arch/arm/crypto/aes-ce-core.S
+++ b/arch/arm/crypto/aes-ce-core.S
@@ -9,6 +9,7 @@
#include <asm/assembler.h>
.text
+ .arch armv8-a
.fpu crypto-neon-fp-armv8
.align 3
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 41a9b4257b72..950a56b71ff0 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -110,7 +110,6 @@ config ARM64
select GENERIC_STRNLEN_USER
select GENERIC_TIME_VSYSCALL
select GENERIC_GETTIMEOFDAY
- select GENERIC_COMPAT_VDSO if (!CPU_BIG_ENDIAN && COMPAT)
select HANDLE_DOMAIN_IRQ
select HARDIRQS_SW_RESEND
select HAVE_PCI
@@ -1159,7 +1158,7 @@ menuconfig COMPAT
if COMPAT
config KUSER_HELPERS
- bool "Enable kuser helpers page for 32 bit applications"
+ bool "Enable kuser helpers page for 32-bit applications"
default y
help
Warning: disabling this option may break 32-bit user programs.
@@ -1185,6 +1184,18 @@ config KUSER_HELPERS
Say N here only if you are absolutely certain that you do not
need these helpers; otherwise, the safe option is to say Y.
+config COMPAT_VDSO
+ bool "Enable vDSO for 32-bit applications"
+ depends on !CPU_BIG_ENDIAN && "$(CROSS_COMPILE_COMPAT)" != ""
+ select GENERIC_COMPAT_VDSO
+ default y
+ help
+ Place in the process address space of 32-bit applications an
+ ELF shared object providing fast implementations of gettimeofday
+ and clock_gettime.
+
+ You must have a 32-bit build of glibc 2.22 or later for programs
+ to seamlessly take advantage of this.
menuconfig ARMV8_DEPRECATED
bool "Emulate deprecated/obsolete ARMv8 instructions"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 84a3d502c5a5..2c0238ce0551 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -53,22 +53,6 @@ $(warning Detected assembler with broken .inst; disassembly will be unreliable)
endif
endif
-ifeq ($(CONFIG_GENERIC_COMPAT_VDSO), y)
- CROSS_COMPILE_COMPAT ?= $(CONFIG_CROSS_COMPILE_COMPAT_VDSO:"%"=%)
-
- ifeq ($(CONFIG_CC_IS_CLANG), y)
- $(warning CROSS_COMPILE_COMPAT is clang, the compat vDSO will not be built)
- else ifeq ($(strip $(CROSS_COMPILE_COMPAT)),)
- $(warning CROSS_COMPILE_COMPAT not defined or empty, the compat vDSO will not be built)
- else ifeq ($(shell which $(CROSS_COMPILE_COMPAT)gcc 2> /dev/null),)
- $(error $(CROSS_COMPILE_COMPAT)gcc not found, check CROSS_COMPILE_COMPAT)
- else
- export CROSS_COMPILE_COMPAT
- export CONFIG_COMPAT_VDSO := y
- compat_vdso := -DCONFIG_COMPAT_VDSO=1
- endif
-endif
-
KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr) $(brokengasinst) \
$(compat_vdso) $(cc_has_k_constraint)
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index c6bd87d2915b..574808b9df4c 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -321,7 +321,8 @@ static inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v)
}
#define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...) \
-static inline u##sz __lse__cmpxchg_case_##name##sz(volatile void *ptr, \
+static __always_inline u##sz \
+__lse__cmpxchg_case_##name##sz(volatile void *ptr, \
u##sz old, \
u##sz new) \
{ \
@@ -362,7 +363,8 @@ __CMPXCHG_CASE(x, , mb_, 64, al, "memory")
#undef __CMPXCHG_CASE
#define __CMPXCHG_DBL(name, mb, cl...) \
-static inline long __lse__cmpxchg_double##name(unsigned long old1, \
+static __always_inline long \
+__lse__cmpxchg_double##name(unsigned long old1, \
unsigned long old2, \
unsigned long new1, \
unsigned long new2, \
diff --git a/arch/arm64/include/asm/vdso/compat_barrier.h b/arch/arm64/include/asm/vdso/compat_barrier.h
index fb60a88b5ed4..3fd8fd6d8fc2 100644
--- a/arch/arm64/include/asm/vdso/compat_barrier.h
+++ b/arch/arm64/include/asm/vdso/compat_barrier.h
@@ -20,7 +20,7 @@
#define dmb(option) __asm__ __volatile__ ("dmb " #option : : : "memory")
-#if __LINUX_ARM_ARCH__ >= 8
+#if __LINUX_ARM_ARCH__ >= 8 && defined(CONFIG_AS_DMB_ISHLD)
#define aarch32_smp_mb() dmb(ish)
#define aarch32_smp_rmb() dmb(ishld)
#define aarch32_smp_wmb() dmb(ishst)
diff --git a/arch/arm64/include/asm/vdso_datapage.h b/arch/arm64/include/asm/vdso_datapage.h
deleted file mode 100644
index 1f38bf330a6e..000000000000
--- a/arch/arm64/include/asm/vdso_datapage.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2012 ARM Limited
- */
-#ifndef __ASM_VDSO_DATAPAGE_H
-#define __ASM_VDSO_DATAPAGE_H
-
-#ifndef __ASSEMBLY__
-
-struct vdso_data {
- __u64 cs_cycle_last; /* Timebase at clocksource init */
- __u64 raw_time_sec; /* Raw time */
- __u64 raw_time_nsec;
- __u64 xtime_clock_sec; /* Kernel time */
- __u64 xtime_clock_nsec;
- __u64 xtime_coarse_sec; /* Coarse time */
- __u64 xtime_coarse_nsec;
- __u64 wtm_clock_sec; /* Wall to monotonic time */
- __u64 wtm_clock_nsec;
- __u32 tb_seq_count; /* Timebase sequence counter */
- /* cs_* members must be adjacent and in this order (ldp accesses) */
- __u32 cs_mono_mult; /* NTP-adjusted clocksource multiplier */
- __u32 cs_shift; /* Clocksource shift (mono = raw) */
- __u32 cs_raw_mult; /* Raw clocksource multiplier */
- __u32 tz_minuteswest; /* Whacky timezone stuff */
- __u32 tz_dsttime;
- __u32 use_syscall;
- __u32 hrtimer_res;
-};
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* __ASM_VDSO_DATAPAGE_H */
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index 2ec09debc2bb..ca158be21f83 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -174,6 +174,9 @@ static void __init register_insn_emulation(struct insn_emulation_ops *ops)
struct insn_emulation *insn;
insn = kzalloc(sizeof(*insn), GFP_KERNEL);
+ if (!insn)
+ return;
+
insn->ops = ops;
insn->min = INSN_UNDEF;
@@ -233,6 +236,8 @@ static void __init register_insn_emulation_sysctl(void)
insns_sysctl = kcalloc(nr_insn_emulated + 1, sizeof(*sysctl),
GFP_KERNEL);
+ if (!insns_sysctl)
+ return;
raw_spin_lock_irqsave(&insn_emulation_lock, flags);
list_for_each_entry(insn, &insn_emulation, node) {
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 1e43ba5c79b7..f593f4cffc0d 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -128,8 +128,8 @@ static void install_bp_hardening_cb(bp_hardening_cb_t fn,
int cpu, slot = -1;
/*
- * enable_smccc_arch_workaround_1() passes NULL for the hyp_vecs
- * start/end if we're a guest. Skip the hyp-vectors work.
+ * detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if
+ * we're a guest. Skip the hyp-vectors work.
*/
if (!hyp_vecs_start) {
__this_cpu_write(bp_hardening_data.fn, fn);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9323bcc40a58..cabebf1a7976 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -136,6 +136,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_SB_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_FRINTTS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_GPI_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 84a822748c84..e304fe04b098 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -775,6 +775,7 @@ el0_sync_compat:
b.ge el0_dbg
b el0_inv
el0_svc_compat:
+ gic_prio_kentry_setup tmp=x1
mov x0, sp
bl el0_svc_compat_handler
b ret_to_user
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 171773257974..06e56b470315 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -121,10 +121,16 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
/*
* Ensure updated trampoline is visible to instruction
- * fetch before we patch in the branch.
+ * fetch before we patch in the branch. Although the
+ * architecture doesn't require an IPI in this case,
+ * Neoverse-N1 erratum #1542419 does require one
+ * if the TLB maintenance in module_enable_ro() is
+ * skipped due to rodata_enabled. It doesn't seem worth
+ * it to make it conditional given that this is
+ * certainly not a fast-path.
*/
- __flush_icache_range((unsigned long)&dst[0],
- (unsigned long)&dst[1]);
+ flush_icache_range((unsigned long)&dst[0],
+ (unsigned long)&dst[1]);
}
addr = (unsigned long)dst;
#else /* CONFIG_ARM64_MODULE_PLTS */
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index a47462def04b..1fb2819fc048 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -332,22 +332,27 @@ void arch_release_task_struct(struct task_struct *tsk)
fpsimd_release_task(tsk);
}
-/*
- * src and dst may temporarily have aliased sve_state after task_struct
- * is copied. We cannot fix this properly here, because src may have
- * live SVE state and dst's thread_info may not exist yet, so tweaking
- * either src's or dst's TIF_SVE is not safe.
- *
- * The unaliasing is done in copy_thread() instead. This works because
- * dst is not schedulable or traceable until both of these functions
- * have been called.
- */
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
if (current->mm)
fpsimd_preserve_current_state();
*dst = *src;
+ /* We rely on the above assignment to initialize dst's thread_flags: */
+ BUILD_BUG_ON(!IS_ENABLED(CONFIG_THREAD_INFO_IN_TASK));
+
+ /*
+ * Detach src's sve_state (if any) from dst so that it does not
+ * get erroneously used or freed prematurely. dst's sve_state
+ * will be allocated on demand later on if dst uses SVE.
+ * For consistency, also clear TIF_SVE here: this could be done
+ * later in copy_process(), but to avoid tripping up future
+ * maintainers it is best not to leave TIF_SVE and sve_state in
+ * an inconsistent state, even temporarily.
+ */
+ dst->thread.sve_state = NULL;
+ clear_tsk_thread_flag(dst, TIF_SVE);
+
return 0;
}
@@ -361,13 +366,6 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
/*
- * Unalias p->thread.sve_state (if any) from the parent task
- * and disable discard SVE state for p:
- */
- clear_tsk_thread_flag(p, TIF_SVE);
- p->thread.sve_state = NULL;
-
- /*
* In case p was allocated the same task_struct pointer as some
* other recently-exited task, make sure p is disassociated from
* any cpu that may have run that now-exited task recently.
diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/arch/arm64/kernel/vdso/gettimeofday.S
+++ /dev/null
diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile
index 1fba0776ed40..76b327f88fbb 100644
--- a/arch/arm64/kernel/vdso32/Makefile
+++ b/arch/arm64/kernel/vdso32/Makefile
@@ -8,15 +8,21 @@
ARCH_REL_TYPE_ABS := R_ARM_JUMP_SLOT|R_ARM_GLOB_DAT|R_ARM_ABS32
include $(srctree)/lib/vdso/Makefile
-COMPATCC := $(CROSS_COMPILE_COMPAT)gcc
+# Same as cc-*option, but using CC_COMPAT instead of CC
+ifeq ($(CONFIG_CC_IS_CLANG), y)
+CC_COMPAT ?= $(CC)
+else
+CC_COMPAT ?= $(CROSS_COMPILE_COMPAT)gcc
+endif
-# Same as cc-*option, but using COMPATCC instead of CC
cc32-option = $(call try-run,\
- $(COMPATCC) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
+ $(CC_COMPAT) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
cc32-disable-warning = $(call try-run,\
- $(COMPATCC) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1)))
+ $(CC_COMPAT) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1)))
cc32-ldoption = $(call try-run,\
- $(COMPATCC) $(1) -nostdlib -x c /dev/null -o "$$TMP",$(1),$(2))
+ $(CC_COMPAT) $(1) -nostdlib -x c /dev/null -o "$$TMP",$(1),$(2))
+cc32-as-instr = $(call try-run,\
+ printf "%b\n" "$(1)" | $(CC_COMPAT) $(VDSO_AFLAGS) -c -x assembler -o "$$TMP" -,$(2),$(3))
# We cannot use the global flags to compile the vDSO files, the main reason
# being that the 32-bit compiler may be older than the main (64-bit) compiler
@@ -25,22 +31,21 @@ cc32-ldoption = $(call try-run,\
# arm64 one.
# As a result we set our own flags here.
-# From top-level Makefile
-# NOSTDINC_FLAGS
-VDSO_CPPFLAGS := -nostdinc -isystem $(shell $(COMPATCC) -print-file-name=include)
+# KBUILD_CPPFLAGS and NOSTDINC_FLAGS from top-level Makefile
+VDSO_CPPFLAGS := -D__KERNEL__ -nostdinc -isystem $(shell $(CC_COMPAT) -print-file-name=include)
VDSO_CPPFLAGS += $(LINUXINCLUDE)
-VDSO_CPPFLAGS += $(KBUILD_CPPFLAGS)
# Common C and assembly flags
# From top-level Makefile
VDSO_CAFLAGS := $(VDSO_CPPFLAGS)
+ifneq ($(shell $(CC_COMPAT) --version 2>&1 | head -n 1 | grep clang),)
+VDSO_CAFLAGS += --target=$(notdir $(CROSS_COMPILE_COMPAT:%-=%))
+endif
+
VDSO_CAFLAGS += $(call cc32-option,-fno-PIE)
ifdef CONFIG_DEBUG_INFO
VDSO_CAFLAGS += -g
endif
-ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(COMPATCC)), y)
-VDSO_CAFLAGS += -DCC_HAVE_ASM_GOTO
-endif
# From arm Makefile
VDSO_CAFLAGS += $(call cc32-option,-fno-dwarf2-cfi-asm)
@@ -55,6 +60,7 @@ endif
VDSO_CAFLAGS += -fPIC -fno-builtin -fno-stack-protector
VDSO_CAFLAGS += -DDISABLE_BRANCH_PROFILING
+
# Try to compile for ARMv8. If the compiler is too old and doesn't support it,
# fall back to v7. There is no easy way to check for what architecture the code
# is being compiled, so define a macro specifying that (see arch/arm/Makefile).
@@ -91,6 +97,12 @@ VDSO_CFLAGS += -Wno-int-to-pointer-cast
VDSO_AFLAGS := $(VDSO_CAFLAGS)
VDSO_AFLAGS += -D__ASSEMBLY__
+# Check for binutils support for dmb ishld
+dmbinstr := $(call cc32-as-instr,dmb ishld,-DCONFIG_AS_DMB_ISHLD=1)
+
+VDSO_CFLAGS += $(dmbinstr)
+VDSO_AFLAGS += $(dmbinstr)
+
VDSO_LDFLAGS := $(VDSO_CPPFLAGS)
# From arm vDSO Makefile
VDSO_LDFLAGS += -Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1
@@ -159,14 +171,14 @@ quiet_cmd_vdsold_and_vdso_check = LD32 $@
cmd_vdsold_and_vdso_check = $(cmd_vdsold); $(cmd_vdso_check)
quiet_cmd_vdsold = LD32 $@
- cmd_vdsold = $(COMPATCC) -Wp,-MD,$(depfile) $(VDSO_LDFLAGS) \
+ cmd_vdsold = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_LDFLAGS) \
-Wl,-T $(filter %.lds,$^) $(filter %.o,$^) -o $@
quiet_cmd_vdsocc = CC32 $@
- cmd_vdsocc = $(COMPATCC) -Wp,-MD,$(depfile) $(VDSO_CFLAGS) -c -o $@ $<
+ cmd_vdsocc = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_CFLAGS) -c -o $@ $<
quiet_cmd_vdsocc_gettimeofday = CC32 $@
- cmd_vdsocc_gettimeofday = $(COMPATCC) -Wp,-MD,$(depfile) $(VDSO_CFLAGS) $(VDSO_CFLAGS_gettimeofday_o) -c -o $@ $<
+ cmd_vdsocc_gettimeofday = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_CFLAGS) $(VDSO_CFLAGS_gettimeofday_o) -c -o $@ $<
quiet_cmd_vdsoas = AS32 $@
- cmd_vdsoas = $(COMPATCC) -Wp,-MD,$(depfile) $(VDSO_AFLAGS) -c -o $@ $<
+ cmd_vdsoas = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_AFLAGS) -c -o $@ $<
quiet_cmd_vdsomunge = MUNGE $@
cmd_vdsomunge = $(obj)/$(munge) $< $@
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 115d7a0e4b08..855f2a7954e6 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -113,6 +113,15 @@ static inline bool is_ttbr1_addr(unsigned long addr)
return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
}
+static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
+{
+ /* Either init_pg_dir or swapper_pg_dir */
+ if (mm == &init_mm)
+ return __pa_symbol(mm->pgd);
+
+ return (unsigned long)virt_to_phys(mm->pgd);
+}
+
/*
* Dump out the page tables associated with 'addr' in the currently active mm.
*/
@@ -141,7 +150,7 @@ static void show_pte(unsigned long addr)
pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
- vabits_actual, (unsigned long)virt_to_phys(mm->pgd));
+ vabits_actual, mm_to_pgd_phys(mm));
pgdp = pgd_offset(mm, addr);
pgd = READ_ONCE(*pgdp);
pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
@@ -266,7 +275,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
* If we got a different type of fault from the AT instruction,
* treat the translation fault as spurious.
*/
- dfsc = FIELD_PREP(SYS_PAR_EL1_FST, par);
+ dfsc = FIELD_GET(SYS_PAR_EL1_FST, par);
return (dfsc & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT;
}
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 35c225ede0e4..61d93f062a36 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -734,5 +734,28 @@ do { \
if (unlikely(__gu_err)) goto err_label; \
} while (0)
+/*
+ * We want the unsafe accessors to always be inlined and use
+ * the error labels - thus the macro games.
+ */
+#define unsafe_copy_loop(dst, src, len, type, label) \
+ while (len >= sizeof(type)) { \
+ unsafe_put_user(*(type *)src,(type __user *)dst,label); \
+ dst += sizeof(type); \
+ src += sizeof(type); \
+ len -= sizeof(type); \
+ }
+
+#define unsafe_copy_to_user(_dst,_src,_len,label) \
+do { \
+ char __user *__ucu_dst = (_dst); \
+ const char *__ucu_src = (_src); \
+ size_t __ucu_len = (_len); \
+ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \
+ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \
+ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \
+ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \
+} while (0)
+
#endif /* _ASM_X86_UACCESS_H */
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 61b635bc2a31..656460636ad3 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -160,24 +160,27 @@ bool rq_depth_calc_max_depth(struct rq_depth *rqd)
return ret;
}
-void rq_depth_scale_up(struct rq_depth *rqd)
+/* Returns true on success and false if scaling up wasn't possible */
+bool rq_depth_scale_up(struct rq_depth *rqd)
{
/*
* Hit max in previous round, stop here
*/
if (rqd->scaled_max)
- return;
+ return false;
rqd->scale_step--;
rqd->scaled_max = rq_depth_calc_max_depth(rqd);
+ return true;
}
/*
* Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
- * had a latency violation.
+ * had a latency violation. Returns true on success and returns false if
+ * scaling down wasn't possible.
*/
-void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
+bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
{
/*
* Stop scaling down when we've hit the limit. This also prevents
@@ -185,7 +188,7 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
* keep up.
*/
if (rqd->max_depth == 1)
- return;
+ return false;
if (rqd->scale_step < 0 && hard_throttle)
rqd->scale_step = 0;
@@ -194,6 +197,7 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
rqd->scaled_max = false;
rq_depth_calc_max_depth(rqd);
+ return true;
}
struct rq_qos_wait_data {
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 08a09dbe0f4b..e8cb68f6958a 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -130,8 +130,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
acquire_inflight_cb_t *acquire_inflight_cb,
cleanup_cb_t *cleanup_cb);
bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
-void rq_depth_scale_up(struct rq_depth *rqd);
-void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
+bool rq_depth_scale_up(struct rq_depth *rqd);
+bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
bool rq_depth_calc_max_depth(struct rq_depth *rqd);
void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 8af553a0ba00..8641ba9793c5 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -308,7 +308,8 @@ static void calc_wb_limits(struct rq_wb *rwb)
static void scale_up(struct rq_wb *rwb)
{
- rq_depth_scale_up(&rwb->rq_depth);
+ if (!rq_depth_scale_up(&rwb->rq_depth))
+ return;
calc_wb_limits(rwb);
rwb->unknown_cnt = 0;
rwb_wake_all(rwb);
@@ -317,7 +318,8 @@ static void scale_up(struct rq_wb *rwb)
static void scale_down(struct rq_wb *rwb, bool hard_throttle)
{
- rq_depth_scale_down(&rwb->rq_depth, hard_throttle);
+ if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle))
+ return;
calc_wb_limits(rwb);
rwb->unknown_cnt = 0;
rwb_trace_step(rwb, "scale down");
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 76d0f9de767b..58e09ffe8b9c 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -4791,27 +4791,6 @@ void ata_scsi_hotplug(struct work_struct *work)
return;
}
- /*
- * XXX - UGLY HACK
- *
- * The block layer suspend/resume path is fundamentally broken due
- * to freezable kthreads and workqueue and may deadlock if a block
- * device gets removed while resume is in progress. I don't know
- * what the solution is short of removing freezable kthreads and
- * workqueues altogether.
- *
- * The following is an ugly hack to avoid kicking off device
- * removal while freezer is active. This is a joke but does avoid
- * this particular deadlock scenario.
- *
- * https://bugzilla.kernel.org/show_bug.cgi?id=62801
- * http://marc.info/?l=linux-kernel&m=138695698516487
- */
-#ifdef CONFIG_FREEZER
- while (pm_freezing)
- msleep(10);
-#endif
-
DPRINTK("ENTER\n");
mutex_lock(&ap->scsi_scan_mutex);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index ac07e8c94c79..478aa86fc1f2 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -248,8 +248,8 @@ static void nbd_put(struct nbd_device *nbd)
if (refcount_dec_and_mutex_lock(&nbd->refs,
&nbd_index_mutex)) {
idr_remove(&nbd_index_idr, nbd->index);
- mutex_unlock(&nbd_index_mutex);
nbd_dev_remove(nbd);
+ mutex_unlock(&nbd_index_mutex);
}
}
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
index eabc116832a7..3d7fdea872f8 100644
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -142,8 +142,7 @@ static blk_status_t null_zone_reset(struct nullb_cmd *cmd, sector_t sector)
zone->wp = zone->start;
break;
default:
- cmd->error = BLK_STS_NOTSUPP;
- break;
+ return BLK_STS_NOTSUPP;
}
return BLK_STS_OK;
}
diff --git a/drivers/gpio/gpio-eic-sprd.c b/drivers/gpio/gpio-eic-sprd.c
index fe7a73f52329..bb287f35cf40 100644
--- a/drivers/gpio/gpio-eic-sprd.c
+++ b/drivers/gpio/gpio-eic-sprd.c
@@ -530,11 +530,12 @@ static void sprd_eic_handle_one_type(struct gpio_chip *chip)
}
for_each_set_bit(n, &reg, SPRD_EIC_PER_BANK_NR) {
- girq = irq_find_mapping(chip->irq.domain,
- bank * SPRD_EIC_PER_BANK_NR + n);
+ u32 offset = bank * SPRD_EIC_PER_BANK_NR + n;
+
+ girq = irq_find_mapping(chip->irq.domain, offset);
generic_handle_irq(girq);
- sprd_eic_toggle_trigger(chip, girq, n);
+ sprd_eic_toggle_trigger(chip, girq, offset);
}
}
}
diff --git a/drivers/gpio/gpio-max77620.c b/drivers/gpio/gpio-max77620.c
index 47d05e357e61..faf86ea9c51a 100644
--- a/drivers/gpio/gpio-max77620.c
+++ b/drivers/gpio/gpio-max77620.c
@@ -192,13 +192,13 @@ static int max77620_gpio_set_debounce(struct max77620_gpio *mgpio,
case 0:
val = MAX77620_CNFG_GPIO_DBNC_None;
break;
- case 1 ... 8:
+ case 1000 ... 8000:
val = MAX77620_CNFG_GPIO_DBNC_8ms;
break;
- case 9 ... 16:
+ case 9000 ... 16000:
val = MAX77620_CNFG_GPIO_DBNC_16ms;
break;
- case 17 ... 32:
+ case 17000 ... 32000:
val = MAX77620_CNFG_GPIO_DBNC_32ms;
break;
default:
diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index 1eea2c6c2e1d..80ea49f570f4 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -317,7 +317,7 @@ struct gpio_desc *gpiod_get_from_of_node(struct device_node *node,
transitory = flags & OF_GPIO_TRANSITORY;
ret = gpiod_request(desc, label);
- if (ret == -EBUSY && (flags & GPIOD_FLAGS_BIT_NONEXCLUSIVE))
+ if (ret == -EBUSY && (dflags & GPIOD_FLAGS_BIT_NONEXCLUSIVE))
return desc;
if (ret)
return ERR_PTR(ret);
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index bdbc1649eafa..5833e4f380d6 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -3070,8 +3070,10 @@ int gpiod_direction_output(struct gpio_desc *desc, int value)
if (!ret)
goto set_output_value;
/* Emulate open drain by not actively driving the line high */
- if (value)
- return gpiod_direction_input(desc);
+ if (value) {
+ ret = gpiod_direction_input(desc);
+ goto set_output_flag;
+ }
}
else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags)) {
ret = gpio_set_config(gc, gpio_chip_hwgpio(desc),
@@ -3079,8 +3081,10 @@ int gpiod_direction_output(struct gpio_desc *desc, int value)
if (!ret)
goto set_output_value;
/* Emulate open source by not actively driving the line low */
- if (!value)
- return gpiod_direction_input(desc);
+ if (!value) {
+ ret = gpiod_direction_input(desc);
+ goto set_output_flag;
+ }
} else {
gpio_set_config(gc, gpio_chip_hwgpio(desc),
PIN_CONFIG_DRIVE_PUSH_PULL);
@@ -3088,6 +3092,17 @@ int gpiod_direction_output(struct gpio_desc *desc, int value)
set_output_value:
return gpiod_direction_output_raw_commit(desc, value);
+
+set_output_flag:
+ /*
+ * When emulating open-source or open-drain functionalities by not
+ * actively driving the line (setting mode to input) we still need to
+ * set the IS_OUT flag or otherwise we won't be able to set the line
+ * value anymore.
+ */
+ if (ret == 0)
+ set_bit(FLAG_IS_OUT, &desc->flags);
+ return ret;
}
EXPORT_SYMBOL_GPL(gpiod_direction_output);
@@ -3448,8 +3463,6 @@ static void gpio_set_open_drain_value_commit(struct gpio_desc *desc, bool value)
if (value) {
ret = chip->direction_input(chip, offset);
- if (!ret)
- clear_bit(FLAG_IS_OUT, &desc->flags);
} else {
ret = chip->direction_output(chip, offset, 0);
if (!ret)
@@ -3479,8 +3492,6 @@ static void gpio_set_open_source_value_commit(struct gpio_desc *desc, bool value
set_bit(FLAG_IS_OUT, &desc->flags);
} else {
ret = chip->direction_input(chip, offset);
- if (!ret)
- clear_bit(FLAG_IS_OUT, &desc->flags);
}
trace_gpio_direction(desc_to_gpio(desc), !value, ret);
if (ret < 0)
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index da10e6ccb43c..5920c0085d35 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -4399,6 +4399,7 @@ error2:
error1:
port_modify.set_port_cap_mask = 0;
port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
+ kfree(port);
while (--i) {
if (!rdma_cap_ib_cm(ib_device, i))
continue;
@@ -4407,6 +4408,7 @@ error1:
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
ib_unregister_mad_agent(port->mad_agent);
cm_remove_port_fs(port);
+ kfree(port);
}
free:
kfree(cm_dev);
@@ -4460,6 +4462,7 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
spin_unlock_irq(&cm.state_lock);
ib_unregister_mad_agent(cur_mad_agent);
cm_remove_port_fs(port);
+ kfree(port);
}
kfree(cm_dev);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 0e3cf3461999..d78f67623f24 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -2396,9 +2396,10 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
conn_id->cm_id.iw = NULL;
cma_exch(conn_id, RDMA_CM_DESTROYING);
mutex_unlock(&conn_id->handler_mutex);
+ mutex_unlock(&listen_id->handler_mutex);
cma_deref_id(conn_id);
rdma_destroy_id(&conn_id->id);
- goto out;
+ return ret;
}
mutex_unlock(&conn_id->handler_mutex);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 99c4a55545cf..2dd2cfe9b561 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1987,8 +1987,6 @@ static int iw_query_port(struct ib_device *device,
if (!netdev)
return -ENODEV;
- dev_put(netdev);
-
port_attr->max_mtu = IB_MTU_4096;
port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
@@ -1996,19 +1994,22 @@ static int iw_query_port(struct ib_device *device,
port_attr->state = IB_PORT_DOWN;
port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
} else {
- inetdev = in_dev_get(netdev);
+ rcu_read_lock();
+ inetdev = __in_dev_get_rcu(netdev);
if (inetdev && inetdev->ifa_list) {
port_attr->state = IB_PORT_ACTIVE;
port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
- in_dev_put(inetdev);
} else {
port_attr->state = IB_PORT_INIT;
port_attr->phys_state =
IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING;
}
+
+ rcu_read_unlock();
}
+ dev_put(netdev);
err = device->ops.query_port(device, port_num, port_attr);
if (err)
return err;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 7a7474000100..65b36548bc17 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -1230,7 +1230,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg) {
ret = -ENOMEM;
- goto err;
+ goto err_get;
}
nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
@@ -1787,10 +1787,6 @@ static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
- ret = rdma_counter_unbind_qpn(device, port, qpn, cntn);
- if (ret)
- goto err_unbind;
-
if (fill_nldev_handle(msg, device) ||
nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
@@ -1799,13 +1795,15 @@ static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
goto err_fill;
}
+ ret = rdma_counter_unbind_qpn(device, port, qpn, cntn);
+ if (ret)
+ goto err_fill;
+
nlmsg_end(msg, nlh);
ib_device_put(device);
return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
err_fill:
- rdma_counter_bind_qpn(device, port, qpn, cntn);
-err_unbind:
nlmsg_free(msg);
err:
ib_device_put(device);
diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
index 1ab423b19f77..6eb6d2717ca5 100644
--- a/drivers/infiniband/core/security.c
+++ b/drivers/infiniband/core/security.c
@@ -426,7 +426,7 @@ int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev)
int ret;
rdma_for_each_port (dev, i) {
- is_ib = rdma_protocol_ib(dev, i++);
+ is_ib = rdma_protocol_ib(dev, i);
if (is_ib)
break;
}
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index f67a30fda1ed..163ff7ba92b7 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -451,8 +451,10 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
* that the hardware will not attempt to access the MR any more.
*/
if (!umem_odp->is_implicit_odp) {
+ mutex_lock(&umem_odp->umem_mutex);
ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
ib_umem_end(umem_odp));
+ mutex_unlock(&umem_odp->umem_mutex);
kvfree(umem_odp->dma_list);
kvfree(umem_odp->page_list);
}
@@ -719,6 +721,8 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
u64 addr;
struct ib_device *dev = umem_odp->umem.ibdev;
+ lockdep_assert_held(&umem_odp->umem_mutex);
+
virt = max_t(u64, virt, ib_umem_start(umem_odp));
bound = min_t(u64, bound, ib_umem_end(umem_odp));
/* Note that during the run of this function, the
@@ -726,7 +730,6 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
* faults from completion. We might be racing with other
* invalidations, so we must make sure we free each page only
* once. */
- mutex_lock(&umem_odp->umem_mutex);
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
if (umem_odp->page_list[idx]) {
@@ -757,7 +760,6 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
umem_odp->npages--;
}
}
- mutex_unlock(&umem_odp->umem_mutex);
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index a8b9548bd1a2..599340c1f0b8 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -242,10 +242,13 @@ static void set_ep_sin6_addrs(struct c4iw_ep *ep,
}
}
-static int dump_qp(struct c4iw_qp *qp, struct c4iw_debugfs_data *qpd)
+static int dump_qp(unsigned long id, struct c4iw_qp *qp,
+ struct c4iw_debugfs_data *qpd)
{
int space;
int cc;
+ if (id != qp->wq.sq.qid)
+ return 0;
space = qpd->bufsize - qpd->pos - 1;
if (space == 0)
@@ -350,7 +353,7 @@ static int qp_open(struct inode *inode, struct file *file)
xa_lock_irq(&qpd->devp->qps);
xa_for_each(&qpd->devp->qps, index, qp)
- dump_qp(qp, qpd);
+ dump_qp(index, qp, qpd);
xa_unlock_irq(&qpd->devp->qps);
qpd->buf[qpd->pos++] = 0;
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index aa772ee0706f..35c284af574d 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -275,13 +275,17 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
struct sk_buff *skb, struct c4iw_wr_wait *wr_waitp)
{
int err;
- struct fw_ri_tpte tpt;
+ struct fw_ri_tpte *tpt;
u32 stag_idx;
static atomic_t key;
if (c4iw_fatal_error(rdev))
return -EIO;
+ tpt = kmalloc(sizeof(*tpt), GFP_KERNEL);
+ if (!tpt)
+ return -ENOMEM;
+
stag_state = stag_state > 0;
stag_idx = (*stag) >> 8;
@@ -291,6 +295,7 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
mutex_lock(&rdev->stats.lock);
rdev->stats.stag.fail++;
mutex_unlock(&rdev->stats.lock);
+ kfree(tpt);
return -ENOMEM;
}
mutex_lock(&rdev->stats.lock);
@@ -305,28 +310,28 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
/* write TPT entry */
if (reset_tpt_entry)
- memset(&tpt, 0, sizeof(tpt));
+ memset(tpt, 0, sizeof(*tpt));
else {
- tpt.valid_to_pdid = cpu_to_be32(FW_RI_TPTE_VALID_F |
+ tpt->valid_to_pdid = cpu_to_be32(FW_RI_TPTE_VALID_F |
FW_RI_TPTE_STAGKEY_V((*stag & FW_RI_TPTE_STAGKEY_M)) |
FW_RI_TPTE_STAGSTATE_V(stag_state) |
FW_RI_TPTE_STAGTYPE_V(type) | FW_RI_TPTE_PDID_V(pdid));
- tpt.locread_to_qpid = cpu_to_be32(FW_RI_TPTE_PERM_V(perm) |
+ tpt->locread_to_qpid = cpu_to_be32(FW_RI_TPTE_PERM_V(perm) |
(bind_enabled ? FW_RI_TPTE_MWBINDEN_F : 0) |
FW_RI_TPTE_ADDRTYPE_V((zbva ? FW_RI_ZERO_BASED_TO :
FW_RI_VA_BASED_TO))|
FW_RI_TPTE_PS_V(page_size));
- tpt.nosnoop_pbladdr = !pbl_size ? 0 : cpu_to_be32(
+ tpt->nosnoop_pbladdr = !pbl_size ? 0 : cpu_to_be32(
FW_RI_TPTE_PBLADDR_V(PBL_OFF(rdev, pbl_addr)>>3));
- tpt.len_lo = cpu_to_be32((u32)(len & 0xffffffffUL));
- tpt.va_hi = cpu_to_be32((u32)(to >> 32));
- tpt.va_lo_fbo = cpu_to_be32((u32)(to & 0xffffffffUL));
- tpt.dca_mwbcnt_pstag = cpu_to_be32(0);
- tpt.len_hi = cpu_to_be32((u32)(len >> 32));
+ tpt->len_lo = cpu_to_be32((u32)(len & 0xffffffffUL));
+ tpt->va_hi = cpu_to_be32((u32)(to >> 32));
+ tpt->va_lo_fbo = cpu_to_be32((u32)(to & 0xffffffffUL));
+ tpt->dca_mwbcnt_pstag = cpu_to_be32(0);
+ tpt->len_hi = cpu_to_be32((u32)(len >> 32));
}
err = write_adapter_mem(rdev, stag_idx +
(rdev->lldi.vr->stag.start >> 5),
- sizeof(tpt), &tpt, skb, wr_waitp);
+ sizeof(*tpt), tpt, skb, wr_waitp);
if (reset_tpt_entry) {
c4iw_put_resource(&rdev->resource.tpt_table, stag_idx);
@@ -334,6 +339,7 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
rdev->stats.stag.cur -= 32;
mutex_unlock(&rdev->stats.lock);
}
+ kfree(tpt);
return err;
}
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index eb9368be28c1..bbcac539777a 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -2737,15 +2737,11 @@ int c4iw_create_srq(struct ib_srq *ib_srq, struct ib_srq_init_attr *attrs,
if (CHELSIO_CHIP_VERSION(rhp->rdev.lldi.adapter_type) > CHELSIO_T6)
srq->flags = T4_SRQ_LIMIT_SUPPORT;
- ret = xa_insert_irq(&rhp->qps, srq->wq.qid, srq, GFP_KERNEL);
- if (ret)
- goto err_free_queue;
-
if (udata) {
srq_key_mm = kmalloc(sizeof(*srq_key_mm), GFP_KERNEL);
if (!srq_key_mm) {
ret = -ENOMEM;
- goto err_remove_handle;
+ goto err_free_queue;
}
srq_db_key_mm = kmalloc(sizeof(*srq_db_key_mm), GFP_KERNEL);
if (!srq_db_key_mm) {
@@ -2789,8 +2785,6 @@ err_free_srq_db_key_mm:
kfree(srq_db_key_mm);
err_free_srq_key_mm:
kfree(srq_key_mm);
-err_remove_handle:
- xa_erase_irq(&rhp->qps, srq->wq.qid);
err_free_queue:
free_srq_queue(srq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
srq->wr_waitp);
@@ -2813,8 +2807,6 @@ void c4iw_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
rhp = srq->rhp;
pr_debug("%s id %d\n", __func__, srq->wq.qid);
-
- xa_erase_irq(&rhp->qps, srq->wq.qid);
ucontext = rdma_udata_to_drv_context(udata, struct c4iw_ucontext,
ibucontext);
free_srq_queue(srq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 2395fd4233a7..2ed7bfd5feea 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -1526,8 +1526,11 @@ int sdma_init(struct hfi1_devdata *dd, u8 port)
}
ret = rhashtable_init(tmp_sdma_rht, &sdma_rht_params);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(tmp_sdma_rht);
goto bail;
+ }
+
dd->sdma_rht = tmp_sdma_rht;
dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 8056930bbe2c..cd9ee1664a69 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -2773,6 +2773,10 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev)
return -ENOMEM;
iwibdev = iwdev->iwibdev;
rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group);
+ ret = ib_device_set_netdev(&iwibdev->ibdev, iwdev->netdev, 1);
+ if (ret)
+ goto error;
+
ret = ib_register_device(&iwibdev->ibdev, "i40iw%d");
if (ret)
goto error;
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 59022b744144..d609f4659afb 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1298,29 +1298,6 @@ static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
return 0;
}
-static void devx_free_indirect_mkey(struct rcu_head *rcu)
-{
- kfree(container_of(rcu, struct devx_obj, devx_mr.rcu));
-}
-
-/* This function to delete from the radix tree needs to be called before
- * destroying the underlying mkey. Otherwise a race might occur in case that
- * other thread will get the same mkey before this one will be deleted,
- * in that case it will fail via inserting to the tree its own data.
- *
- * Note:
- * An error in the destroy is not expected unless there is some other indirect
- * mkey which points to this one. In a kernel cleanup flow it will be just
- * destroyed in the iterative destruction call. In a user flow, in case
- * the application didn't close in the expected order it's its own problem,
- * the mkey won't be part of the tree, in both cases the kernel is safe.
- */
-static void devx_cleanup_mkey(struct devx_obj *obj)
-{
- xa_erase(&obj->ib_dev->mdev->priv.mkey_table,
- mlx5_base_mkey(obj->devx_mr.mmkey.key));
-}
-
static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
struct devx_event_subscription *sub)
{
@@ -1362,8 +1339,16 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
int ret;
dev = mlx5_udata_to_mdev(&attrs->driver_udata);
- if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
- devx_cleanup_mkey(obj);
+ if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
+ /*
+ * The pagefault_single_data_segment() does commands against
+ * the mmkey, we must wait for that to stop before freeing the
+ * mkey, as another allocation could get the same mkey #.
+ */
+ xa_erase(&obj->ib_dev->mdev->priv.mkey_table,
+ mlx5_base_mkey(obj->devx_mr.mmkey.key));
+ synchronize_srcu(&dev->mr_srcu);
+ }
if (obj->flags & DEVX_OBJ_FLAGS_DCT)
ret = mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct);
@@ -1382,12 +1367,6 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
devx_cleanup_subscription(dev, sub_entry);
mutex_unlock(&devx_event_table->event_xa_lock);
- if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
- call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu,
- devx_free_indirect_mkey);
- return ret;
- }
-
kfree(obj);
return ret;
}
@@ -1491,26 +1470,21 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
&obj_id);
WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32));
- if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
- err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out);
- if (err)
- goto obj_destroy;
- }
-
err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len);
if (err)
- goto err_copy;
+ goto obj_destroy;
if (opcode == MLX5_CMD_OP_CREATE_GENERAL_OBJECT)
obj_type = MLX5_GET(general_obj_in_cmd_hdr, cmd_in, obj_type);
-
obj->obj_id = get_enc_obj_id(opcode | obj_type << 16, obj_id);
+ if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
+ err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out);
+ if (err)
+ goto obj_destroy;
+ }
return 0;
-err_copy:
- if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
- devx_cleanup_mkey(obj);
obj_destroy:
if (obj->flags & DEVX_OBJ_FLAGS_DCT)
mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 2ceaef3ea3fb..1a98ee2e01c4 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -606,7 +606,7 @@ struct mlx5_ib_mr {
struct mlx5_ib_dev *dev;
u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
struct mlx5_core_sig_ctx *sig;
- int live;
+ unsigned int live;
void *descs_alloc;
int access_flags; /* Needed for rereg MR */
@@ -639,7 +639,6 @@ struct mlx5_ib_mw {
struct mlx5_ib_devx_mr {
struct mlx5_core_mkey mmkey;
int ndescs;
- struct rcu_head rcu;
};
struct mlx5_ib_umr_context {
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 1eff031ef048..630599311586 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -84,32 +84,6 @@ static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
}
-static void update_odp_mr(struct mlx5_ib_mr *mr)
-{
- if (is_odp_mr(mr)) {
- /*
- * This barrier prevents the compiler from moving the
- * setting of umem->odp_data->private to point to our
- * MR, before reg_umr finished, to ensure that the MR
- * initialization have finished before starting to
- * handle invalidations.
- */
- smp_wmb();
- to_ib_umem_odp(mr->umem)->private = mr;
- /*
- * Make sure we will see the new
- * umem->odp_data->private value in the invalidation
- * routines, before we can get page faults on the
- * MR. Page faults can happen once we put the MR in
- * the tree, below this line. Without the barrier,
- * there can be a fault handling and an invalidation
- * before umem->odp_data->private == mr is visible to
- * the invalidation handler.
- */
- smp_wmb();
- }
-}
-
static void reg_mr_callback(int status, struct mlx5_async_work *context)
{
struct mlx5_ib_mr *mr =
@@ -1346,8 +1320,6 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mr->umem = umem;
set_mr_fields(dev, mr, npages, length, access_flags);
- update_odp_mr(mr);
-
if (use_umr) {
int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
@@ -1363,10 +1335,12 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
}
}
- if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
- mr->live = 1;
+ if (is_odp_mr(mr)) {
+ to_ib_umem_odp(mr->umem)->private = mr;
atomic_set(&mr->num_pending_prefetch, 0);
}
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+ smp_store_release(&mr->live, 1);
return &mr->ibmr;
error:
@@ -1441,6 +1415,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
if (!mr->umem)
return -EINVAL;
+ if (is_odp_mr(mr))
+ return -EOPNOTSUPP;
+
if (flags & IB_MR_REREG_TRANS) {
addr = virt_addr;
len = length;
@@ -1486,8 +1463,6 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
}
mr->allocated_from_cache = 0;
- if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
- mr->live = 1;
} else {
/*
* Send a UMR WQE
@@ -1516,7 +1491,6 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
set_mr_fields(dev, mr, npages, len, access_flags);
- update_odp_mr(mr);
return 0;
err:
@@ -1607,15 +1581,16 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
/* Prevent new page faults and
* prefetch requests from succeeding
*/
- mr->live = 0;
+ WRITE_ONCE(mr->live, 0);
+
+ /* Wait for all running page-fault handlers to finish. */
+ synchronize_srcu(&dev->mr_srcu);
/* dequeue pending prefetch requests for the mr */
if (atomic_read(&mr->num_pending_prefetch))
flush_workqueue(system_unbound_wq);
WARN_ON(atomic_read(&mr->num_pending_prefetch));
- /* Wait for all running page-fault handlers to finish. */
- synchronize_srcu(&dev->mr_srcu);
/* Destroy all page mappings */
if (!umem_odp->is_implicit_odp)
mlx5_ib_invalidate_range(umem_odp,
@@ -1987,14 +1962,25 @@ free:
int mlx5_ib_dealloc_mw(struct ib_mw *mw)
{
+ struct mlx5_ib_dev *dev = to_mdev(mw->device);
struct mlx5_ib_mw *mmw = to_mmw(mw);
int err;
- err = mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev,
- &mmw->mmkey);
- if (!err)
- kfree(mmw);
- return err;
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+ xa_erase(&dev->mdev->priv.mkey_table,
+ mlx5_base_mkey(mmw->mmkey.key));
+ /*
+ * pagefault_single_data_segment() may be accessing mmw under
+ * SRCU if the user bound an ODP MR to this MW.
+ */
+ synchronize_srcu(&dev->mr_srcu);
+ }
+
+ err = mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
+ if (err)
+ return err;
+ kfree(mmw);
+ return 0;
}
int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 2e9b43061797..3f9478d19376 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -178,6 +178,29 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
return;
}
+ /*
+ * The locking here is pretty subtle. Ideally the implicit children
+ * list would be protected by the umem_mutex, however that is not
+ * possible. Instead this uses a weaker update-then-lock pattern:
+ *
+ * srcu_read_lock()
+ * <change children list>
+ * mutex_lock(umem_mutex)
+ * mlx5_ib_update_xlt()
+ * mutex_unlock(umem_mutex)
+ * destroy lkey
+ *
+ * ie any change the children list must be followed by the locked
+ * update_xlt before destroying.
+ *
+ * The umem_mutex provides the acquire/release semantic needed to make
+ * the children list visible to a racing thread. While SRCU is not
+ * technically required, using it gives consistent use of the SRCU
+ * locking around the children list.
+ */
+ lockdep_assert_held(&to_ib_umem_odp(mr->umem)->umem_mutex);
+ lockdep_assert_held(&mr->dev->mr_srcu);
+
odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE,
nentries * MLX5_IMR_MTT_SIZE, mr);
@@ -202,15 +225,22 @@ static void mr_leaf_free_action(struct work_struct *work)
struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
+ struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
+ int srcu_key;
mr->parent = NULL;
synchronize_srcu(&mr->dev->mr_srcu);
- ib_umem_odp_release(odp);
- if (imr->live)
+ if (smp_load_acquire(&imr->live)) {
+ srcu_key = srcu_read_lock(&mr->dev->mr_srcu);
+ mutex_lock(&odp_imr->umem_mutex);
mlx5_ib_update_xlt(imr, idx, 1, 0,
MLX5_IB_UPD_XLT_INDIRECT |
MLX5_IB_UPD_XLT_ATOMIC);
+ mutex_unlock(&odp_imr->umem_mutex);
+ srcu_read_unlock(&mr->dev->mr_srcu, srcu_key);
+ }
+ ib_umem_odp_release(odp);
mlx5_mr_cache_free(mr->dev, mr);
if (atomic_dec_and_test(&imr->num_leaf_free))
@@ -278,7 +308,6 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
idx - blk_start_idx + 1, 0,
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ATOMIC);
- mutex_unlock(&umem_odp->umem_mutex);
/*
* We are now sure that the device will not access the
* memory. We can safely unmap it, and mark it as dirty if
@@ -289,10 +318,12 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
if (unlikely(!umem_odp->npages && mr->parent &&
!umem_odp->dying)) {
- WRITE_ONCE(umem_odp->dying, 1);
+ WRITE_ONCE(mr->live, 0);
+ umem_odp->dying = 1;
atomic_inc(&mr->parent->num_leaf_free);
schedule_work(&umem_odp->work);
}
+ mutex_unlock(&umem_odp->umem_mutex);
}
void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
@@ -429,8 +460,6 @@ static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
mr->ibmr.lkey = mr->mmkey.key;
mr->ibmr.rkey = mr->mmkey.key;
- mr->live = 1;
-
mlx5_ib_dbg(dev, "key %x dev %p mr %p\n",
mr->mmkey.key, dev->mdev, mr);
@@ -484,6 +513,8 @@ next_mr:
mtt->parent = mr;
INIT_WORK(&odp->work, mr_leaf_free_action);
+ smp_store_release(&mtt->live, 1);
+
if (!nentries)
start_idx = addr >> MLX5_IMR_MTT_SHIFT;
nentries++;
@@ -536,6 +567,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
init_waitqueue_head(&imr->q_leaf_free);
atomic_set(&imr->num_leaf_free, 0);
atomic_set(&imr->num_pending_prefetch, 0);
+ smp_store_release(&imr->live, 1);
return imr;
}
@@ -555,15 +587,19 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
if (mr->parent != imr)
continue;
+ mutex_lock(&umem_odp->umem_mutex);
ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
ib_umem_end(umem_odp));
- if (umem_odp->dying)
+ if (umem_odp->dying) {
+ mutex_unlock(&umem_odp->umem_mutex);
continue;
+ }
- WRITE_ONCE(umem_odp->dying, 1);
+ umem_odp->dying = 1;
atomic_inc(&imr->num_leaf_free);
schedule_work(&umem_odp->work);
+ mutex_unlock(&umem_odp->umem_mutex);
}
up_read(&per_mm->umem_rwsem);
@@ -773,7 +809,7 @@ next_mr:
switch (mmkey->type) {
case MLX5_MKEY_MR:
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
- if (!mr->live || !mr->ibmr.pd) {
+ if (!smp_load_acquire(&mr->live) || !mr->ibmr.pd) {
mlx5_ib_dbg(dev, "got dead MR\n");
ret = -EFAULT;
goto srcu_unlock;
@@ -1641,12 +1677,12 @@ static bool num_pending_prefetch_inc(struct ib_pd *pd,
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
- if (mr->ibmr.pd != pd) {
+ if (!smp_load_acquire(&mr->live)) {
ret = false;
break;
}
- if (!mr->live) {
+ if (mr->ibmr.pd != pd) {
ret = false;
break;
}
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
index 6cac0c88cf39..36cdfbdbd325 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
@@ -230,8 +230,6 @@ static void pvrdma_free_srq(struct pvrdma_dev *dev, struct pvrdma_srq *srq)
pvrdma_page_dir_cleanup(dev, &srq->pdir);
- kfree(srq);
-
atomic_dec(&dev->num_srqs);
}
diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c
index 430314c8abd9..52d402f39df9 100644
--- a/drivers/infiniband/sw/siw/siw_qp.c
+++ b/drivers/infiniband/sw/siw/siw_qp.c
@@ -182,12 +182,19 @@ void siw_qp_llp_close(struct siw_qp *qp)
*/
void siw_qp_llp_write_space(struct sock *sk)
{
- struct siw_cep *cep = sk_to_cep(sk);
+ struct siw_cep *cep;
- cep->sk_write_space(sk);
+ read_lock(&sk->sk_callback_lock);
+
+ cep = sk_to_cep(sk);
+ if (cep) {
+ cep->sk_write_space(sk);
- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
- (void)siw_sq_start(cep->qp);
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ (void)siw_sq_start(cep->qp);
+ }
+
+ read_unlock(&sk->sk_callback_lock);
}
static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
index 9231b39d18b2..c501bf2a0252 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -112,17 +112,11 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev,
u32 out[MLX5_ST_SZ_DW(destroy_mkey_out)] = {0};
u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {0};
struct xarray *mkeys = &dev->priv.mkey_table;
- struct mlx5_core_mkey *deleted_mkey;
unsigned long flags;
xa_lock_irqsave(mkeys, flags);
- deleted_mkey = __xa_erase(mkeys, mlx5_base_mkey(mkey->key));
+ __xa_erase(mkeys, mlx5_base_mkey(mkey->key));
xa_unlock_irqrestore(mkeys, flags);
- if (!deleted_mkey) {
- mlx5_core_dbg(dev, "failed xarray delete of mkey 0x%x\n",
- mlx5_base_mkey(mkey->key));
- return -ENOENT;
- }
MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY);
MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key));
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8fe4eb7e5045..27e5b269e729 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1591,7 +1591,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
- struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
u64 lockstart;
@@ -1611,6 +1610,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
return -ENOMEM;
while (iov_iter_count(i) > 0) {
+ struct extent_state *cached_state = NULL;
size_t offset = offset_in_page(pos);
size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i),
@@ -1758,9 +1758,20 @@ again:
if (copied > 0)
ret = btrfs_dirty_pages(inode, pages, dirty_pages,
pos, copied, &cached_state);
+
+ /*
+ * If we have not locked the extent range, because the range's
+ * start offset is >= i_size, we might still have a non-NULL
+ * cached extent state, acquired while marking the extent range
+ * as delalloc through btrfs_dirty_pages(). Therefore free any
+ * possible cached extent state to avoid a memory leak.
+ */
if (extents_locked)
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
lockstart, lockend, &cached_state);
+ else
+ free_extent_state(cached_state);
+
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
true);
if (ret) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0546401bc0a..0f2754eaa05b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6305,13 +6305,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
u32 sizes[2];
int nitems = name ? 2 : 1;
unsigned long ptr;
+ unsigned int nofs_flag;
int ret;
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
+ nofs_flag = memalloc_nofs_save();
inode = new_inode(fs_info->sb);
+ memalloc_nofs_restore(nofs_flag);
if (!inode) {
btrfs_free_path(path);
return ERR_PTR(-ENOMEM);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index e87cbdad02a3..b57f3618e58e 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -500,7 +500,7 @@ static int process_leaf(struct btrfs_root *root,
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
u32 count;
- int i = 0, tree_block_level = 0, ret;
+ int i = 0, tree_block_level = 0, ret = 0;
struct btrfs_key key;
int nritems = btrfs_header_nritems(leaf);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f3215028235c..123ac54af071 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5085,7 +5085,7 @@ static int clone_range(struct send_ctx *sctx,
struct btrfs_path *path;
struct btrfs_key key;
int ret;
- u64 clone_src_i_size;
+ u64 clone_src_i_size = 0;
/*
* Prevent cloning from a zero offset with a length matching the sector
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 29b82a795522..8a6cc600bf18 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2932,7 +2932,8 @@ out:
* in the tree of log roots
*/
static int update_log_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *log)
+ struct btrfs_root *log,
+ struct btrfs_root_item *root_item)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret;
@@ -2940,10 +2941,10 @@ static int update_log_root(struct btrfs_trans_handle *trans,
if (log->log_transid == 1) {
/* insert root item on the first sync */
ret = btrfs_insert_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
} else {
ret = btrfs_update_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
}
return ret;
}
@@ -3041,6 +3042,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+ struct btrfs_root_item new_root_item;
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
@@ -3104,18 +3106,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
+ /*
+ * We _must_ update under the root->log_mutex in order to make sure we
+ * have a consistent view of the log root we are trying to commit at
+ * this moment.
+ *
+ * We _must_ copy this into a local copy, because we are not holding the
+ * log_root_tree->log_mutex yet. This is important because when we
+ * commit the log_root_tree we must have a consistent view of the
+ * log_root_tree when we update the super block to point at the
+ * log_root_tree bytenr. If we update the log_root_tree here we'll race
+ * with the commit and possibly point at the new block which we may not
+ * have written out.
+ */
btrfs_set_root_node(&log->root_item, log->node);
+ memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
/*
- * Update or create log root item under the root's log_mutex to prevent
- * races with concurrent log syncs that can lead to failure to update
- * log root item because it was not created yet.
- */
- ret = update_log_root(trans, log);
- /*
* IO has been started, blocks of the log tree have WRITTEN flag set
* in their headers. new modifications of the log will be written to
* new positions. so it's safe to allow log writers to go in.
@@ -3135,6 +3145,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
mutex_lock(&log_root_tree->log_mutex);
+
+ /*
+ * Now we are safe to update the log_root_tree because we're under the
+ * log_mutex, and we're a current writer so we're holding the commit
+ * open until we drop the log_mutex.
+ */
+ ret = update_log_root(trans, log, &new_root_item);
+
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
/* atomic_dec_and_test implies a barrier */
cond_wake_up_nomb(&log_root_tree->log_writer_wait);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cdd7af424033..bdfe4493e43a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3845,7 +3845,11 @@ static int alloc_profile_is_valid(u64 flags, int extended)
return !extended; /* "0" is valid for usual profiles */
/* true if exactly one bit set */
- return is_power_of_2(flags);
+ /*
+ * Don't use is_power_of_2(unsigned long) because it won't work
+ * for the single profile (1ULL << 48) on 32-bit CPUs.
+ */
+ return flags != 0 && (flags & (flags - 1)) == 0;
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8aaa7eec7b74..e88421d9a48d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -164,8 +164,13 @@ static void finish_writeback_work(struct bdi_writeback *wb,
if (work->auto_free)
kfree(work);
- if (done && atomic_dec_and_test(&done->cnt))
- wake_up_all(done->waitq);
+ if (done) {
+ wait_queue_head_t *waitq = done->waitq;
+
+ /* @done can't be accessed after the following dec */
+ if (atomic_dec_and_test(&done->cnt))
+ wake_up_all(waitq);
+ }
}
static void wb_queue_work(struct bdi_writeback *wb,
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8a0381f1a43b..92972b5c1d00 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -591,14 +591,6 @@ static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
io_cqring_ev_posted(ctx);
}
-static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
-{
- percpu_ref_put_many(&ctx->refs, refs);
-
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
-}
-
static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
@@ -646,7 +638,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
req->result = 0;
return req;
out:
- io_ring_drop_ctx_refs(ctx, 1);
+ percpu_ref_put(&ctx->refs);
return NULL;
}
@@ -654,7 +646,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
{
if (*nr) {
kmem_cache_free_bulk(req_cachep, *nr, reqs);
- io_ring_drop_ctx_refs(ctx, *nr);
+ percpu_ref_put_many(&ctx->refs, *nr);
*nr = 0;
}
}
@@ -663,7 +655,7 @@ static void __io_free_req(struct io_kiocb *req)
{
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file);
- io_ring_drop_ctx_refs(req->ctx, 1);
+ percpu_ref_put(&req->ctx->refs);
kmem_cache_free(req_cachep, req);
}
@@ -2761,7 +2753,7 @@ out:
if (link)
io_queue_link_head(ctx, link, &link->submit, shadow_req,
- block_for_last);
+ !block_for_last);
if (statep)
io_submit_state_end(statep);
@@ -2920,8 +2912,12 @@ static void io_finish_async(struct io_ring_ctx *ctx)
static void io_destruct_skb(struct sk_buff *skb)
{
struct io_ring_ctx *ctx = skb->sk->sk_user_data;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++)
+ if (ctx->sqo_wq[i])
+ flush_workqueue(ctx->sqo_wq[i]);
- io_finish_async(ctx);
unix_destruct_scm(skb);
}
@@ -3630,7 +3626,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
}
}
- io_ring_drop_ctx_refs(ctx, 1);
+ percpu_ref_put(&ctx->refs);
out_fput:
fdput(f);
return submitted ? submitted : ret;
diff --git a/fs/libfs.c b/fs/libfs.c
index c9b2850c0f7c..540611b99b9a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -89,58 +89,45 @@ int dcache_dir_close(struct inode *inode, struct file *file)
EXPORT_SYMBOL(dcache_dir_close);
/* parent is locked at least shared */
-static struct dentry *next_positive(struct dentry *parent,
- struct list_head *from,
- int count)
+/*
+ * Returns an element of siblings' list.
+ * We are looking for <count>th positive after <p>; if
+ * found, dentry is grabbed and returned to caller.
+ * If no such element exists, NULL is returned.
+ */
+static struct dentry *scan_positives(struct dentry *cursor,
+ struct list_head *p,
+ loff_t count,
+ struct dentry *last)
{
- unsigned *seq = &parent->d_inode->i_dir_seq, n;
- struct dentry *res;
- struct list_head *p;
- bool skipped;
- int i;
+ struct dentry *dentry = cursor->d_parent, *found = NULL;
-retry:
- i = count;
- skipped = false;
- n = smp_load_acquire(seq) & ~1;
- res = NULL;
- rcu_read_lock();
- for (p = from->next; p != &parent->d_subdirs; p = p->next) {
+ spin_lock(&dentry->d_lock);
+ while ((p = p->next) != &dentry->d_subdirs) {
struct dentry *d = list_entry(p, struct dentry, d_child);
- if (!simple_positive(d)) {
- skipped = true;
- } else if (!--i) {
- res = d;
- break;
+ // we must at least skip cursors, to avoid livelocks
+ if (d->d_flags & DCACHE_DENTRY_CURSOR)
+ continue;
+ if (simple_positive(d) && !--count) {
+ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+ if (simple_positive(d))
+ found = dget_dlock(d);
+ spin_unlock(&d->d_lock);
+ if (likely(found))
+ break;
+ count = 1;
+ }
+ if (need_resched()) {
+ list_move(&cursor->d_child, p);
+ p = &cursor->d_child;
+ spin_unlock(&dentry->d_lock);
+ cond_resched();
+ spin_lock(&dentry->d_lock);
}
}
- rcu_read_unlock();
- if (skipped) {
- smp_rmb();
- if (unlikely(*seq != n))
- goto retry;
- }
- return res;
-}
-
-static void move_cursor(struct dentry *cursor, struct list_head *after)
-{
- struct dentry *parent = cursor->d_parent;
- unsigned n, *seq = &parent->d_inode->i_dir_seq;
- spin_lock(&parent->d_lock);
- for (;;) {
- n = *seq;
- if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
- break;
- cpu_relax();
- }
- __list_del(cursor->d_child.prev, cursor->d_child.next);
- if (after)
- list_add(&cursor->d_child, after);
- else
- list_add_tail(&cursor->d_child, &parent->d_subdirs);
- smp_store_release(seq, n + 2);
- spin_unlock(&parent->d_lock);
+ spin_unlock(&dentry->d_lock);
+ dput(last);
+ return found;
}
loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
@@ -158,17 +145,25 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
return -EINVAL;
}
if (offset != file->f_pos) {
+ struct dentry *cursor = file->private_data;
+ struct dentry *to = NULL;
+
+ inode_lock_shared(dentry->d_inode);
+
+ if (offset > 2)
+ to = scan_positives(cursor, &dentry->d_subdirs,
+ offset - 2, NULL);
+ spin_lock(&dentry->d_lock);
+ if (to)
+ list_move(&cursor->d_child, &to->d_child);
+ else
+ list_del_init(&cursor->d_child);
+ spin_unlock(&dentry->d_lock);
+ dput(to);
+
file->f_pos = offset;
- if (file->f_pos >= 2) {
- struct dentry *cursor = file->private_data;
- struct dentry *to;
- loff_t n = file->f_pos - 2;
-
- inode_lock_shared(dentry->d_inode);
- to = next_positive(dentry, &dentry->d_subdirs, n);
- move_cursor(cursor, to ? &to->d_child : NULL);
- inode_unlock_shared(dentry->d_inode);
- }
+
+ inode_unlock_shared(dentry->d_inode);
}
return offset;
}
@@ -190,25 +185,35 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
struct dentry *cursor = file->private_data;
- struct list_head *p = &cursor->d_child;
- struct dentry *next;
- bool moved = false;
+ struct list_head *anchor = &dentry->d_subdirs;
+ struct dentry *next = NULL;
+ struct list_head *p;
if (!dir_emit_dots(file, ctx))
return 0;
if (ctx->pos == 2)
- p = &dentry->d_subdirs;
- while ((next = next_positive(dentry, p, 1)) != NULL) {
+ p = anchor;
+ else if (!list_empty(&cursor->d_child))
+ p = &cursor->d_child;
+ else
+ return 0;
+
+ while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
d_inode(next)->i_ino, dt_type(d_inode(next))))
break;
- moved = true;
- p = &next->d_child;
ctx->pos++;
+ p = &next->d_child;
}
- if (moved)
- move_cursor(cursor, p);
+ spin_lock(&dentry->d_lock);
+ if (next)
+ list_move_tail(&cursor->d_child, &next->d_child);
+ else
+ list_del_init(&cursor->d_child);
+ spin_unlock(&dentry->d_lock);
+ dput(next);
+
return 0;
}
EXPORT_SYMBOL(dcache_readdir);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8de1c9d644f6..9cd0a6815933 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2049,7 +2049,8 @@ out_write_size:
inode->i_mtime = inode->i_ctime = current_time(inode);
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ if (handle)
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
}
if (handle)
ocfs2_journal_dirty(handle, wc->w_di_bh);
@@ -2146,13 +2147,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_dio_write_ctxt *dwc = NULL;
struct buffer_head *di_bh = NULL;
u64 p_blkno;
- loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
+ loff_t pos = iblock << i_blkbits;
+ sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
unsigned len, total_len = bh_result->b_size;
int ret = 0, first_get_block = 0;
len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
len = min(total_len, len);
+ /*
+ * bh_result->b_size is count in get_more_blocks according to write
+ * "pos" and "end", we need map twice to return different buffer state:
+ * 1. area in file size, not set NEW;
+ * 2. area out file size, set NEW.
+ *
+ * iblock endblk
+ * |--------|---------|---------|---------
+ * |<-------area in file------->|
+ */
+
+ if ((iblock <= endblk) &&
+ ((iblock + ((len - 1) >> i_blkbits)) > endblk))
+ len = (endblk - iblock + 1) << i_blkbits;
+
mlog(0, "get block of %lu at %llu:%u req %u\n",
inode->i_ino, pos, len, total_len);
@@ -2236,6 +2254,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
if (desc->c_needs_zero)
set_buffer_new(bh_result);
+ if (iblock > endblk)
+ set_buffer_new(bh_result);
+
/* May sleep in end_io. It should not happen in a irq context. So defer
* it to dio work queue. */
set_buffer_defer_completion(bh_result);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index d6f7b299eb23..efeea208fdeb 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -283,7 +283,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
if (inode_alloc)
inode_lock(inode_alloc);
- if (o2info_coherent(&fi->ifi_req)) {
+ if (inode_alloc && o2info_coherent(&fi->ifi_req)) {
status = ocfs2_inode_lock(inode_alloc, &bh, 0);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 90c830e3758e..d8507972ee13 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1490,18 +1490,6 @@ static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
return loc->xl_ops->xlo_check_space(loc, xi);
}
-static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
-{
- loc->xl_ops->xlo_add_entry(loc, name_hash);
- loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
- /*
- * We can't leave the new entry's xe_name_offset at zero or
- * add_namevalue() will go nuts. We set it to the size of our
- * storage so that it can never be less than any other entry.
- */
- loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
-}
-
static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
struct ocfs2_xattr_info *xi)
{
@@ -2133,29 +2121,31 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
if (rc)
goto out;
- if (loc->xl_entry) {
- if (ocfs2_xa_can_reuse_entry(loc, xi)) {
- orig_value_size = loc->xl_entry->xe_value_size;
- rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
- if (rc)
- goto out;
- goto alloc_value;
- }
+ if (!loc->xl_entry) {
+ rc = -EINVAL;
+ goto out;
+ }
- if (!ocfs2_xattr_is_local(loc->xl_entry)) {
- orig_clusters = ocfs2_xa_value_clusters(loc);
- rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
- if (rc) {
- mlog_errno(rc);
- ocfs2_xa_cleanup_value_truncate(loc,
- "overwriting",
- orig_clusters);
- goto out;
- }
+ if (ocfs2_xa_can_reuse_entry(loc, xi)) {
+ orig_value_size = loc->xl_entry->xe_value_size;
+ rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
+ if (rc)
+ goto out;
+ goto alloc_value;
+ }
+
+ if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+ orig_clusters = ocfs2_xa_value_clusters(loc);
+ rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+ if (rc) {
+ mlog_errno(rc);
+ ocfs2_xa_cleanup_value_truncate(loc,
+ "overwriting",
+ orig_clusters);
+ goto out;
}
- ocfs2_xa_wipe_namevalue(loc);
- } else
- ocfs2_xa_add_entry(loc, name_hash);
+ }
+ ocfs2_xa_wipe_namevalue(loc);
/*
* If we get here, we have a blank entry. Fill it. We grow our
diff --git a/fs/readdir.c b/fs/readdir.c
index 19bea591c3f1..6e2623e57b2e 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -27,53 +27,13 @@
/*
* Note the "unsafe_put_user() semantics: we goto a
* label for errors.
- *
- * Also note how we use a "while()" loop here, even though
- * only the biggest size needs to loop. The compiler (well,
- * at least gcc) is smart enough to turn the smaller sizes
- * into just if-statements, and this way we don't need to
- * care whether 'u64' or 'u32' is the biggest size.
- */
-#define unsafe_copy_loop(dst, src, len, type, label) \
- while (len >= sizeof(type)) { \
- unsafe_put_user(get_unaligned((type *)src), \
- (type __user *)dst, label); \
- dst += sizeof(type); \
- src += sizeof(type); \
- len -= sizeof(type); \
- }
-
-/*
- * We avoid doing 64-bit copies on 32-bit architectures. They
- * might be better, but the component names are mostly small,
- * and the 64-bit cases can end up being much more complex and
- * put much more register pressure on the code, so it's likely
- * not worth the pain of unaligned accesses etc.
- *
- * So limit the copies to "unsigned long" size. I did verify
- * that at least the x86-32 case is ok without this limiting,
- * but I worry about random other legacy 32-bit cases that
- * might not do as well.
- */
-#define unsafe_copy_type(dst, src, len, type, label) do { \
- if (sizeof(type) <= sizeof(unsigned long)) \
- unsafe_copy_loop(dst, src, len, type, label); \
-} while (0)
-
-/*
- * Copy the dirent name to user space, and NUL-terminate
- * it. This should not be a function call, since we're doing
- * the copy inside a "user_access_begin/end()" section.
*/
#define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \
char __user *dst = (_dst); \
const char *src = (_src); \
size_t len = (_len); \
- unsafe_copy_type(dst, src, len, u64, label); \
- unsafe_copy_type(dst, src, len, u32, label); \
- unsafe_copy_type(dst, src, len, u16, label); \
- unsafe_copy_type(dst, src, len, u8, label); \
- unsafe_put_user(0, dst, label); \
+ unsafe_put_user(0, dst+len, label); \
+ unsafe_copy_to_user(dst, src, len, label); \
} while (0)
diff --git a/fs/super.c b/fs/super.c
index f627b7c53d2b..cfadab2cbf35 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1300,6 +1300,7 @@ int get_tree_bdev(struct fs_context *fc,
mutex_lock(&bdev->bd_fsfreeze_mutex);
if (bdev->bd_fsfreeze_count > 0) {
mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ blkdev_put(bdev, mode);
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
return -EBUSY;
}
@@ -1308,8 +1309,10 @@ int get_tree_bdev(struct fs_context *fc,
fc->sget_key = bdev;
s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
- if (IS_ERR(s))
+ if (IS_ERR(s)) {
+ blkdev_put(bdev, mode);
return PTR_ERR(s);
+ }
if (s->s_root) {
/* Don't summarily change the RO/RW state. */
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 5de296b34ab1..14fbdf22b7e7 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -28,12 +28,11 @@ xfs_get_aghdr_buf(
struct xfs_mount *mp,
xfs_daddr_t blkno,
size_t numblks,
- int flags,
const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
- bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+ bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0);
if (!bp)
return NULL;
@@ -345,7 +344,7 @@ xfs_ag_init_hdr(
{
struct xfs_buf *bp;
- bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops);
+ bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, ops);
if (!bp)
return -ENOMEM;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index b9f019603d0b..f0089e862216 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -826,32 +826,17 @@ xfs_attr_shortform_to_leaf(
sf = (xfs_attr_shortform_t *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
- xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+ xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
bp = NULL;
error = xfs_da_grow_inode(args, &blkno);
- if (error) {
- /*
- * If we hit an IO error middle of the transaction inside
- * grow_inode(), we may have inconsistent data. Bail out.
- */
- if (error == -EIO)
- goto out;
- xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
- memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
+ if (error)
goto out;
- }
ASSERT(blkno == 0);
error = xfs_attr3_leaf_create(args, blkno, &bp);
- if (error) {
- /* xfs_attr3_leaf_create may not have instantiated a block */
- if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0))
- goto out;
- xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
- memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
+ if (error)
goto out;
- }
memset((char *)&nargs, 0, sizeof(nargs));
nargs.dp = dp;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 4edc25a2ba80..02469d59c787 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -792,6 +792,7 @@ out_root_realloc:
*/
void
xfs_bmap_local_to_extents_empty(
+ struct xfs_trans *tp,
struct xfs_inode *ip,
int whichfork)
{
@@ -808,6 +809,7 @@ xfs_bmap_local_to_extents_empty(
ifp->if_u1.if_root = NULL;
ifp->if_height = 0;
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -840,7 +842,7 @@ xfs_bmap_local_to_extents(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
if (!ifp->if_bytes) {
- xfs_bmap_local_to_extents_empty(ip, whichfork);
+ xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
flags = XFS_ILOG_CORE;
goto done;
}
@@ -887,7 +889,7 @@ xfs_bmap_local_to_extents(
/* account for the change in fork size */
xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
- xfs_bmap_local_to_extents_empty(ip, whichfork);
+ xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
flags |= XFS_ILOG_CORE;
ifp->if_u1.if_root = NULL;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 5bb446d80542..e2798c6f3a5f 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -182,7 +182,8 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
-void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
+void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork);
void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
xfs_filblks_t len, const struct xfs_owner_info *oinfo,
bool skip_discard);
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 9595ced393dc..49e4bc39e7bb 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -1096,7 +1096,7 @@ xfs_dir2_sf_to_block(
memcpy(sfp, oldsfp, ifp->if_bytes);
xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
- xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
+ xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK);
dp->i_d.di_size = 0;
/*
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 93b3793bc5b3..0cab11a5d390 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -341,7 +341,6 @@ xchk_refcountbt_rec(
xfs_extlen_t len;
xfs_nlink_t refcount;
bool has_cowflag;
- int error = 0;
bno = be32_to_cpu(rec->refc.rc_startblock);
len = be32_to_cpu(rec->refc.rc_blockcount);
@@ -366,7 +365,7 @@ xchk_refcountbt_rec(
xchk_refcountbt_xref(bs->sc, bno, len, refcount);
- return error;
+ return 0;
}
/* Make sure we have as many refc blocks as the rmap says. */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 0910cb75b65d..4f443703065e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -864,6 +864,7 @@ xfs_alloc_file_space(
xfs_filblks_t allocatesize_fsb;
xfs_extlen_t extsz, temp;
xfs_fileoff_t startoffset_fsb;
+ xfs_fileoff_t endoffset_fsb;
int nimaps;
int quota_flag;
int rt;
@@ -891,7 +892,8 @@ xfs_alloc_file_space(
imapp = &imaps[0];
nimaps = 1;
startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
- allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+ endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
+ allocatesize_fsb = endoffset_fsb - startoffset_fsb;
/*
* Allocate file space until done or until there is an error
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 21c243622a79..0abba171aa89 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -345,6 +345,15 @@ xfs_buf_allocate_memory(
unsigned short page_count, i;
xfs_off_t start, end;
int error;
+ xfs_km_flags_t kmflag_mask = 0;
+
+ /*
+ * assure zeroed buffer for non-read cases.
+ */
+ if (!(flags & XBF_READ)) {
+ kmflag_mask |= KM_ZERO;
+ gfp_mask |= __GFP_ZERO;
+ }
/*
* for buffers that are contained within a single page, just allocate
@@ -354,7 +363,8 @@ xfs_buf_allocate_memory(
size = BBTOB(bp->b_length);
if (size < PAGE_SIZE) {
int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
- bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS);
+ bp->b_addr = kmem_alloc_io(size, align_mask,
+ KM_NOFS | kmflag_mask);
if (!bp->b_addr) {
/* low memory - use alloc_page loop instead */
goto use_alloc_page;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a2beee9f74da..641d07f30a27 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1443,7 +1443,7 @@ xlog_alloc_log(
prev_iclog = iclog;
iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
- KM_MAYFAIL);
+ KM_MAYFAIL | KM_ZERO);
if (!iclog->ic_data)
goto out_free_iclog;
#ifdef DEBUG
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 508319039dce..c1a514ffff55 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -127,7 +127,7 @@ xlog_alloc_buffer(
if (nbblks > 1 && log->l_sectBBsize > 1)
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
- return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL);
+ return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO);
}
/*
diff --git a/include/linux/leds.h b/include/linux/leds.h
index b8df71193329..efb309dba914 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -247,7 +247,7 @@ extern void led_set_brightness(struct led_classdev *led_cdev,
/**
* led_set_brightness_sync - set LED brightness synchronously
* @led_cdev: the LED to set
- * @brightness: the brightness to set it to
+ * @value: the brightness to set it to
*
* Set an LED's brightness immediately. This function will block
* the caller for the time required for accessing device registers,
@@ -301,8 +301,7 @@ extern void led_sysfs_enable(struct led_classdev *led_cdev);
/**
* led_compose_name - compose LED class device name
* @dev: LED controller device object
- * @child: child fwnode_handle describing a LED or a group of synchronized LEDs;
- * it must be provided only for fwnode based LEDs
+ * @init_data: the LED class device initialization data
* @led_classdev_name: composed LED class device name
*
* Create LED class device name basing on the provided init_data argument.
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9b60863429cc..ae703ea3ef48 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -356,6 +356,19 @@ static inline bool mem_cgroup_disabled(void)
return !cgroup_subsys_enabled(memory_cgrp_subsys);
}
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+ bool in_low_reclaim)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+
+ if (in_low_reclaim)
+ return READ_ONCE(memcg->memory.emin);
+
+ return max(READ_ONCE(memcg->memory.emin),
+ READ_ONCE(memcg->memory.elow));
+}
+
enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
struct mem_cgroup *memcg);
@@ -537,6 +550,8 @@ void mem_cgroup_handle_over_high(void);
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg);
+
void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
struct task_struct *p);
@@ -829,6 +844,12 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
{
}
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+ bool in_low_reclaim)
+{
+ return 0;
+}
+
static inline enum mem_cgroup_protection mem_cgroup_protected(
struct mem_cgroup *root, struct mem_cgroup *memcg)
{
@@ -968,6 +989,11 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
return 0;
}
+static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
static inline void
mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
@@ -1264,6 +1290,9 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
static inline void mem_cgroup_track_foreign_dirty(struct page *page,
struct bdi_writeback *wb)
{
+ if (mem_cgroup_disabled())
+ return;
+
if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
mem_cgroup_track_foreign_dirty_slowpath(page, wb);
}
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ab2b98ad76e1..4d2a2fa55ed5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -493,6 +493,10 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
* kmalloc is the normal method of allocating memory
* for objects smaller than page size in the kernel.
*
+ * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
+ * bytes. For @size of power of two bytes, the alignment is also guaranteed
+ * to be at least to the size.
+ *
* The @flags argument may be one of the GFP flags defined at
* include/linux/gfp.h and described at
* :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index e47d0522a1f4..d4ee6e942562 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -355,8 +355,10 @@ extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count);
#ifndef user_access_begin
#define user_access_begin(ptr,len) access_ok(ptr, len)
#define user_access_end() do { } while (0)
-#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0)
-#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0)
+#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
+#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e)
+#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
+#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
static inline unsigned long user_access_save(void) { return 0UL; }
static inline void user_access_restore(unsigned long flags) { }
#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 1f6c45f6a734..bcdf53125210 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2925,7 +2925,7 @@ int sysctl_max_threads(struct ctl_table *table, int write,
struct ctl_table t;
int ret;
int threads = max_threads;
- int min = MIN_THREADS;
+ int min = 1;
int max = MAX_THREADS;
t = *table;
@@ -2937,7 +2937,7 @@ int sysctl_max_threads(struct ctl_table *table, int write,
if (ret || !write)
return ret;
- set_max_threads(threads);
+ max_threads = threads;
return 0;
}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index c0738424bb43..dc520f01f99d 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -22,12 +22,6 @@ EXPORT_SYMBOL(system_freezing_cnt);
bool pm_freezing;
bool pm_nosig_freezing;
-/*
- * Temporary export for the deadlock workaround in ata_scsi_hotplug().
- * Remove once the hack becomes unnecessary.
- */
-EXPORT_SYMBOL_GPL(pm_freezing);
-
/* protects freezing and frozen transitions */
static DEFINE_SPINLOCK(freezer_lock);
diff --git a/kernel/panic.c b/kernel/panic.c
index 47e8ebccc22b..f470a038b05b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -180,6 +180,7 @@ void panic(const char *fmt, ...)
* after setting panic_cpu) from invoking panic() again.
*/
local_irq_disable();
+ preempt_disable_notrace();
/*
* It's possible to come here directly from a panic-assertion and
diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig
index cc00364bd2c2..9fe698ff62ec 100644
--- a/lib/vdso/Kconfig
+++ b/lib/vdso/Kconfig
@@ -24,13 +24,4 @@ config GENERIC_COMPAT_VDSO
help
This config option enables the compat VDSO layer.
-config CROSS_COMPILE_COMPAT_VDSO
- string "32 bit Toolchain prefix for compat vDSO"
- default ""
- depends on GENERIC_COMPAT_VDSO
- help
- Defines the cross-compiler prefix for compiling compat vDSO.
- If a 64 bit compiler (i.e. x86_64) can compile the VDSO for
- 32 bit, it does not need to define this parameter.
-
endif
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d9daa3e422d0..c360f6a6c844 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -239,8 +239,8 @@ static int __init default_bdi_init(void)
{
int err;
- bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
- WQ_UNBOUND | WQ_SYSFS, 0);
+ bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
+ WQ_SYSFS, 0);
if (!bdi_wq)
return -ENOMEM;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c313c49074ca..bdac56009a38 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1567,6 +1567,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
return max;
}
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+ return page_counter_read(&memcg->memory);
+}
+
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
diff --git a/mm/memremap.c b/mm/memremap.c
index 32c79b51af86..68204912cc0a 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -13,8 +13,6 @@
#include <linux/xarray.h>
static DEFINE_XARRAY(pgmap_array);
-#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
-#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
#ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 15c2050c629b..c0b2e0306720 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1175,11 +1175,17 @@ static __always_inline bool free_pages_prepare(struct page *page,
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
- arch_free_page(page, order);
if (want_init_on_free())
kernel_init_free_pages(page, 1 << order);
kernel_poison_pages(page, 1 << order, 0);
+ /*
+ * arch_free_page() can make the page's contents inaccessible. s390
+ * does this. So nothing which can access the page's contents should
+ * happen after this.
+ */
+ arch_free_page(page, order);
+
if (debug_pagealloc_enabled())
kernel_map_pages(page, 1 << order, 0);
diff --git a/mm/shmem.c b/mm/shmem.c
index cd570cc79c76..220be9fa2c41 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3482,6 +3482,12 @@ static int shmem_parse_options(struct fs_context *fc, void *data)
{
char *options = data;
+ if (options) {
+ int err = security_sb_eat_lsm_opts(options, &fc->security);
+ if (err)
+ return err;
+ }
+
while (options != NULL) {
char *this_char = options;
for (;;) {
diff --git a/mm/shuffle.c b/mm/shuffle.c
index 3ce12481b1dc..b3fe97fd6654 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -33,7 +33,7 @@ __meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
}
static bool shuffle_param;
-extern int shuffle_show(char *buffer, const struct kernel_param *kp)
+static int shuffle_show(char *buffer, const struct kernel_param *kp)
{
return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state)
? 'Y' : 'N');
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6491c3a41805..c29f03adca91 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1030,10 +1030,19 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
unsigned int useroffset, unsigned int usersize)
{
int err;
+ unsigned int align = ARCH_KMALLOC_MINALIGN;
s->name = name;
s->size = s->object_size = size;
- s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+
+ /*
+ * For power of two sizes, guarantee natural alignment for kmalloc
+ * caches, regardless of SL*B debugging options.
+ */
+ if (is_power_of_2(size))
+ align = max(align, size);
+ s->align = calculate_alignment(flags, align, size);
+
s->useroffset = useroffset;
s->usersize = usersize;
@@ -1287,12 +1296,16 @@ void __init create_kmalloc_caches(slab_flags_t flags)
*/
void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
{
- void *ret;
+ void *ret = NULL;
struct page *page;
flags |= __GFP_COMP;
page = alloc_pages(flags, order);
- ret = page ? page_address(page) : NULL;
+ if (likely(page)) {
+ ret = page_address(page);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ }
ret = kasan_kmalloc_large(ret, size, flags);
/* As ret might get tagged, call kmemleak hook after KASAN. */
kmemleak_alloc(ret, size, 1, flags);
diff --git a/mm/slob.c b/mm/slob.c
index cf377beab962..fa53e9f73893 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -190,7 +190,7 @@ static int slob_last(slob_t *s)
static void *slob_new_pages(gfp_t gfp, int order, int node)
{
- void *page;
+ struct page *page;
#ifdef CONFIG_NUMA
if (node != NUMA_NO_NODE)
@@ -202,14 +202,21 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
if (!page)
return NULL;
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
return page_address(page);
}
static void slob_free_pages(void *b, int order)
{
+ struct page *sp = virt_to_page(b);
+
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
- free_pages((unsigned long)b, order);
+
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(sp, order);
}
/*
@@ -217,6 +224,7 @@ static void slob_free_pages(void *b, int order)
* @sp: Page to look in.
* @size: Size of the allocation.
* @align: Allocation alignment.
+ * @align_offset: Offset in the allocated block that will be aligned.
* @page_removed_from_list: Return parameter.
*
* Tries to find a chunk of memory at least @size bytes big within @page.
@@ -227,7 +235,7 @@ static void slob_free_pages(void *b, int order)
* true (set to false otherwise).
*/
static void *slob_page_alloc(struct page *sp, size_t size, int align,
- bool *page_removed_from_list)
+ int align_offset, bool *page_removed_from_list)
{
slob_t *prev, *cur, *aligned = NULL;
int delta = 0, units = SLOB_UNITS(size);
@@ -236,8 +244,17 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
slobidx_t avail = slob_units(cur);
+ /*
+ * 'aligned' will hold the address of the slob block so that the
+ * address 'aligned'+'align_offset' is aligned according to the
+ * 'align' parameter. This is for kmalloc() which prepends the
+ * allocated block with its size, so that the block itself is
+ * aligned when needed.
+ */
if (align) {
- aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+ aligned = (slob_t *)
+ (ALIGN((unsigned long)cur + align_offset, align)
+ - align_offset);
delta = aligned - cur;
}
if (avail >= units + delta) { /* room enough? */
@@ -281,7 +298,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
/*
* slob_alloc: entry point into the slob allocator.
*/
-static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
+ int align_offset)
{
struct page *sp;
struct list_head *slob_list;
@@ -312,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
if (sp->units < SLOB_UNITS(size))
continue;
- b = slob_page_alloc(sp, size, align, &page_removed_from_list);
+ b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list);
if (!b)
continue;
@@ -349,7 +367,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
INIT_LIST_HEAD(&sp->slab_list);
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
set_slob_page_free(sp, slob_list);
- b = slob_page_alloc(sp, size, align, &_unused);
+ b = slob_page_alloc(sp, size, align, align_offset, &_unused);
BUG_ON(!b);
spin_unlock_irqrestore(&slob_lock, flags);
}
@@ -451,7 +469,7 @@ static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
{
unsigned int *m;
- int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
void *ret;
gfp &= gfp_allowed_mask;
@@ -459,19 +477,28 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
fs_reclaim_acquire(gfp);
fs_reclaim_release(gfp);
- if (size < PAGE_SIZE - align) {
+ if (size < PAGE_SIZE - minalign) {
+ int align = minalign;
+
+ /*
+ * For power of two sizes, guarantee natural alignment for
+ * kmalloc()'d objects.
+ */
+ if (is_power_of_2(size))
+ align = max(minalign, (int) size);
+
if (!size)
return ZERO_SIZE_PTR;
- m = slob_alloc(size + align, gfp, align, node);
+ m = slob_alloc(size + minalign, gfp, align, node, minalign);
if (!m)
return NULL;
*m = size;
- ret = (void *)m + align;
+ ret = (void *)m + minalign;
trace_kmalloc_node(caller, ret,
- size, size + align, gfp, node);
+ size, size + minalign, gfp, node);
} else {
unsigned int order = get_order(size);
@@ -521,8 +548,13 @@ void kfree(const void *block)
int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
slob_free(m, *m + align);
- } else
- __free_pages(sp, compound_order(sp));
+ } else {
+ unsigned int order = compound_order(sp);
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(sp, order);
+
+ }
}
EXPORT_SYMBOL(kfree);
@@ -567,7 +599,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
fs_reclaim_release(flags);
if (c->size < PAGE_SIZE) {
- b = slob_alloc(c->size, flags, c->align, node);
+ b = slob_alloc(c->size, flags, c->align, node, 0);
trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
SLOB_UNITS(c->size) * SLOB_UNIT,
flags, node);
diff --git a/mm/slub.c b/mm/slub.c
index 42c1b3af3c98..3d63ae320d31 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3821,11 +3821,15 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
struct page *page;
void *ptr = NULL;
+ unsigned int order = get_order(size);
flags |= __GFP_COMP;
- page = alloc_pages_node(node, flags, get_order(size));
- if (page)
+ page = alloc_pages_node(node, flags, order);
+ if (page) {
ptr = page_address(page);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ }
return kmalloc_large_node_hook(ptr, size, flags);
}
@@ -3951,9 +3955,13 @@ void kfree(const void *x)
page = virt_to_head_page(x);
if (unlikely(!PageSlab(page))) {
+ unsigned int order = compound_order(page);
+
BUG_ON(!PageCompound(page));
kfree_hook(object);
- __free_pages(page, compound_order(page));
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(page, order);
return;
}
slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
diff --git a/mm/sparse.c b/mm/sparse.c
index bf32de9e666b..f6891c1992b1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -219,7 +219,7 @@ static inline unsigned long first_present_section_nr(void)
return next_present_section_nr(-1);
}
-void subsection_mask_set(unsigned long *map, unsigned long pfn,
+static void subsection_mask_set(unsigned long *map, unsigned long pfn,
unsigned long nr_pages)
{
int idx = subsection_map_index(pfn);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index f3b50811497a..4bac22fe1aa2 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -355,6 +355,9 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* "hierarchy" or "local").
*
* To be used as memcg event method.
+ *
+ * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could
+ * not be parsed.
*/
int vmpressure_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
@@ -362,7 +365,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
- enum vmpressure_levels level = -1;
+ enum vmpressure_levels level;
char *spec, *spec_orig;
char *token;
int ret = 0;
@@ -375,20 +378,18 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
/* Find required level */
token = strsep(&spec, ",");
- level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
- if (level < 0) {
- ret = level;
+ ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
+ if (ret < 0)
goto out;
- }
+ level = ret;
/* Find optional mode */
token = strsep(&spec, ",");
if (token) {
- mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
- if (mode < 0) {
- ret = mode;
+ ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
+ if (ret < 0)
goto out;
- }
+ mode = ret;
}
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
@@ -404,6 +405,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
mutex_lock(&vmpr->events_lock);
list_add(&ev->node, &vmpr->events);
mutex_unlock(&vmpr->events_lock);
+ ret = 0;
out:
kfree(spec_orig);
return ret;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5d52d6a24af..c6659bb758a4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2459,17 +2459,70 @@ out:
*lru_pages = 0;
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
- unsigned long size;
+ unsigned long lruvec_size;
unsigned long scan;
+ unsigned long protection;
+
+ lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ protection = mem_cgroup_protection(memcg,
+ sc->memcg_low_reclaim);
+
+ if (protection) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan = lruvec_size - lruvec_size * protection /
+ cgroup_size;
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decremeting
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ } else {
+ scan = lruvec_size;
+ }
+
+ scan >>= sc->priority;
- size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- scan = size >> sc->priority;
/*
* If the cgroup's already been deleted, make sure to
* scrape out the remaining cache.
*/
if (!scan && !mem_cgroup_online(memcg))
- scan = min(size, SWAP_CLUSTER_MAX);
+ scan = min(lruvec_size, SWAP_CLUSTER_MAX);
switch (scan_balance) {
case SCAN_EQUAL:
@@ -2489,7 +2542,7 @@ out:
case SCAN_ANON:
/* Scan one type exclusively */
if ((scan_balance == SCAN_FILE) != file) {
- size = 0;
+ lruvec_size = 0;
scan = 0;
}
break;
@@ -2498,7 +2551,7 @@ out:
BUG();
}
- *lru_pages += size;
+ *lru_pages += lruvec_size;
nr[lru] = scan;
}
}
@@ -2742,6 +2795,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
memcg_memory_event(memcg, MEMCG_LOW);
break;
case MEMCG_PROT_NONE:
+ /*
+ * All protection thresholds breached. We may
+ * still choose to vary the scan pressure
+ * applied based on by how much the cgroup in
+ * question has exceeded its protection
+ * thresholds (see get_scan_count).
+ */
break;
}
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 05bdf90646e7..6d3d3f698ebb 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -998,9 +998,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
struct z3fold_header *zhdr;
struct page *page;
enum buddy bud;
+ bool page_claimed;
zhdr = handle_to_z3fold_header(handle);
page = virt_to_page(zhdr);
+ page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
if (test_bit(PAGE_HEADLESS, &page->private)) {
/* if a headless page is under reclaim, just leave.
@@ -1008,7 +1010,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
* has not been set before, we release this page
* immediately so we don't care about its value any more.
*/
- if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ if (!page_claimed) {
spin_lock(&pool->lock);
list_del(&page->lru);
spin_unlock(&pool->lock);
@@ -1044,13 +1046,15 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
atomic64_dec(&pool->pages_nr);
return;
}
- if (test_bit(PAGE_CLAIMED, &page->private)) {
+ if (page_claimed) {
+ /* the page has not been claimed by us */
z3fold_page_unlock(zhdr);
return;
}
if (unlikely(PageIsolated(page)) ||
test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
@@ -1060,10 +1064,12 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
zhdr->cpu = -1;
kref_get(&zhdr->refcount);
do_compact_page(zhdr, true);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
kref_get(&zhdr->refcount);
queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 3a29e7c24ba9..a5813c7629c1 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -1946,7 +1946,14 @@ static int convert_context(struct context *oldc, struct context *newc, void *p)
rc = string_to_context_struct(args->newp, NULL, s,
newc, SECSID_NULL);
if (rc == -EINVAL) {
- /* Retain string representation for later mapping. */
+ /*
+ * Retain string representation for later mapping.
+ *
+ * IMPORTANT: We need to copy the contents of oldc->str
+ * back into s again because string_to_context_struct()
+ * may have garbled it.
+ */
+ memcpy(s, oldc->str, oldc->len);
context_init(newc);
newc->str = s;
newc->len = oldc->len;
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index c3feccb99ff5..4cdbae6f4e61 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -63,6 +63,13 @@ TARGETS += zram
TARGETS_HOTPLUG = cpu-hotplug
TARGETS_HOTPLUG += memory-hotplug
+# User can optionally provide a TARGETS skiplist.
+SKIP_TARGETS ?=
+ifneq ($(SKIP_TARGETS),)
+ TMP := $(filter-out $(SKIP_TARGETS), $(TARGETS))
+ override TARGETS := $(TMP)
+endif
+
# Clear LDFLAGS and MAKEFLAGS if called from main
# Makefile to avoid test build failures when test
# Makefile doesn't have explicit build rules.
@@ -171,9 +178,12 @@ run_pstore_crash:
# 1. output_dir=kernel_src
# 2. a separate output directory is specified using O= KBUILD_OUTPUT
# 3. a separate output directory is specified using KBUILD_OUTPUT
+# Avoid conflict with INSTALL_PATH set by the main Makefile
#
-INSTALL_PATH ?= $(BUILD)/install
-INSTALL_PATH := $(abspath $(INSTALL_PATH))
+KSFT_INSTALL_PATH ?= $(BUILD)/kselftest_install
+KSFT_INSTALL_PATH := $(abspath $(KSFT_INSTALL_PATH))
+# Avoid changing the rest of the logic here and lib.mk.
+INSTALL_PATH := $(KSFT_INSTALL_PATH)
ALL_SCRIPT := $(INSTALL_PATH)/run_kselftest.sh
install: all
@@ -198,11 +208,16 @@ ifdef INSTALL_PATH
echo " cat /dev/null > \$$logfile" >> $(ALL_SCRIPT)
echo "fi" >> $(ALL_SCRIPT)
+ @# While building run_kselftest.sh skip also non-existent TARGET dirs:
+ @# they could be the result of a build failure and should NOT be
+ @# included in the generated runlist.
for TARGET in $(TARGETS); do \
BUILD_TARGET=$$BUILD/$$TARGET; \
+ [ ! -d $$INSTALL_PATH/$$TARGET ] && echo "Skipping non-existent dir: $$TARGET" && continue; \
echo "[ -w /dev/kmsg ] && echo \"kselftest: Running tests in $$TARGET\" >> /dev/kmsg" >> $(ALL_SCRIPT); \
echo "cd $$TARGET" >> $(ALL_SCRIPT); \
echo -n "run_many" >> $(ALL_SCRIPT); \
+ echo -n "Emit Tests for $$TARGET\n"; \
$(MAKE) -s --no-print-directory OUTPUT=$$BUILD_TARGET -C $$TARGET emit_tests >> $(ALL_SCRIPT); \
echo "" >> $(ALL_SCRIPT); \
echo "cd \$$ROOT" >> $(ALL_SCRIPT); \
diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh
index 00c9020bdda8..84de7bc74f2c 100644
--- a/tools/testing/selftests/kselftest/runner.sh
+++ b/tools/testing/selftests/kselftest/runner.sh
@@ -3,9 +3,14 @@
#
# Runs a set of tests in a given subdirectory.
export skip_rc=4
+export timeout_rc=124
export logfile=/dev/stdout
export per_test_logging=
+# Defaults for "settings" file fields:
+# "timeout" how many seconds to let each test run before failing.
+export kselftest_default_timeout=45
+
# There isn't a shell-agnostic way to find the path of a sourced file,
# so we must rely on BASE_DIR being set to find other tools.
if [ -z "$BASE_DIR" ]; then
@@ -24,6 +29,16 @@ tap_prefix()
fi
}
+tap_timeout()
+{
+ # Make sure tests will time out if utility is available.
+ if [ -x /usr/bin/timeout ] ; then
+ /usr/bin/timeout "$kselftest_timeout" "$1"
+ else
+ "$1"
+ fi
+}
+
run_one()
{
DIR="$1"
@@ -32,6 +47,18 @@ run_one()
BASENAME_TEST=$(basename $TEST)
+ # Reset any "settings"-file variables.
+ export kselftest_timeout="$kselftest_default_timeout"
+ # Load per-test-directory kselftest "settings" file.
+ settings="$BASE_DIR/$DIR/settings"
+ if [ -r "$settings" ] ; then
+ while read line ; do
+ field=$(echo "$line" | cut -d= -f1)
+ value=$(echo "$line" | cut -d= -f2-)
+ eval "kselftest_$field"="$value"
+ done < "$settings"
+ fi
+
TEST_HDR_MSG="selftests: $DIR: $BASENAME_TEST"
echo "# $TEST_HDR_MSG"
if [ ! -x "$TEST" ]; then
@@ -44,14 +71,17 @@ run_one()
echo "not ok $test_num $TEST_HDR_MSG"
else
cd `dirname $TEST` > /dev/null
- (((((./$BASENAME_TEST 2>&1; echo $? >&3) |
+ ((((( tap_timeout ./$BASENAME_TEST 2>&1; echo $? >&3) |
tap_prefix >&4) 3>&1) |
(read xs; exit $xs)) 4>>"$logfile" &&
echo "ok $test_num $TEST_HDR_MSG") ||
- (if [ $? -eq $skip_rc ]; then \
+ (rc=$?; \
+ if [ $rc -eq $skip_rc ]; then \
echo "not ok $test_num $TEST_HDR_MSG # SKIP"
+ elif [ $rc -eq $timeout_rc ]; then \
+ echo "not ok $test_num $TEST_HDR_MSG # TIMEOUT"
else
- echo "not ok $test_num $TEST_HDR_MSG"
+ echo "not ok $test_num $TEST_HDR_MSG # exit=$rc"
fi)
cd - >/dev/null
fi
diff --git a/tools/testing/selftests/kselftest_install.sh b/tools/testing/selftests/kselftest_install.sh
index ec304463883c..e2e1911d62d5 100755
--- a/tools/testing/selftests/kselftest_install.sh
+++ b/tools/testing/selftests/kselftest_install.sh
@@ -24,12 +24,12 @@ main()
echo "$0: Installing in specified location - $install_loc ..."
fi
- install_dir=$install_loc/kselftest
+ install_dir=$install_loc/kselftest_install
# Create install directory
mkdir -p $install_dir
# Build tests
- INSTALL_PATH=$install_dir make install
+ KSFT_INSTALL_PATH=$install_dir make install
}
main "$@"
diff --git a/tools/testing/selftests/rtc/settings b/tools/testing/selftests/rtc/settings
new file mode 100644
index 000000000000..ba4d85f74cd6
--- /dev/null
+++ b/tools/testing/selftests/rtc/settings
@@ -0,0 +1 @@
+timeout=90
diff --git a/tools/testing/selftests/watchdog/watchdog-test.c b/tools/testing/selftests/watchdog/watchdog-test.c
index afff120c7be6..f45e510500c0 100644
--- a/tools/testing/selftests/watchdog/watchdog-test.c
+++ b/tools/testing/selftests/watchdog/watchdog-test.c
@@ -19,7 +19,7 @@
int fd;
const char v = 'V';
-static const char sopts[] = "bdehp:t:Tn:NLf:";
+static const char sopts[] = "bdehp:t:Tn:NLf:i";
static const struct option lopts[] = {
{"bootstatus", no_argument, NULL, 'b'},
{"disable", no_argument, NULL, 'd'},
@@ -32,6 +32,7 @@ static const struct option lopts[] = {
{"getpretimeout", no_argument, NULL, 'N'},
{"gettimeleft", no_argument, NULL, 'L'},
{"file", required_argument, NULL, 'f'},
+ {"info", no_argument, NULL, 'i'},
{NULL, no_argument, NULL, 0x0}
};
@@ -72,6 +73,7 @@ static void usage(char *progname)
printf("Usage: %s [options]\n", progname);
printf(" -f, --file\t\tOpen watchdog device file\n");
printf("\t\t\tDefault is /dev/watchdog\n");
+ printf(" -i, --info\t\tShow watchdog_info\n");
printf(" -b, --bootstatus\tGet last boot status (Watchdog/POR)\n");
printf(" -d, --disable\t\tTurn off the watchdog timer\n");
printf(" -e, --enable\t\tTurn on the watchdog timer\n");
@@ -97,6 +99,7 @@ int main(int argc, char *argv[])
int c;
int oneshot = 0;
char *file = "/dev/watchdog";
+ struct watchdog_info info;
setbuf(stdout, NULL);
@@ -118,6 +121,16 @@ int main(int argc, char *argv[])
exit(-1);
}
+ /*
+ * Validate that `file` is a watchdog device
+ */
+ ret = ioctl(fd, WDIOC_GETSUPPORT, &info);
+ if (ret) {
+ printf("WDIOC_GETSUPPORT error '%s'\n", strerror(errno));
+ close(fd);
+ exit(ret);
+ }
+
optind = 0;
while ((c = getopt_long(argc, argv, sopts, lopts, NULL)) != -1) {
@@ -205,6 +218,18 @@ int main(int argc, char *argv[])
case 'f':
/* Handled above */
break;
+ case 'i':
+ /*
+ * watchdog_info was obtained as part of file open
+ * validation. So we just show it here.
+ */
+ oneshot = 1;
+ printf("watchdog_info:\n");
+ printf(" identity:\t\t%s\n", info.identity);
+ printf(" firmware_version:\t%u\n",
+ info.firmware_version);
+ printf(" options:\t\t%08x\n", info.options);
+ break;
default:
usage(argv[0]);