From 7f07e5f1f778605e98cf2156d4db1ff3a3a1a74a Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Tue, 26 Mar 2019 11:48:57 +0200 Subject: net: mii: Fix PAUSE cap advertisement from linkmode_adv_to_lcl_adv_t() helper With a recent link mode advertisement code update this helper providing local pause capability translation used for flow control link mode negotiation got broken. For eth drivers using this helper, the issue is apparent only if either PAUSE or ASYM_PAUSE is being advertised. Fixes: 3c1bcc8614db ("net: ethernet: Convert phydev advertize and supported from u32 to link mode") Signed-off-by: Claudiu Manoil Signed-off-by: David S. Miller --- include/linux/mii.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mii.h b/include/linux/mii.h index 6fee8b1a4400..5cd824c1c0ca 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -469,7 +469,7 @@ static inline u32 linkmode_adv_to_lcl_adv_t(unsigned long *advertising) if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_CAP; - if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, + if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_ASYM; -- cgit v1.2.3-59-g8ed1b From 80a2a9026b24c6bd34b8d58256973e22270bedec Mon Sep 17 00:00:00 2001 From: Yuval Avnery Date: Mon, 11 Mar 2019 06:18:24 +0200 Subject: net/mlx5e: Add a lock on tir list Refresh tirs is looping over a global list of tirs while netdevs are adding and removing tirs from that list. That is why a lock is required. Fixes: 724b2aa15126 ("net/mlx5e: TIRs management refactoring") Signed-off-by: Yuval Avnery Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_common.c | 7 +++++++ include/linux/mlx5/driver.h | 2 ++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index 8100786f6fb5..1539cf3de5dc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -45,7 +45,9 @@ int mlx5e_create_tir(struct mlx5_core_dev *mdev, if (err) return err; + mutex_lock(&mdev->mlx5e_res.td.list_lock); list_add(&tir->list, &mdev->mlx5e_res.td.tirs_list); + mutex_unlock(&mdev->mlx5e_res.td.list_lock); return 0; } @@ -53,8 +55,10 @@ int mlx5e_create_tir(struct mlx5_core_dev *mdev, void mlx5e_destroy_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir) { + mutex_lock(&mdev->mlx5e_res.td.list_lock); mlx5_core_destroy_tir(mdev, tir->tirn); list_del(&tir->list); + mutex_unlock(&mdev->mlx5e_res.td.list_lock); } static int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, @@ -114,6 +118,7 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev) } INIT_LIST_HEAD(&mdev->mlx5e_res.td.tirs_list); + mutex_init(&mdev->mlx5e_res.td.list_lock); return 0; @@ -159,6 +164,7 @@ int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb) MLX5_SET(modify_tir_in, in, bitmask.self_lb_en, 1); + mutex_lock(&mdev->mlx5e_res.td.list_lock); list_for_each_entry(tir, &mdev->mlx5e_res.td.tirs_list, list) { tirn = tir->tirn; err = mlx5_core_modify_tir(mdev, tirn, in, inlen); @@ -170,6 +176,7 @@ out: kvfree(in); if (err) netdev_err(priv->netdev, "refresh tir(0x%x) failed, %d\n", tirn, err); + mutex_unlock(&mdev->mlx5e_res.td.list_lock); return err; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 022541dc5dbf..0d0729648844 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -594,6 +594,8 @@ enum mlx5_pagefault_type_flags { }; struct mlx5_td { + /* protects tirs list changes while tirs refresh */ + struct mutex list_lock; struct list_head tirs_list; u32 tdn; }; -- cgit v1.2.3-59-g8ed1b From a0fe2c6479aab5723239b315ef1b552673f434a3 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 29 Mar 2019 22:46:49 +0100 Subject: linux/kernel.h: Use parentheses around argument in u64_to_user_ptr() Use parentheses around uses of the argument in u64_to_user_ptr() to ensure that the cast doesn't apply to part of the argument. There are existing uses of the macro of the form u64_to_user_ptr(A + B) which expands to (void __user *)(uintptr_t)A + B (the cast applies to the first operand of the addition, the addition is a pointer addition). This happens to still work as intended, the semantic difference doesn't cause a difference in behavior. But I want to use u64_to_user_ptr() with a ternary operator in the argument, like so: u64_to_user_ptr(A ? B : C) This currently doesn't work as intended. Signed-off-by: Jann Horn Signed-off-by: Borislav Petkov Reviewed-by: Mukesh Ojha Cc: Andrei Vagin Cc: Andrew Morton Cc: Dan Carpenter Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jani Nikula Cc: Kees Cook Cc: Masahiro Yamada Cc: NeilBrown Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190329214652.258477-1-jannh@google.com --- include/linux/kernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 34a5036debd3..2d14e21c16c0 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -47,8 +47,8 @@ #define u64_to_user_ptr(x) ( \ { \ - typecheck(u64, x); \ - (void __user *)(uintptr_t)x; \ + typecheck(u64, (x)); \ + (void __user *)(uintptr_t)(x); \ } \ ) -- cgit v1.2.3-59-g8ed1b From 631b7abacd02b88f4b0795c08b54ad4fc3e7c7c0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 7 Nov 2016 16:26:35 -0500 Subject: ptrace: Remove maxargs from task_current_syscall() task_current_syscall() has a single user that passes in 6 for maxargs, which is the maximum arguments that can be used to get system calls from syscall_get_arguments(). Instead of passing in a number of arguments to grab, just get 6 arguments. The args argument even specifies that it's an array of 6 items. This will also allow changing syscall_get_arguments() to not get a variable number of arguments, but always grab 6. Linus also suggested not passing in a bunch of arguments to task_current_syscall() but to instead pass in a pointer to a structure, and just fill the structure. struct seccomp_data has almost all the parameters that is needed except for the stack pointer (sp). As seccomp_data is part of uapi, and I'm afraid to change it, a new structure was created "syscall_info", which includes seccomp_data and adds the "sp" field. Link: http://lkml.kernel.org/r/20161107213233.466776454@goodmis.org Cc: Andy Lutomirski Cc: Alexey Dobriyan Cc: Oleg Nesterov Cc: Kees Cook Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Steven Rostedt (VMware) --- fs/proc/base.c | 17 ++++++++------- include/linux/ptrace.h | 11 +++++++--- lib/syscall.c | 57 ++++++++++++++++++++++---------------------------- 3 files changed, 42 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/base.c b/fs/proc/base.c index ddef482f1334..6a803a0b75df 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -616,24 +616,25 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - long nr; - unsigned long args[6], sp, pc; + struct syscall_info info; + u64 *args = &info.data.args[0]; int res; res = lock_trace(task); if (res) return res; - if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) + if (task_current_syscall(task, &info)) seq_puts(m, "running\n"); - else if (nr < 0) - seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + else if (info.data.nr < 0) + seq_printf(m, "%d 0x%llx 0x%llx\n", + info.data.nr, info.sp, info.data.instruction_pointer); else seq_printf(m, - "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", - nr, + "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n", + info.data.nr, args[0], args[1], args[2], args[3], args[4], args[5], - sp, pc); + info.sp, info.data.instruction_pointer); unlock_trace(task); return 0; diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index edb9b040c94c..d5084ebd9f03 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -9,6 +9,13 @@ #include /* For BUG_ON. */ #include /* For task_active_pid_ns. */ #include +#include + +/* Add sp to seccomp_data, as seccomp is user API, we don't want to modify it */ +struct syscall_info { + __u64 sp; + struct seccomp_data data; +}; extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); @@ -407,9 +414,7 @@ static inline void user_single_step_report(struct pt_regs *regs) #define current_user_stack_pointer() user_stack_pointer(current_pt_regs()) #endif -extern int task_current_syscall(struct task_struct *target, long *callno, - unsigned long args[6], unsigned int maxargs, - unsigned long *sp, unsigned long *pc); +extern int task_current_syscall(struct task_struct *target, struct syscall_info *info); extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact); #endif diff --git a/lib/syscall.c b/lib/syscall.c index 1a7077f20eae..e8467e17b9a2 100644 --- a/lib/syscall.c +++ b/lib/syscall.c @@ -5,16 +5,14 @@ #include #include -static int collect_syscall(struct task_struct *target, long *callno, - unsigned long args[6], unsigned int maxargs, - unsigned long *sp, unsigned long *pc) +static int collect_syscall(struct task_struct *target, struct syscall_info *info) { struct pt_regs *regs; if (!try_get_task_stack(target)) { /* Task has no stack, so the task isn't in a syscall. */ - *sp = *pc = 0; - *callno = -1; + memset(info, 0, sizeof(*info)); + info->data.nr = -1; return 0; } @@ -24,12 +22,13 @@ static int collect_syscall(struct task_struct *target, long *callno, return -EAGAIN; } - *sp = user_stack_pointer(regs); - *pc = instruction_pointer(regs); + info->sp = user_stack_pointer(regs); + info->data.instruction_pointer = instruction_pointer(regs); - *callno = syscall_get_nr(target, regs); - if (*callno != -1L && maxargs > 0) - syscall_get_arguments(target, regs, 0, maxargs, args); + info->data.nr = syscall_get_nr(target, regs); + if (info->data.nr != -1L) + syscall_get_arguments(target, regs, 0, 6, + (unsigned long *)&info->data.args[0]); put_task_stack(target); return 0; @@ -38,41 +37,35 @@ static int collect_syscall(struct task_struct *target, long *callno, /** * task_current_syscall - Discover what a blocked task is doing. * @target: thread to examine - * @callno: filled with system call number or -1 - * @args: filled with @maxargs system call arguments - * @maxargs: number of elements in @args to fill - * @sp: filled with user stack pointer - * @pc: filled with user PC + * @info: structure with the following fields: + * .sp - filled with user stack pointer + * .data.nr - filled with system call number or -1 + * .data.args - filled with @maxargs system call arguments + * .data.instruction_pointer - filled with user PC * - * If @target is blocked in a system call, returns zero with *@callno - * set to the the call's number and @args filled in with its arguments. - * Registers not used for system call arguments may not be available and - * it is not kosher to use &struct user_regset calls while the system + * If @target is blocked in a system call, returns zero with @info.data.nr + * set to the the call's number and @info.data.args filled in with its + * arguments. Registers not used for system call arguments may not be available + * and it is not kosher to use &struct user_regset calls while the system * call is still in progress. Note we may get this result if @target * has finished its system call but not yet returned to user mode, such * as when it's stopped for signal handling or syscall exit tracing. * * If @target is blocked in the kernel during a fault or exception, - * returns zero with *@callno set to -1 and does not fill in @args. - * If so, it's now safe to examine @target using &struct user_regset - * get() calls as long as we're sure @target won't return to user mode. + * returns zero with *@info.data.nr set to -1 and does not fill in + * @info.data.args. If so, it's now safe to examine @target using + * &struct user_regset get() calls as long as we're sure @target won't return + * to user mode. * * Returns -%EAGAIN if @target does not remain blocked. - * - * Returns -%EINVAL if @maxargs is too large (maximum is six). */ -int task_current_syscall(struct task_struct *target, long *callno, - unsigned long args[6], unsigned int maxargs, - unsigned long *sp, unsigned long *pc) +int task_current_syscall(struct task_struct *target, struct syscall_info *info) { long state; unsigned long ncsw; - if (unlikely(maxargs > 6)) - return -EINVAL; - if (target == current) - return collect_syscall(target, callno, args, maxargs, sp, pc); + return collect_syscall(target, info); state = target->state; if (unlikely(!state)) @@ -80,7 +73,7 @@ int task_current_syscall(struct task_struct *target, long *callno, ncsw = wait_task_inactive(target, state); if (unlikely(!ncsw) || - unlikely(collect_syscall(target, callno, args, maxargs, sp, pc)) || + unlikely(collect_syscall(target, info)) || unlikely(wait_task_inactive(target, state) != ncsw)) return -EAGAIN; -- cgit v1.2.3-59-g8ed1b From 5f074f3e192f10c9fade898b9b3b8812e3d83342 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 5 Apr 2019 18:38:45 -0700 Subject: lib/string.c: implement a basic bcmp A recent optimization in Clang (r355672) lowers comparisons of the return value of memcmp against zero to comparisons of the return value of bcmp against zero. This helps some platforms that implement bcmp more efficiently than memcmp. glibc simply aliases bcmp to memcmp, but an optimized implementation is in the works. This results in linkage failures for all targets with Clang due to the undefined symbol. For now, just implement bcmp as a tailcail to memcmp to unbreak the build. This routine can be further optimized in the future. Other ideas discussed: * A weak alias was discussed, but breaks for architectures that define their own implementations of memcmp since aliases to declarations are not permitted (only definitions). Arch-specific memcmp implementations typically declare memcmp in C headers, but implement them in assembly. * -ffreestanding also is used sporadically throughout the kernel. * -fno-builtin-bcmp doesn't work when doing LTO. Link: https://bugs.llvm.org/show_bug.cgi?id=41035 Link: https://code.woboq.org/userspace/glibc/string/memcmp.c.html#bcmp Link: https://github.com/llvm/llvm-project/commit/8e16d73346f8091461319a7dfc4ddd18eedcff13 Link: https://github.com/ClangBuiltLinux/linux/issues/416 Link: http://lkml.kernel.org/r/20190313211335.165605-1-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Reported-by: Nathan Chancellor Reported-by: Adhemerval Zanella Suggested-by: Arnd Bergmann Suggested-by: James Y Knight Suggested-by: Masahiro Yamada Suggested-by: Nathan Chancellor Suggested-by: Rasmus Villemoes Acked-by: Steven Rostedt (VMware) Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Reviewed-by: Masahiro Yamada Reviewed-by: Andy Shevchenko Cc: David Laight Cc: Rasmus Villemoes Cc: Namhyung Kim Cc: Greg Kroah-Hartman Cc: Alexander Shishkin Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 3 +++ lib/string.c | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 7927b875f80c..6ab0a6fa512e 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -150,6 +150,9 @@ extern void * memscan(void *,int,__kernel_size_t); #ifndef __HAVE_ARCH_MEMCMP extern int memcmp(const void *,const void *,__kernel_size_t); #endif +#ifndef __HAVE_ARCH_BCMP +extern int bcmp(const void *,const void *,__kernel_size_t); +#endif #ifndef __HAVE_ARCH_MEMCHR extern void * memchr(const void *,int,__kernel_size_t); #endif diff --git a/lib/string.c b/lib/string.c index 38e4ca08e757..3ab861c1a857 100644 --- a/lib/string.c +++ b/lib/string.c @@ -866,6 +866,26 @@ __visible int memcmp(const void *cs, const void *ct, size_t count) EXPORT_SYMBOL(memcmp); #endif +#ifndef __HAVE_ARCH_BCMP +/** + * bcmp - returns 0 if and only if the buffers have identical contents. + * @a: pointer to first buffer. + * @b: pointer to second buffer. + * @len: size of buffers. + * + * The sign or magnitude of a non-zero return value has no particular + * meaning, and architectures may implement their own more efficient bcmp(). So + * while this particular implementation is a simple (tail) call to memcmp, do + * not rely on anything but whether the return value is zero or non-zero. + */ +#undef bcmp +int bcmp(const void *a, const void *b, size_t len) +{ + return memcmp(a, b, len); +} +EXPORT_SYMBOL(bcmp); +#endif + #ifndef __HAVE_ARCH_MEMSCAN /** * memscan - Find a character in an area of memory. -- cgit v1.2.3-59-g8ed1b From 6147e136ff5071609b54f18982dea87706288e21 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Apr 2019 18:38:53 -0700 Subject: include/linux/bitrev.h: fix constant bitrev clang points out with hundreds of warnings that the bitrev macros have a problem with constant input: drivers/hwmon/sht15.c:187:11: error: variable '__x' is uninitialized when used within its own initialization [-Werror,-Wuninitialized] u8 crc = bitrev8(data->val_status & 0x0F); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/linux/bitrev.h:102:21: note: expanded from macro 'bitrev8' __constant_bitrev8(__x) : \ ~~~~~~~~~~~~~~~~~~~^~~~ include/linux/bitrev.h:67:11: note: expanded from macro '__constant_bitrev8' u8 __x = x; \ ~~~ ^ Both the bitrev and the __constant_bitrev macros use an internal variable named __x, which goes horribly wrong when passing one to the other. The obvious fix is to rename one of the variables, so this adds an extra '_'. It seems we got away with this because - there are only a few drivers using bitrev macros - usually there are no constant arguments to those - when they are constant, they tend to be either 0 or (unsigned)-1 (drivers/isdn/i4l/isdnhdlc.o, drivers/iio/amplifiers/ad8366.c) and give the correct result by pure chance. In fact, the only driver that I could find that gets different results with this is drivers/net/wan/slic_ds26522.c, which in turn is a driver for fairly rare hardware (adding the maintainer to Cc for testing). Link: http://lkml.kernel.org/r/20190322140503.123580-1-arnd@arndb.de Fixes: 556d2f055bf6 ("ARM: 8187/1: add CONFIG_HAVE_ARCH_BITREVERSE to support rbit instruction") Signed-off-by: Arnd Bergmann Reviewed-by: Nick Desaulniers Cc: Zhao Qiang Cc: Yalin Wang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitrev.h | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitrev.h b/include/linux/bitrev.h index 50fb0dee23e8..d35b8ec1c485 100644 --- a/include/linux/bitrev.h +++ b/include/linux/bitrev.h @@ -34,41 +34,41 @@ static inline u32 __bitrev32(u32 x) #define __constant_bitrev32(x) \ ({ \ - u32 __x = x; \ - __x = (__x >> 16) | (__x << 16); \ - __x = ((__x & (u32)0xFF00FF00UL) >> 8) | ((__x & (u32)0x00FF00FFUL) << 8); \ - __x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4); \ - __x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2); \ - __x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1); \ - __x; \ + u32 ___x = x; \ + ___x = (___x >> 16) | (___x << 16); \ + ___x = ((___x & (u32)0xFF00FF00UL) >> 8) | ((___x & (u32)0x00FF00FFUL) << 8); \ + ___x = ((___x & (u32)0xF0F0F0F0UL) >> 4) | ((___x & (u32)0x0F0F0F0FUL) << 4); \ + ___x = ((___x & (u32)0xCCCCCCCCUL) >> 2) | ((___x & (u32)0x33333333UL) << 2); \ + ___x = ((___x & (u32)0xAAAAAAAAUL) >> 1) | ((___x & (u32)0x55555555UL) << 1); \ + ___x; \ }) #define __constant_bitrev16(x) \ ({ \ - u16 __x = x; \ - __x = (__x >> 8) | (__x << 8); \ - __x = ((__x & (u16)0xF0F0U) >> 4) | ((__x & (u16)0x0F0FU) << 4); \ - __x = ((__x & (u16)0xCCCCU) >> 2) | ((__x & (u16)0x3333U) << 2); \ - __x = ((__x & (u16)0xAAAAU) >> 1) | ((__x & (u16)0x5555U) << 1); \ - __x; \ + u16 ___x = x; \ + ___x = (___x >> 8) | (___x << 8); \ + ___x = ((___x & (u16)0xF0F0U) >> 4) | ((___x & (u16)0x0F0FU) << 4); \ + ___x = ((___x & (u16)0xCCCCU) >> 2) | ((___x & (u16)0x3333U) << 2); \ + ___x = ((___x & (u16)0xAAAAU) >> 1) | ((___x & (u16)0x5555U) << 1); \ + ___x; \ }) #define __constant_bitrev8x4(x) \ ({ \ - u32 __x = x; \ - __x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4); \ - __x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2); \ - __x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1); \ - __x; \ + u32 ___x = x; \ + ___x = ((___x & (u32)0xF0F0F0F0UL) >> 4) | ((___x & (u32)0x0F0F0F0FUL) << 4); \ + ___x = ((___x & (u32)0xCCCCCCCCUL) >> 2) | ((___x & (u32)0x33333333UL) << 2); \ + ___x = ((___x & (u32)0xAAAAAAAAUL) >> 1) | ((___x & (u32)0x55555555UL) << 1); \ + ___x; \ }) #define __constant_bitrev8(x) \ ({ \ - u8 __x = x; \ - __x = (__x >> 4) | (__x << 4); \ - __x = ((__x & (u8)0xCCU) >> 2) | ((__x & (u8)0x33U) << 2); \ - __x = ((__x & (u8)0xAAU) >> 1) | ((__x & (u8)0x55U) << 1); \ - __x; \ + u8 ___x = x; \ + ___x = (___x >> 4) | (___x << 4); \ + ___x = ((___x & (u8)0xCCU) >> 2) | ((___x & (u8)0x33U) << 2); \ + ___x = ((___x & (u8)0xAAU) >> 1) | ((___x & (u8)0x55U) << 1); \ + ___x; \ }) #define bitrev32(x) \ -- cgit v1.2.3-59-g8ed1b From fcae96ff96538f66e7acd5d4e0f2e7516ff8cbd0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 5 Apr 2019 18:39:01 -0700 Subject: mm: fix vm_fault_t cast in VM_FAULT_GET_HINDEX() Symmetrically to VM_FAULT_SET_HINDEX(), we need a force-cast in VM_FAULT_GET_HINDEX() to tell sparse that this is intentional. Sparse complains about the current code when building a kernel with CONFIG_MEMORY_FAILURE: arch/x86/mm/fault.c:1058:53: warning: restricted vm_fault_t degrades to integer Link: http://lkml.kernel.org/r/20190327204117.35215-1-jannh@google.com Fixes: 3d3539018d2c ("mm: create the new vm_fault_t type") Signed-off-by: Jann Horn Reviewed-by: Andrew Morton Cc: Souptick Joarder Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7eade9132f02..4ef4bbe78a1d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -671,7 +671,7 @@ enum vm_fault_reason { /* Encode hstate index for a hwpoisoned large page */ #define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16)) -#define VM_FAULT_GET_HINDEX(x) (((x) >> 16) & 0xf) +#define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf) #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \ VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \ -- cgit v1.2.3-59-g8ed1b From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 5 Apr 2019 18:39:18 -0700 Subject: mm: writeback: use exact memcg dirty counts Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting") memcg dirty and writeback counters are managed as: 1) per-memcg per-cpu values in range of [-32..32] 2) per-memcg atomic counter When a per-cpu counter cannot fit in [-32..32] it's flushed to the atomic. Stat readers only check the atomic. Thus readers such as balance_dirty_pages() may see a nontrivial error margin: 32 pages per cpu. Assuming 100 cpus: 4k x86 page_size: 13 MiB error per memcg 64k ppc page_size: 200 MiB error per memcg Considering that dirty+writeback are used together for some decisions the errors double. This inaccuracy can lead to undeserved oom kills. One nasty case is when all per-cpu counters hold positive values offsetting an atomic negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32). balance_dirty_pages() only consults the atomic and does not consider throttling the next n_cpu*32 dirty pages. If the file_lru is in the 13..200 MiB range then there's absolutely no dirty throttling, which burdens vmscan with only dirty+writeback pages thus resorting to oom kill. It could be argued that tiny containers are not supported, but it's more subtle. It's the amount the space available for file lru that matters. If a container has memory.max-200MiB of non reclaimable memory, then it will also suffer such oom kills on a 100 cpu machine. The following test reliably ooms without this patch. This patch avoids oom kills. $ cat test mount -t cgroup2 none /dev/cgroup cd /dev/cgroup echo +io +memory > cgroup.subtree_control mkdir test cd test echo 10M > memory.max (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo) (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100) $ cat memcg-writeback-stress.c /* * Dirty pages from all but one cpu. * Clean pages from the non dirtying cpu. * This is to stress per cpu counter imbalance. * On a 100 cpu machine: * - per memcg per cpu dirty count is 32 pages for each of 99 cpus * - per memcg atomic is -99*32 pages * - thus the complete dirty limit: sum of all counters 0 * - balance_dirty_pages() only sees atomic count -99*32 pages, which * it max()s to 0. * - So a workload can dirty -99*32 pages before balance_dirty_pages() * cares. */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include static char *buf; static int bufSize; static void set_affinity(int cpu) { cpu_set_t affinity; CPU_ZERO(&affinity); CPU_SET(cpu, &affinity); if (sched_setaffinity(0, sizeof(affinity), &affinity)) err(1, "sched_setaffinity"); } static void dirty_on(int output_fd, int cpu) { int i, wrote; set_affinity(cpu); for (i = 0; i < 32; i++) { for (wrote = 0; wrote < bufSize; ) { int ret = write(output_fd, buf+wrote, bufSize-wrote); if (ret == -1) err(1, "write"); wrote += ret; } } } int main(int argc, char **argv) { int cpu, flush_cpu = 1, output_fd; const char *output; if (argc != 2) errx(1, "usage: output_file"); output = argv[1]; bufSize = getpagesize(); buf = malloc(getpagesize()); if (buf == NULL) errx(1, "malloc failed"); output_fd = open(output, O_CREAT|O_RDWR); if (output_fd == -1) err(1, "open(%s)", output); for (cpu = 0; cpu < get_nprocs(); cpu++) { if (cpu != flush_cpu) dirty_on(output_fd, cpu); } set_affinity(flush_cpu); if (fsync(output_fd)) err(1, "fsync(%s)", output); if (close(output_fd)) err(1, "close(%s)", output); free(buf); } Make balance_dirty_pages() and wb_over_bg_thresh() work harder to collect exact per memcg counters. This avoids the aforementioned oom kills. This does not affect the overhead of memory.stat, which still reads the single atomic counter. Why not use percpu_counter? memcg already handles cpus going offline, so no need for that overhead from percpu_counter. And the percpu_counter spinlocks are more heavyweight than is required. It probably also makes sense to use exact dirty and writeback counters in memcg oom reports. But that is saved for later. Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com Signed-off-by: Greg Thelen Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Cc: [4.16+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++++- mm/memcontrol.c | 20 ++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1f3d880b7ca1..dbb6118370c1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -566,7 +566,10 @@ struct mem_cgroup *lock_page_memcg(struct page *page); void __unlock_page_memcg(struct mem_cgroup *memcg); void unlock_page_memcg(struct page *page); -/* idx can be of type enum memcg_stat_item or node_stat_item */ +/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page_state(). + */ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 532e0e2a4817..81a0d3914ec9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3882,6 +3882,22 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) return &memcg->cgwb_domain; } +/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page(). + */ +static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = atomic_long_read(&memcg->stat[idx]); + int cpu; + + for_each_online_cpu(cpu) + x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx]; + if (x < 0) + x = 0; + return x; +} + /** * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg * @wb: bdi_writeback in question @@ -3907,10 +3923,10 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); + *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); /* this should eventually include NR_UNSTABLE_NFS */ - *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); + *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | (1 << LRU_ACTIVE_FILE)); *pheadroom = PAGE_COUNTER_MAX; -- cgit v1.2.3-59-g8ed1b From 10dce8af34226d90fa56746a934f8da5dcdba3df Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Tue, 26 Mar 2019 22:20:43 +0000 Subject: fs: stream_open - opener for stream-like files so that read and write can run simultaneously without deadlock Commit 9c225f2655e3 ("vfs: atomic f_pos accesses as per POSIX") added locking for file.f_pos access and in particular made concurrent read and write not possible - now both those functions take f_pos lock for the whole run, and so if e.g. a read is blocked waiting for data, write will deadlock waiting for that read to complete. This caused regression for stream-like files where previously read and write could run simultaneously, but after that patch could not do so anymore. See e.g. commit 581d21a2d02a ("xenbus: fix deadlock on writes to /proc/xen/xenbus") which fixes such regression for particular case of /proc/xen/xenbus. The patch that added f_pos lock in 2014 did so to guarantee POSIX thread safety for read/write/lseek and added the locking to file descriptors of all regular files. In 2014 that thread-safety problem was not new as it was already discussed earlier in 2006. However even though 2006'th version of Linus's patch was adding f_pos locking "only for files that are marked seekable with FMODE_LSEEK (thus avoiding the stream-like objects like pipes and sockets)", the 2014 version - the one that actually made it into the tree as 9c225f2655e3 - is doing so irregardless of whether a file is seekable or not. See https://lore.kernel.org/lkml/53022DB1.4070805@gmail.com/ https://lwn.net/Articles/180387 https://lwn.net/Articles/180396 for historic context. The reason that it did so is, probably, that there are many files that are marked non-seekable, but e.g. their read implementation actually depends on knowing current position to correctly handle the read. Some examples: kernel/power/user.c snapshot_read fs/debugfs/file.c u32_array_read fs/fuse/control.c fuse_conn_waiting_read + ... drivers/hwmon/asus_atk0110.c atk_debugfs_ggrp_read arch/s390/hypfs/inode.c hypfs_read_iter ... Despite that, many nonseekable_open users implement read and write with pure stream semantics - they don't depend on passed ppos at all. And for those cases where read could wait for something inside, it creates a situation similar to xenbus - the write could be never made to go until read is done, and read is waiting for some, potentially external, event, for potentially unbounded time -> deadlock. Besides xenbus, there are 14 such places in the kernel that I've found with semantic patch (see below): drivers/xen/evtchn.c:667:8-24: ERROR: evtchn_fops: .read() can deadlock .write() drivers/isdn/capi/capi.c:963:8-24: ERROR: capi_fops: .read() can deadlock .write() drivers/input/evdev.c:527:1-17: ERROR: evdev_fops: .read() can deadlock .write() drivers/char/pcmcia/cm4000_cs.c:1685:7-23: ERROR: cm4000_fops: .read() can deadlock .write() net/rfkill/core.c:1146:8-24: ERROR: rfkill_fops: .read() can deadlock .write() drivers/s390/char/fs3270.c:488:1-17: ERROR: fs3270_fops: .read() can deadlock .write() drivers/usb/misc/ldusb.c:310:1-17: ERROR: ld_usb_fops: .read() can deadlock .write() drivers/hid/uhid.c:635:1-17: ERROR: uhid_fops: .read() can deadlock .write() net/batman-adv/icmp_socket.c:80:1-17: ERROR: batadv_fops: .read() can deadlock .write() drivers/media/rc/lirc_dev.c:198:1-17: ERROR: lirc_fops: .read() can deadlock .write() drivers/leds/uleds.c:77:1-17: ERROR: uleds_fops: .read() can deadlock .write() drivers/input/misc/uinput.c:400:1-17: ERROR: uinput_fops: .read() can deadlock .write() drivers/infiniband/core/user_mad.c:985:7-23: ERROR: umad_fops: .read() can deadlock .write() drivers/gnss/core.c:45:1-17: ERROR: gnss_fops: .read() can deadlock .write() In addition to the cases above another regression caused by f_pos locking is that now FUSE filesystems that implement open with FOPEN_NONSEEKABLE flag, can no longer implement bidirectional stream-like files - for the same reason as above e.g. read can deadlock write locking on file.f_pos in the kernel. FUSE's FOPEN_NONSEEKABLE was added in 2008 in a7c1b990f715 ("fuse: implement nonseekable open") to support OSSPD. OSSPD implements /dev/dsp in userspace with FOPEN_NONSEEKABLE flag, with corresponding read and write routines not depending on current position at all, and with both read and write being potentially blocking operations: See https://github.com/libfuse/osspd https://lwn.net/Articles/308445 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1406 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1438-L1477 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1479-L1510 Corresponding libfuse example/test also describes FOPEN_NONSEEKABLE as "somewhat pipe-like files ..." with read handler not using offset. However that test implements only read without write and cannot exercise the deadlock scenario: https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L124-L131 https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L146-L163 https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L209-L216 I've actually hit the read vs write deadlock for real while implementing my FUSE filesystem where there is /head/watch file, for which open creates separate bidirectional socket-like stream in between filesystem and its user with both read and write being later performed simultaneously. And there it is semantically not easy to split the stream into two separate read-only and write-only channels: https://lab.nexedi.com/kirr/wendelin.core/blob/f13aa600/wcfs/wcfs.go#L88-169 Let's fix this regression. The plan is: 1. We can't change nonseekable_open to include &~FMODE_ATOMIC_POS - doing so would break many in-kernel nonseekable_open users which actually use ppos in read/write handlers. 2. Add stream_open() to kernel to open stream-like non-seekable file descriptors. Read and write on such file descriptors would never use nor change ppos. And with that property on stream-like files read and write will be running without taking f_pos lock - i.e. read and write could be running simultaneously. 3. With semantic patch search and convert to stream_open all in-kernel nonseekable_open users for which read and write actually do not depend on ppos and where there is no other methods in file_operations which assume @offset access. 4. Add FOPEN_STREAM to fs/fuse/ and open in-kernel file-descriptors via steam_open if that bit is present in filesystem open reply. It was tempting to change fs/fuse/ open handler to use stream_open instead of nonseekable_open on just FOPEN_NONSEEKABLE flags, but grepping through Debian codesearch shows users of FOPEN_NONSEEKABLE, and in particular GVFS which actually uses offset in its read and write handlers https://codesearch.debian.net/search?q=-%3Enonseekable+%3D https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1080 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1247-1346 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1399-1481 so if we would do such a change it will break a real user. 5. Add stream_open and FOPEN_STREAM handling to stable kernels starting from v3.14+ (the kernel where 9c225f2655 first appeared). This will allow to patch OSSPD and other FUSE filesystems that provide stream-like files to return FOPEN_STREAM | FOPEN_NONSEEKABLE in their open handler and this way avoid the deadlock on all kernel versions. This should work because fs/fuse/ ignores unknown open flags returned from a filesystem and so passing FOPEN_STREAM to a kernel that is not aware of this flag cannot hurt. In turn the kernel that is not aware of FOPEN_STREAM will be < v3.14 where just FOPEN_NONSEEKABLE is sufficient to implement streams without read vs write deadlock. This patch adds stream_open, converts /proc/xen/xenbus to it and adds semantic patch to automatically locate in-kernel places that are either required to be converted due to read vs write deadlock, or that are just safe to be converted because read and write do not use ppos and there are no other funky methods in file_operations. Regarding semantic patch I've verified each generated change manually - that it is correct to convert - and each other nonseekable_open instance left - that it is either not correct to convert there, or that it is not converted due to current stream_open.cocci limitations. The script also does not convert files that should be valid to convert, but that currently have .llseek = noop_llseek or generic_file_llseek for unknown reason despite file being opened with nonseekable_open (e.g. drivers/input/mousedev.c) Cc: Michael Kerrisk Cc: Yongzhi Pan Cc: Jonathan Corbet Cc: David Vrabel Cc: Juergen Gross Cc: Miklos Szeredi Cc: Tejun Heo Cc: Kirill Tkhai Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: Julia Lawall Cc: Nikolaus Rath Cc: Han-Wen Nienhuys Signed-off-by: Kirill Smelkov Signed-off-by: Linus Torvalds --- drivers/xen/xenbus/xenbus_dev_frontend.c | 4 +- fs/open.c | 18 ++ fs/read_write.c | 5 +- include/linux/fs.h | 4 + scripts/coccinelle/api/stream_open.cocci | 363 +++++++++++++++++++++++++++++++ 5 files changed, 389 insertions(+), 5 deletions(-) create mode 100644 scripts/coccinelle/api/stream_open.cocci (limited to 'include/linux') diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index c3e201025ef0..0782ff3c2273 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -622,9 +622,7 @@ static int xenbus_file_open(struct inode *inode, struct file *filp) if (xen_store_evtchn == 0) return -ENOENT; - nonseekable_open(inode, filp); - - filp->f_mode &= ~FMODE_ATOMIC_POS; /* cdev-style semantics */ + stream_open(inode, filp); u = kzalloc(sizeof(*u), GFP_KERNEL); if (u == NULL) diff --git a/fs/open.c b/fs/open.c index f1c2f855fd43..a00350018a47 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1215,3 +1215,21 @@ int nonseekable_open(struct inode *inode, struct file *filp) } EXPORT_SYMBOL(nonseekable_open); + +/* + * stream_open is used by subsystems that want stream-like file descriptors. + * Such file descriptors are not seekable and don't have notion of position + * (file.f_pos is always 0). Contrary to file descriptors of other regular + * files, .read() and .write() can run simultaneously. + * + * stream_open never fails and is marked to return int so that it could be + * directly used as file_operations.open . + */ +int stream_open(struct inode *inode, struct file *filp) +{ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); + filp->f_mode |= FMODE_STREAM; + return 0; +} + +EXPORT_SYMBOL(stream_open); diff --git a/fs/read_write.c b/fs/read_write.c index 177ccc3d405a..61b43ad7608e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -560,12 +560,13 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ static inline loff_t file_pos_read(struct file *file) { - return file->f_pos; + return file->f_mode & FMODE_STREAM ? 0 : file->f_pos; } static inline void file_pos_write(struct file *file, loff_t pos) { - file->f_pos = pos; + if ((file->f_mode & FMODE_STREAM) == 0) + file->f_pos = pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b42df09b04c..dd28e7679089 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -158,6 +158,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_OPENED ((__force fmode_t)0x80000) #define FMODE_CREATED ((__force fmode_t)0x100000) +/* File is stream-like */ +#define FMODE_STREAM ((__force fmode_t)0x200000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) @@ -3074,6 +3077,7 @@ extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t); extern loff_t no_seek_end_llseek(struct file *, loff_t, int); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); +extern int stream_open(struct inode * inode, struct file * filp); #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(struct bio *bio, struct inode *inode, diff --git a/scripts/coccinelle/api/stream_open.cocci b/scripts/coccinelle/api/stream_open.cocci new file mode 100644 index 000000000000..350145da7669 --- /dev/null +++ b/scripts/coccinelle/api/stream_open.cocci @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0 +// Author: Kirill Smelkov (kirr@nexedi.com) +// +// Search for stream-like files that are using nonseekable_open and convert +// them to stream_open. A stream-like file is a file that does not use ppos in +// its read and write. Rationale for the conversion is to avoid deadlock in +// between read and write. + +virtual report +virtual patch +virtual explain // explain decisions in the patch (SPFLAGS="-D explain") + +// stream-like reader & writer - ones that do not depend on f_pos. +@ stream_reader @ +identifier readstream, ppos; +identifier f, buf, len; +type loff_t; +@@ + ssize_t readstream(struct file *f, char *buf, size_t len, loff_t *ppos) + { + ... when != ppos + } + +@ stream_writer @ +identifier writestream, ppos; +identifier f, buf, len; +type loff_t; +@@ + ssize_t writestream(struct file *f, const char *buf, size_t len, loff_t *ppos) + { + ... when != ppos + } + + +// a function that blocks +@ blocks @ +identifier block_f; +identifier wait_event =~ "^wait_event_.*"; +@@ + block_f(...) { + ... when exists + wait_event(...) + ... when exists + } + +// stream_reader that can block inside. +// +// XXX wait_* can be called not directly from current function (e.g. func -> f -> g -> wait()) +// XXX currently reader_blocks supports only direct and 1-level indirect cases. +@ reader_blocks_direct @ +identifier stream_reader.readstream; +identifier wait_event =~ "^wait_event_.*"; +@@ + readstream(...) + { + ... when exists + wait_event(...) + ... when exists + } + +@ reader_blocks_1 @ +identifier stream_reader.readstream; +identifier blocks.block_f; +@@ + readstream(...) + { + ... when exists + block_f(...) + ... when exists + } + +@ reader_blocks depends on reader_blocks_direct || reader_blocks_1 @ +identifier stream_reader.readstream; +@@ + readstream(...) { + ... + } + + +// file_operations + whether they have _any_ .read, .write, .llseek ... at all. +// +// XXX add support for file_operations xxx[N] = ... (sound/core/pcm_native.c) +@ fops0 @ +identifier fops; +@@ + struct file_operations fops = { + ... + }; + +@ has_read @ +identifier fops0.fops; +identifier read_f; +@@ + struct file_operations fops = { + .read = read_f, + }; + +@ has_read_iter @ +identifier fops0.fops; +identifier read_iter_f; +@@ + struct file_operations fops = { + .read_iter = read_iter_f, + }; + +@ has_write @ +identifier fops0.fops; +identifier write_f; +@@ + struct file_operations fops = { + .write = write_f, + }; + +@ has_write_iter @ +identifier fops0.fops; +identifier write_iter_f; +@@ + struct file_operations fops = { + .write_iter = write_iter_f, + }; + +@ has_llseek @ +identifier fops0.fops; +identifier llseek_f; +@@ + struct file_operations fops = { + .llseek = llseek_f, + }; + +@ has_no_llseek @ +identifier fops0.fops; +@@ + struct file_operations fops = { + .llseek = no_llseek, + }; + +@ has_mmap @ +identifier fops0.fops; +identifier mmap_f; +@@ + struct file_operations fops = { + .mmap = mmap_f, + }; + +@ has_copy_file_range @ +identifier fops0.fops; +identifier copy_file_range_f; +@@ + struct file_operations fops = { + .copy_file_range = copy_file_range_f, + }; + +@ has_remap_file_range @ +identifier fops0.fops; +identifier remap_file_range_f; +@@ + struct file_operations fops = { + .remap_file_range = remap_file_range_f, + }; + +@ has_splice_read @ +identifier fops0.fops; +identifier splice_read_f; +@@ + struct file_operations fops = { + .splice_read = splice_read_f, + }; + +@ has_splice_write @ +identifier fops0.fops; +identifier splice_write_f; +@@ + struct file_operations fops = { + .splice_write = splice_write_f, + }; + + +// file_operations that is candidate for stream_open conversion - it does not +// use mmap and other methods that assume @offset access to file. +// +// XXX for simplicity require no .{read/write}_iter and no .splice_{read/write} for now. +// XXX maybe_steam.fops cannot be used in other rules - it gives "bad rule maybe_stream or bad variable fops". +@ maybe_stream depends on (!has_llseek || has_no_llseek) && !has_mmap && !has_copy_file_range && !has_remap_file_range && !has_read_iter && !has_write_iter && !has_splice_read && !has_splice_write @ +identifier fops0.fops; +@@ + struct file_operations fops = { + }; + + +// ---- conversions ---- + +// XXX .open = nonseekable_open -> .open = stream_open +// XXX .open = func -> openfunc -> nonseekable_open + +// read & write +// +// if both are used in the same file_operations together with an opener - +// under that conditions we can use stream_open instead of nonseekable_open. +@ fops_rw depends on maybe_stream @ +identifier fops0.fops, openfunc; +identifier stream_reader.readstream; +identifier stream_writer.writestream; +@@ + struct file_operations fops = { + .open = openfunc, + .read = readstream, + .write = writestream, + }; + +@ report_rw depends on report @ +identifier fops_rw.openfunc; +position p1; +@@ + openfunc(...) { + <... + nonseekable_open@p1 + ...> + } + +@ script:python depends on report && reader_blocks @ +fops << fops0.fops; +p << report_rw.p1; +@@ +coccilib.report.print_report(p[0], + "ERROR: %s: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix." % (fops,)) + +@ script:python depends on report && !reader_blocks @ +fops << fops0.fops; +p << report_rw.p1; +@@ +coccilib.report.print_report(p[0], + "WARNING: %s: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open." % (fops,)) + + +@ explain_rw_deadlocked depends on explain && reader_blocks @ +identifier fops_rw.openfunc; +@@ + openfunc(...) { + <... +- nonseekable_open ++ nonseekable_open /* read & write (was deadlock) */ + ...> + } + + +@ explain_rw_nodeadlock depends on explain && !reader_blocks @ +identifier fops_rw.openfunc; +@@ + openfunc(...) { + <... +- nonseekable_open ++ nonseekable_open /* read & write (no direct deadlock) */ + ...> + } + +@ patch_rw depends on patch @ +identifier fops_rw.openfunc; +@@ + openfunc(...) { + <... +- nonseekable_open ++ stream_open + ...> + } + + +// read, but not write +@ fops_r depends on maybe_stream && !has_write @ +identifier fops0.fops, openfunc; +identifier stream_reader.readstream; +@@ + struct file_operations fops = { + .open = openfunc, + .read = readstream, + }; + +@ report_r depends on report @ +identifier fops_r.openfunc; +position p1; +@@ + openfunc(...) { + <... + nonseekable_open@p1 + ...> + } + +@ script:python depends on report @ +fops << fops0.fops; +p << report_r.p1; +@@ +coccilib.report.print_report(p[0], + "WARNING: %s: .read() has stream semantic; safe to change nonseekable_open -> stream_open." % (fops,)) + +@ explain_r depends on explain @ +identifier fops_r.openfunc; +@@ + openfunc(...) { + <... +- nonseekable_open ++ nonseekable_open /* read only */ + ...> + } + +@ patch_r depends on patch @ +identifier fops_r.openfunc; +@@ + openfunc(...) { + <... +- nonseekable_open ++ stream_open + ...> + } + + +// write, but not read +@ fops_w depends on maybe_stream && !has_read @ +identifier fops0.fops, openfunc; +identifier stream_writer.writestream; +@@ + struct file_operations fops = { + .open = openfunc, + .write = writestream, + }; + +@ report_w depends on report @ +identifier fops_w.openfunc; +position p1; +@@ + openfunc(...) { + <... + nonseekable_open@p1 + ...> + } + +@ script:python depends on report @ +fops << fops0.fops; +p << report_w.p1; +@@ +coccilib.report.print_report(p[0], + "WARNING: %s: .write() has stream semantic; safe to change nonseekable_open -> stream_open." % (fops,)) + +@ explain_w depends on explain @ +identifier fops_w.openfunc; +@@ + openfunc(...) { + <... +- nonseekable_open ++ nonseekable_open /* write only */ + ...> + } + +@ patch_w depends on patch @ +identifier fops_w.openfunc; +@@ + openfunc(...) { + <... +- nonseekable_open ++ stream_open + ...> + } + + +// no read, no write - don't change anything -- cgit v1.2.3-59-g8ed1b From 1200e07f3ad4b9d976cf2fff3a0c3d9a1faecb3e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 8 Apr 2019 19:02:38 +0800 Subject: block: don't use for-inside-for in bio_for_each_segment_all Commit 6dc4f100c175 ("block: allow bio_for_each_segment_all() to iterate over multi-page bvec") changes bio_for_each_segment_all() to use for-inside-for. This way breaks all bio_for_each_segment_all() call with error out branch via 'break', since now 'break' can only break from the inner loop. Fixes this issue by implementing bio_for_each_segment_all() via single 'for' loop, and now the logic is very similar with normal bvec iterator. Cc: Qu Wenruo Cc: linux-btrfs@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: Omar Sandoval Reviewed-by: Johannes Thumshirn Reported-and-Tested-by: Qu Wenruo Fixes: 6dc4f100c175 ("block: allow bio_for_each_segment_all() to iterate over multi-page bvec") Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 20 ++++++++++++-------- include/linux/bvec.h | 14 ++++++++++---- 2 files changed, 22 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index bb6090aa165d..e584673c1881 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -120,19 +120,23 @@ static inline bool bio_full(struct bio *bio) return bio->bi_vcnt >= bio->bi_max_vecs; } -#define mp_bvec_for_each_segment(bv, bvl, i, iter_all) \ - for (bv = bvec_init_iter_all(&iter_all); \ - (iter_all.done < (bvl)->bv_len) && \ - (mp_bvec_next_segment((bvl), &iter_all), 1); \ - iter_all.done += bv->bv_len, i += 1) +static inline bool bio_next_segment(const struct bio *bio, + struct bvec_iter_all *iter) +{ + if (iter->idx >= bio->bi_vcnt) + return false; + + bvec_advance(&bio->bi_io_vec[iter->idx], iter); + return true; +} /* * drivers should _never_ use the all version - the bio may have been split * before it got to the driver and the driver won't own all of it */ -#define bio_for_each_segment_all(bvl, bio, i, iter_all) \ - for (i = 0, iter_all.idx = 0; iter_all.idx < (bio)->bi_vcnt; iter_all.idx++) \ - mp_bvec_for_each_segment(bvl, &((bio)->bi_io_vec[iter_all.idx]), i, iter_all) +#define bio_for_each_segment_all(bvl, bio, i, iter) \ + for (i = 0, bvl = bvec_init_iter_all(&iter); \ + bio_next_segment((bio), &iter); i++) static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, unsigned bytes) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index f6275c4da13a..3bc91879e1e2 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -145,18 +145,18 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv, static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all) { - iter_all->bv.bv_page = NULL; iter_all->done = 0; + iter_all->idx = 0; return &iter_all->bv; } -static inline void mp_bvec_next_segment(const struct bio_vec *bvec, - struct bvec_iter_all *iter_all) +static inline void bvec_advance(const struct bio_vec *bvec, + struct bvec_iter_all *iter_all) { struct bio_vec *bv = &iter_all->bv; - if (bv->bv_page) { + if (iter_all->done) { bv->bv_page = nth_page(bv->bv_page, 1); bv->bv_offset = 0; } else { @@ -165,6 +165,12 @@ static inline void mp_bvec_next_segment(const struct bio_vec *bvec, } bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset, bvec->bv_len - iter_all->done); + iter_all->done += bv->bv_len; + + if (iter_all->done == bvec->bv_len) { + iter_all->idx++; + iter_all->done = 0; + } } /* -- cgit v1.2.3-59-g8ed1b From cf94db21905333e610e479688add629397a4b384 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Mon, 8 Apr 2019 14:33:22 +0200 Subject: virtio: Honour 'may_reduce_num' in vring_create_virtqueue vring_create_virtqueue() allows the caller to specify via the may_reduce_num parameter whether the vring code is allowed to allocate a smaller ring than specified. However, the split ring allocation code tries to allocate a smaller ring on allocation failure regardless of what the caller specified. This may cause trouble for e.g. virtio-pci in legacy mode, which does not support ring resizing. (The packed ring code does not resize in any case.) Let's fix this by bailing out immediately in the split ring code if the requested size cannot be allocated and may_reduce_num has not been specified. While at it, fix a typo in the usage instructions. Fixes: 2a2d1382fe9d ("virtio: Add improved queue allocation API") Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Cornelia Huck Signed-off-by: Michael S. Tsirkin Reviewed-by: Halil Pasic Reviewed-by: Jens Freimann --- drivers/virtio/virtio_ring.c | 2 ++ include/linux/virtio_ring.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 18846afb39da..5df92c308286 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -882,6 +882,8 @@ static struct virtqueue *vring_create_virtqueue_split( GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO); if (queue) break; + if (!may_reduce_num) + return NULL; } if (!num) diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index fab02133a919..3dc70adfe5f5 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -63,7 +63,7 @@ struct virtqueue; /* * Creates a virtqueue and allocates the descriptor ring. If * may_reduce_num is set, then this may allocate a smaller ring than - * expected. The caller should query virtqueue_get_ring_size to learn + * expected. The caller should query virtqueue_get_vring_size to learn * the actual size of the ring. */ struct virtqueue *vring_create_virtqueue(unsigned int index, -- cgit v1.2.3-59-g8ed1b From 1b8f21b74c3c9c82fce5a751d7aefb7cc0b8d33d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 9 Apr 2019 06:31:21 +0800 Subject: blk-mq: introduce blk_mq_complete_request_sync() In NVMe's error handler, follows the typical steps of tearing down hardware for recovering controller: 1) stop blk_mq hw queues 2) stop the real hw queues 3) cancel in-flight requests via blk_mq_tagset_busy_iter(tags, cancel_request, ...) cancel_request(): mark the request as abort blk_mq_complete_request(req); 4) destroy real hw queues However, there may be race between #3 and #4, because blk_mq_complete_request() may run q->mq_ops->complete(rq) remotelly and asynchronously, and ->complete(rq) may be run after #4. This patch introduces blk_mq_complete_request_sync() for fixing the above race. Cc: Sagi Grimberg Cc: Bart Van Assche Cc: James Smart Cc: linux-nvme@lists.infradead.org Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 +++++++ include/linux/blk-mq.h | 1 + 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index a9354835cf51..9516304a38ee 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -654,6 +654,13 @@ bool blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); +void blk_mq_complete_request_sync(struct request *rq) +{ + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + rq->q->mq_ops->complete(rq); +} +EXPORT_SYMBOL_GPL(blk_mq_complete_request_sync); + int blk_mq_request_started(struct request *rq) { return blk_mq_rq_state(rq) != MQ_RQ_IDLE; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index cb2aa7ecafff..db29928de467 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -302,6 +302,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); bool blk_mq_complete_request(struct request *rq); +void blk_mq_complete_request_sync(struct request *rq); bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio); bool blk_mq_queue_stopped(struct request_queue *q); -- cgit v1.2.3-59-g8ed1b From 7c2e07130090ae001a97a6b65597830d6815e93e Mon Sep 17 00:00:00 2001 From: David Müller Date: Mon, 8 Apr 2019 15:33:54 +0200 Subject: clk: x86: Add system specific quirk to mark clocks as critical MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 648e921888ad ("clk: x86: Stop marking clocks as CLK_IS_CRITICAL"), the pmc_plt_clocks of the Bay Trail SoC are unconditionally gated off. Unfortunately this will break systems where these clocks are used for external purposes beyond the kernel's knowledge. Fix it by implementing a system specific quirk to mark the necessary pmc_plt_clks as critical. Fixes: 648e921888ad ("clk: x86: Stop marking clocks as CLK_IS_CRITICAL") Signed-off-by: David Müller Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Signed-off-by: Stephen Boyd --- drivers/clk/x86/clk-pmc-atom.c | 14 +++++++++++--- drivers/platform/x86/pmc_atom.c | 21 +++++++++++++++++++++ include/linux/platform_data/x86/clk-pmc-atom.h | 3 +++ 3 files changed, 35 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/x86/clk-pmc-atom.c b/drivers/clk/x86/clk-pmc-atom.c index d977193842df..19174835693b 100644 --- a/drivers/clk/x86/clk-pmc-atom.c +++ b/drivers/clk/x86/clk-pmc-atom.c @@ -165,7 +165,7 @@ static const struct clk_ops plt_clk_ops = { }; static struct clk_plt *plt_clk_register(struct platform_device *pdev, int id, - void __iomem *base, + const struct pmc_clk_data *pmc_data, const char **parent_names, int num_parents) { @@ -184,9 +184,17 @@ static struct clk_plt *plt_clk_register(struct platform_device *pdev, int id, init.num_parents = num_parents; pclk->hw.init = &init; - pclk->reg = base + PMC_CLK_CTL_OFFSET + id * PMC_CLK_CTL_SIZE; + pclk->reg = pmc_data->base + PMC_CLK_CTL_OFFSET + id * PMC_CLK_CTL_SIZE; spin_lock_init(&pclk->lock); + /* + * On some systems, the pmc_plt_clocks already enabled by the + * firmware are being marked as critical to avoid them being + * gated by the clock framework. + */ + if (pmc_data->critical && plt_clk_is_enabled(&pclk->hw)) + init.flags |= CLK_IS_CRITICAL; + ret = devm_clk_hw_register(&pdev->dev, &pclk->hw); if (ret) { pclk = ERR_PTR(ret); @@ -332,7 +340,7 @@ static int plt_clk_probe(struct platform_device *pdev) return PTR_ERR(parent_names); for (i = 0; i < PMC_CLK_NUM; i++) { - data->clks[i] = plt_clk_register(pdev, i, pmc_data->base, + data->clks[i] = plt_clk_register(pdev, i, pmc_data, parent_names, data->nparents); if (IS_ERR(data->clks[i])) { err = PTR_ERR(data->clks[i]); diff --git a/drivers/platform/x86/pmc_atom.c b/drivers/platform/x86/pmc_atom.c index 8f018b3f3cd4..eaec2d306481 100644 --- a/drivers/platform/x86/pmc_atom.c +++ b/drivers/platform/x86/pmc_atom.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -391,11 +392,27 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc) } #endif /* CONFIG_DEBUG_FS */ +/* + * Some systems need one or more of their pmc_plt_clks to be + * marked as critical. + */ +static const struct dmi_system_id critclk_systems[] __initconst = { + { + .ident = "MPL CEC1x", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "MPL AG"), + DMI_MATCH(DMI_PRODUCT_NAME, "CEC10 Family"), + }, + }, + { /*sentinel*/ } +}; + static int pmc_setup_clks(struct pci_dev *pdev, void __iomem *pmc_regmap, const struct pmc_data *pmc_data) { struct platform_device *clkdev; struct pmc_clk_data *clk_data; + const struct dmi_system_id *d = dmi_first_match(critclk_systems); clk_data = kzalloc(sizeof(*clk_data), GFP_KERNEL); if (!clk_data) @@ -403,6 +420,10 @@ static int pmc_setup_clks(struct pci_dev *pdev, void __iomem *pmc_regmap, clk_data->base = pmc_regmap; /* offset is added by client */ clk_data->clks = pmc_data->clks; + if (d) { + clk_data->critical = true; + pr_info("%s critclks quirk enabled\n", d->ident); + } clkdev = platform_device_register_data(&pdev->dev, "clk-pmc-atom", PLATFORM_DEVID_NONE, diff --git a/include/linux/platform_data/x86/clk-pmc-atom.h b/include/linux/platform_data/x86/clk-pmc-atom.h index 3ab892208343..7a37ac27d0fb 100644 --- a/include/linux/platform_data/x86/clk-pmc-atom.h +++ b/include/linux/platform_data/x86/clk-pmc-atom.h @@ -35,10 +35,13 @@ struct pmc_clk { * * @base: PMC clock register base offset * @clks: pointer to set of registered clocks, typically 0..5 + * @critical: flag to indicate if firmware enabled pmc_plt_clks + * should be marked as critial or not */ struct pmc_clk_data { void __iomem *base; const struct pmc_clk *clks; + bool critical; }; #endif /* __PLATFORM_DATA_X86_CLK_PMC_ATOM_H */ -- cgit v1.2.3-59-g8ed1b From 8065a779f17e94536a1c4dcee4f9d88011672f97 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Mon, 8 Apr 2019 19:45:27 -0400 Subject: failover: allow name change on IFF_UP slave interfaces When a netdev appears through hot plug then gets enslaved by a failover master that is already up and running, the slave will be opened right away after getting enslaved. Today there's a race that userspace (udev) may fail to rename the slave if the kernel (net_failover) opens the slave earlier than when the userspace rename happens. Unlike bond or team, the primary slave of failover can't be renamed by userspace ahead of time, since the kernel initiated auto-enslavement is unable to, or rather, is never meant to be synchronized with the rename request from userspace. As the failover slave interfaces are not designed to be operated directly by userspace apps: IP configuration, filter rules with regard to network traffic passing and etc., should all be done on master interface. In general, userspace apps only care about the name of master interface, while slave names are less important as long as admin users can see reliable names that may carry other information describing the netdev. For e.g., they can infer that "ens3nsby" is a standby slave of "ens3", while for a name like "eth0" they can't tell which master it belongs to. Historically the name of IFF_UP interface can't be changed because there might be admin script or management software that is already relying on such behavior and assumes that the slave name can't be changed once UP. But failover is special: with the in-kernel auto-enslavement mechanism, the userspace expectation for device enumeration and bring-up order is already broken. Previously initramfs and various userspace config tools were modified to bypass failover slaves because of auto-enslavement and duplicate MAC address. Similarly, in case that users care about seeing reliable slave name, the new type of failover slaves needs to be taken care of specifically in userspace anyway. It's less risky to lift up the rename restriction on failover slave which is already UP. Although it's possible this change may potentially break userspace component (most likely configuration scripts or management software) that assumes slave name can't be changed while UP, it's relatively a limited and controllable set among all userspace components, which can be fixed specifically to listen for the rename events on failover slaves. Userspace component interacting with slaves is expected to be changed to operate on failover master interface instead, as the failover slave is dynamic in nature which may come and go at any point. The goal is to make the role of failover slaves less relevant, and userspace components should only deal with failover master in the long run. Fixes: 30c8bd5aa8b2 ("net: Introduce generic failover module") Signed-off-by: Si-Wei Liu Reviewed-by: Liran Alon Acked-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 16 +++++++++++++++- net/core/failover.c | 6 +++--- 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 26f69cf763f4..324e872c91d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1500,6 +1500,7 @@ struct net_device_ops { * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device + * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1532,6 +1533,7 @@ enum netdev_priv_flags { IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, + IFF_LIVE_RENAME_OK = 1<<30, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1563,6 +1565,7 @@ enum netdev_priv_flags { #define IFF_FAILOVER IFF_FAILOVER #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER +#define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK /** * struct net_device - The DEVICE structure. diff --git a/net/core/dev.c b/net/core/dev.c index fdcff29df915..f409406254dd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1184,7 +1184,21 @@ int dev_change_name(struct net_device *dev, const char *newname) BUG_ON(!dev_net(dev)); net = dev_net(dev); - if (dev->flags & IFF_UP) + + /* Some auto-enslaved devices e.g. failover slaves are + * special, as userspace might rename the device after + * the interface had been brought up and running since + * the point kernel initiated auto-enslavement. Allow + * live name change even when these slave devices are + * up and running. + * + * Typically, users of these auto-enslaving devices + * don't actually care about slave name change, as + * they are supposed to operate on master interface + * directly. + */ + if (dev->flags & IFF_UP && + likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK))) return -EBUSY; write_seqcount_begin(&devnet_rename_seq); diff --git a/net/core/failover.c b/net/core/failover.c index 4a92a98ccce9..b5cd3c727285 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -80,14 +80,14 @@ static int failover_slave_register(struct net_device *slave_dev) goto err_upper_link; } - slave_dev->priv_flags |= IFF_FAILOVER_SLAVE; + slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); - slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: @@ -121,7 +121,7 @@ int failover_slave_unregister(struct net_device *slave_dev) netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); - slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) -- cgit v1.2.3-59-g8ed1b From d808b7f759b50acf0784ce6230ffa63e12ef465d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 9 Apr 2019 10:03:59 -0600 Subject: nvmet: fix discover log page when offsets are used The nvme target hadn't been taking the Get Log Page offset parameter into consideration, and so has been returning corrupted log pages when offsets are used. Since many tools, including nvme-cli, split the log request to 4k, we've been breaking discovery log responses when more than 3 subsystems exist. Fix the returned data by internally generating the entire discovery log page and copying only the requested bytes into the user buffer. The command log page offset type has been modified to a native __le64 to make it easier to extract the value from a command. Signed-off-by: Keith Busch Tested-by: Minwoo Im Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 5 +++ drivers/nvme/target/discovery.c | 68 +++++++++++++++++++++++++++-------------- drivers/nvme/target/nvmet.h | 1 + include/linux/nvme.h | 9 ++++-- 4 files changed, 58 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 76250181fee0..9f72d515fc4b 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -24,6 +24,11 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd) return len; } +u64 nvmet_get_log_page_offset(struct nvme_command *cmd) +{ + return le64_to_cpu(cmd->get_log_page.lpo); +} + static void nvmet_execute_get_log_page_noop(struct nvmet_req *req) { nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->data_len)); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index c872b47a88f3..33ed95e72d6b 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -131,54 +131,76 @@ static void nvmet_set_disc_traddr(struct nvmet_req *req, struct nvmet_port *port memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); } +static size_t discovery_log_entries(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmet_subsys_link *p; + struct nvmet_port *r; + size_t entries = 0; + + list_for_each_entry(p, &req->port->subsystems, entry) { + if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn)) + continue; + entries++; + } + list_for_each_entry(r, &req->port->referrals, entry) + entries++; + return entries; +} + static void nvmet_execute_get_disc_log_page(struct nvmet_req *req) { const int entry_size = sizeof(struct nvmf_disc_rsp_page_entry); struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmf_disc_rsp_page_hdr *hdr; + u64 offset = nvmet_get_log_page_offset(req->cmd); size_t data_len = nvmet_get_log_page_len(req->cmd); - size_t alloc_len = max(data_len, sizeof(*hdr)); - int residual_len = data_len - sizeof(*hdr); + size_t alloc_len; struct nvmet_subsys_link *p; struct nvmet_port *r; u32 numrec = 0; u16 status = 0; + void *buffer; + + /* Spec requires dword aligned offsets */ + if (offset & 0x3) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out; + } /* * Make sure we're passing at least a buffer of response header size. * If host provided data len is less than the header size, only the * number of bytes requested by host will be sent to host. */ - hdr = kzalloc(alloc_len, GFP_KERNEL); - if (!hdr) { + down_read(&nvmet_config_sem); + alloc_len = sizeof(*hdr) + entry_size * discovery_log_entries(req); + buffer = kzalloc(alloc_len, GFP_KERNEL); + if (!buffer) { + up_read(&nvmet_config_sem); status = NVME_SC_INTERNAL; goto out; } - down_read(&nvmet_config_sem); + hdr = buffer; list_for_each_entry(p, &req->port->subsystems, entry) { + char traddr[NVMF_TRADDR_SIZE]; + if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn)) continue; - if (residual_len >= entry_size) { - char traddr[NVMF_TRADDR_SIZE]; - - nvmet_set_disc_traddr(req, req->port, traddr); - nvmet_format_discovery_entry(hdr, req->port, - p->subsys->subsysnqn, traddr, - NVME_NQN_NVME, numrec); - residual_len -= entry_size; - } + + nvmet_set_disc_traddr(req, req->port, traddr); + nvmet_format_discovery_entry(hdr, req->port, + p->subsys->subsysnqn, traddr, + NVME_NQN_NVME, numrec); numrec++; } list_for_each_entry(r, &req->port->referrals, entry) { - if (residual_len >= entry_size) { - nvmet_format_discovery_entry(hdr, r, - NVME_DISC_SUBSYS_NAME, - r->disc_addr.traddr, - NVME_NQN_DISC, numrec); - residual_len -= entry_size; - } + nvmet_format_discovery_entry(hdr, r, + NVME_DISC_SUBSYS_NAME, + r->disc_addr.traddr, + NVME_NQN_DISC, numrec); numrec++; } @@ -190,8 +212,8 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req) up_read(&nvmet_config_sem); - status = nvmet_copy_to_sgl(req, 0, hdr, data_len); - kfree(hdr); + status = nvmet_copy_to_sgl(req, 0, buffer + offset, data_len); + kfree(buffer); out: nvmet_req_complete(req, status); } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 51e49efd7849..1653d19b187f 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -428,6 +428,7 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len); u32 nvmet_get_log_page_len(struct nvme_command *cmd); +u64 nvmet_get_log_page_offset(struct nvme_command *cmd); extern struct list_head *nvmet_ports; void nvmet_port_disc_changed(struct nvmet_port *port, diff --git a/include/linux/nvme.h b/include/linux/nvme.h index baa49e6a23cc..c40720cb59ac 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -967,8 +967,13 @@ struct nvme_get_log_page_command { __le16 numdl; __le16 numdu; __u16 rsvd11; - __le32 lpol; - __le32 lpou; + union { + struct { + __le32 lpol; + __le32 lpou; + }; + __le64 lpo; + }; __u32 rsvd14[2]; }; -- cgit v1.2.3-59-g8ed1b From af6b61d7ef58099c82d854395a0e002be6bd036c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 11 Apr 2019 15:16:52 -0400 Subject: Revert "SUNRPC: Micro-optimise when the task is known not to be sleeping" This reverts commit 009a82f6437490c262584d65a14094a818bcb747. The ability to optimise here relies on compiler being able to optimise away tail calls to avoid stack overflows. Unfortunately, we are seeing reports of problems, so let's just revert. Reported-by: Daniel Mack Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 8 -------- net/sunrpc/clnt.c | 45 ++++++++------------------------------------ 2 files changed, 8 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index ec861cd0cfe8..52d41d0c1ae1 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -304,12 +304,4 @@ rpc_clnt_swap_deactivate(struct rpc_clnt *clnt) } #endif /* CONFIG_SUNRPC_SWAP */ -static inline bool -rpc_task_need_resched(const struct rpc_task *task) -{ - if (RPC_IS_QUEUED(task) || task->tk_callback) - return true; - return false; -} - #endif /* _LINUX_SUNRPC_SCHED_H_ */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 187d10443a15..1d0395ef62c9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1540,7 +1540,6 @@ call_start(struct rpc_task *task) clnt->cl_stats->rpccnt++; task->tk_action = call_reserve; rpc_task_set_transport(task, clnt); - call_reserve(task); } /* @@ -1554,9 +1553,6 @@ call_reserve(struct rpc_task *task) task->tk_status = 0; task->tk_action = call_reserveresult; xprt_reserve(task); - if (rpc_task_need_resched(task)) - return; - call_reserveresult(task); } static void call_retry_reserve(struct rpc_task *task); @@ -1579,7 +1575,6 @@ call_reserveresult(struct rpc_task *task) if (status >= 0) { if (task->tk_rqstp) { task->tk_action = call_refresh; - call_refresh(task); return; } @@ -1605,7 +1600,6 @@ call_reserveresult(struct rpc_task *task) /* fall through */ case -EAGAIN: /* woken up; retry */ task->tk_action = call_retry_reserve; - call_retry_reserve(task); return; case -EIO: /* probably a shutdown */ break; @@ -1628,9 +1622,6 @@ call_retry_reserve(struct rpc_task *task) task->tk_status = 0; task->tk_action = call_reserveresult; xprt_retry_reserve(task); - if (rpc_task_need_resched(task)) - return; - call_reserveresult(task); } /* @@ -1645,9 +1636,6 @@ call_refresh(struct rpc_task *task) task->tk_status = 0; task->tk_client->cl_stats->rpcauthrefresh++; rpcauth_refreshcred(task); - if (rpc_task_need_resched(task)) - return; - call_refreshresult(task); } /* @@ -1666,7 +1654,6 @@ call_refreshresult(struct rpc_task *task) case 0: if (rpcauth_uptodatecred(task)) { task->tk_action = call_allocate; - call_allocate(task); return; } /* Use rate-limiting and a max number of retries if refresh @@ -1685,7 +1672,6 @@ call_refreshresult(struct rpc_task *task) task->tk_cred_retry--; dprintk("RPC: %5u %s: retry refresh creds\n", task->tk_pid, __func__); - call_refresh(task); return; } dprintk("RPC: %5u %s: refresh creds failed with error %d\n", @@ -1711,10 +1697,8 @@ call_allocate(struct rpc_task *task) task->tk_status = 0; task->tk_action = call_encode; - if (req->rq_buffer) { - call_encode(task); + if (req->rq_buffer) return; - } if (proc->p_proc != 0) { BUG_ON(proc->p_arglen == 0); @@ -1740,12 +1724,8 @@ call_allocate(struct rpc_task *task) status = xprt->ops->buf_alloc(task); xprt_inject_disconnect(xprt); - if (status == 0) { - if (rpc_task_need_resched(task)) - return; - call_encode(task); + if (status == 0) return; - } if (status != -ENOMEM) { rpc_exit(task, status); return; @@ -1828,8 +1808,12 @@ call_encode(struct rpc_task *task) xprt_request_enqueue_receive(task); xprt_request_enqueue_transmit(task); out: - task->tk_action = call_bind; - call_bind(task); + task->tk_action = call_transmit; + /* Check that the connection is OK */ + if (!xprt_bound(task->tk_xprt)) + task->tk_action = call_bind; + else if (!xprt_connected(task->tk_xprt)) + task->tk_action = call_connect; } /* @@ -1847,7 +1831,6 @@ rpc_task_handle_transmitted(struct rpc_task *task) { xprt_end_transmit(task); task->tk_action = call_transmit_status; - call_transmit_status(task); } /* @@ -1865,7 +1848,6 @@ call_bind(struct rpc_task *task) if (xprt_bound(xprt)) { task->tk_action = call_connect; - call_connect(task); return; } @@ -1896,7 +1878,6 @@ call_bind_status(struct rpc_task *task) dprint_status(task); task->tk_status = 0; task->tk_action = call_connect; - call_connect(task); return; } @@ -1981,7 +1962,6 @@ call_connect(struct rpc_task *task) if (xprt_connected(xprt)) { task->tk_action = call_transmit; - call_transmit(task); return; } @@ -2051,7 +2031,6 @@ call_connect_status(struct rpc_task *task) case 0: clnt->cl_stats->netreconn++; task->tk_action = call_transmit; - call_transmit(task); return; } rpc_exit(task, status); @@ -2087,9 +2066,6 @@ call_transmit(struct rpc_task *task) xprt_transmit(task); } xprt_end_transmit(task); - if (rpc_task_need_resched(task)) - return; - call_transmit_status(task); } /* @@ -2107,9 +2083,6 @@ call_transmit_status(struct rpc_task *task) if (rpc_task_transmitted(task)) { if (task->tk_status == 0) xprt_request_wait_receive(task); - if (rpc_task_need_resched(task)) - return; - call_status(task); return; } @@ -2170,7 +2143,6 @@ call_bc_encode(struct rpc_task *task) { xprt_request_enqueue_transmit(task); task->tk_action = call_bc_transmit; - call_bc_transmit(task); } /* @@ -2261,7 +2233,6 @@ call_status(struct rpc_task *task) status = task->tk_status; if (status >= 0) { task->tk_action = call_decode; - call_decode(task); return; } -- cgit v1.2.3-59-g8ed1b From 77f1e0a52d26242b6c2dba019f6ebebfb9ff701e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 18 Jan 2019 10:34:16 -0700 Subject: bfq: update internal depth state when queue depth changes A previous commit moved the shallow depth and BFQ depth map calculations to be done at init time, moving it outside of the hotter IO path. This potentially causes hangs if the users changes the depth of the scheduler map, by writing to the 'nr_requests' sysfs file for that device. Add a blk-mq-sched hook that allows blk-mq to inform the scheduler if the depth changes, so that the scheduler can update its internal state. Tested-by: Kai Krakow Reported-by: Paolo Valente Fixes: f0635b8a416e ("bfq: calculate shallow depths at init time") Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 +++++++- block/blk-mq.c | 2 ++ include/linux/elevator.h | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index dfb8cb0af13a..5ba1e0d841b4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5396,7 +5396,7 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, return min_shallow; } -static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) +static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; @@ -5404,6 +5404,11 @@ static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); +} + +static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) +{ + bfq_depth_updated(hctx); return 0; } @@ -5826,6 +5831,7 @@ static struct elevator_type iosched_bfq_mq = { .requests_merged = bfq_requests_merged, .request_merged = bfq_request_merged, .has_work = bfq_has_work, + .depth_updated = bfq_depth_updated, .init_hctx = bfq_init_hctx, .init_sched = bfq_init_queue, .exit_sched = bfq_exit_queue, diff --git a/block/blk-mq.c b/block/blk-mq.c index 9516304a38ee..fc60ed7e940e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3135,6 +3135,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } if (ret) break; + if (q->elevator && q->elevator->type->ops.depth_updated) + q->elevator->type->ops.depth_updated(hctx); } if (!ret) diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 2e9e2763bf47..6e8bc53740f0 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -31,6 +31,7 @@ struct elevator_mq_ops { void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); + void (*depth_updated)(struct blk_mq_hw_ctx *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); -- cgit v1.2.3-59-g8ed1b From f958d7b528b1b40c44cfda5eabe2d82760d868c3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 11 Apr 2019 10:06:20 -0700 Subject: mm: make page ref count overflow check tighter and more explicit We have a VM_BUG_ON() to check that the page reference count doesn't underflow (or get close to overflow) by checking the sign of the count. That's all fine, but we actually want to allow people to use a "get page ref unless it's already very high" helper function, and we want that one to use the sign of the page ref (without triggering this VM_BUG_ON). Change the VM_BUG_ON to only check for small underflows (or _very_ close to overflowing), and ignore overflows which have strayed into negative territory. Acked-by: Matthew Wilcox Cc: Jann Horn Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bb6408fe73..541d99b86aea 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -965,6 +965,10 @@ static inline bool is_pci_p2pdma_page(const struct page *page) } #endif /* CONFIG_DEV_PAGEMAP_OPS */ +/* 127: arbitrary random number, small enough to assemble well */ +#define page_ref_zero_or_close_to_overflow(page) \ + ((unsigned int) page_ref_count(page) + 127u <= 127u) + static inline void get_page(struct page *page) { page = compound_head(page); @@ -972,7 +976,7 @@ static inline void get_page(struct page *page) * Getting a normal page or the head of a compound page * requires to already have an elevated page->_refcount. */ - VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); + VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); page_ref_inc(page); } -- cgit v1.2.3-59-g8ed1b From 88b1a17dfc3ed7728316478fae0f5ad508f50397 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 11 Apr 2019 10:14:59 -0700 Subject: mm: add 'try_get_page()' helper function This is the same as the traditional 'get_page()' function, but instead of unconditionally incrementing the reference count of the page, it only does so if the count was "safe". It returns whether the reference count was incremented (and is marked __must_check, since the caller obviously has to be aware of it). Also like 'get_page()', you can't use this function unless you already had a reference to the page. The intent is that you can use this exactly like get_page(), but in situations where you want to limit the maximum reference count. The code currently does an unconditional WARN_ON_ONCE() if we ever hit the reference count issues (either zero or negative), as a notification that the conditional non-increment actually happened. NOTE! The count access for the "safety" check is inherently racy, but that doesn't matter since the buffer we use is basically half the range of the reference count (ie we look at the sign of the count). Acked-by: Matthew Wilcox Cc: Jann Horn Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 541d99b86aea..7000ddd807e0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -980,6 +980,15 @@ static inline void get_page(struct page *page) page_ref_inc(page); } +static inline __must_check bool try_get_page(struct page *page) +{ + page = compound_head(page); + if (WARN_ON_ONCE(page_ref_count(page) <= 0)) + return false; + page_ref_inc(page); + return true; +} + static inline void put_page(struct page *page) { page = compound_head(page); -- cgit v1.2.3-59-g8ed1b From 15fab63e1e57be9fdb5eec1bbc5916e9825e9acb Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 5 Apr 2019 14:02:10 -0700 Subject: fs: prevent page refcount overflow in pipe_buf_get Change pipe_buf_get() to return a bool indicating whether it succeeded in raising the refcount of the page (if the thing in the pipe is a page). This removes another mechanism for overflowing the page refcount. All callers converted to handle a failure. Reported-by: Jann Horn Signed-off-by: Matthew Wilcox Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 12 ++++++------ fs/pipe.c | 4 ++-- fs/splice.c | 12 ++++++++++-- include/linux/pipe_fs_i.h | 10 ++++++---- kernel/trace/trace.c | 6 +++++- 5 files changed, 29 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 809c0f2f9942..64f4de983468 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2034,10 +2034,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; ret = -EINVAL; - if (rem < len) { - pipe_unlock(pipe); - goto out; - } + if (rem < len) + goto out_free; rem = len; while (rem) { @@ -2055,7 +2053,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; } else { - pipe_buf_get(pipe, ibuf); + if (!pipe_buf_get(pipe, ibuf)) + goto out_free; + *obuf = *ibuf; obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->len = rem; @@ -2078,11 +2078,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); pipe_lock(pipe); +out_free: for (idx = 0; idx < nbuf; idx++) pipe_buf_release(pipe, &bufs[idx]); pipe_unlock(pipe); -out: kvfree(bufs); return ret; } diff --git a/fs/pipe.c b/fs/pipe.c index bdc5d3c0977d..b1543b85c14a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -189,9 +189,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal); * in the tee() system call, when we duplicate the buffers in one * pipe into another. */ -void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - get_page(buf->page); + return try_get_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_get); diff --git a/fs/splice.c b/fs/splice.c index de2ede048473..f30af82b850d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1588,7 +1588,11 @@ retry: * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } *obuf = *ibuf; /* @@ -1660,7 +1664,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } obuf = opipe->bufs + nbuf; *obuf = *ibuf; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 5a3bb3b7c9ad..3f2a42c11e20 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -108,18 +108,20 @@ struct pipe_buf_operations { /* * Get a reference to the pipe buffer. */ - void (*get)(struct pipe_inode_info *, struct pipe_buffer *); + bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); }; /** * pipe_buf_get - get a reference to a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to get a reference to + * + * Return: %true if the reference was successfully obtained. */ -static inline void pipe_buf_get(struct pipe_inode_info *pipe, +static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - buf->ops->get(pipe, buf); + return buf->ops->get(pipe, buf); } /** @@ -178,7 +180,7 @@ struct pipe_inode_info *alloc_pipe_info(void); void free_pipe_info(struct pipe_inode_info *); /* Generic pipe buffer ops functions */ -void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); +bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c4238b441624..0f300d488c9f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6835,12 +6835,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, buf->private = 0; } -static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, +static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; + if (ref->ref > INT_MAX/2) + return false; + ref->ref++; + return true; } /* Pipe buffer operations for a buffer. */ -- cgit v1.2.3-59-g8ed1b From 0082517fa4bce073e7cf542633439f26538a14cc Mon Sep 17 00:00:00 2001 From: Jian-Hong Pan Date: Fri, 12 Apr 2019 16:01:53 +0800 Subject: x86/reboot, efi: Use EFI reboot for Acer TravelMate X514-51T Upon reboot, the Acer TravelMate X514-51T laptop appears to complete the shutdown process, but then it hangs in BIOS POST with a black screen. The problem is intermittent - at some points it has appeared related to Secure Boot settings or different kernel builds, but ultimately we have not been able to identify the exact conditions that trigger the issue to come and go. Besides, the EFI mode cannot be disabled in the BIOS of this model. However, after extensive testing, we observe that using the EFI reboot method reliably avoids the issue in all cases. So add a boot time quirk to use EFI reboot on such systems. Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=203119 Signed-off-by: Jian-Hong Pan Signed-off-by: Daniel Drake Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: linux@endlessm.com Link: http://lkml.kernel.org/r/20190412080152.3718-1-jian-hong@endlessm.com [ Fix !CONFIG_EFI build failure, clarify the code and the changelog a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 21 +++++++++++++++++++++ include/linux/efi.h | 7 ++++++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 725624b6c0c0..8fd3cedd9acc 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -81,6 +81,19 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } +/* + * Some machines don't handle the default ACPI reboot method and + * require the EFI reboot method: + */ +static int __init set_efi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_EFI && !efi_runtime_disabled()) { + reboot_type = BOOT_EFI; + pr_info("%s series board detected. Selecting EFI-method for reboot.\n", d->ident); + } + return 0; +} + void __noreturn machine_real_restart(unsigned int type) { local_irq_disable(); @@ -166,6 +179,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), }, }, + { /* Handle reboot issue on Acer TravelMate X514-51T */ + .callback = set_efi_reboot, + .ident = "Acer TravelMate X514-51T", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate X514-51T"), + }, + }, /* Apple */ { /* Handle problems with rebooting on Apple MacBook5 */ diff --git a/include/linux/efi.h b/include/linux/efi.h index 54357a258b35..6ebc2098cfe1 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1611,7 +1611,12 @@ efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, struct screen_info *si, efi_guid_t *proto, unsigned long size); -bool efi_runtime_disabled(void); +#ifdef CONFIG_EFI +extern bool efi_runtime_disabled(void); +#else +static inline bool efi_runtime_disabled(void) { return true; } +#endif + extern void efi_call_virt_check_flags(unsigned long flags, const char *call); extern unsigned long efi_call_virt_save_flags(void); -- cgit v1.2.3-59-g8ed1b From 1d487e9bf8ba66a7174c56a0029c54b1eca8f99c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Apr 2019 11:16:47 +0200 Subject: KVM: fix spectrev1 gadgets These were found with smatch, and then generalized when applicable. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 +++- include/linux/kvm_host.h | 10 ++++++---- virt/kvm/irqchip.c | 5 +++-- virt/kvm/kvm_main.c | 6 ++++-- 4 files changed, 16 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 991fdf7fc17f..9bf70cf84564 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -138,6 +138,7 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, if (offset <= max_apic_id) { u8 cluster_size = min(max_apic_id - offset + 1, 16U); + offset = array_index_nospec(offset, map->max_apic_id + 1); *cluster = &map->phys_map[offset]; *mask = dest_id & (0xffff >> (16 - cluster_size)); } else { @@ -901,7 +902,8 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, if (irq->dest_id > map->max_apic_id) { *bitmap = 0; } else { - *dst = &map->phys_map[irq->dest_id]; + u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1); + *dst = &map->phys_map[dest_id]; *bitmap = 1; } return true; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9d55c63db09b..640a03642766 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -513,10 +514,10 @@ static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) { - /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu, in case - * the caller has read kvm->online_vcpus before (as is the case - * for kvm_for_each_vcpu, for example). - */ + int num_vcpus = atomic_read(&kvm->online_vcpus); + i = array_index_nospec(i, num_vcpus); + + /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu. */ smp_rmb(); return kvm->vcpus[i]; } @@ -600,6 +601,7 @@ void kvm_put_kvm(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { + as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM); return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, lockdep_is_held(&kvm->slots_lock) || !refcount_read(&kvm->users_count)); diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 3547b0d8c91e..79e59e4fa3dc 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -144,18 +144,19 @@ static int setup_routing_entry(struct kvm *kvm, { struct kvm_kernel_irq_routing_entry *ei; int r; + u32 gsi = array_index_nospec(ue->gsi, KVM_MAX_IRQ_ROUTES); /* * Do not allow GSI to be mapped to the same irqchip more than once. * Allow only one to one mapping between GSI and non-irqchip routing. */ - hlist_for_each_entry(ei, &rt->map[ue->gsi], link) + hlist_for_each_entry(ei, &rt->map[gsi], link) if (ei->type != KVM_IRQ_ROUTING_IRQCHIP || ue->type != KVM_IRQ_ROUTING_IRQCHIP || ue->u.irqchip.irqchip == ei->irqchip.irqchip) return -EINVAL; - e->gsi = ue->gsi; + e->gsi = gsi; e->type = ue->type; r = kvm_set_routing_entry(kvm, e, ue); if (r) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 55fe8e20d8fd..dc8edc97ba85 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2977,12 +2977,14 @@ static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_device_ops *ops = NULL; struct kvm_device *dev; bool test = cd->flags & KVM_CREATE_DEVICE_TEST; + int type; int ret; if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) return -ENODEV; - ops = kvm_device_ops_table[cd->type]; + type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); + ops = kvm_device_ops_table[type]; if (ops == NULL) return -ENODEV; @@ -2997,7 +2999,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm, dev->kvm = kvm; mutex_lock(&kvm->lock); - ret = ops->create(dev, cd->type); + ret = ops->create(dev, type); if (ret < 0) { mutex_unlock(&kvm->lock); kfree(dev); -- cgit v1.2.3-59-g8ed1b From 3ff9c075cc767b3060bdac12da72fc94dd7da1b8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 24 Feb 2019 01:49:52 +0900 Subject: x86/kprobes: Verify stack frame on kretprobe Verify the stack frame pointer on kretprobe trampoline handler, If the stack frame pointer does not match, it skips the wrong entry and tries to find correct one. This can happen if user puts the kretprobe on the function which can be used in the path of ftrace user-function call. Such functions should not be probed, so this adds a warning message that reports which function should be blacklisted. Tested-by: Andrea Righi Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/155094059185.6137.15527904013362842072.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/core.c | 26 ++++++++++++++++++++++++++ include/linux/kprobes.h | 1 + 2 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index a034cb808e7e..18fbe9be2d68 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -569,6 +569,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) unsigned long *sara = stack_addr(regs); ri->ret_addr = (kprobe_opcode_t *) *sara; + ri->fp = sara; /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; @@ -759,15 +760,21 @@ static __used void *trampoline_handler(struct pt_regs *regs) unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; kprobe_opcode_t *correct_ret_addr = NULL; + void *frame_pointer; + bool skipped = false; INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; + /* On x86-64, we use pt_regs->sp for return address holder. */ + frame_pointer = ®s->sp; #else regs->cs = __KERNEL_CS | get_kernel_rpl(); regs->gs = 0; + /* On x86-32, we use pt_regs->flags for return address holder. */ + frame_pointer = ®s->flags; #endif regs->ip = trampoline_address; regs->orig_ax = ~0UL; @@ -789,8 +796,25 @@ static __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + /* + * Return probes must be pushed on this hash list correct + * order (same as return order) so that it can be poped + * correctly. However, if we find it is pushed it incorrect + * order, this means we find a function which should not be + * probed, because the wrong order entry is pushed on the + * path of processing other kretprobe itself. + */ + if (ri->fp != frame_pointer) { + if (!skipped) + pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); + skipped = true; + continue; + } orig_ret_address = (unsigned long)ri->ret_addr; + if (skipped) + pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", + ri->rp->kp.addr); if (orig_ret_address != trampoline_address) /* @@ -808,6 +832,8 @@ static __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + if (ri->fp != frame_pointer) + continue; orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 201f0f2683f2..9a897256e481 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -173,6 +173,7 @@ struct kretprobe_instance { struct kretprobe *rp; kprobe_opcode_t *ret_addr; struct task_struct *task; + void *fp; char data[0]; }; -- cgit v1.2.3-59-g8ed1b From af53d3e9e04024885de5b4fda51e5fa362ae2bd8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 18 Apr 2019 17:50:13 -0700 Subject: mm: swapoff: shmem_unuse() stop eviction without igrab() The igrab() in shmem_unuse() looks good, but we forgot that it gives no protection against concurrent unmounting: a point made by Konstantin Khlebnikov eight years ago, and then fixed in 2.6.39 by 778dd893ae78 ("tmpfs: fix race between umount and swapoff"). The current 5.1-rc swapoff is liable to hit "VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day..." followed by GPF. Once again, give up on using igrab(); but don't go back to making such heavy-handed use of shmem_swaplist_mutex as last time: that would spoil the new design, and I expect could deadlock inside shmem_swapin_page(). Instead, shmem_unuse() just raise a "stop_eviction" count in the shmem- specific inode, and shmem_evict_inode() wait for that to go down to 0. Call it "stop_eviction" rather than "swapoff_busy" because it can be put to use for others later (huge tmpfs patches expect to use it). That simplifies shmem_unuse(), protecting it from both unlink and unmount; and in practice lets it locate all the swap in its first try. But do not rely on that: there's still a theoretical case, when shmem_writepage() might have been preempted after its get_swap_page(), before making the swap entry visible to swapoff. [hughd@google.com: remove incorrect list_del()] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904091133570.1898@eggly.anvils Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904081259400.1523@eggly.anvils Fixes: b56a2d8af914 ("mm: rid swapoff of quadratic complexity") Signed-off-by: Hugh Dickins Cc: "Alex Xu (Hello71)" Cc: Huang Ying Cc: Kelley Nielsen Cc: Konstantin Khlebnikov Cc: Rik van Riel Cc: Vineeth Pillai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 1 + mm/shmem.c | 40 ++++++++++++++++++---------------------- mm/swapfile.c | 11 +++++------ 3 files changed, 24 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f3fb1edb3526..20d815a33145 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -21,6 +21,7 @@ struct shmem_inode_info { struct list_head swaplist; /* chain of maybes on swap */ struct shared_policy policy; /* NUMA memory alloc policy */ struct simple_xattrs xattrs; /* list of xattrs */ + atomic_t stop_eviction; /* hold when working on inode */ struct inode vfs_inode; }; diff --git a/mm/shmem.c b/mm/shmem.c index 859e8628071f..2275a0ff7c30 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1081,9 +1081,14 @@ static void shmem_evict_inode(struct inode *inode) } spin_unlock(&sbinfo->shrinklist_lock); } - if (!list_empty(&info->swaplist)) { + while (!list_empty(&info->swaplist)) { + /* Wait while shmem_unuse() is scanning this inode... */ + wait_var_event(&info->stop_eviction, + !atomic_read(&info->stop_eviction)); mutex_lock(&shmem_swaplist_mutex); - list_del_init(&info->swaplist); + /* ...but beware of the race if we peeked too early */ + if (!atomic_read(&info->stop_eviction)) + list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); } } @@ -1227,36 +1232,27 @@ int shmem_unuse(unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse) { struct shmem_inode_info *info, *next; - struct inode *inode; - struct inode *prev_inode = NULL; int error = 0; if (list_empty(&shmem_swaplist)) return 0; mutex_lock(&shmem_swaplist_mutex); - - /* - * The extra refcount on the inode is necessary to safely dereference - * p->next after re-acquiring the lock. New shmem inodes with swap - * get added to the end of the list and we will scan them all. - */ list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); continue; } - - inode = igrab(&info->vfs_inode); - if (!inode) - continue; - + /* + * Drop the swaplist mutex while searching the inode for swap; + * but before doing so, make sure shmem_evict_inode() will not + * remove placeholder inode from swaplist, nor let it be freed + * (igrab() would protect from unlink, but not from unmount). + */ + atomic_inc(&info->stop_eviction); mutex_unlock(&shmem_swaplist_mutex); - if (prev_inode) - iput(prev_inode); - prev_inode = inode; - error = shmem_unuse_inode(inode, type, frontswap, + error = shmem_unuse_inode(&info->vfs_inode, type, frontswap, fs_pages_to_unuse); cond_resched(); @@ -1264,14 +1260,13 @@ int shmem_unuse(unsigned int type, bool frontswap, next = list_next_entry(info, swaplist); if (!info->swapped) list_del_init(&info->swaplist); + if (atomic_dec_and_test(&info->stop_eviction)) + wake_up_var(&info->stop_eviction); if (error) break; } mutex_unlock(&shmem_swaplist_mutex); - if (prev_inode) - iput(prev_inode); - return error; } @@ -2238,6 +2233,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); + atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->shrinklist); diff --git a/mm/swapfile.c b/mm/swapfile.c index 71383625a582..cf63b5f01adf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2116,12 +2116,11 @@ retry: * Under global memory pressure, swap entries can be reinserted back * into process space after the mmlist loop above passes over them. * - * Limit the number of retries? No: when shmem_unuse()'s igrab() fails, - * a shmem inode using swap is being evicted; and when mmget_not_zero() - * above fails, that mm is likely to be freeing swap from exit_mmap(). - * Both proceed at their own independent pace: we could move them to - * separate lists, and wait for those lists to be emptied; but it's - * easier and more robust (though cpu-intensive) just to keep retrying. + * Limit the number of retries? No: when mmget_not_zero() above fails, + * that mm is likely to be freeing swap from exit_mmap(), which proceeds + * at its own independent pace; and even shmem_writepage() could have + * been preempted after get_swap_page(), temporarily hiding that swap. + * It's easy and robust (though cpu-intensive) just to keep retrying. */ if (si->inuse_pages) { if (!signal_pending(current)) -- cgit v1.2.3-59-g8ed1b From 04f5866e41fb70690e28397487d8bd8eea7d712a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 18 Apr 2019 17:50:52 -0700 Subject: coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping The core dumping code has always run without holding the mmap_sem for writing, despite that is the only way to ensure that the entire vma layout will not change from under it. Only using some signal serialization on the processes belonging to the mm is not nearly enough. This was pointed out earlier. For example in Hugh's post from Jul 2017: https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils "Not strictly relevant here, but a related note: I was very surprised to discover, only quite recently, how handle_mm_fault() may be called without down_read(mmap_sem) - when core dumping. That seems a misguided optimization to me, which would also be nice to correct" In particular because the growsdown and growsup can move the vm_start/vm_end the various loops the core dump does around the vma will not be consistent if page faults can happen concurrently. Pretty much all users calling mmget_not_zero()/get_task_mm() and then taking the mmap_sem had the potential to introduce unexpected side effects in the core dumping code. Adding mmap_sem for writing around the ->core_dump invocation is a viable long term fix, but it requires removing all copy user and page faults and to replace them with get_dump_page() for all binary formats which is not suitable as a short term fix. For the time being this solution manually covers the places that can confuse the core dump either by altering the vma layout or the vma flags while it runs. Once ->core_dump runs under mmap_sem for writing the function mmget_still_valid() can be dropped. Allowing mmap_sem protected sections to run in parallel with the coredump provides some minor parallelism advantage to the swapoff code (which seems to be safe enough by never mangling any vma field and can keep doing swapins in parallel to the core dumping) and to some other corner case. In order to facilitate the backporting I added "Fixes: 86039bd3b4e6" however the side effect of this same race condition in /proc/pid/mem should be reproducible since before 2.6.12-rc2 so I couldn't add any other "Fixes:" because there's no hash beyond the git genesis commit. Because find_extend_vma() is the only location outside of the process context that could modify the "mm" structures under mmap_sem for reading, by adding the mmget_still_valid() check to it, all other cases that take the mmap_sem for reading don't need the new check after mmget_not_zero()/get_task_mm(). The expand_stack() in page fault context also doesn't need the new check, because all tasks under core dumping are frozen. Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization") Signed-off-by: Andrea Arcangeli Reported-by: Jann Horn Suggested-by: Oleg Nesterov Acked-by: Peter Xu Reviewed-by: Mike Rapoport Reviewed-by: Oleg Nesterov Reviewed-by: Jann Horn Acked-by: Jason Gunthorpe Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/core/uverbs_main.c | 3 +++ fs/proc/task_mmu.c | 18 ++++++++++++++++++ fs/userfaultfd.c | 9 +++++++++ include/linux/sched/mm.h | 21 +++++++++++++++++++++ mm/mmap.c | 7 ++++++- 5 files changed, 57 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 70b7d80431a9..f2e7ffe6fc54 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -993,6 +993,8 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) * will only be one mm, so no big deal. */ down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto skip_mm; mutex_lock(&ufile->umap_lock); list_for_each_entry_safe (priv, next_priv, &ufile->umaps, list) { @@ -1007,6 +1009,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); } mutex_unlock(&ufile->umap_lock); + skip_mm: up_write(&mm->mmap_sem); mmput(mm); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 92a91e7816d8..95ca1fe7283c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1143,6 +1143,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = -EINTR; goto out_mm; } + /* + * Avoid to modify vma->vm_flags + * without locked ops while the + * coredump reads the vm_flags. + */ + if (!mmget_still_valid(mm)) { + /* + * Silently return "count" + * like if get_task_mm() + * failed. FIXME: should this + * function have returned + * -ESRCH if get_task_mm() + * failed like if + * get_proc_task() fails? + */ + up_write(&mm->mmap_sem); + goto out_mm; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 89800fc7dc9d..f5de1e726356 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -629,6 +629,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ down_write(&mm->mmap_sem); + /* no task can run (and in turn coredump) yet */ + VM_WARN_ON(!mmget_still_valid(mm)); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -883,6 +885,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_sem for writing. */ down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto skip_mm; prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -905,6 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } +skip_mm: up_write(&mm->mmap_sem); mmput(mm); wakeup: @@ -1333,6 +1338,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1520,6 +1527,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0cd9f10423fb..a3fda9f024c3 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,27 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +/* + * This has to be called after a get_task_mm()/mmget_not_zero() + * followed by taking the mmap_sem for writing before modifying the + * vmas or anything the coredump pretends not to change from under it. + * + * NOTE: find_extend_vma() called from GUP context is the only place + * that can modify the "mm" (notably the vm_start/end) under mmap_sem + * for reading and outside the context of the process, so it is also + * the only case that holds the mmap_sem for reading that must call + * this function. Generally if the mmap_sem is hold for reading + * there's no need of this check after get_task_mm()/mmget_not_zero(). + * + * This function can be obsoleted and the check can be removed, after + * the coredump code will hold the mmap_sem for writing before + * invoking the ->core_dump methods. + */ +static inline bool mmget_still_valid(struct mm_struct *mm) +{ + return likely(!mm->core_state); +} + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. diff --git a/mm/mmap.c b/mm/mmap.c index 41eb48d9b527..bd7b9f293b39 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -2525,7 +2526,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + /* don't alter vm_end if the coredump is running */ + if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); @@ -2551,6 +2553,9 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; + /* don't alter vm_start if the coredump is running */ + if (!mmget_still_valid(mm)) + return NULL; start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; -- cgit v1.2.3-59-g8ed1b From b40fabc05ea047f6af5933d26a5483873340b0d4 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 19 Apr 2019 10:31:27 +0800 Subject: block: kill all_q_node in request_queue all_q_node has not been used since commit 4b855ad37194 ("blk-mq: Create hctx for each present CPU"), so remove it. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Signed-off-by: Hou Tao Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c58a3b2bf00..317ab30d2904 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -548,7 +548,6 @@ struct request_queue { struct rcu_head rcu_head; wait_queue_head_t mq_freeze_wq; struct percpu_ref q_usage_counter; - struct list_head all_q_node; struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; -- cgit v1.2.3-59-g8ed1b From 6bedf00e55e5dd0a4ed1ad3f06131edd6fb56ec8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 17 Apr 2019 09:11:26 +0800 Subject: block: make sure that bvec length can't be overflow bvec->bv_offset may be bigger than PAGE_SIZE sometimes, such as, when one bio is splitted in the middle of one bvec via bio_split(), and bi_iter.bi_bvec_done is used to build offset of the 1st bvec of remained bio. And the remained bio's bvec may be re-submitted to fs layer via ITER_IBVEC, such as loop and nvme-loop. So we have to make sure that every bvec's offset is less than PAGE_SIZE from bio_for_each_segment_all() because some drivers(loop, nvme-loop) passes the splitted bvec to fs layer via ITER_BVEC. This patch fixes this issue reported by Zhang Yi When running nvme/011. Cc: Christoph Hellwig Cc: Yi Zhang Reported-by: Yi Zhang Reviewed-by: Christoph Hellwig Fixes: 6dc4f100c175 ("block: allow bio_for_each_segment_all() to iterate over multi-page bvec") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bvec.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 3bc91879e1e2..ff13cbc1887d 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -160,8 +160,9 @@ static inline void bvec_advance(const struct bio_vec *bvec, bv->bv_page = nth_page(bv->bv_page, 1); bv->bv_offset = 0; } else { - bv->bv_page = bvec->bv_page; - bv->bv_offset = bvec->bv_offset; + bv->bv_page = bvec_nth_page(bvec->bv_page, bvec->bv_offset / + PAGE_SIZE); + bv->bv_offset = bvec->bv_offset & ~PAGE_MASK; } bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset, bvec->bv_len - iter_all->done); -- cgit v1.2.3-59-g8ed1b