aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/acct.c7
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_tree.c3
-rw-r--r--kernel/auditsc.c18
-rw-r--r--kernel/bounds.c1
-rw-r--r--kernel/bpf/Makefile12
-rw-r--r--kernel/bpf/arraymap.c41
-rw-r--r--kernel/bpf/bpf_lru_list.h3
-rw-r--r--kernel/bpf/cgroup.c570
-rw-r--r--kernel/bpf/core.c243
-rw-r--r--kernel/bpf/cpumap.c706
-rw-r--r--kernel/bpf/devmap.c418
-rw-r--r--kernel/bpf/disasm.c214
-rw-r--r--kernel/bpf/disasm.h32
-rw-r--r--kernel/bpf/hashtab.c65
-rw-r--r--kernel/bpf/inode.c16
-rw-r--r--kernel/bpf/lpm_trie.c105
-rw-r--r--kernel/bpf/offload.c194
-rw-r--r--kernel/bpf/percpu_freelist.c8
-rw-r--r--kernel/bpf/sockmap.c901
-rw-r--r--kernel/bpf/stackmap.c11
-rw-r--r--kernel/bpf/syscall.c463
-rw-r--r--kernel/bpf/tnum.c180
-rw-r--r--kernel/bpf/verifier.c3634
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup-internal.h13
-rw-r--r--kernel/cgroup/cgroup-v1.c75
-rw-r--r--kernel/cgroup/cgroup.c1011
-rw-r--r--kernel/cgroup/cpuset.c92
-rw-r--r--kernel/cgroup/debug.c54
-rw-r--r--kernel/cgroup/freezer.c6
-rw-r--r--kernel/cgroup/namespace.c1
-rw-r--r--kernel/cgroup/pids.c1
-rw-r--r--kernel/compat.c23
-rw-r--r--kernel/configs/android-base.config1
-rw-r--r--kernel/cpu.c526
-rw-r--r--kernel/cpu_pm.c50
-rw-r--r--kernel/dma.c1
-rw-r--r--kernel/elfcore.c1
-rw-r--r--kernel/events/Makefile1
-rw-r--r--kernel/events/core.c646
-rw-r--r--kernel/events/internal.h7
-rw-r--r--kernel/events/ring_buffer.c49
-rw-r--r--kernel/exec_domain.c1
-rw-r--r--kernel/exit.c46
-rw-r--r--kernel/extable.c47
-rw-r--r--kernel/fork.c36
-rw-r--r--kernel/futex.c123
-rw-r--r--kernel/futex_compat.c1
-rw-r--r--kernel/gcov/Makefile1
-rw-r--r--kernel/gcov/base.c1
-rw-r--r--kernel/gcov/fs.c1
-rw-r--r--kernel/gcov/gcc_3_4.c1
-rw-r--r--kernel/gcov/gcc_4_7.c1
-rw-r--r--kernel/gcov/gcov.h1
-rw-r--r--kernel/groups.c1
-rw-r--r--kernel/irq/Kconfig15
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/affinity.c1
-rw-r--r--kernel/irq/autoprobe.c3
-rw-r--r--kernel/irq/chip.c148
-rw-r--r--kernel/irq/cpuhotplug.c28
-rw-r--r--kernel/irq/debug.h1
-rw-r--r--kernel/irq/debugfs.c62
-rw-r--r--kernel/irq/generic-chip.c16
-rw-r--r--kernel/irq/internals.h22
-rw-r--r--kernel/irq/irq_sim.c164
-rw-r--r--kernel/irq/irqdesc.c34
-rw-r--r--kernel/irq/irqdomain.c293
-rw-r--r--kernel/irq/manage.c58
-rw-r--r--kernel/irq/matrix.c443
-rw-r--r--kernel/irq/migration.c1
-rw-r--r--kernel/irq/msi.c37
-rw-r--r--kernel/irq/proc.c14
-rw-r--r--kernel/irq/resend.c1
-rw-r--r--kernel/irq/settings.h1
-rw-r--r--kernel/irq/spurious.c3
-rw-r--r--kernel/irq/timings.c2
-rw-r--r--kernel/irq_work.c21
-rw-r--r--kernel/jump_label.c114
-rw-r--r--kernel/kallsyms.c43
-rw-r--r--kernel/kcmp.c3
-rw-r--r--kernel/kcov.c2
-rw-r--r--kernel/kexec_core.c12
-rw-r--r--kernel/kexec_file.c5
-rw-r--r--kernel/kexec_internal.h1
-rw-r--r--kernel/kmod.c563
-rw-r--r--kernel/kprobes.c18
-rw-r--r--kernel/kthread.c76
-rw-r--r--kernel/livepatch/Makefile2
-rw-r--r--kernel/livepatch/core.c112
-rw-r--r--kernel/livepatch/core.h41
-rw-r--r--kernel/livepatch/patch.c1
-rw-r--r--kernel/livepatch/patch.h1
-rw-r--r--kernel/livepatch/shadow.c277
-rw-r--r--kernel/livepatch/transition.c45
-rw-r--r--kernel/livepatch/transition.h1
-rw-r--r--kernel/locking/Makefile1
-rw-r--r--kernel/locking/lockdep.c1049
-rw-r--r--kernel/locking/lockdep_internals.h3
-rw-r--r--kernel/locking/lockdep_proc.c5
-rw-r--r--kernel/locking/lockdep_states.h1
-rw-r--r--kernel/locking/mcs_spinlock.h1
-rw-r--r--kernel/locking/mutex-debug.h1
-rw-r--r--kernel/locking/mutex.h1
-rw-r--r--kernel/locking/osq_lock.c14
-rw-r--r--kernel/locking/qrwlock.c86
-rw-r--r--kernel/locking/qspinlock.c117
-rw-r--r--kernel/locking/qspinlock_paravirt.h70
-rw-r--r--kernel/locking/rtmutex-debug.c3
-rw-r--r--kernel/locking/rtmutex-debug.h1
-rw-r--r--kernel/locking/rtmutex.c35
-rw-r--r--kernel/locking/rtmutex.h1
-rw-r--r--kernel/locking/rtmutex_common.h42
-rw-r--r--kernel/locking/rwsem-spinlock.c38
-rw-r--r--kernel/locking/rwsem-xadd.c61
-rw-r--r--kernel/locking/rwsem.c17
-rw-r--r--kernel/locking/rwsem.h1
-rw-r--r--kernel/locking/spinlock.c10
-rw-r--r--kernel/locking/test-ww_mutex.c2
-rw-r--r--kernel/membarrier.c70
-rw-r--r--kernel/memremap.c136
-rw-r--r--kernel/module.c44
-rw-r--r--kernel/padata.c71
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/params.c35
-rw-r--r--kernel/pid_namespace.c4
-rw-r--r--kernel/power/Kconfig14
-rw-r--r--kernel/power/Makefile1
-rw-r--r--kernel/power/autosleep.c1
-rw-r--r--kernel/power/console.c1
-rw-r--r--kernel/power/hibernate.c29
-rw-r--r--kernel/power/main.c64
-rw-r--r--kernel/power/power.h6
-rw-r--r--kernel/power/process.c6
-rw-r--r--kernel/power/qos.c4
-rw-r--r--kernel/power/snapshot.c35
-rw-r--r--kernel/power/suspend.c190
-rw-r--r--kernel/power/suspend_test.c4
-rw-r--r--kernel/power/swap.c131
-rw-r--r--kernel/power/wakelock.c1
-rw-r--r--kernel/printk/braille.c1
-rw-r--r--kernel/printk/braille.h1
-rw-r--r--kernel/printk/console_cmdline.h1
-rw-r--r--kernel/printk/printk.c72
-rw-r--r--kernel/ptrace.c6
-rw-r--r--kernel/range.c1
-rw-r--r--kernel/rcu/Kconfig3
-rw-r--r--kernel/rcu/Makefile1
-rw-r--r--kernel/rcu/rcu.h149
-rw-r--r--kernel/rcu/rcu_segcblist.c109
-rw-r--r--kernel/rcu/rcu_segcblist.h28
-rw-r--r--kernel/rcu/rcuperf.c17
-rw-r--r--kernel/rcu/rcutorture.c111
-rw-r--r--kernel/rcu/srcutiny.c8
-rw-r--r--kernel/rcu/srcutree.c52
-rw-r--r--kernel/rcu/sync.c9
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tiny_plugin.h47
-rw-r--r--kernel/rcu/tree.c414
-rw-r--r--kernel/rcu/tree.h20
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h266
-rw-r--r--kernel/rcu/update.c46
-rw-r--r--kernel/resource.c76
-rw-r--r--kernel/sched/Makefile3
-rw-r--r--kernel/sched/autogroup.c4
-rw-r--r--kernel/sched/autogroup.h1
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/completion.c31
-rw-r--r--kernel/sched/core.c160
-rw-r--r--kernel/sched/cpuacct.c1
-rw-r--r--kernel/sched/cpuacct.h1
-rw-r--r--kernel/sched/cpudeadline.c27
-rw-r--r--kernel/sched/cpudeadline.h1
-rw-r--r--kernel/sched/cpufreq_schedutil.c100
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/cpupri.h1
-rw-r--r--kernel/sched/cputime.c3
-rw-r--r--kernel/sched/deadline.c107
-rw-r--r--kernel/sched/debug.c106
-rw-r--r--kernel/sched/fair.c1513
-rw-r--r--kernel/sched/features.h4
-rw-r--r--kernel/sched/idle.c12
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/isolation.c155
-rw-r--r--kernel/sched/loadavg.c1
-rw-r--r--kernel/sched/membarrier.c178
-rw-r--r--kernel/sched/rt.c319
-rw-r--r--kernel/sched/sched-pelt.h1
-rw-r--r--kernel/sched/sched.h101
-rw-r--r--kernel/sched/stats.c1
-rw-r--r--kernel/sched/stats.h1
-rw-r--r--kernel/sched/stop_task.c1
-rw-r--r--kernel/sched/swait.c7
-rw-r--r--kernel/sched/topology.c91
-rw-r--r--kernel/sched/wait.c85
-rw-r--r--kernel/seccomp.c347
-rw-r--r--kernel/signal.c72
-rw-r--r--kernel/smp.c36
-rw-r--r--kernel/smpboot.c25
-rw-r--r--kernel/smpboot.h1
-rw-r--r--kernel/softirq.c10
-rw-r--r--kernel/sys.c21
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c32
-rw-r--r--kernel/sysctl_binary.c22
-rw-r--r--kernel/task_work.c11
-rw-r--r--kernel/test_kprobes.c29
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/alarmtimer.c17
-rw-r--r--kernel/time/clockevents.c21
-rw-r--r--kernel/time/hrtimer.c4
-rw-r--r--kernel/time/itimer.c1
-rw-r--r--kernel/time/ntp.c228
-rw-r--r--kernel/time/ntp_internal.h2
-rw-r--r--kernel/time/posix-cpu-timers.c21
-rw-r--r--kernel/time/posix-stubs.c20
-rw-r--r--kernel/time/posix-timers.h1
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c1
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-oneshot.c1
-rw-r--r--kernel/time/tick-sched.c38
-rw-r--r--kernel/time/tick-sched.h1
-rw-r--r--kernel/time/time.c71
-rw-r--r--kernel/time/timekeeping.c186
-rw-r--r--kernel/time/timekeeping.h3
-rw-r--r--kernel/time/timekeeping_debug.c5
-rw-r--r--kernel/time/timekeeping_internal.h1
-rw-r--r--kernel/time/timer.c82
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/Kconfig4
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c369
-rw-r--r--kernel/trace/bpf_trace.c177
-rw-r--r--kernel/trace/ftrace.c82
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/rpm-traces.c1
-rw-r--r--kernel/trace/trace.c32
-rw-r--r--kernel/trace/trace.h6
-rw-r--r--kernel/trace/trace_benchmark.c1
-rw-r--r--kernel/trace/trace_benchmark.h1
-rw-r--r--kernel/trace/trace_branch.c1
-rw-r--r--kernel/trace/trace_entries.h1
-rw-r--r--kernel/trace/trace_events.c17
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_events_filter_test.h1
-rw-r--r--kernel/trace/trace_export.c1
-rw-r--r--kernel/trace/trace_functions.c1
-rw-r--r--kernel/trace/trace_functions_graph.c3
-rw-r--r--kernel/trace/trace_kdb.c1
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_mmiotrace.c2
-rw-r--r--kernel/trace/trace_nop.c1
-rw-r--r--kernel/trace/trace_output.c21
-rw-r--r--kernel/trace/trace_output.h1
-rw-r--r--kernel/trace/trace_sched_switch.c1
-rw-r--r--kernel/trace/trace_sched_wakeup.c9
-rw-r--r--kernel/trace/trace_selftest.c3
-rw-r--r--kernel/trace/trace_selftest_dynamic.c1
-rw-r--r--kernel/trace/trace_stack.c18
-rw-r--r--kernel/trace/trace_stat.c1
-rw-r--r--kernel/trace/trace_stat.h1
-rw-r--r--kernel/trace/trace_syscalls.c58
-rw-r--r--kernel/trace/trace_uprobe.c3
-rw-r--r--kernel/trace/tracing_map.h1
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/umh.c568
-rw-r--r--kernel/up.c2
-rw-r--r--kernel/user_namespace.c22
-rw-r--r--kernel/watchdog.c657
-rw-r--r--kernel/watchdog_hld.c204
-rw-r--r--kernel/workqueue.c132
-rw-r--r--kernel/workqueue_internal.h4
278 files changed, 16814 insertions, 7168 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb8e8b23c6e..172d151d429c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux kernel.
#
@@ -5,12 +6,13 @@
obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o softirq.o resource.o \
sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
- signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
+ signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o smpboot.o ucount.o
+obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
ifdef CONFIG_FUNCTION_TRACER
@@ -108,7 +110,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_HAS_IOMEM) += memremap.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 5b1284370367..d15c0ee4d955 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/acct.c
*
@@ -146,7 +147,7 @@ static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
again:
smp_rmb();
rcu_read_lock();
- res = to_acct(ACCESS_ONCE(ns->bacct));
+ res = to_acct(READ_ONCE(ns->bacct));
if (!res) {
rcu_read_unlock();
return NULL;
@@ -158,7 +159,7 @@ again:
}
rcu_read_unlock();
mutex_lock(&res->lock);
- if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
+ if (res != to_acct(READ_ONCE(ns->bacct))) {
mutex_unlock(&res->lock);
acct_put(res);
goto again;
@@ -516,7 +517,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
if (file_start_write_trylock(file)) {
/* it's been opened O_APPEND, so position is irrelevant */
loff_t pos = 0;
- __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
+ __kernel_write(file, &ac, sizeof(acct_t), &pos);
file_end_write(file);
}
out:
diff --git a/kernel/audit.h b/kernel/audit.h
index 6bdaf6bd377e..af5bc59487ed 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -182,7 +182,7 @@ struct audit_context {
mqd_t mqdes;
size_t msg_len;
unsigned int msg_prio;
- struct timespec abs_timeout;
+ struct timespec64 abs_timeout;
} mq_sendrecv;
struct {
int oflag;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 011d46e5f73f..fd353120e0d9 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "audit.h"
#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
@@ -1007,7 +1008,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
* We are guaranteed to have at least one reference to the mark from
* either the inode or the caller of fsnotify_destroy_mark().
*/
- BUG_ON(atomic_read(&entry->refcnt) < 1);
+ BUG_ON(refcount_read(&entry->refcnt) < 1);
}
static const struct fsnotify_ops audit_tree_ops = {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c9bb29e17335..e80459f7e132 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1235,11 +1235,11 @@ static void show_special(struct audit_context *context, int *call_panic)
case AUDIT_MQ_SENDRECV:
audit_log_format(ab,
"mqdes=%d msg_len=%zd msg_prio=%u "
- "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
+ "abs_timeout_sec=%lld abs_timeout_nsec=%ld",
context->mq_sendrecv.mqdes,
context->mq_sendrecv.msg_len,
context->mq_sendrecv.msg_prio,
- context->mq_sendrecv.abs_timeout.tv_sec,
+ (long long) context->mq_sendrecv.abs_timeout.tv_sec,
context->mq_sendrecv.abs_timeout.tv_nsec);
break;
case AUDIT_MQ_NOTIFY:
@@ -2106,15 +2106,15 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
*
*/
void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
- const struct timespec *abs_timeout)
+ const struct timespec64 *abs_timeout)
{
struct audit_context *context = current->audit_context;
- struct timespec *p = &context->mq_sendrecv.abs_timeout;
+ struct timespec64 *p = &context->mq_sendrecv.abs_timeout;
if (abs_timeout)
- memcpy(p, abs_timeout, sizeof(struct timespec));
+ memcpy(p, abs_timeout, sizeof(*p));
else
- memset(p, 0, sizeof(struct timespec));
+ memset(p, 0, sizeof(*p));
context->mq_sendrecv.mqdes = mqdes;
context->mq_sendrecv.msg_len = msg_len;
@@ -2413,6 +2413,12 @@ void __audit_log_kern_module(char *name)
context->type = AUDIT_KERN_MODULE;
}
+void __audit_fanotify(unsigned int response)
+{
+ audit_log(current->audit_context, GFP_KERNEL,
+ AUDIT_FANOTIFY, "resp=%u", response);
+}
+
static void audit_log_task(struct audit_buffer *ab)
{
kuid_t auid, uid;
diff --git a/kernel/bounds.c b/kernel/bounds.c
index e1d1d1952bfa..c373e887c066 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Generate definitions needed by the preprocessor.
* This code generates raw asm output which is post-processed
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e1e5e658f2db..e691da0b3bab 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+ifeq ($(CONFIG_NET),y)
+obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += offload.o
+ifeq ($(CONFIG_STREAM_PARSER),y)
+obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
+endif
+endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index d771a3872500..7c25426d3cf5 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -19,6 +19,9 @@
#include "map_in_map.h"
+#define ARRAY_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
static void bpf_array_free_percpu(struct bpf_array *array)
{
int i;
@@ -49,13 +52,16 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_array *array;
u64 array_size;
u32 elem_size;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size == 0 || attr->map_flags)
+ attr->value_size == 0 ||
+ attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
+ (percpu && numa_node != NUMA_NO_NODE))
return ERR_PTR(-EINVAL);
if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -77,7 +83,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
/* allocate all map elements and zero-initialize them */
- array = bpf_map_area_alloc(array_size);
+ array = bpf_map_area_alloc(array_size, numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
@@ -87,6 +93,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
array->map.map_flags = attr->map_flags;
+ array->map.numa_node = numa_node;
array->elem_size = elem_size;
if (!percpu)
@@ -95,7 +102,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
if (array_size >= U32_MAX - PAGE_SIZE ||
- elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
+ bpf_array_alloc_percpu(array)) {
bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
@@ -489,7 +496,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
ee = ERR_PTR(-EOPNOTSUPP);
event = perf_file->private_data;
- if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+ if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
goto err_out;
ee = bpf_event_entry_gen(perf_file, map_file);
@@ -603,6 +610,31 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
+static u32 array_of_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ u32 elem_size = round_up(map->value_size, 8);
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+ const int map_ptr = BPF_REG_1;
+ const int index = BPF_REG_2;
+
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
+ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ if (is_power_of_2(elem_size))
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
+ else
+ *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
+ *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
+ *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(ret, 0);
+
+ return insn - insn_buf;
+}
+
const struct bpf_map_ops array_of_maps_map_ops = {
.map_alloc = array_of_map_alloc,
.map_free = array_of_map_free,
@@ -612,4 +644,5 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
+ .map_gen_lookup = array_of_map_gen_lookup,
};
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 5c35a98d02bf..7d4f89b7cb84 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -69,7 +69,8 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
/* ref is an approximation on access frequency. It does not
* have to be very accurate. Hence, no protection is used.
*/
- node->ref = 1;
+ if (!node->ref)
+ node->ref = 1;
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 546113430049..b789ab78d28f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,129 +27,405 @@ void cgroup_bpf_put(struct cgroup *cgrp)
{
unsigned int type;
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
- struct bpf_prog *prog = cgrp->bpf.prog[type];
-
- if (prog) {
- bpf_prog_put(prog);
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ struct bpf_prog_list *pl, *tmp;
+
+ list_for_each_entry_safe(pl, tmp, progs, node) {
+ list_del(&pl->node);
+ bpf_prog_put(pl->prog);
+ kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key);
}
+ bpf_prog_array_free(cgrp->bpf.effective[type]);
+ }
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct list_head *head)
+{
+ struct bpf_prog_list *pl;
+ u32 cnt = 0;
+
+ list_for_each_entry(pl, head, node) {
+ if (!pl->prog)
+ continue;
+ cnt++;
}
+ return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ u32 new_flags)
+{
+ struct cgroup *p;
+
+ p = cgroup_parent(cgrp);
+ if (!p)
+ return true;
+ do {
+ u32 flags = p->bpf.flags[type];
+ u32 cnt;
+
+ if (flags & BPF_F_ALLOW_MULTI)
+ return true;
+ cnt = prog_list_length(&p->bpf.progs[type]);
+ WARN_ON_ONCE(cnt > 1);
+ if (cnt == 1)
+ return !!(flags & BPF_F_ALLOW_OVERRIDE);
+ p = cgroup_parent(p);
+ } while (p);
+ return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ struct bpf_prog_array __rcu **array)
+{
+ struct bpf_prog_array __rcu *progs;
+ struct bpf_prog_list *pl;
+ struct cgroup *p = cgrp;
+ int cnt = 0;
+
+ /* count number of effective programs by walking parents */
+ do {
+ if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ cnt += prog_list_length(&p->bpf.progs[type]);
+ p = cgroup_parent(p);
+ } while (p);
+
+ progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+ if (!progs)
+ return -ENOMEM;
+
+ /* populate the array with effective progs */
+ cnt = 0;
+ p = cgrp;
+ do {
+ if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ list_for_each_entry(pl,
+ &p->bpf.progs[type], node) {
+ if (!pl->prog)
+ continue;
+ rcu_dereference_protected(progs, 1)->
+ progs[cnt++] = pl->prog;
+ }
+ p = cgroup_parent(p);
+ } while (p);
+
+ *array = progs;
+ return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ struct bpf_prog_array __rcu *array)
+{
+ struct bpf_prog_array __rcu *old_array;
+
+ old_array = xchg(&cgrp->bpf.effective[type], array);
+ /* free prog array after grace period, since __cgroup_bpf_run_*()
+ * might be still walking the array
+ */
+ bpf_prog_array_free(old_array);
}
/**
* cgroup_bpf_inherit() - inherit effective programs from parent
* @cgrp: the cgroup to modify
- * @parent: the parent to inherit from
*/
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+int cgroup_bpf_inherit(struct cgroup *cgrp)
{
- unsigned int type;
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define NR ARRAY_SIZE(cgrp->bpf.effective)
+ struct bpf_prog_array __rcu *arrays[NR] = {};
+ int i;
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
- struct bpf_prog *e;
+ for (i = 0; i < NR; i++)
+ INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
- e = rcu_dereference_protected(parent->bpf.effective[type],
- lockdep_is_held(&cgroup_mutex));
- rcu_assign_pointer(cgrp->bpf.effective[type], e);
- cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
- }
+ for (i = 0; i < NR; i++)
+ if (compute_effective_progs(cgrp, i, &arrays[i]))
+ goto cleanup;
+
+ for (i = 0; i < NR; i++)
+ activate_effective_progs(cgrp, i, arrays[i]);
+
+ return 0;
+cleanup:
+ for (i = 0; i < NR; i++)
+ bpf_prog_array_free(arrays[i]);
+ return -ENOMEM;
}
+#define BPF_CGROUP_MAX_PROGS 64
+
/**
- * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program to a cgroup, and
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
- * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
- * @prog: A new program to pin
- * @type: Type of pinning operation (ingress/egress)
- *
- * Each cgroup has a set of two pointers for bpf programs; one for eBPF
- * programs it owns, and which is effective for execution.
- *
- * If @prog is not %NULL, this function attaches a new program to the cgroup
- * and releases the one that is currently attached, if any. @prog is then made
- * the effective program of type @type in that cgroup.
- *
- * If @prog is %NULL, the currently attached program of type @type is released,
- * and the effective program of the parent cgroup (if any) is inherited to
- * @cgrp.
- *
- * Then, the descendants of @cgrp are walked and the effective program for
- * each of them is set to the effective program of @cgrp unless the
- * descendant has its own program attached, in which case the subbranch is
- * skipped. This ensures that delegated subcgroups with own programs are left
- * untouched.
+ * @prog: A program to attach
+ * @type: Type of attach operation
*
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
- struct bpf_prog *prog, enum bpf_attach_type type,
- bool new_overridable)
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
{
- struct bpf_prog *old_prog, *effective = NULL;
- struct cgroup_subsys_state *pos;
- bool overridable = true;
-
- if (parent) {
- overridable = !parent->bpf.disallow_override[type];
- effective = rcu_dereference_protected(parent->bpf.effective[type],
- lockdep_is_held(&cgroup_mutex));
- }
-
- if (prog && effective && !overridable)
- /* if parent has non-overridable prog attached, disallow
- * attaching new programs to descendent cgroup
- */
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ struct bpf_prog *old_prog = NULL;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_list *pl;
+ bool pl_was_allocated;
+ int err;
+
+ if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+ /* invalid combination */
+ return -EINVAL;
+
+ if (!hierarchy_allows_attach(cgrp, type, flags))
return -EPERM;
- if (prog && effective && overridable != new_overridable)
- /* if parent has overridable prog attached, only
- * allow overridable programs in descendent cgroup
+ if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+ /* Disallow attaching non-overridable on top
+ * of existing overridable in this cgroup.
+ * Disallow attaching multi-prog if overridable or none
*/
return -EPERM;
- old_prog = cgrp->bpf.prog[type];
-
- if (prog) {
- overridable = new_overridable;
- effective = prog;
- if (old_prog &&
- cgrp->bpf.disallow_override[type] == new_overridable)
- /* disallow attaching non-overridable on top
- * of existing overridable in this cgroup
- * and vice versa
- */
- return -EPERM;
+ if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+ return -E2BIG;
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ list_for_each_entry(pl, progs, node)
+ if (pl->prog == prog)
+ /* disallow attaching the same prog twice */
+ return -EINVAL;
+
+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return -ENOMEM;
+ pl_was_allocated = true;
+ pl->prog = prog;
+ list_add_tail(&pl->node, progs);
+ } else {
+ if (list_empty(progs)) {
+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return -ENOMEM;
+ pl_was_allocated = true;
+ list_add_tail(&pl->node, progs);
+ } else {
+ pl = list_first_entry(progs, typeof(*pl), node);
+ old_prog = pl->prog;
+ pl_was_allocated = false;
+ }
+ pl->prog = prog;
}
- if (!prog && !old_prog)
- /* report error when trying to detach and nothing is attached */
- return -ENOENT;
+ cgrp->bpf.flags[type] = flags;
- cgrp->bpf.prog[type] = prog;
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
- css_for_each_descendant_pre(pos, &cgrp->self) {
- struct cgroup *desc = container_of(pos, struct cgroup, self);
-
- /* skip the subtree if the descendant has its own program */
- if (desc->bpf.prog[type] && desc != cgrp) {
- pos = css_rightmost_descendant(pos);
- } else {
- rcu_assign_pointer(desc->bpf.effective[type],
- effective);
- desc->bpf.disallow_override[type] = !overridable;
- }
+ err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
}
- if (prog)
- static_branch_inc(&cgroup_bpf_enabled_key);
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+ activate_effective_progs(desc, type, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ static_branch_inc(&cgroup_bpf_enabled_key);
if (old_prog) {
bpf_prog_put(old_prog);
static_branch_dec(&cgroup_bpf_enabled_key);
}
return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* and cleanup the prog list */
+ pl->prog = old_prog;
+ if (pl_was_allocated) {
+ list_del(&pl->node);
+ kfree(pl);
+ }
+ return err;
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @type: Type of detach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 unused_flags)
+{
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ u32 flags = cgrp->bpf.flags[type];
+ struct bpf_prog *old_prog = NULL;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_list *pl;
+ int err;
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ if (!prog)
+ /* to detach MULTI prog the user has to specify valid FD
+ * of the program to be detached
+ */
+ return -EINVAL;
+ } else {
+ if (list_empty(progs))
+ /* report error when trying to detach and nothing is attached */
+ return -ENOENT;
+ }
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ /* find the prog and detach it */
+ list_for_each_entry(pl, progs, node) {
+ if (pl->prog != prog)
+ continue;
+ old_prog = prog;
+ /* mark it deleted, so it's ignored while
+ * recomputing effective
+ */
+ pl->prog = NULL;
+ break;
+ }
+ if (!old_prog)
+ return -ENOENT;
+ } else {
+ /* to maintain backward compatibility NONE and OVERRIDE cgroups
+ * allow detaching with invalid FD (prog==NULL)
+ */
+ pl = list_first_entry(progs, typeof(*pl), node);
+ old_prog = pl->prog;
+ pl->prog = NULL;
+ }
+
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
+ }
+
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ activate_effective_progs(desc, type, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* now can actually delete it from this cgroup list */
+ list_del(&pl->node);
+ kfree(pl);
+ if (list_empty(progs))
+ /* last program was detached, reset flags to zero */
+ cgrp->bpf.flags[type] = 0;
+
+ bpf_prog_put(old_prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* and restore back old_prog */
+ pl->prog = old_prog;
+ return err;
+}
+
+/* Must be called with cgroup_mutex held to avoid races. */
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ enum bpf_attach_type type = attr->query.attach_type;
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ u32 flags = cgrp->bpf.flags[type];
+ int cnt, ret = 0, i;
+
+ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
+ cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+ else
+ cnt = prog_list_length(progs);
+
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+ return -EFAULT;
+ if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+ /* return early if user requested only program count + flags */
+ return 0;
+ if (attr->query.prog_cnt < cnt) {
+ cnt = attr->query.prog_cnt;
+ ret = -ENOSPC;
+ }
+
+ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+ return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
+ prog_ids, cnt);
+ } else {
+ struct bpf_prog_list *pl;
+ u32 id;
+
+ i = 0;
+ list_for_each_entry(pl, progs, node) {
+ id = pl->prog->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (++i == cnt)
+ break;
+ }
+ }
+ return ret;
}
/**
@@ -171,36 +447,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
enum bpf_attach_type type)
{
- struct bpf_prog *prog;
+ unsigned int offset = skb->data - skb_network_header(skb);
+ struct sock *save_sk;
struct cgroup *cgrp;
- int ret = 0;
+ int ret;
if (!sk || !sk_fullsock(sk))
return 0;
- if (sk->sk_family != AF_INET &&
- sk->sk_family != AF_INET6)
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-
- rcu_read_lock();
-
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog) {
- unsigned int offset = skb->data - skb_network_header(skb);
- struct sock *save_sk = skb->sk;
-
- skb->sk = sk;
- __skb_push(skb, offset);
- ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
- __skb_pull(skb, offset);
- skb->sk = save_sk;
- }
-
- rcu_read_unlock();
-
- return ret;
+ save_sk = skb->sk;
+ skb->sk = sk;
+ __skb_push(skb, offset);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+ bpf_prog_run_save_cb);
+ __skb_pull(skb, offset);
+ skb->sk = save_sk;
+ return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
@@ -221,19 +487,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
enum bpf_attach_type type)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- struct bpf_prog *prog;
- int ret = 0;
+ int ret;
-
- rcu_read_lock();
-
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog)
- ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
-
- rcu_read_unlock();
-
- return ret;
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+ return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -258,18 +515,77 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
enum bpf_attach_type type)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- struct bpf_prog *prog;
- int ret = 0;
+ int ret;
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
+ BPF_PROG_RUN);
+ return ret == 1 ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+ short access, enum bpf_attach_type type)
+{
+ struct cgroup *cgrp;
+ struct bpf_cgroup_dev_ctx ctx = {
+ .access_type = (access << 16) | dev_type,
+ .major = major,
+ .minor = minor,
+ };
+ int allow = 1;
rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
+ BPF_PROG_RUN);
+ rcu_read_unlock();
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog)
- ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
+ return !allow;
+}
+EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
- rcu_read_unlock();
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_trace_printk:
+ if (capable(CAP_SYS_ADMIN))
+ return bpf_get_trace_printk_proto();
+ default:
+ return NULL;
+ }
+}
- return ret;
+static bool cgroup_dev_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE)
+ return false;
+
+ if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
+ return false;
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u32))
+ return false;
+
+ return true;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+const struct bpf_prog_ops cg_dev_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_dev_verifier_ops = {
+ .get_func_proto = cgroup_dev_func_proto,
+ .is_valid_access = cgroup_dev_is_valid_access,
+};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ad5f55922a13..8a6c37762330 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -309,12 +309,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
{
+ const char *end = sym + KSYM_NAME_LEN;
+
BUILD_BUG_ON(sizeof("bpf_prog_") +
- sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+ sizeof(prog->tag) * 2 +
+ /* name has been null terminated.
+ * We should need +1 for the '_' preceding
+ * the name. However, the null character
+ * is double counted between the name and the
+ * sizeof("bpf_prog_") above, so we omit
+ * the +1 here.
+ */
+ sizeof(prog->aux->name) > KSYM_NAME_LEN);
sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
- *sym = 0;
+ if (prog->aux->name[0])
+ snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
+ else
+ *sym = 0;
}
static __always_inline unsigned long
@@ -595,9 +608,13 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_JMP | BPF_JEQ | BPF_K:
case BPF_JMP | BPF_JNE | BPF_K:
case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JLT | BPF_K:
case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JLE | BPF_K:
case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSLT | BPF_K:
case BPF_JMP | BPF_JSGE | BPF_K:
+ case BPF_JMP | BPF_JSLE | BPF_K:
case BPF_JMP | BPF_JSET | BPF_K:
/* Accommodate for extra offset in case of a backjump. */
off = from->off;
@@ -833,12 +850,20 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X,
+ [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K,
[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X,
+ [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K,
[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X,
+ [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K,
[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X,
+ [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K,
[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
/* Program return */
@@ -1010,7 +1035,7 @@ select_insn:
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog;
- u64 index = BPF_R3;
+ u32 index = BPF_R3;
if (unlikely(index >= array->map.max_entries))
goto out;
@@ -1073,6 +1098,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLT_X:
+ if (DST < SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLT_K:
+ if (DST < IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JGE_X:
if (DST >= SRC) {
insn += insn->off;
@@ -1085,6 +1122,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLE_X:
+ if (DST <= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLE_K:
+ if (DST <= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGT_X:
if (((s64) DST) > ((s64) SRC)) {
insn += insn->off;
@@ -1097,6 +1146,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLT_X:
+ if (((s64) DST) < ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLT_K:
+ if (((s64) DST) < ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGE_X:
if (((s64) DST) >= ((s64) SRC)) {
insn += insn->off;
@@ -1109,6 +1170,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLE_X:
+ if (((s64) DST) <= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLE_K:
+ if (((s64) DST) <= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSET_X:
if (DST & SRC) {
insn += insn->off;
@@ -1307,7 +1380,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* valid program, which in this case would simply not
* be JITed, but falls back to the interpreter.
*/
- fp = bpf_int_jit_compile(fp);
+ if (!bpf_prog_is_dev_bound(fp->aux)) {
+ fp = bpf_int_jit_compile(fp);
+ } else {
+ *err = bpf_prog_offload_compile(fp);
+ if (*err)
+ return fp;
+ }
bpf_prog_lock_ro(fp);
/* The tail call compatibility check can only be done at
@@ -1321,11 +1400,163 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+static unsigned int __bpf_prog_ret1(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ return 1;
+}
+
+static struct bpf_prog_dummy {
+ struct bpf_prog prog;
+} dummy_bpf_prog = {
+ .prog = {
+ .bpf_func = __bpf_prog_ret1,
+ },
+};
+
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+static struct {
+ struct bpf_prog_array hdr;
+ struct bpf_prog *null_prog;
+} empty_prog_array = {
+ .null_prog = NULL,
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+ if (prog_cnt)
+ return kzalloc(sizeof(struct bpf_prog_array) +
+ sizeof(struct bpf_prog *) * (prog_cnt + 1),
+ flags);
+
+ return &empty_prog_array.hdr;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+{
+ if (!progs ||
+ progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+ return;
+ kfree_rcu(progs, rcu);
+}
+
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+{
+ struct bpf_prog **prog;
+ u32 cnt = 0;
+
+ rcu_read_lock();
+ prog = rcu_dereference(progs)->progs;
+ for (; *prog; prog++)
+ cnt++;
+ rcu_read_unlock();
+ return cnt;
+}
+
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+ __u32 __user *prog_ids, u32 cnt)
+{
+ struct bpf_prog **prog;
+ u32 i = 0, id;
+
+ rcu_read_lock();
+ prog = rcu_dereference(progs)->progs;
+ for (; *prog; prog++) {
+ id = (*prog)->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
+ rcu_read_unlock();
+ return -EFAULT;
+ }
+ if (++i == cnt) {
+ prog++;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (*prog)
+ return -ENOSPC;
+ return 0;
+}
+
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_prog **prog = progs->progs;
+
+ for (; *prog; prog++)
+ if (*prog == old_prog) {
+ WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+ break;
+ }
+}
+
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+ struct bpf_prog *exclude_prog,
+ struct bpf_prog *include_prog,
+ struct bpf_prog_array **new_array)
+{
+ int new_prog_cnt, carry_prog_cnt = 0;
+ struct bpf_prog **existing_prog;
+ struct bpf_prog_array *array;
+ int new_prog_idx = 0;
+
+ /* Figure out how many existing progs we need to carry over to
+ * the new array.
+ */
+ if (old_array) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++) {
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ carry_prog_cnt++;
+ if (*existing_prog == include_prog)
+ return -EEXIST;
+ }
+ }
+
+ /* How many progs (not NULL) will be in the new array? */
+ new_prog_cnt = carry_prog_cnt;
+ if (include_prog)
+ new_prog_cnt += 1;
+
+ /* Do we have any prog (not NULL) in the new array? */
+ if (!new_prog_cnt) {
+ *new_array = NULL;
+ return 0;
+ }
+
+ /* +1 as the end of prog_array is marked with NULL */
+ array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+
+ /* Fill in the new prog array */
+ if (carry_prog_cnt) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++)
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ array->progs[new_prog_idx++] = *existing_prog;
+ }
+ if (include_prog)
+ array->progs[new_prog_idx++] = include_prog;
+ array->progs[new_prog_idx] = NULL;
+ *new_array = array;
+ return 0;
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
aux = container_of(work, struct bpf_prog_aux, work);
+ if (bpf_prog_is_dev_bound(aux))
+ bpf_prog_offload_destroy(aux->prog);
bpf_jit_free(aux->prog);
}
@@ -1378,6 +1609,7 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto bpf_sock_map_update_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
@@ -1437,5 +1669,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+/* These are only used within the BPF_SYSCALL code */
+#ifdef CONFIG_BPF_SYSCALL
EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
+#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
new file mode 100644
index 000000000000..ce5b669003b2
--- /dev/null
+++ b/kernel/bpf/cpumap.c
@@ -0,0 +1,706 @@
+/* bpf/cpumap.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2. See COPYING.
+ */
+
+/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+ * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
+ *
+ * Unlike devmap which redirects XDP frames out another NIC device,
+ * this map type redirects raw XDP frames to another CPU. The remote
+ * CPU will do SKB-allocation and call the normal network stack.
+ *
+ * This is a scalability and isolation mechanism, that allow
+ * separating the early driver network XDP layer, from the rest of the
+ * netstack, and assigning dedicated CPUs for this stage. This
+ * basically allows for 10G wirespeed pre-filtering via bpf.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ptr_ring.h>
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/capability.h>
+#include <trace/events/xdp.h>
+
+#include <linux/netdevice.h> /* netif_receive_skb_core */
+#include <linux/etherdevice.h> /* eth_type_trans */
+
+/* General idea: XDP packets getting XDP redirected to another CPU,
+ * will maximum be stored/queued for one driver ->poll() call. It is
+ * guaranteed that setting flush bit and flush operation happen on
+ * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
+ * which queue in bpf_cpu_map_entry contains packets.
+ */
+
+#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
+struct xdp_bulk_queue {
+ void *q[CPU_MAP_BULK_SIZE];
+ unsigned int count;
+};
+
+/* Struct for every remote "destination" CPU in map */
+struct bpf_cpu_map_entry {
+ u32 cpu; /* kthread CPU and map index */
+ int map_id; /* Back reference to map */
+ u32 qsize; /* Queue size placeholder for map lookup */
+
+ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */
+ struct xdp_bulk_queue __percpu *bulkq;
+
+ /* Queue with potential multi-producers, and single-consumer kthread */
+ struct ptr_ring *queue;
+ struct task_struct *kthread;
+ struct work_struct kthread_stop_wq;
+
+ atomic_t refcnt; /* Control when this struct can be free'ed */
+ struct rcu_head rcu;
+};
+
+struct bpf_cpu_map {
+ struct bpf_map map;
+ /* Below members specific for map type */
+ struct bpf_cpu_map_entry **cpu_map;
+ unsigned long __percpu *flush_needed;
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_bulk_queue *bq);
+
+static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
+{
+ return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_cpu_map *cmap;
+ int err = -ENOMEM;
+ u64 cost;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ return ERR_PTR(-EINVAL);
+
+ cmap = kzalloc(sizeof(*cmap), GFP_USER);
+ if (!cmap)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ cmap->map.map_type = attr->map_type;
+ cmap->map.key_size = attr->key_size;
+ cmap->map.value_size = attr->value_size;
+ cmap->map.max_entries = attr->max_entries;
+ cmap->map.map_flags = attr->map_flags;
+ cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+
+ /* Pre-limit array size based on NR_CPUS, not final CPU check */
+ if (cmap->map.max_entries > NR_CPUS) {
+ err = -E2BIG;
+ goto free_cmap;
+ }
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
+ cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_cmap;
+ cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* Notice returns -EPERM on if map size is larger than memlock limit */
+ ret = bpf_map_precharge_memlock(cmap->map.pages);
+ if (ret) {
+ err = ret;
+ goto free_cmap;
+ }
+
+ /* A per cpu bitfield with a bit per possible CPU in map */
+ cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
+ __alignof__(unsigned long));
+ if (!cmap->flush_needed)
+ goto free_cmap;
+
+ /* Alloc array for possible remote "destination" CPUs */
+ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
+ sizeof(struct bpf_cpu_map_entry *),
+ cmap->map.numa_node);
+ if (!cmap->cpu_map)
+ goto free_percpu;
+
+ return &cmap->map;
+free_percpu:
+ free_percpu(cmap->flush_needed);
+free_cmap:
+ kfree(cmap);
+ return ERR_PTR(err);
+}
+
+void __cpu_map_queue_destructor(void *ptr)
+{
+ /* The tear-down procedure should have made sure that queue is
+ * empty. See __cpu_map_entry_replace() and work-queue
+ * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+ * gracefully and warn once.
+ */
+ if (WARN_ON_ONCE(ptr))
+ page_frag_free(ptr);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+ if (atomic_dec_and_test(&rcpu->refcnt)) {
+ /* The queue should be empty at this point */
+ ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
+ kfree(rcpu->queue);
+ kfree(rcpu);
+ }
+}
+
+static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+ atomic_inc(&rcpu->refcnt);
+}
+
+/* called from workqueue, to workaround syscall using preempt_disable */
+static void cpu_map_kthread_stop(struct work_struct *work)
+{
+ struct bpf_cpu_map_entry *rcpu;
+
+ rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
+
+ /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
+ * as it waits until all in-flight call_rcu() callbacks complete.
+ */
+ rcu_barrier();
+
+ /* kthread_stop will wake_up_process and wait for it to complete */
+ kthread_stop(rcpu->kthread);
+}
+
+/* For now, xdp_pkt is a cpumap internal data structure, with info
+ * carried between enqueue to dequeue. It is mapped into the top
+ * headroom of the packet, to avoid allocating separate mem.
+ */
+struct xdp_pkt {
+ void *data;
+ u16 len;
+ u16 headroom;
+ u16 metasize;
+ struct net_device *dev_rx;
+};
+
+/* Convert xdp_buff to xdp_pkt */
+static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
+{
+ struct xdp_pkt *xdp_pkt;
+ int metasize;
+ int headroom;
+
+ /* Assure headroom is available for storing info */
+ headroom = xdp->data - xdp->data_hard_start;
+ metasize = xdp->data - xdp->data_meta;
+ metasize = metasize > 0 ? metasize : 0;
+ if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
+ return NULL;
+
+ /* Store info in top of packet */
+ xdp_pkt = xdp->data_hard_start;
+
+ xdp_pkt->data = xdp->data;
+ xdp_pkt->len = xdp->data_end - xdp->data;
+ xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
+ xdp_pkt->metasize = metasize;
+
+ return xdp_pkt;
+}
+
+struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_pkt *xdp_pkt)
+{
+ unsigned int frame_size;
+ void *pkt_data_start;
+ struct sk_buff *skb;
+
+ /* build_skb need to place skb_shared_info after SKB end, and
+ * also want to know the memory "truesize". Thus, need to
+ * know the memory frame size backing xdp_buff.
+ *
+ * XDP was designed to have PAGE_SIZE frames, but this
+ * assumption is not longer true with ixgbe and i40e. It
+ * would be preferred to set frame_size to 2048 or 4096
+ * depending on the driver.
+ * frame_size = 2048;
+ * frame_len = frame_size - sizeof(*xdp_pkt);
+ *
+ * Instead, with info avail, skb_shared_info in placed after
+ * packet len. This, unfortunately fakes the truesize.
+ * Another disadvantage of this approach, the skb_shared_info
+ * is not at a fixed memory location, with mixed length
+ * packets, which is bad for cache-line hotness.
+ */
+ frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+ skb = build_skb(pkt_data_start, frame_size);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, xdp_pkt->headroom);
+ __skb_put(skb, xdp_pkt->len);
+ if (xdp_pkt->metasize)
+ skb_metadata_set(skb, xdp_pkt->metasize);
+
+ /* Essential SKB info: protocol and skb->dev */
+ skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+
+ /* Optional SKB info, currently missing:
+ * - HW checksum info (skb->ip_summed)
+ * - HW RX hash (skb_set_hash)
+ * - RX ring dev queue index (skb_record_rx_queue)
+ */
+
+ return skb;
+}
+
+static int cpu_map_kthread_run(void *data)
+{
+ struct bpf_cpu_map_entry *rcpu = data;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* When kthread gives stop order, then rcpu have been disconnected
+ * from map, thus no new packets can enter. Remaining in-flight
+ * per CPU stored packets are flushed to this queue. Wait honoring
+ * kthread_stop signal until queue is empty.
+ */
+ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+ unsigned int processed = 0, drops = 0, sched = 0;
+ struct xdp_pkt *xdp_pkt;
+
+ /* Release CPU reschedule checks */
+ if (__ptr_ring_empty(rcpu->queue)) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* Recheck to avoid lost wake-up */
+ if (__ptr_ring_empty(rcpu->queue)) {
+ schedule();
+ sched = 1;
+ } else {
+ __set_current_state(TASK_RUNNING);
+ }
+ } else {
+ sched = cond_resched();
+ }
+
+ /* Process packets in rcpu->queue */
+ local_bh_disable();
+ /*
+ * The bpf_cpu_map_entry is single consumer, with this
+ * kthread CPU pinned. Lockless access to ptr_ring
+ * consume side valid as no-resize allowed of queue.
+ */
+ while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+ struct sk_buff *skb;
+ int ret;
+
+ skb = cpu_map_build_skb(rcpu, xdp_pkt);
+ if (!skb) {
+ page_frag_free(xdp_pkt);
+ continue;
+ }
+
+ /* Inject into network stack */
+ ret = netif_receive_skb_core(skb);
+ if (ret == NET_RX_DROP)
+ drops++;
+
+ /* Limit BH-disable period */
+ if (++processed == 8)
+ break;
+ }
+ /* Feedback loop via tracepoint */
+ trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+
+ local_bh_enable(); /* resched point, may call do_softirq() */
+ }
+ __set_current_state(TASK_RUNNING);
+
+ put_cpu_map_entry(rcpu);
+ return 0;
+}
+
+struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+{
+ gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
+ struct bpf_cpu_map_entry *rcpu;
+ int numa, err;
+
+ /* Have map->numa_node, but choose node of redirect target CPU */
+ numa = cpu_to_node(cpu);
+
+ rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
+ if (!rcpu)
+ return NULL;
+
+ /* Alloc percpu bulkq */
+ rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
+ sizeof(void *), gfp);
+ if (!rcpu->bulkq)
+ goto free_rcu;
+
+ /* Alloc queue */
+ rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
+ if (!rcpu->queue)
+ goto free_bulkq;
+
+ err = ptr_ring_init(rcpu->queue, qsize, gfp);
+ if (err)
+ goto free_queue;
+
+ rcpu->cpu = cpu;
+ rcpu->map_id = map_id;
+ rcpu->qsize = qsize;
+
+ /* Setup kthread */
+ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
+ "cpumap/%d/map:%d", cpu, map_id);
+ if (IS_ERR(rcpu->kthread))
+ goto free_ptr_ring;
+
+ get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
+ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
+
+ /* Make sure kthread runs on a single CPU */
+ kthread_bind(rcpu->kthread, cpu);
+ wake_up_process(rcpu->kthread);
+
+ return rcpu;
+
+free_ptr_ring:
+ ptr_ring_cleanup(rcpu->queue, NULL);
+free_queue:
+ kfree(rcpu->queue);
+free_bulkq:
+ free_percpu(rcpu->bulkq);
+free_rcu:
+ kfree(rcpu);
+ return NULL;
+}
+
+void __cpu_map_entry_free(struct rcu_head *rcu)
+{
+ struct bpf_cpu_map_entry *rcpu;
+ int cpu;
+
+ /* This cpu_map_entry have been disconnected from map and one
+ * RCU graze-period have elapsed. Thus, XDP cannot queue any
+ * new packets and cannot change/set flush_needed that can
+ * find this entry.
+ */
+ rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+
+ /* Flush remaining packets in percpu bulkq */
+ for_each_online_cpu(cpu) {
+ struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
+
+ /* No concurrent bq_enqueue can run at this point */
+ bq_flush_to_queue(rcpu, bq);
+ }
+ free_percpu(rcpu->bulkq);
+ /* Cannot kthread_stop() here, last put free rcpu resources */
+ put_cpu_map_entry(rcpu);
+}
+
+/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
+ * ensure any driver rcu critical sections have completed, but this
+ * does not guarantee a flush has happened yet. Because driver side
+ * rcu_read_lock/unlock only protects the running XDP program. The
+ * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
+ * pending flush op doesn't fail.
+ *
+ * The bpf_cpu_map_entry is still used by the kthread, and there can
+ * still be pending packets (in queue and percpu bulkq). A refcnt
+ * makes sure to last user (kthread_stop vs. call_rcu) free memory
+ * resources.
+ *
+ * The rcu callback __cpu_map_entry_free flush remaining packets in
+ * percpu bulkq to queue. Due to caller map_delete_elem() disable
+ * preemption, cannot call kthread_stop() to make sure queue is empty.
+ * Instead a work_queue is started for stopping kthread,
+ * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * stopping kthread, emptying the queue.
+ */
+void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+ u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+{
+ struct bpf_cpu_map_entry *old_rcpu;
+
+ old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+ if (old_rcpu) {
+ call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
+ INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
+ schedule_work(&old_rcpu->kthread_stop_wq);
+ }
+}
+
+int cpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ u32 key_cpu = *(u32 *)key;
+
+ if (key_cpu >= map->max_entries)
+ return -EINVAL;
+
+ /* notice caller map_delete_elem() use preempt_disable() */
+ __cpu_map_entry_replace(cmap, key_cpu, NULL);
+ return 0;
+}
+
+int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpu_map_entry *rcpu;
+
+ /* Array index key correspond to CPU number */
+ u32 key_cpu = *(u32 *)key;
+ /* Value is the queue size */
+ u32 qsize = *(u32 *)value;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(key_cpu >= cmap->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+ if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+ return -EOVERFLOW;
+
+ /* Make sure CPU is a valid possible cpu */
+ if (!cpu_possible(key_cpu))
+ return -ENODEV;
+
+ if (qsize == 0) {
+ rcpu = NULL; /* Same as deleting */
+ } else {
+ /* Updating qsize cause re-allocation of bpf_cpu_map_entry */
+ rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+ if (!rcpu)
+ return -ENOMEM;
+ }
+ rcu_read_lock();
+ __cpu_map_entry_replace(cmap, key_cpu, rcpu);
+ rcu_read_unlock();
+ return 0;
+}
+
+void cpu_map_free(struct bpf_map *map)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ int cpu;
+ u32 i;
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the bpf programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete. The rcu critical section only guarantees
+ * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
+ * It does __not__ ensure pending flush operations (if any) are
+ * complete.
+ */
+ synchronize_rcu();
+
+ /* To ensure all pending flush operations have completed wait for flush
+ * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * Because the above synchronize_rcu() ensures the map is disconnected
+ * from the program we can assume no new bits will be set.
+ */
+ for_each_online_cpu(cpu) {
+ unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+
+ while (!bitmap_empty(bitmap, cmap->map.max_entries))
+ cond_resched();
+ }
+
+ /* For cpu_map the remote CPUs can still be using the entries
+ * (struct bpf_cpu_map_entry).
+ */
+ for (i = 0; i < cmap->map.max_entries; i++) {
+ struct bpf_cpu_map_entry *rcpu;
+
+ rcpu = READ_ONCE(cmap->cpu_map[i]);
+ if (!rcpu)
+ continue;
+
+ /* bq flush and cleanup happens after RCU graze-period */
+ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+ }
+ free_percpu(cmap->flush_needed);
+ bpf_map_area_free(cmap->cpu_map);
+ kfree(cmap);
+}
+
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpu_map_entry *rcpu;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ rcpu = READ_ONCE(cmap->cpu_map[key]);
+ return rcpu;
+}
+
+static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_cpu_map_entry *rcpu =
+ __cpu_map_lookup_elem(map, *(u32 *)key);
+
+ return rcpu ? &rcpu->qsize : NULL;
+}
+
+static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= cmap->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == cmap->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+const struct bpf_map_ops cpu_map_ops = {
+ .map_alloc = cpu_map_alloc,
+ .map_free = cpu_map_free,
+ .map_delete_elem = cpu_map_delete_elem,
+ .map_update_elem = cpu_map_update_elem,
+ .map_lookup_elem = cpu_map_lookup_elem,
+ .map_get_next_key = cpu_map_get_next_key,
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_bulk_queue *bq)
+{
+ unsigned int processed = 0, drops = 0;
+ const int to_cpu = rcpu->cpu;
+ struct ptr_ring *q;
+ int i;
+
+ if (unlikely(!bq->count))
+ return 0;
+
+ q = rcpu->queue;
+ spin_lock(&q->producer_lock);
+
+ for (i = 0; i < bq->count; i++) {
+ void *xdp_pkt = bq->q[i];
+ int err;
+
+ err = __ptr_ring_produce(q, xdp_pkt);
+ if (err) {
+ drops++;
+ page_frag_free(xdp_pkt); /* Free xdp_pkt */
+ }
+ processed++;
+ }
+ bq->count = 0;
+ spin_unlock(&q->producer_lock);
+
+ /* Feedback loop via tracepoints */
+ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
+ return 0;
+}
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+{
+ struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+
+ if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
+ bq_flush_to_queue(rcpu, bq);
+
+ /* Notice, xdp_buff/page MUST be queued here, long enough for
+ * driver to code invoking us to finished, due to driver
+ * (e.g. ixgbe) recycle tricks based on page-refcnt.
+ *
+ * Thus, incoming xdp_pkt is always queued here (else we race
+ * with another CPU on page-refcnt and remaining driver code).
+ * Queue time is very short, as driver will invoke flush
+ * operation, when completing napi->poll call.
+ */
+ bq->q[bq->count++] = xdp_pkt;
+ return 0;
+}
+
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct xdp_pkt *xdp_pkt;
+
+ xdp_pkt = convert_to_xdp_pkt(xdp);
+ if (unlikely(!xdp_pkt))
+ return -EOVERFLOW;
+
+ /* Info needed when constructing SKB on remote CPU */
+ xdp_pkt->dev_rx = dev_rx;
+
+ bq_enqueue(rcpu, xdp_pkt);
+ return 0;
+}
+
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+
+ __set_bit(bit, bitmap);
+}
+
+void __cpu_map_flush(struct bpf_map *map)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+ u32 bit;
+
+ /* The napi->poll softirq makes sure __cpu_map_insert_ctx()
+ * and __cpu_map_flush() happen on same CPU. Thus, the percpu
+ * bitmap indicate which percpu bulkq have packets.
+ */
+ for_each_set_bit(bit, bitmap, map->max_entries) {
+ struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
+ struct xdp_bulk_queue *bq;
+
+ /* This is possible if entry is removed by user space
+ * between xdp redirect and flush op.
+ */
+ if (unlikely(!rcpu))
+ continue;
+
+ __clear_bit(bit, bitmap);
+
+ /* Flush all frames in bulkq to real queue */
+ bq = this_cpu_ptr(rcpu->bulkq);
+ bq_flush_to_queue(rcpu, bq);
+
+ /* If already running, costs spin_lock_irqsave + smb_mb */
+ wake_up_process(rcpu->kthread);
+ }
+}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
new file mode 100644
index 000000000000..ebdef54bf7df
--- /dev/null
+++ b/kernel/bpf/devmap.c
@@ -0,0 +1,418 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* Devmaps primary use is as a backend map for XDP BPF helper call
+ * bpf_redirect_map(). Because XDP is mostly concerned with performance we
+ * spent some effort to ensure the datapath with redirect maps does not use
+ * any locking. This is a quick note on the details.
+ *
+ * We have three possible paths to get into the devmap control plane bpf
+ * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
+ * will invoke an update, delete, or lookup operation. To ensure updates and
+ * deletes appear atomic from the datapath side xchg() is used to modify the
+ * netdev_map array. Then because the datapath does a lookup into the netdev_map
+ * array (read-only) from an RCU critical section we use call_rcu() to wait for
+ * an rcu grace period before free'ing the old data structures. This ensures the
+ * datapath always has a valid copy. However, the datapath does a "flush"
+ * operation that pushes any pending packets in the driver outside the RCU
+ * critical section. Each bpf_dtab_netdev tracks these pending operations using
+ * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
+ * until all bits are cleared indicating outstanding flush operations have
+ * completed.
+ *
+ * BPF syscalls may race with BPF program calls on any of the update, delete
+ * or lookup operations. As noted above the xchg() operation also keep the
+ * netdev_map consistent in this case. From the devmap side BPF programs
+ * calling into these operations are the same as multiple user space threads
+ * making system calls.
+ *
+ * Finally, any of the above may race with a netdev_unregister notifier. The
+ * unregister notifier must search for net devices in the map structure that
+ * contain a reference to the net device and remove them. This is a two step
+ * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
+ * check to see if the ifindex is the same as the net_device being removed.
+ * When removing the dev a cmpxchg() is used to ensure the correct dev is
+ * removed, in the case of a concurrent update or delete operation it is
+ * possible that the initially referenced dev is no longer in the map. As the
+ * notifier hook walks the map we know that new dev references can not be
+ * added by the user because core infrastructure ensures dev_get_by_index()
+ * calls will fail at this point.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+#define DEV_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+struct bpf_dtab_netdev {
+ struct net_device *dev;
+ struct bpf_dtab *dtab;
+ unsigned int bit;
+ struct rcu_head rcu;
+};
+
+struct bpf_dtab {
+ struct bpf_map map;
+ struct bpf_dtab_netdev **netdev_map;
+ unsigned long __percpu *flush_needed;
+ struct list_head list;
+};
+
+static DEFINE_SPINLOCK(dev_map_lock);
+static LIST_HEAD(dev_map_list);
+
+static u64 dev_map_bitmap_size(const union bpf_attr *attr)
+{
+ return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_dtab *dtab;
+ int err = -EINVAL;
+ u64 cost;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ dtab = kzalloc(sizeof(*dtab), GFP_USER);
+ if (!dtab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ dtab->map.map_type = attr->map_type;
+ dtab->map.key_size = attr->key_size;
+ dtab->map.value_size = attr->value_size;
+ dtab->map.max_entries = attr->max_entries;
+ dtab->map.map_flags = attr->map_flags;
+ dtab->map.numa_node = bpf_map_attr_numa_node(attr);
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
+ cost += dev_map_bitmap_size(attr) * num_possible_cpus();
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_dtab;
+
+ dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* if map size is larger than memlock limit, reject it early */
+ err = bpf_map_precharge_memlock(dtab->map.pages);
+ if (err)
+ goto free_dtab;
+
+ err = -ENOMEM;
+
+ /* A per cpu bitfield with a bit per possible net device */
+ dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
+ __alignof__(unsigned long),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!dtab->flush_needed)
+ goto free_dtab;
+
+ dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
+ sizeof(struct bpf_dtab_netdev *),
+ dtab->map.numa_node);
+ if (!dtab->netdev_map)
+ goto free_dtab;
+
+ spin_lock(&dev_map_lock);
+ list_add_tail_rcu(&dtab->list, &dev_map_list);
+ spin_unlock(&dev_map_lock);
+
+ return &dtab->map;
+free_dtab:
+ free_percpu(dtab->flush_needed);
+ kfree(dtab);
+ return ERR_PTR(err);
+}
+
+static void dev_map_free(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ int i, cpu;
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete. The rcu critical section only guarantees
+ * no further reads against netdev_map. It does __not__ ensure pending
+ * flush operations (if any) are complete.
+ */
+
+ spin_lock(&dev_map_lock);
+ list_del_rcu(&dtab->list);
+ spin_unlock(&dev_map_lock);
+
+ synchronize_rcu();
+
+ /* To ensure all pending flush operations have completed wait for flush
+ * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * Because the above synchronize_rcu() ensures the map is disconnected
+ * from the program we can assume no new bits will be set.
+ */
+ for_each_online_cpu(cpu) {
+ unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
+
+ while (!bitmap_empty(bitmap, dtab->map.max_entries))
+ cond_resched();
+ }
+
+ for (i = 0; i < dtab->map.max_entries; i++) {
+ struct bpf_dtab_netdev *dev;
+
+ dev = dtab->netdev_map[i];
+ if (!dev)
+ continue;
+
+ dev_put(dev->dev);
+ kfree(dev);
+ }
+
+ free_percpu(dtab->flush_needed);
+ bpf_map_area_free(dtab->netdev_map);
+ kfree(dtab);
+}
+
+static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= dtab->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == dtab->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+
+ __set_bit(bit, bitmap);
+}
+
+/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
+ * from the driver before returning from its napi->poll() routine. The poll()
+ * routine is called either from busy_poll context or net_rx_action signaled
+ * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
+ * net device can be torn down. On devmap tear down we ensure the ctx bitmap
+ * is zeroed before completing to ensure all flush operations have completed.
+ */
+void __dev_map_flush(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+ u32 bit;
+
+ for_each_set_bit(bit, bitmap, map->max_entries) {
+ struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+ struct net_device *netdev;
+
+ /* This is possible if the dev entry is removed by user space
+ * between xdp redirect and flush op.
+ */
+ if (unlikely(!dev))
+ continue;
+
+ __clear_bit(bit, bitmap);
+ netdev = dev->dev;
+ if (likely(netdev->netdev_ops->ndo_xdp_flush))
+ netdev->netdev_ops->ndo_xdp_flush(netdev);
+ }
+}
+
+/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
+ * update happens in parallel here a dev_put wont happen until after reading the
+ * ifindex.
+ */
+struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dev;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ dev = READ_ONCE(dtab->netdev_map[key]);
+ return dev ? dev->dev : NULL;
+}
+
+static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+
+ return dev ? &dev->ifindex : NULL;
+}
+
+static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
+{
+ if (dev->dev->netdev_ops->ndo_xdp_flush) {
+ struct net_device *fl = dev->dev;
+ unsigned long *bitmap;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
+ __clear_bit(dev->bit, bitmap);
+
+ fl->netdev_ops->ndo_xdp_flush(dev->dev);
+ }
+ }
+}
+
+static void __dev_map_entry_free(struct rcu_head *rcu)
+{
+ struct bpf_dtab_netdev *dev;
+
+ dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+ dev_map_flush_old(dev);
+ dev_put(dev->dev);
+ kfree(dev);
+}
+
+static int dev_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *old_dev;
+ int k = *(u32 *)key;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ /* Use call_rcu() here to ensure any rcu critical sections have
+ * completed, but this does not guarantee a flush has happened
+ * yet. Because driver side rcu_read_lock/unlock only protects the
+ * running XDP program. However, for pending flush operations the
+ * dev and ctx are stored in another per cpu map. And additionally,
+ * the driver tear down ensures all soft irqs are complete before
+ * removing the net device in the case of dev_put equals zero.
+ */
+ old_dev = xchg(&dtab->netdev_map[k], NULL);
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ return 0;
+}
+
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_dtab_netdev *dev, *old_dev;
+ u32 i = *(u32 *)key;
+ u32 ifindex = *(u32 *)value;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(i >= dtab->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+
+ if (!ifindex) {
+ dev = NULL;
+ } else {
+ dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
+ map->numa_node);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->dev = dev_get_by_index(net, ifindex);
+ if (!dev->dev) {
+ kfree(dev);
+ return -EINVAL;
+ }
+
+ dev->bit = i;
+ dev->dtab = dtab;
+ }
+
+ /* Use call_rcu() here to ensure rcu critical sections have completed
+ * Remembering the driver side flush operation will happen before the
+ * net device is removed.
+ */
+ old_dev = xchg(&dtab->netdev_map[i], dev);
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+
+ return 0;
+}
+
+const struct bpf_map_ops dev_map_ops = {
+ .map_alloc = dev_map_alloc,
+ .map_free = dev_map_free,
+ .map_get_next_key = dev_map_get_next_key,
+ .map_lookup_elem = dev_map_lookup_elem,
+ .map_update_elem = dev_map_update_elem,
+ .map_delete_elem = dev_map_delete_elem,
+};
+
+static int dev_map_notification(struct notifier_block *notifier,
+ ulong event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct bpf_dtab *dtab;
+ int i;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* This rcu_read_lock/unlock pair is needed because
+ * dev_map_list is an RCU list AND to ensure a delete
+ * operation does not free a netdev_map entry while we
+ * are comparing it against the netdev being unregistered.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(dtab, &dev_map_list, list) {
+ for (i = 0; i < dtab->map.max_entries; i++) {
+ struct bpf_dtab_netdev *dev, *odev;
+
+ dev = READ_ONCE(dtab->netdev_map[i]);
+ if (!dev ||
+ dev->dev->ifindex != netdev->ifindex)
+ continue;
+ odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
+ if (dev == odev)
+ call_rcu(&dev->rcu,
+ __dev_map_entry_free);
+ }
+ }
+ rcu_read_unlock();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_map_notifier = {
+ .notifier_call = dev_map_notification,
+};
+
+static int __init dev_map_init(void)
+{
+ register_netdevice_notifier(&dev_map_notifier);
+ return 0;
+}
+
+subsys_initcall(dev_map_init);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
new file mode 100644
index 000000000000..e682850c9715
--- /dev/null
+++ b/kernel/bpf/disasm.c
@@ -0,0 +1,214 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+
+#include "disasm.h"
+
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+ __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+const char *func_id_name(int id)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+ return func_id_str[id];
+ else
+ return "unknown";
+}
+
+const char *const bpf_class_string[8] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_RET] = "BUG",
+ [BPF_ALU64] = "alu64",
+};
+
+const char *const bpf_alu_string[16] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[16] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JLT >> 4] = "<",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JLE >> 4] = "<=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSLT >> 4] = "s<",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_JSLE >> 4] = "s<=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_end_insn(bpf_insn_print_cb verbose,
+ struct bpf_verifier_env *env,
+ const struct bpf_insn *insn)
+{
+ verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+ BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+ insn->imm, insn->dst_reg);
+}
+
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, bool allow_ptr_leaks)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_OP(insn->code) == BPF_END) {
+ if (class == BPF_ALU64)
+ verbose(env, "BUG_alu64_%02x\n", insn->code);
+ else
+ print_bpf_end_insn(verbose, env, insn);
+ } else if (BPF_OP(insn->code) == BPF_NEG) {
+ verbose(env, "(%02x) r%d = %s-r%d\n",
+ insn->code, insn->dst_reg,
+ class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose(env, "(%02x) %sr%d %s %sr%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->src_reg);
+ } else {
+ verbose(env, "(%02x) %sr%d %s %s%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->imm);
+ }
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ else
+ verbose(env, "BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "BUG_st_%02x\n", insn->code);
+ return;
+ }
+ verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM &&
+ BPF_SIZE(insn->code) == BPF_DW) {
+ /* At this point, we already made sure that the second
+ * part of the ldimm64 insn is accessible.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+ bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+ if (map_ptr && !allow_ptr_leaks)
+ imm = 0;
+
+ verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
+ insn->dst_reg, (unsigned long long)imm);
+ } else {
+ verbose(env, "BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ verbose(env, "(%02x) call %s#%d\n", insn->code,
+ func_id_name(insn->imm), insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose(env, "(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose(env, "(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->src_reg, insn->off);
+ } else {
+ verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ verbose(env, "(%02x) %s\n",
+ insn->code, bpf_class_string[class]);
+ }
+}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
new file mode 100644
index 000000000000..8de977e420b6
--- /dev/null
+++ b/kernel/bpf/disasm.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __BPF_DISASM_H__
+#define __BPF_DISASM_H__
+
+#include <linux/bpf.h>
+#include <linux/kernel.h>
+#include <linux/stringify.h>
+
+extern const char *const bpf_alu_string[16];
+extern const char *const bpf_class_string[8];
+
+const char *func_id_name(int id);
+
+struct bpf_verifier_env;
+typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env,
+ const char *, ...);
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, bool allow_ptr_leaks);
+
+#endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d11c8181f4c5..e469e05c8e83 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,6 +18,10 @@
#include "bpf_lru_list.h"
#include "map_in_map.h"
+#define HTAB_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
+ BPF_F_RDONLY | BPF_F_WRONLY)
+
struct bucket {
struct hlist_nulls_head head;
raw_spinlock_t lock;
@@ -138,7 +142,8 @@ static int prealloc_init(struct bpf_htab *htab)
if (!htab_is_percpu(htab) && !htab_is_lru(htab))
num_entries += num_possible_cpus();
- htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
+ htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
+ htab->map.numa_node);
if (!htab->elems)
return -ENOMEM;
@@ -233,6 +238,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+ int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_htab *htab;
int err, i;
u64 cost;
@@ -248,7 +254,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
return ERR_PTR(-EPERM);
- if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
+ if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
/* reserved bits should not be used */
return ERR_PTR(-EINVAL);
@@ -258,6 +264,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (lru && !prealloc)
return ERR_PTR(-ENOTSUPP);
+ if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
+ return ERR_PTR(-EINVAL);
+
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -268,6 +277,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->map.value_size = attr->value_size;
htab->map.max_entries = attr->max_entries;
htab->map.map_flags = attr->map_flags;
+ htab->map.numa_node = numa_node;
/* check sanity of attributes.
* value_size == 0 may be allowed in the future to use map as a set
@@ -308,10 +318,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
goto free_htab;
- if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
- /* make sure the size for pcpu_alloc() is reasonable */
- goto free_htab;
-
htab->elem_size = sizeof(struct htab_elem) +
round_up(htab->map.key_size, 8);
if (percpu)
@@ -346,7 +352,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
err = -ENOMEM;
htab->buckets = bpf_map_area_alloc(htab->n_buckets *
- sizeof(struct bucket));
+ sizeof(struct bucket),
+ htab->map.numa_node);
if (!htab->buckets)
goto free_htab;
@@ -504,6 +511,29 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+ const int ref_reg = BPF_REG_1;
+
+ *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
+ *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
+ offsetof(struct htab_elem, lru_node) +
+ offsetof(struct bpf_lru_node, ref));
+ *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
+ *insn++ = BPF_ST_MEM(BPF_B, ret,
+ offsetof(struct htab_elem, lru_node) +
+ offsetof(struct bpf_lru_node, ref),
+ 1);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct htab_elem, key) +
+ round_up(map->key_size, 8));
+ return insn - insn_buf;
+}
+
/* It is called from the bpf_lru_list when the LRU needs to delete
* older elements from the htab.
*/
@@ -704,7 +734,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
atomic_dec(&htab->count);
return ERR_PTR(-E2BIG);
}
- l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+ l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
if (!l_new)
return ERR_PTR(-ENOMEM);
}
@@ -1126,6 +1157,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
+ .map_gen_lookup = htab_lru_map_gen_lookup,
};
/* Called from eBPF program */
@@ -1315,6 +1347,22 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
+static u32 htab_of_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+
+ *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct htab_elem, key) +
+ round_up(map->key_size, 8));
+ *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
+
+ return insn - insn_buf;
+}
+
static void htab_of_map_free(struct bpf_map *map)
{
bpf_map_meta_free(map->inner_map_meta);
@@ -1330,4 +1378,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
+ .map_gen_lookup = htab_of_map_gen_lookup,
};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index e833ed914358..01aaef1a77c5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -295,7 +295,7 @@ out:
}
static void *bpf_obj_do_get(const struct filename *pathname,
- enum bpf_type *type)
+ enum bpf_type *type, int flags)
{
struct inode *inode;
struct path path;
@@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
return ERR_PTR(ret);
inode = d_backing_inode(path.dentry);
- ret = inode_permission(inode, MAY_WRITE);
+ ret = inode_permission(inode, ACC_MODE(flags));
if (ret)
goto out;
@@ -326,18 +326,23 @@ out:
return ERR_PTR(ret);
}
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
struct filename *pname;
int ret = -ENOENT;
+ int f_flags;
void *raw;
+ f_flags = bpf_get_file_flag(flags);
+ if (f_flags < 0)
+ return f_flags;
+
pname = getname(pathname);
if (IS_ERR(pname))
return PTR_ERR(pname);
- raw = bpf_obj_do_get(pname, &type);
+ raw = bpf_obj_do_get(pname, &type, f_flags);
if (IS_ERR(raw)) {
ret = PTR_ERR(raw);
goto out;
@@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname)
if (type == BPF_TYPE_PROG)
ret = bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
- ret = bpf_map_new_fd(raw);
+ ret = bpf_map_new_fd(raw, f_flags);
else
goto out;
@@ -363,6 +368,7 @@ out:
putname(pname);
return ret;
}
+EXPORT_SYMBOL_GPL(bpf_obj_get_user);
static void bpf_evict_inode(struct inode *inode)
{
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b09185f0f17d..885e45479680 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -244,7 +244,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
if (value)
size += trie->map.value_size;
- node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
+ node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN,
+ trie->map.numa_node);
if (!node)
return NULL;
@@ -388,10 +389,99 @@ out:
return ret;
}
-static int trie_delete_elem(struct bpf_map *map, void *key)
+/* Called from syscall or from eBPF program */
+static int trie_delete_elem(struct bpf_map *map, void *_key)
{
- /* TODO */
- return -ENOSYS;
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct bpf_lpm_trie_key *key = _key;
+ struct lpm_trie_node __rcu **trim, **trim2;
+ struct lpm_trie_node *node, *parent;
+ unsigned long irq_flags;
+ unsigned int next_bit;
+ size_t matchlen = 0;
+ int ret = 0;
+
+ if (key->prefixlen > trie->max_prefixlen)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+ /* Walk the tree looking for an exact key/length match and keeping
+ * track of the path we traverse. We will need to know the node
+ * we wish to delete, and the slot that points to the node we want
+ * to delete. We may also need to know the nodes parent and the
+ * slot that contains it.
+ */
+ trim = &trie->root;
+ trim2 = trim;
+ parent = NULL;
+ while ((node = rcu_dereference_protected(
+ *trim, lockdep_is_held(&trie->lock)))) {
+ matchlen = longest_prefix_match(trie, node, key);
+
+ if (node->prefixlen != matchlen ||
+ node->prefixlen == key->prefixlen)
+ break;
+
+ parent = node;
+ trim2 = trim;
+ next_bit = extract_bit(key->data, node->prefixlen);
+ trim = &node->child[next_bit];
+ }
+
+ if (!node || node->prefixlen != key->prefixlen ||
+ (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ trie->n_entries--;
+
+ /* If the node we are removing has two children, simply mark it
+ * as intermediate and we are done.
+ */
+ if (rcu_access_pointer(node->child[0]) &&
+ rcu_access_pointer(node->child[1])) {
+ node->flags |= LPM_TREE_NODE_FLAG_IM;
+ goto out;
+ }
+
+ /* If the parent of the node we are about to delete is an intermediate
+ * node, and the deleted node doesn't have any children, we can delete
+ * the intermediate parent as well and promote its other child
+ * up the tree. Doing this maintains the invariant that all
+ * intermediate nodes have exactly 2 children and that there are no
+ * unnecessary intermediate nodes in the tree.
+ */
+ if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) &&
+ !node->child[0] && !node->child[1]) {
+ if (node == rcu_access_pointer(parent->child[0]))
+ rcu_assign_pointer(
+ *trim2, rcu_access_pointer(parent->child[1]));
+ else
+ rcu_assign_pointer(
+ *trim2, rcu_access_pointer(parent->child[0]));
+ kfree_rcu(parent, rcu);
+ kfree_rcu(node, rcu);
+ goto out;
+ }
+
+ /* The node we are removing has either zero or one child. If there
+ * is a child, move it into the removed node's slot then delete
+ * the node. Otherwise just clear the slot and delete the node.
+ */
+ if (node->child[0])
+ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0]));
+ else if (node->child[1])
+ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
+ else
+ RCU_INIT_POINTER(*trim, NULL);
+ kfree_rcu(node, rcu);
+
+out:
+ raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+ return ret;
}
#define LPM_DATA_SIZE_MAX 256
@@ -405,6 +495,9 @@ static int trie_delete_elem(struct bpf_map *map, void *key)
#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
+#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \
+ BPF_F_RDONLY | BPF_F_WRONLY)
+
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
@@ -416,7 +509,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 ||
- attr->map_flags != BPF_F_NO_PREALLOC ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->map_flags & ~LPM_CREATE_FLAG_MASK ||
attr->key_size < LPM_KEY_SIZE_MIN ||
attr->key_size > LPM_KEY_SIZE_MAX ||
attr->value_size < LPM_VAL_SIZE_MIN ||
@@ -433,6 +527,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
trie->map.value_size = attr->value_size;
trie->map.max_entries = attr->max_entries;
trie->map.map_flags = attr->map_flags;
+ trie->map.numa_node = bpf_map_attr_numa_node(attr);
trie->data_size = attr->key_size -
offsetof(struct bpf_lpm_trie_key, data);
trie->max_prefixlen = trie->data_size * 8;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
new file mode 100644
index 000000000000..2816feb38be1
--- /dev/null
+++ b/kernel/bpf/offload.c
@@ -0,0 +1,194 @@
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rtnetlink.h>
+
+/* protected by RTNL */
+static LIST_HEAD(bpf_prog_offload_devs);
+
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_dev_offload *offload;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (attr->prog_flags)
+ return -EINVAL;
+
+ offload = kzalloc(sizeof(*offload), GFP_USER);
+ if (!offload)
+ return -ENOMEM;
+
+ offload->prog = prog;
+ init_waitqueue_head(&offload->verifier_done);
+
+ rtnl_lock();
+ offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex);
+ if (!offload->netdev) {
+ rtnl_unlock();
+ kfree(offload);
+ return -EINVAL;
+ }
+
+ prog->aux->offload = offload;
+ list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
+ rtnl_unlock();
+
+ return 0;
+}
+
+static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
+ struct netdev_bpf *data)
+{
+ struct net_device *netdev = prog->aux->offload->netdev;
+
+ ASSERT_RTNL();
+
+ if (!netdev)
+ return -ENODEV;
+ if (!netdev->netdev_ops->ndo_bpf)
+ return -EOPNOTSUPP;
+
+ data->command = cmd;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, data);
+}
+
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+ struct netdev_bpf data = {};
+ int err;
+
+ data.verifier.prog = env->prog;
+
+ rtnl_lock();
+ err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
+ if (err)
+ goto exit_unlock;
+
+ env->dev_ops = data.verifier.ops;
+
+ env->prog->aux->offload->dev_state = true;
+ env->prog->aux->offload->verifier_running = true;
+exit_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+ struct netdev_bpf data = {};
+
+ data.offload.prog = prog;
+
+ if (offload->verifier_running)
+ wait_event(offload->verifier_done, !offload->verifier_running);
+
+ if (offload->dev_state)
+ WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+
+ offload->dev_state = false;
+ list_del_init(&offload->offloads);
+ offload->netdev = NULL;
+}
+
+void bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+
+ offload->verifier_running = false;
+ wake_up(&offload->verifier_done);
+
+ rtnl_lock();
+ __bpf_prog_offload_destroy(prog);
+ rtnl_unlock();
+
+ kfree(offload);
+}
+
+static int bpf_prog_offload_translate(struct bpf_prog *prog)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+ struct netdev_bpf data = {};
+ int ret;
+
+ data.offload.prog = prog;
+
+ offload->verifier_running = false;
+ wake_up(&offload->verifier_done);
+
+ rtnl_lock();
+ ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
+ rtnl_unlock();
+
+ return ret;
+}
+
+static unsigned int bpf_prog_warn_on_exec(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ WARN(1, "attempt to execute device eBPF program on the host!");
+ return 0;
+}
+
+int bpf_prog_offload_compile(struct bpf_prog *prog)
+{
+ prog->bpf_func = bpf_prog_warn_on_exec;
+
+ return bpf_prog_offload_translate(prog);
+}
+
+u32 bpf_prog_offload_ifindex(struct bpf_prog *prog)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+ u32 ifindex;
+
+ rtnl_lock();
+ ifindex = offload->netdev ? offload->netdev->ifindex : 0;
+ rtnl_unlock();
+
+ return ifindex;
+}
+
+const struct bpf_prog_ops bpf_offload_prog_ops = {
+};
+
+static int bpf_offload_notification(struct notifier_block *notifier,
+ ulong event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct bpf_dev_offload *offload, *tmp;
+
+ ASSERT_RTNL();
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
+ offloads) {
+ if (offload->netdev == netdev)
+ __bpf_prog_offload_destroy(offload->prog);
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block bpf_offload_notifier = {
+ .notifier_call = bpf_offload_notification,
+};
+
+static int __init bpf_offload_init(void)
+{
+ register_netdevice_notifier(&bpf_offload_notifier);
+ return 0;
+}
+
+subsys_initcall(bpf_offload_init);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 5c51d1985b51..673fa6fe2d73 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -78,8 +78,10 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
+ unsigned long flags;
int orig_cpu, cpu;
+ local_irq_save(flags);
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
@@ -87,14 +89,16 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
node = head->first;
if (node) {
head->first = node->next;
- raw_spin_unlock(&head->lock);
+ raw_spin_unlock_irqrestore(&head->lock, flags);
return node;
}
raw_spin_unlock(&head->lock);
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
- if (cpu == orig_cpu)
+ if (cpu == orig_cpu) {
+ local_irq_restore(flags);
return NULL;
+ }
}
}
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
new file mode 100644
index 000000000000..5ee2e41893d9
--- /dev/null
+++ b/kernel/bpf/sockmap.c
@@ -0,0 +1,901 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* A BPF sock_map is used to store sock objects. This is primarly used
+ * for doing socket redirect with BPF helper routines.
+ *
+ * A sock map may have BPF programs attached to it, currently a program
+ * used to parse packets and a program to provide a verdict and redirect
+ * decision on the packet are supported. Any programs attached to a sock
+ * map are inherited by sock objects when they are added to the map. If
+ * no BPF programs are attached the sock object may only be used for sock
+ * redirect.
+ *
+ * A sock object may be in multiple maps, but can only inherit a single
+ * parse or verdict program. If adding a sock object to a map would result
+ * in having multiple parsing programs the update will return an EBUSY error.
+ *
+ * For reference this program is similar to devmap used in XDP context
+ * reviewing these together may be useful. For an example please review
+ * ./samples/bpf/sockmap/.
+ */
+#include <linux/bpf.h>
+#include <net/sock.h>
+#include <linux/filter.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+#include <net/strparser.h>
+#include <net/tcp.h>
+
+#define SOCK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+struct bpf_stab {
+ struct bpf_map map;
+ struct sock **sock_map;
+ struct bpf_prog *bpf_parse;
+ struct bpf_prog *bpf_verdict;
+};
+
+enum smap_psock_state {
+ SMAP_TX_RUNNING,
+};
+
+struct smap_psock_map_entry {
+ struct list_head list;
+ struct sock **entry;
+};
+
+struct smap_psock {
+ struct rcu_head rcu;
+ /* refcnt is used inside sk_callback_lock */
+ u32 refcnt;
+
+ /* datapath variables */
+ struct sk_buff_head rxqueue;
+ bool strp_enabled;
+
+ /* datapath error path cache across tx work invocations */
+ int save_rem;
+ int save_off;
+ struct sk_buff *save_skb;
+
+ struct strparser strp;
+ struct bpf_prog *bpf_parse;
+ struct bpf_prog *bpf_verdict;
+ struct list_head maps;
+
+ /* Back reference used when sock callback trigger sockmap operations */
+ struct sock *sock;
+ unsigned long state;
+
+ struct work_struct tx_work;
+ struct work_struct gc_work;
+
+ void (*save_data_ready)(struct sock *sk);
+ void (*save_write_space)(struct sock *sk);
+ void (*save_state_change)(struct sock *sk);
+};
+
+static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
+{
+ return rcu_dereference_sk_user_data(sk);
+}
+
+/* compute the linear packet data range [data, data_end) for skb when
+ * sk_skb type programs are in use.
+ */
+static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
+{
+ TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
+}
+
+enum __sk_action {
+ __SK_DROP = 0,
+ __SK_PASS,
+ __SK_REDIRECT,
+};
+
+static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
+{
+ struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
+ int rc;
+
+ if (unlikely(!prog))
+ return __SK_DROP;
+
+ skb_orphan(skb);
+ /* We need to ensure that BPF metadata for maps is also cleared
+ * when we orphan the skb so that we don't have the possibility
+ * to reference a stale map.
+ */
+ TCP_SKB_CB(skb)->bpf.map = NULL;
+ skb->sk = psock->sock;
+ bpf_compute_data_pointers(skb);
+ preempt_disable();
+ rc = (*prog->bpf_func)(skb, prog->insnsi);
+ preempt_enable();
+ skb->sk = NULL;
+
+ /* Moving return codes from UAPI namespace into internal namespace */
+ return rc == SK_PASS ?
+ (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) :
+ __SK_DROP;
+}
+
+static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
+{
+ struct sock *sk;
+ int rc;
+
+ rc = smap_verdict_func(psock, skb);
+ switch (rc) {
+ case __SK_REDIRECT:
+ sk = do_sk_redirect_map(skb);
+ if (likely(sk)) {
+ struct smap_psock *peer = smap_psock_sk(sk);
+
+ if (likely(peer &&
+ test_bit(SMAP_TX_RUNNING, &peer->state) &&
+ !sock_flag(sk, SOCK_DEAD) &&
+ sock_writeable(sk))) {
+ skb_set_owner_w(skb, sk);
+ skb_queue_tail(&peer->rxqueue, skb);
+ schedule_work(&peer->tx_work);
+ break;
+ }
+ }
+ /* Fall through and free skb otherwise */
+ case __SK_DROP:
+ default:
+ kfree_skb(skb);
+ }
+}
+
+static void smap_report_sk_error(struct smap_psock *psock, int err)
+{
+ struct sock *sk = psock->sock;
+
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+
+/* Called with lock_sock(sk) held */
+static void smap_state_change(struct sock *sk)
+{
+ struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock *psock;
+ struct socket_wq *wq;
+ struct sock *osk;
+
+ rcu_read_lock();
+
+ /* Allowing transitions into an established syn_recv states allows
+ * for early binding sockets to a smap object before the connection
+ * is established.
+ */
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ case TCP_ESTABLISHED:
+ break;
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ case TCP_LAST_ACK:
+ case TCP_FIN_WAIT1:
+ case TCP_FIN_WAIT2:
+ case TCP_LISTEN:
+ break;
+ case TCP_CLOSE:
+ /* Only release if the map entry is in fact the sock in
+ * question. There is a case where the operator deletes
+ * the sock from the map, but the TCP sock is closed before
+ * the psock is detached. Use cmpxchg to verify correct
+ * sock is removed.
+ */
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ break;
+ write_lock_bh(&sk->sk_callback_lock);
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ osk = cmpxchg(e->entry, sk, NULL);
+ if (osk == sk) {
+ list_del(&e->list);
+ smap_release_sock(psock, sk);
+ }
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ break;
+ default:
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ break;
+ smap_report_sk_error(psock, EPIPE);
+ break;
+ }
+
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_all(&wq->wait);
+ rcu_read_unlock();
+}
+
+static void smap_read_sock_strparser(struct strparser *strp,
+ struct sk_buff *skb)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = container_of(strp, struct smap_psock, strp);
+ smap_do_verdict(psock, skb);
+ rcu_read_unlock();
+}
+
+/* Called with lock held on socket */
+static void smap_data_ready(struct sock *sk)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (likely(psock)) {
+ write_lock_bh(&sk->sk_callback_lock);
+ strp_data_ready(&psock->strp);
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+ rcu_read_unlock();
+}
+
+static void smap_tx_work(struct work_struct *w)
+{
+ struct smap_psock *psock;
+ struct sk_buff *skb;
+ int rem, off, n;
+
+ psock = container_of(w, struct smap_psock, tx_work);
+
+ /* lock sock to avoid losing sk_socket at some point during loop */
+ lock_sock(psock->sock);
+ if (psock->save_skb) {
+ skb = psock->save_skb;
+ rem = psock->save_rem;
+ off = psock->save_off;
+ psock->save_skb = NULL;
+ goto start;
+ }
+
+ while ((skb = skb_dequeue(&psock->rxqueue))) {
+ rem = skb->len;
+ off = 0;
+start:
+ do {
+ if (likely(psock->sock->sk_socket))
+ n = skb_send_sock_locked(psock->sock,
+ skb, off, rem);
+ else
+ n = -EINVAL;
+ if (n <= 0) {
+ if (n == -EAGAIN) {
+ /* Retry when space is available */
+ psock->save_skb = skb;
+ psock->save_rem = rem;
+ psock->save_off = off;
+ goto out;
+ }
+ /* Hard errors break pipe and stop xmit */
+ smap_report_sk_error(psock, n ? -n : EPIPE);
+ clear_bit(SMAP_TX_RUNNING, &psock->state);
+ kfree_skb(skb);
+ goto out;
+ }
+ rem -= n;
+ off += n;
+ } while (rem);
+ kfree_skb(skb);
+ }
+out:
+ release_sock(psock->sock);
+}
+
+static void smap_write_space(struct sock *sk)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
+ schedule_work(&psock->tx_work);
+ rcu_read_unlock();
+}
+
+static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
+{
+ if (!psock->strp_enabled)
+ return;
+ sk->sk_data_ready = psock->save_data_ready;
+ sk->sk_write_space = psock->save_write_space;
+ sk->sk_state_change = psock->save_state_change;
+ psock->save_data_ready = NULL;
+ psock->save_write_space = NULL;
+ psock->save_state_change = NULL;
+ strp_stop(&psock->strp);
+ psock->strp_enabled = false;
+}
+
+static void smap_destroy_psock(struct rcu_head *rcu)
+{
+ struct smap_psock *psock = container_of(rcu,
+ struct smap_psock, rcu);
+
+ /* Now that a grace period has passed there is no longer
+ * any reference to this sock in the sockmap so we can
+ * destroy the psock, strparser, and bpf programs. But,
+ * because we use workqueue sync operations we can not
+ * do it in rcu context
+ */
+ schedule_work(&psock->gc_work);
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
+{
+ psock->refcnt--;
+ if (psock->refcnt)
+ return;
+
+ smap_stop_sock(psock, sock);
+ clear_bit(SMAP_TX_RUNNING, &psock->state);
+ rcu_assign_sk_user_data(sock, NULL);
+ call_rcu_sched(&psock->rcu, smap_destroy_psock);
+}
+
+static int smap_parse_func_strparser(struct strparser *strp,
+ struct sk_buff *skb)
+{
+ struct smap_psock *psock;
+ struct bpf_prog *prog;
+ int rc;
+
+ rcu_read_lock();
+ psock = container_of(strp, struct smap_psock, strp);
+ prog = READ_ONCE(psock->bpf_parse);
+
+ if (unlikely(!prog)) {
+ rcu_read_unlock();
+ return skb->len;
+ }
+
+ /* Attach socket for bpf program to use if needed we can do this
+ * because strparser clones the skb before handing it to a upper
+ * layer, meaning skb_orphan has been called. We NULL sk on the
+ * way out to ensure we don't trigger a BUG_ON in skb/sk operations
+ * later and because we are not charging the memory of this skb to
+ * any socket yet.
+ */
+ skb->sk = psock->sock;
+ bpf_compute_data_pointers(skb);
+ rc = (*prog->bpf_func)(skb, prog->insnsi);
+ skb->sk = NULL;
+ rcu_read_unlock();
+ return rc;
+}
+
+
+static int smap_read_sock_done(struct strparser *strp, int err)
+{
+ return err;
+}
+
+static int smap_init_sock(struct smap_psock *psock,
+ struct sock *sk)
+{
+ static const struct strp_callbacks cb = {
+ .rcv_msg = smap_read_sock_strparser,
+ .parse_msg = smap_parse_func_strparser,
+ .read_sock_done = smap_read_sock_done,
+ };
+
+ return strp_init(&psock->strp, sk, &cb);
+}
+
+static void smap_init_progs(struct smap_psock *psock,
+ struct bpf_stab *stab,
+ struct bpf_prog *verdict,
+ struct bpf_prog *parse)
+{
+ struct bpf_prog *orig_parse, *orig_verdict;
+
+ orig_parse = xchg(&psock->bpf_parse, parse);
+ orig_verdict = xchg(&psock->bpf_verdict, verdict);
+
+ if (orig_verdict)
+ bpf_prog_put(orig_verdict);
+ if (orig_parse)
+ bpf_prog_put(orig_parse);
+}
+
+static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
+{
+ if (sk->sk_data_ready == smap_data_ready)
+ return;
+ psock->save_data_ready = sk->sk_data_ready;
+ psock->save_write_space = sk->sk_write_space;
+ psock->save_state_change = sk->sk_state_change;
+ sk->sk_data_ready = smap_data_ready;
+ sk->sk_write_space = smap_write_space;
+ sk->sk_state_change = smap_state_change;
+ psock->strp_enabled = true;
+}
+
+static void sock_map_remove_complete(struct bpf_stab *stab)
+{
+ bpf_map_area_free(stab->sock_map);
+ kfree(stab);
+}
+
+static void smap_gc_work(struct work_struct *w)
+{
+ struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock *psock;
+
+ psock = container_of(w, struct smap_psock, gc_work);
+
+ /* no callback lock needed because we already detached sockmap ops */
+ if (psock->strp_enabled)
+ strp_done(&psock->strp);
+
+ cancel_work_sync(&psock->tx_work);
+ __skb_queue_purge(&psock->rxqueue);
+
+ /* At this point all strparser and xmit work must be complete */
+ if (psock->bpf_parse)
+ bpf_prog_put(psock->bpf_parse);
+ if (psock->bpf_verdict)
+ bpf_prog_put(psock->bpf_verdict);
+
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ list_del(&e->list);
+ kfree(e);
+ }
+
+ sock_put(psock->sock);
+ kfree(psock);
+}
+
+static struct smap_psock *smap_init_psock(struct sock *sock,
+ struct bpf_stab *stab)
+{
+ struct smap_psock *psock;
+
+ psock = kzalloc_node(sizeof(struct smap_psock),
+ GFP_ATOMIC | __GFP_NOWARN,
+ stab->map.numa_node);
+ if (!psock)
+ return ERR_PTR(-ENOMEM);
+
+ psock->sock = sock;
+ skb_queue_head_init(&psock->rxqueue);
+ INIT_WORK(&psock->tx_work, smap_tx_work);
+ INIT_WORK(&psock->gc_work, smap_gc_work);
+ INIT_LIST_HEAD(&psock->maps);
+ psock->refcnt = 1;
+
+ rcu_assign_sk_user_data(sock, psock);
+ sock_hold(sock);
+ return psock;
+}
+
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_stab *stab;
+ int err = -EINVAL;
+ u64 cost;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ stab = kzalloc(sizeof(*stab), GFP_USER);
+ if (!stab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ stab->map.map_type = attr->map_type;
+ stab->map.key_size = attr->key_size;
+ stab->map.value_size = attr->value_size;
+ stab->map.max_entries = attr->max_entries;
+ stab->map.map_flags = attr->map_flags;
+ stab->map.numa_node = bpf_map_attr_numa_node(attr);
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) stab->map.max_entries * sizeof(struct sock *);
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_stab;
+
+ stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* if map size is larger than memlock limit, reject it early */
+ err = bpf_map_precharge_memlock(stab->map.pages);
+ if (err)
+ goto free_stab;
+
+ err = -ENOMEM;
+ stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
+ sizeof(struct sock *),
+ stab->map.numa_node);
+ if (!stab->sock_map)
+ goto free_stab;
+
+ return &stab->map;
+free_stab:
+ kfree(stab);
+ return ERR_PTR(err);
+}
+
+static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+{
+ struct smap_psock_map_entry *e, *tmp;
+
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ if (e->entry == entry) {
+ list_del(&e->list);
+ break;
+ }
+ }
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ int i;
+
+ synchronize_rcu();
+
+ /* At this point no update, lookup or delete operations can happen.
+ * However, be aware we can still get a socket state event updates,
+ * and data ready callabacks that reference the psock from sk_user_data
+ * Also psock worker threads are still in-flight. So smap_release_sock
+ * will only free the psock after cancel_sync on the worker threads
+ * and a grace period expire to ensure psock is really safe to remove.
+ */
+ rcu_read_lock();
+ for (i = 0; i < stab->map.max_entries; i++) {
+ struct smap_psock *psock;
+ struct sock *sock;
+
+ sock = xchg(&stab->sock_map[i], NULL);
+ if (!sock)
+ continue;
+
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ smap_list_remove(psock, &stab->sock_map[i]);
+ smap_release_sock(psock, sock);
+ write_unlock_bh(&sock->sk_callback_lock);
+ }
+ rcu_read_unlock();
+
+ if (stab->bpf_verdict)
+ bpf_prog_put(stab->bpf_verdict);
+ if (stab->bpf_parse)
+ bpf_prog_put(stab->bpf_parse);
+
+ sock_map_remove_complete(stab);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ u32 i = key ? *(u32 *)key : U32_MAX;
+ u32 *next = (u32 *)next_key;
+
+ if (i >= stab->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (i == stab->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = i + 1;
+ return 0;
+}
+
+struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ return READ_ONCE(stab->sock_map[key]);
+}
+
+static int sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct smap_psock *psock;
+ int k = *(u32 *)key;
+ struct sock *sock;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ sock = xchg(&stab->sock_map[k], NULL);
+ if (!sock)
+ return -EINVAL;
+
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ if (!psock)
+ goto out;
+
+ if (psock->bpf_parse)
+ smap_stop_sock(psock, sock);
+ smap_list_remove(psock, &stab->sock_map[k]);
+ smap_release_sock(psock, sock);
+out:
+ write_unlock_bh(&sock->sk_callback_lock);
+ return 0;
+}
+
+/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
+ * done inside rcu critical sections. This ensures on updates that the psock
+ * will not be released via smap_release_sock() until concurrent updates/deletes
+ * complete. All operations operate on sock_map using cmpxchg and xchg
+ * operations to ensure we do not get stale references. Any reads into the
+ * map must be done with READ_ONCE() because of this.
+ *
+ * A psock is destroyed via call_rcu and after any worker threads are cancelled
+ * and syncd so we are certain all references from the update/lookup/delete
+ * operations as well as references in the data path are no longer in use.
+ *
+ * Psocks may exist in multiple maps, but only a single set of parse/verdict
+ * programs may be inherited from the maps it belongs to. A reference count
+ * is kept with the total number of references to the psock from all maps. The
+ * psock will not be released until this reaches zero. The psock and sock
+ * user data data use the sk_callback_lock to protect critical data structures
+ * from concurrent access. This allows us to avoid two updates from modifying
+ * the user data in sock and the lock is required anyways for modifying
+ * callbacks, we simply increase its scope slightly.
+ *
+ * Rules to follow,
+ * - psock must always be read inside RCU critical section
+ * - sk_user_data must only be modified inside sk_callback_lock and read
+ * inside RCU critical section.
+ * - psock->maps list must only be read & modified inside sk_callback_lock
+ * - sock_map must use READ_ONCE and (cmp)xchg operations
+ * - BPF verdict/parse programs must use READ_ONCE and xchg operations
+ */
+static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+ struct bpf_map *map,
+ void *key, u64 flags)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct smap_psock_map_entry *e = NULL;
+ struct bpf_prog *verdict, *parse;
+ struct sock *osock, *sock;
+ struct smap_psock *psock;
+ u32 i = *(u32 *)key;
+ int err;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ if (unlikely(i >= stab->map.max_entries))
+ return -E2BIG;
+
+ sock = READ_ONCE(stab->sock_map[i]);
+ if (flags == BPF_EXIST && !sock)
+ return -ENOENT;
+ else if (flags == BPF_NOEXIST && sock)
+ return -EEXIST;
+
+ sock = skops->sk;
+
+ /* 1. If sock map has BPF programs those will be inherited by the
+ * sock being added. If the sock is already attached to BPF programs
+ * this results in an error.
+ */
+ verdict = READ_ONCE(stab->bpf_verdict);
+ parse = READ_ONCE(stab->bpf_parse);
+
+ if (parse && verdict) {
+ /* bpf prog refcnt may be zero if a concurrent attach operation
+ * removes the program after the above READ_ONCE() but before
+ * we increment the refcnt. If this is the case abort with an
+ * error.
+ */
+ verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+ if (IS_ERR(verdict))
+ return PTR_ERR(verdict);
+
+ parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+ if (IS_ERR(parse)) {
+ bpf_prog_put(verdict);
+ return PTR_ERR(parse);
+ }
+ }
+
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+
+ /* 2. Do not allow inheriting programs if psock exists and has
+ * already inherited programs. This would create confusion on
+ * which parser/verdict program is running. If no psock exists
+ * create one. Inside sk_callback_lock to ensure concurrent create
+ * doesn't update user data.
+ */
+ if (psock) {
+ if (READ_ONCE(psock->bpf_parse) && parse) {
+ err = -EBUSY;
+ goto out_progs;
+ }
+ psock->refcnt++;
+ } else {
+ psock = smap_init_psock(sock, stab);
+ if (IS_ERR(psock)) {
+ err = PTR_ERR(psock);
+ goto out_progs;
+ }
+
+ set_bit(SMAP_TX_RUNNING, &psock->state);
+ }
+
+ e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+ if (!e) {
+ err = -ENOMEM;
+ goto out_progs;
+ }
+ e->entry = &stab->sock_map[i];
+
+ /* 3. At this point we have a reference to a valid psock that is
+ * running. Attach any BPF programs needed.
+ */
+ if (parse && verdict && !psock->strp_enabled) {
+ err = smap_init_sock(psock, sock);
+ if (err)
+ goto out_free;
+ smap_init_progs(psock, stab, verdict, parse);
+ smap_start_sock(psock, sock);
+ }
+
+ /* 4. Place psock in sockmap for use and stop any programs on
+ * the old sock assuming its not the same sock we are replacing
+ * it with. Because we can only have a single set of programs if
+ * old_sock has a strp we can stop it.
+ */
+ list_add_tail(&e->list, &psock->maps);
+ write_unlock_bh(&sock->sk_callback_lock);
+
+ osock = xchg(&stab->sock_map[i], sock);
+ if (osock) {
+ struct smap_psock *opsock = smap_psock_sk(osock);
+
+ write_lock_bh(&osock->sk_callback_lock);
+ if (osock != sock && parse)
+ smap_stop_sock(opsock, osock);
+ smap_list_remove(opsock, &stab->sock_map[i]);
+ smap_release_sock(opsock, osock);
+ write_unlock_bh(&osock->sk_callback_lock);
+ }
+ return 0;
+out_free:
+ smap_release_sock(psock, sock);
+out_progs:
+ if (verdict)
+ bpf_prog_put(verdict);
+ if (parse)
+ bpf_prog_put(parse);
+ write_unlock_bh(&sock->sk_callback_lock);
+ kfree(e);
+ return err;
+}
+
+int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct bpf_prog *orig;
+
+ if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
+ return -EINVAL;
+
+ switch (type) {
+ case BPF_SK_SKB_STREAM_PARSER:
+ orig = xchg(&stab->bpf_parse, prog);
+ break;
+ case BPF_SK_SKB_STREAM_VERDICT:
+ orig = xchg(&stab->bpf_verdict, prog);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (orig)
+ bpf_prog_put(orig);
+
+ return 0;
+}
+
+static void *sock_map_lookup(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+static int sock_map_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags)
+{
+ struct bpf_sock_ops_kern skops;
+ u32 fd = *(u32 *)value;
+ struct socket *socket;
+ int err;
+
+ socket = sockfd_lookup(fd, &err);
+ if (!socket)
+ return err;
+
+ skops.sk = socket->sk;
+ if (!skops.sk) {
+ fput(socket->file);
+ return -EINVAL;
+ }
+
+ if (skops.sk->sk_type != SOCK_STREAM ||
+ skops.sk->sk_protocol != IPPROTO_TCP) {
+ fput(socket->file);
+ return -EOPNOTSUPP;
+ }
+
+ err = sock_map_ctx_update_elem(&skops, map, key, flags);
+ fput(socket->file);
+ return err;
+}
+
+const struct bpf_map_ops sock_map_ops = {
+ .map_alloc = sock_map_alloc,
+ .map_free = sock_map_free,
+ .map_lookup_elem = sock_map_lookup,
+ .map_get_next_key = sock_map_get_next_key,
+ .map_update_elem = sock_map_update_elem,
+ .map_delete_elem = sock_map_delete_elem,
+};
+
+BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto = {
+ .func = bpf_sock_map_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 31147d730abf..a15bc636cc98 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,9 @@
#include <linux/perf_event.h>
#include "percpu_freelist.h"
+#define STACK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
struct stack_map_bucket {
struct pcpu_freelist_node fnode;
u32 hash;
@@ -31,7 +34,8 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
int err;
- smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries);
+ smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
+ smap->map.numa_node);
if (!smap->elems)
return -ENOMEM;
@@ -59,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (attr->map_flags)
+ if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
/* check sanity of attributes */
@@ -75,7 +79,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-E2BIG);
- smap = bpf_map_area_alloc(cost);
+ smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
if (!smap)
return ERR_PTR(-ENOMEM);
@@ -91,6 +95,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
smap->map.map_flags = attr->map_flags;
smap->n_buckets = n_buckets;
smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+ smap->map.numa_node = bpf_map_attr_numa_node(attr);
err = bpf_map_precharge_memlock(smap->map.pages);
if (err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6c772adabad2..09badc37e864 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,9 @@
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -31,6 +34,8 @@
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
+#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
+
DEFINE_PER_CPU(int, bpf_prog_active);
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
@@ -48,6 +53,47 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
#undef BPF_MAP_TYPE
};
+/*
+ * If we're handed a bigger struct than we know of, ensure all the unknown bits
+ * are 0 - i.e. new user-space does not rely on any kernel feature extensions
+ * we don't know about yet.
+ *
+ * There is a ToCToU between this function call and the following
+ * copy_from_user() call. However, this is not a concern since this function is
+ * meant to be a future-proofing of bits.
+ */
+static int check_uarg_tail_zero(void __user *uaddr,
+ size_t expected_size,
+ size_t actual_size)
+{
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+ int err;
+
+ if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
+ return -E2BIG;
+
+ if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size)))
+ return -EFAULT;
+
+ if (actual_size <= expected_size)
+ return 0;
+
+ addr = uaddr + expected_size;
+ end = uaddr + actual_size;
+
+ for (; addr < end; addr++) {
+ err = get_user(val, addr);
+ if (err)
+ return err;
+ if (val)
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
struct bpf_map *map;
@@ -64,7 +110,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
}
-void *bpf_map_area_alloc(size_t size)
+void *bpf_map_area_alloc(size_t size, int numa_node)
{
/* We definitely need __GFP_NORETRY, so OOM killer doesn't
* trigger under memory pressure as we really just want to
@@ -74,12 +120,13 @@ void *bpf_map_area_alloc(size_t size)
void *area;
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- area = kmalloc(size, GFP_USER | flags);
+ area = kmalloc_node(size, GFP_USER | flags, numa_node);
if (area != NULL)
return area;
}
- return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL);
+ return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
+ __builtin_return_address(0));
}
void bpf_map_area_free(void *area)
@@ -144,15 +191,17 @@ static int bpf_map_alloc_id(struct bpf_map *map)
static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
{
+ unsigned long flags;
+
if (do_idr_lock)
- spin_lock_bh(&map_idr_lock);
+ spin_lock_irqsave(&map_idr_lock, flags);
else
__acquire(&map_idr_lock);
idr_remove(&map_idr, map->id);
if (do_idr_lock)
- spin_unlock_bh(&map_idr_lock);
+ spin_unlock_irqrestore(&map_idr_lock, flags);
else
__release(&map_idr_lock);
}
@@ -163,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
struct bpf_map *map = container_of(work, struct bpf_map, work);
bpf_map_uncharge_memlock(map);
+ security_bpf_map_free(map);
/* implementation dependent freeing */
map->ops->map_free(map);
}
@@ -247,17 +297,54 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
-static const struct file_operations bpf_map_fops = {
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+ loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_READ.
+ */
+ return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+ size_t siz, loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_WRITE.
+ */
+ return -EINVAL;
+}
+
+const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
#endif
.release = bpf_map_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
{
+ int ret;
+
+ ret = security_bpf_map(map, OPEN_FMODE(flags));
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
- O_RDWR | O_CLOEXEC);
+ flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+ if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+ return -EINVAL;
+ if (flags & BPF_F_RDONLY)
+ return O_RDONLY;
+ if (flags & BPF_F_WRONLY)
+ return O_WRONLY;
+ return O_RDWR;
}
/* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -268,34 +355,76 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd
+/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
+ * Return 0 on success and < 0 on error.
+ */
+static int bpf_obj_name_cpy(char *dst, const char *src)
+{
+ const char *end = src + BPF_OBJ_NAME_LEN;
+
+ memset(dst, 0, BPF_OBJ_NAME_LEN);
+
+ /* Copy all isalnum() and '_' char */
+ while (src < end && *src) {
+ if (!isalnum(*src) && *src != '_')
+ return -EINVAL;
+ *dst++ = *src++;
+ }
+
+ /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+ if (src == end)
+ return -EINVAL;
+
+ return 0;
+}
+
+#define BPF_MAP_CREATE_LAST_FIELD map_name
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
+ int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map *map;
+ int f_flags;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;
+ f_flags = bpf_get_file_flag(attr->map_flags);
+ if (f_flags < 0)
+ return f_flags;
+
+ if (numa_node != NUMA_NO_NODE &&
+ ((unsigned int)numa_node >= nr_node_ids ||
+ !node_online(numa_node)))
+ return -EINVAL;
+
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map = find_and_alloc_map(attr);
if (IS_ERR(map))
return PTR_ERR(map);
+ err = bpf_obj_name_cpy(map->name, attr->map_name);
+ if (err)
+ goto free_map_nouncharge;
+
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
- err = bpf_map_charge_memlock(map);
+ err = security_bpf_map_alloc(map);
if (err)
goto free_map_nouncharge;
+ err = bpf_map_charge_memlock(map);
+ if (err)
+ goto free_map_sec;
+
err = bpf_map_alloc_id(map);
if (err)
goto free_map;
- err = bpf_map_new_fd(map);
+ err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
/* failed to allocate fd.
* bpf_map_put() is needed because the above
@@ -312,6 +441,8 @@ static int map_create(union bpf_attr *attr)
free_map:
bpf_map_uncharge_memlock(map);
+free_map_sec:
+ security_bpf_map_free(map);
free_map_nouncharge:
map->ops->map_free(map);
return err;
@@ -410,6 +541,11 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -490,6 +626,11 @@ static int map_update_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -512,6 +653,12 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;
+ /* Need to create a kthread, thus must support schedule */
+ if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
+ goto out;
+ }
+
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
* inside bpf map update or delete otherwise deadlocks are possible
*/
@@ -542,7 +689,7 @@ static int map_update_elem(union bpf_attr *attr)
}
__this_cpu_dec(bpf_prog_active);
preempt_enable();
-
+out:
if (!err)
trace_bpf_map_update_elem(map, ufd, key, value);
free_value:
@@ -573,6 +720,11 @@ static int map_delete_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -616,6 +768,11 @@ static int map_get_next_key(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
if (ukey) {
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
@@ -653,9 +810,9 @@ err_put:
return err;
}
-static const struct bpf_verifier_ops * const bpf_prog_types[] = {
-#define BPF_PROG_TYPE(_id, _ops) \
- [_id] = &_ops,
+static const struct bpf_prog_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+ [_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
@@ -667,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
return -EINVAL;
- prog->aux->ops = bpf_prog_types[type];
+ if (!bpf_prog_is_dev_bound(prog->aux))
+ prog->aux->ops = bpf_prog_types[type];
+ else
+ prog->aux->ops = &bpf_offload_prog_ops;
prog->type = type;
return 0;
}
@@ -770,6 +930,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
free_used_maps(aux);
bpf_prog_uncharge_memlock(aux->prog);
+ security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
}
@@ -817,15 +978,23 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
-static const struct file_operations bpf_prog_fops = {
+const struct file_operations bpf_prog_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_prog_show_fdinfo,
#endif
.release = bpf_prog_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
int bpf_prog_new_fd(struct bpf_prog *prog)
{
+ int ret;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
O_RDWR | O_CLOEXEC);
}
@@ -870,7 +1039,7 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
EXPORT_SYMBOL_GPL(bpf_prog_inc);
/* prog_idr_lock should have been held */
-static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
{
int refold;
@@ -886,8 +1055,24 @@ static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
return prog;
}
+EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
+static bool bpf_prog_can_attach(struct bpf_prog *prog,
+ enum bpf_prog_type *attach_type,
+ struct net_device *netdev)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+
+ if (prog->type != *attach_type)
+ return false;
+ if (offload && offload->netdev != netdev)
+ return false;
+
+ return true;
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
+ struct net_device *netdev)
{
struct fd f = fdget(ufd);
struct bpf_prog *prog;
@@ -895,7 +1080,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
prog = ____bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
- if (type && prog->type != *type) {
+ if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) {
prog = ERR_PTR(-EINVAL);
goto out;
}
@@ -908,12 +1093,12 @@ out:
struct bpf_prog *bpf_prog_get(u32 ufd)
{
- return __bpf_prog_get(ufd, NULL);
+ return __bpf_prog_get(ufd, NULL, NULL);
}
struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
{
- struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
+ struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL);
if (!IS_ERR(prog))
trace_bpf_prog_get_type(prog);
@@ -921,8 +1106,19 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
}
EXPORT_SYMBOL_GPL(bpf_prog_get_type);
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+ struct net_device *netdev)
+{
+ struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev);
+
+ if (!IS_ERR(prog))
+ trace_bpf_prog_get_type(prog);
+ return prog;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
+
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD prog_flags
+#define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
static int bpf_prog_load(union bpf_attr *attr)
{
@@ -964,10 +1160,14 @@ static int bpf_prog_load(union bpf_attr *attr)
if (!prog)
return -ENOMEM;
- err = bpf_prog_charge_memlock(prog);
+ err = security_bpf_prog_alloc(prog->aux);
if (err)
goto free_prog_nouncharge;
+ err = bpf_prog_charge_memlock(prog);
+ if (err)
+ goto free_prog_sec;
+
prog->len = attr->insn_cnt;
err = -EFAULT;
@@ -981,11 +1181,22 @@ static int bpf_prog_load(union bpf_attr *attr)
atomic_set(&prog->aux->refcnt, 1);
prog->gpl_compatible = is_gpl ? 1 : 0;
+ if (attr->prog_target_ifindex) {
+ err = bpf_prog_offload_init(prog, attr);
+ if (err)
+ goto free_prog;
+ }
+
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog;
+ prog->aux->load_time = ktime_get_boot_ns();
+ err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
+ if (err)
+ goto free_prog;
+
/* run eBPF verifier */
err = bpf_check(&prog, attr);
if (err < 0)
@@ -1020,16 +1231,18 @@ free_used_maps:
free_used_maps(prog->aux);
free_prog:
bpf_prog_uncharge_memlock(prog);
+free_prog_sec:
+ security_bpf_prog_free(prog->aux);
free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
static int bpf_obj_pin(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ))
+ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
return -EINVAL;
return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
@@ -1037,16 +1250,55 @@ static int bpf_obj_pin(const union bpf_attr *attr)
static int bpf_obj_get(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+ attr->file_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
- return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
+ return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+ attr->file_flags);
}
#ifdef CONFIG_CGROUP_BPF
#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
+{
+ struct bpf_prog *prog = NULL;
+ int ufd = attr->target_fd;
+ struct bpf_map *map;
+ struct fd f;
+ int err;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (attach) {
+ prog = bpf_prog_get_type(attr->attach_bpf_fd,
+ BPF_PROG_TYPE_SK_SKB);
+ if (IS_ERR(prog)) {
+ fdput(f);
+ return PTR_ERR(prog);
+ }
+ }
+
+ err = sock_map_prog(map, prog, attr->attach_type);
+ if (err) {
+ fdput(f);
+ if (prog)
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ fdput(f);
+ return 0;
+}
+
+#define BPF_F_ATTACH_MASK \
+ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+
static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
@@ -1060,7 +1312,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
- if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
return -EINVAL;
switch (attr->attach_type) {
@@ -1074,6 +1326,12 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_SOCK_OPS:
ptype = BPF_PROG_TYPE_SOCK_OPS;
break;
+ case BPF_CGROUP_DEVICE:
+ ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+ break;
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ return sockmap_get_from_fd(attr, true);
default:
return -EINVAL;
}
@@ -1088,8 +1346,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return PTR_ERR(cgrp);
}
- ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
- attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+ ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+ attr->attach_flags);
if (ret)
bpf_prog_put(prog);
cgroup_put(cgrp);
@@ -1101,6 +1359,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
static int bpf_prog_detach(const union bpf_attr *attr)
{
+ enum bpf_prog_type ptype;
+ struct bpf_prog *prog;
struct cgroup *cgrp;
int ret;
@@ -1113,23 +1373,71 @@ static int bpf_prog_detach(const union bpf_attr *attr)
switch (attr->attach_type) {
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
+ ptype = BPF_PROG_TYPE_CGROUP_SKB;
+ break;
case BPF_CGROUP_INET_SOCK_CREATE:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+ break;
case BPF_CGROUP_SOCK_OPS:
- cgrp = cgroup_get_from_fd(attr->target_fd);
- if (IS_ERR(cgrp))
- return PTR_ERR(cgrp);
-
- ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
- cgroup_put(cgrp);
+ ptype = BPF_PROG_TYPE_SOCK_OPS;
break;
-
+ case BPF_CGROUP_DEVICE:
+ ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+ break;
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ return sockmap_get_from_fd(attr, false);
default:
return -EINVAL;
}
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ prog = NULL;
+
+ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+ if (prog)
+ bpf_prog_put(prog);
+ cgroup_put(cgrp);
return ret;
}
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+
+static int bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (CHECK_ATTR(BPF_PROG_QUERY))
+ return -EINVAL;
+ if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
+ return -EINVAL;
+
+ switch (attr->query.attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_DEVICE:
+ break;
+ default:
+ return -EINVAL;
+ }
+ cgrp = cgroup_get_from_fd(attr->query.target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+ ret = cgroup_bpf_query(cgrp, attr, uattr);
+ cgroup_put(cgrp);
+ return ret;
+}
#endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -1214,20 +1522,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
return fd;
}
-#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
{
struct bpf_map *map;
u32 id = attr->map_id;
+ int f_flags;
int fd;
- if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+ if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
+ attr->open_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ f_flags = bpf_get_file_flag(attr->open_flags);
+ if (f_flags < 0)
+ return f_flags;
+
spin_lock_bh(&map_idr_lock);
map = idr_find(&map_idr, id);
if (map)
@@ -1239,39 +1553,13 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- fd = bpf_map_new_fd(map);
+ fd = bpf_map_new_fd(map, f_flags);
if (fd < 0)
bpf_map_put(map);
return fd;
}
-static int check_uarg_tail_zero(void __user *uaddr,
- size_t expected_size,
- size_t actual_size)
-{
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
- int err;
-
- if (actual_size <= expected_size)
- return 0;
-
- addr = uaddr + expected_size;
- end = uaddr + actual_size;
-
- for (; addr < end; addr++) {
- err = get_user(val, addr);
- if (err)
- return err;
- if (val)
- return -E2BIG;
- }
-
- return 0;
-}
-
static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -1293,8 +1581,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info.type = prog->type;
info.id = prog->aux->id;
+ info.load_time = prog->aux->load_time;
+ info.created_by_uid = from_kuid_munged(current_user_ns(),
+ prog->aux->user->uid);
memcpy(info.tag, prog->tag, sizeof(prog->tag));
+ memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+ ulen = info.nr_map_ids;
+ info.nr_map_ids = prog->aux->used_map_cnt;
+ ulen = min_t(u32, info.nr_map_ids, ulen);
+ if (ulen) {
+ u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
+ u32 i;
+
+ for (i = 0; i < ulen; i++)
+ if (put_user(prog->aux->used_maps[i]->id,
+ &user_map_ids[i]))
+ return -EFAULT;
+ }
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
@@ -1320,6 +1625,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
return -EFAULT;
}
+ if (bpf_prog_is_dev_bound(prog->aux)) {
+ info.status |= BPF_PROG_STATUS_DEV_BOUND;
+ info.ifindex = bpf_prog_offload_ifindex(prog);
+ }
+
done:
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
@@ -1348,6 +1658,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
info.value_size = map->value_size;
info.max_entries = map->max_entries;
info.map_flags = map->map_flags;
+ memcpy(info.name, map->name, sizeof(map->name));
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
@@ -1393,17 +1704,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
return -EPERM;
- if (!access_ok(VERIFY_READ, uattr, 1))
- return -EFAULT;
-
- if (size > PAGE_SIZE) /* silly large */
- return -E2BIG;
-
- /* If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
err = check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
@@ -1413,6 +1713,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
+ err = security_bpf(cmd, &attr, size);
+ if (err < 0)
+ return err;
+
switch (cmd) {
case BPF_MAP_CREATE:
err = map_create(&attr);
@@ -1445,6 +1749,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_DETACH:
err = bpf_prog_detach(&attr);
break;
+ case BPF_PROG_QUERY:
+ err = bpf_prog_query(&attr, uattr);
+ break;
#endif
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr);
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
new file mode 100644
index 000000000000..1f4bf68c12db
--- /dev/null
+++ b/kernel/bpf/tnum.c
@@ -0,0 +1,180 @@
+/* tnum: tracked (or tristate) numbers
+ *
+ * A tnum tracks knowledge about the bits of a value. Each bit can be either
+ * known (0 or 1), or unknown (x). Arithmetic operations on tnums will
+ * propagate the unknown bits such that the tnum result represents all the
+ * possible results for possible values of the operands.
+ */
+#include <linux/kernel.h>
+#include <linux/tnum.h>
+
+#define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m}
+/* A completely unknown value */
+const struct tnum tnum_unknown = { .value = 0, .mask = -1 };
+
+struct tnum tnum_const(u64 value)
+{
+ return TNUM(value, 0);
+}
+
+struct tnum tnum_range(u64 min, u64 max)
+{
+ u64 chi = min ^ max, delta;
+ u8 bits = fls64(chi);
+
+ /* special case, needed because 1ULL << 64 is undefined */
+ if (bits > 63)
+ return tnum_unknown;
+ /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7.
+ * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return
+ * constant min (since min == max).
+ */
+ delta = (1ULL << bits) - 1;
+ return TNUM(min & ~delta, delta);
+}
+
+struct tnum tnum_lshift(struct tnum a, u8 shift)
+{
+ return TNUM(a.value << shift, a.mask << shift);
+}
+
+struct tnum tnum_rshift(struct tnum a, u8 shift)
+{
+ return TNUM(a.value >> shift, a.mask >> shift);
+}
+
+struct tnum tnum_add(struct tnum a, struct tnum b)
+{
+ u64 sm, sv, sigma, chi, mu;
+
+ sm = a.mask + b.mask;
+ sv = a.value + b.value;
+ sigma = sm + sv;
+ chi = sigma ^ sv;
+ mu = chi | a.mask | b.mask;
+ return TNUM(sv & ~mu, mu);
+}
+
+struct tnum tnum_sub(struct tnum a, struct tnum b)
+{
+ u64 dv, alpha, beta, chi, mu;
+
+ dv = a.value - b.value;
+ alpha = dv + a.mask;
+ beta = dv - b.mask;
+ chi = alpha ^ beta;
+ mu = chi | a.mask | b.mask;
+ return TNUM(dv & ~mu, mu);
+}
+
+struct tnum tnum_and(struct tnum a, struct tnum b)
+{
+ u64 alpha, beta, v;
+
+ alpha = a.value | a.mask;
+ beta = b.value | b.mask;
+ v = a.value & b.value;
+ return TNUM(v, alpha & beta & ~v);
+}
+
+struct tnum tnum_or(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value | b.value;
+ mu = a.mask | b.mask;
+ return TNUM(v, mu & ~v);
+}
+
+struct tnum tnum_xor(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value ^ b.value;
+ mu = a.mask | b.mask;
+ return TNUM(v & ~mu, mu);
+}
+
+/* half-multiply add: acc += (unknown * mask * value).
+ * An intermediate step in the multiply algorithm.
+ */
+static struct tnum hma(struct tnum acc, u64 value, u64 mask)
+{
+ while (mask) {
+ if (mask & 1)
+ acc = tnum_add(acc, TNUM(0, value));
+ mask >>= 1;
+ value <<= 1;
+ }
+ return acc;
+}
+
+struct tnum tnum_mul(struct tnum a, struct tnum b)
+{
+ struct tnum acc;
+ u64 pi;
+
+ pi = a.value * b.value;
+ acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value);
+ return hma(acc, b.mask, a.value);
+}
+
+/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
+ * a 'known 0' - this will return a 'known 1' for that bit.
+ */
+struct tnum tnum_intersect(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value | b.value;
+ mu = a.mask & b.mask;
+ return TNUM(v & ~mu, mu);
+}
+
+struct tnum tnum_cast(struct tnum a, u8 size)
+{
+ a.value &= (1ULL << (size * 8)) - 1;
+ a.mask &= (1ULL << (size * 8)) - 1;
+ return a;
+}
+
+bool tnum_is_aligned(struct tnum a, u64 size)
+{
+ if (!size)
+ return true;
+ return !((a.value | a.mask) & (size - 1));
+}
+
+bool tnum_in(struct tnum a, struct tnum b)
+{
+ if (b.mask & ~a.mask)
+ return false;
+ b.value &= ~a.mask;
+ return a.value == b.value;
+}
+
+int tnum_strn(char *str, size_t size, struct tnum a)
+{
+ return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
+}
+EXPORT_SYMBOL_GPL(tnum_strn);
+
+int tnum_sbin(char *str, size_t size, struct tnum a)
+{
+ size_t n;
+
+ for (n = 64; n; n--) {
+ if (n < size) {
+ if (a.mask & 1)
+ str[n - 1] = 'x';
+ else if (a.value & 1)
+ str[n - 1] = '1';
+ else
+ str[n - 1] = '0';
+ }
+ a.mask >>= 1;
+ a.value >>= 1;
+ }
+ str[min(size - 1, (size_t)64)] = 0;
+ return 64;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 664d93972373..dd54d20ace2f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21,6 +21,17 @@
#include <linux/vmalloc.h>
#include <linux/stringify.h>
+#include "disasm.h"
+
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+ [_id] = & _name ## _verifier_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
/* bpf_check() is a static code analyzer that walks eBPF program
* instruction by instruction and updates register/stack state.
* All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -61,12 +72,12 @@
* (and -20 constant is saved for further stack bounds checking).
* Meaning that this reg is a pointer to stack plus known immediate constant.
*
- * Most of the time the registers have UNKNOWN_VALUE type, which
+ * Most of the time the registers have SCALAR_VALUE type, which
* means the register has some value, but it's not a valid pointer.
- * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ * (like pointer plus pointer becomes SCALAR_VALUE type)
*
* When verifier sees load or store instructions the type of base register
- * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer
* types recognized by check_mem_access() function.
*
* PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
@@ -140,7 +151,7 @@ struct bpf_verifier_stack_elem {
struct bpf_verifier_stack_elem *next;
};
-#define BPF_COMPLEXITY_LIMIT_INSNS 98304
+#define BPF_COMPLEXITY_LIMIT_INSNS 131072
#define BPF_COMPLEXITY_LIMIT_STACK 1024
#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
@@ -153,63 +164,60 @@ struct bpf_call_arg_meta {
int access_size;
};
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
-
static DEFINE_MUTEX(bpf_verifier_lock);
/* log_level controls verbosity level of eBPF verifier.
* verbose() is used to dump the verification trace to the log, so the user
* can figure out what's wrong with the program
*/
-static __printf(1, 2) void verbose(const char *fmt, ...)
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+ const char *fmt, ...)
{
+ struct bpf_verifer_log *log = &env->log;
+ unsigned int n;
va_list args;
- if (log_level == 0 || log_len >= log_size - 1)
+ if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
return;
va_start(args, fmt);
- log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
va_end(args);
+
+ WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+ "verifier log line truncated - local buffer too short\n");
+
+ n = min(log->len_total - log->len_used - 1, n);
+ log->kbuf[n] = '\0';
+
+ if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+ log->len_used += n;
+ else
+ log->ubuf = NULL;
+}
+
+static bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+ return type == PTR_TO_PACKET ||
+ type == PTR_TO_PACKET_META;
}
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
- [UNKNOWN_VALUE] = "inv",
+ [SCALAR_VALUE] = "inv",
[PTR_TO_CTX] = "ctx",
[CONST_PTR_TO_MAP] = "map_ptr",
[PTR_TO_MAP_VALUE] = "map_value",
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
- [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj",
- [FRAME_PTR] = "fp",
[PTR_TO_STACK] = "fp",
- [CONST_IMM] = "imm",
[PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_META] = "pkt_meta",
[PTR_TO_PACKET_END] = "pkt_end",
};
-#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
-static const char * const func_id_str[] = {
- __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
-};
-#undef __BPF_FUNC_STR_FN
-
-static const char *func_id_name(int id)
-{
- BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
-
- if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
- return func_id_str[id];
- else
- return "unknown";
-}
-
-static void print_verifier_state(struct bpf_verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state)
{
struct bpf_reg_state *reg;
enum bpf_reg_type t;
@@ -220,241 +228,196 @@ static void print_verifier_state(struct bpf_verifier_state *state)
t = reg->type;
if (t == NOT_INIT)
continue;
- verbose(" R%d=%s", i, reg_type_str[t]);
- if (t == CONST_IMM || t == PTR_TO_STACK)
- verbose("%lld", reg->imm);
- else if (t == PTR_TO_PACKET)
- verbose("(id=%d,off=%d,r=%d)",
- reg->id, reg->off, reg->range);
- else if (t == UNKNOWN_VALUE && reg->imm)
- verbose("%lld", reg->imm);
- else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL ||
- t == PTR_TO_MAP_VALUE_ADJ)
- verbose("(ks=%d,vs=%d,id=%u)",
- reg->map_ptr->key_size,
- reg->map_ptr->value_size,
- reg->id);
- if (reg->min_value != BPF_REGISTER_MIN_RANGE)
- verbose(",min_value=%lld",
- (long long)reg->min_value);
- if (reg->max_value != BPF_REGISTER_MAX_RANGE)
- verbose(",max_value=%llu",
- (unsigned long long)reg->max_value);
- if (reg->min_align)
- verbose(",min_align=%u", reg->min_align);
- if (reg->aux_off)
- verbose(",aux_off=%u", reg->aux_off);
- if (reg->aux_off_align)
- verbose(",aux_off_align=%u", reg->aux_off_align);
- }
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] == STACK_SPILL)
- verbose(" fp%d=%s", -MAX_BPF_STACK + i,
- reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
- }
- verbose("\n");
-}
-
-static const char *const bpf_class_string[] = {
- [BPF_LD] = "ld",
- [BPF_LDX] = "ldx",
- [BPF_ST] = "st",
- [BPF_STX] = "stx",
- [BPF_ALU] = "alu",
- [BPF_JMP] = "jmp",
- [BPF_RET] = "BUG",
- [BPF_ALU64] = "alu64",
-};
-
-static const char *const bpf_alu_string[16] = {
- [BPF_ADD >> 4] = "+=",
- [BPF_SUB >> 4] = "-=",
- [BPF_MUL >> 4] = "*=",
- [BPF_DIV >> 4] = "/=",
- [BPF_OR >> 4] = "|=",
- [BPF_AND >> 4] = "&=",
- [BPF_LSH >> 4] = "<<=",
- [BPF_RSH >> 4] = ">>=",
- [BPF_NEG >> 4] = "neg",
- [BPF_MOD >> 4] = "%=",
- [BPF_XOR >> 4] = "^=",
- [BPF_MOV >> 4] = "=",
- [BPF_ARSH >> 4] = "s>>=",
- [BPF_END >> 4] = "endian",
-};
-
-static const char *const bpf_ldst_string[] = {
- [BPF_W >> 3] = "u32",
- [BPF_H >> 3] = "u16",
- [BPF_B >> 3] = "u8",
- [BPF_DW >> 3] = "u64",
-};
+ verbose(env, " R%d=%s", i, reg_type_str[t]);
+ if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
+ tnum_is_const(reg->var_off)) {
+ /* reg->off should be 0 for SCALAR_VALUE */
+ verbose(env, "%lld", reg->var_off.value + reg->off);
+ } else {
+ verbose(env, "(id=%d", reg->id);
+ if (t != SCALAR_VALUE)
+ verbose(env, ",off=%d", reg->off);
+ if (type_is_pkt_pointer(t))
+ verbose(env, ",r=%d", reg->range);
+ else if (t == CONST_PTR_TO_MAP ||
+ t == PTR_TO_MAP_VALUE ||
+ t == PTR_TO_MAP_VALUE_OR_NULL)
+ verbose(env, ",ks=%d,vs=%d",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
+ if (tnum_is_const(reg->var_off)) {
+ /* Typically an immediate SCALAR_VALUE, but
+ * could be a pointer whose offset is too big
+ * for reg->off
+ */
+ verbose(env, ",imm=%llx", reg->var_off.value);
+ } else {
+ if (reg->smin_value != reg->umin_value &&
+ reg->smin_value != S64_MIN)
+ verbose(env, ",smin_value=%lld",
+ (long long)reg->smin_value);
+ if (reg->smax_value != reg->umax_value &&
+ reg->smax_value != S64_MAX)
+ verbose(env, ",smax_value=%lld",
+ (long long)reg->smax_value);
+ if (reg->umin_value != 0)
+ verbose(env, ",umin_value=%llu",
+ (unsigned long long)reg->umin_value);
+ if (reg->umax_value != U64_MAX)
+ verbose(env, ",umax_value=%llu",
+ (unsigned long long)reg->umax_value);
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, ",var_off=%s", tn_buf);
+ }
+ }
+ verbose(env, ")");
+ }
+ }
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] == STACK_SPILL)
+ verbose(env, " fp%d=%s",
+ -MAX_BPF_STACK + i * BPF_REG_SIZE,
+ reg_type_str[state->stack[i].spilled_ptr.type]);
+ }
+ verbose(env, "\n");
+}
-static const char *const bpf_jmp_string[16] = {
- [BPF_JA >> 4] = "jmp",
- [BPF_JEQ >> 4] = "==",
- [BPF_JGT >> 4] = ">",
- [BPF_JGE >> 4] = ">=",
- [BPF_JSET >> 4] = "&",
- [BPF_JNE >> 4] = "!=",
- [BPF_JSGT >> 4] = "s>",
- [BPF_JSGE >> 4] = "s>=",
- [BPF_CALL >> 4] = "call",
- [BPF_EXIT >> 4] = "exit",
-};
+static int copy_stack_state(struct bpf_verifier_state *dst,
+ const struct bpf_verifier_state *src)
+{
+ if (!src->stack)
+ return 0;
+ if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
+ /* internal bug, make state invalid to reject the program */
+ memset(dst, 0, sizeof(*dst));
+ return -EFAULT;
+ }
+ memcpy(dst->stack, src->stack,
+ sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
+ return 0;
+}
-static void print_bpf_insn(const struct bpf_verifier_env *env,
- const struct bpf_insn *insn)
+/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
+ * make it consume minimal amount of memory. check_stack_write() access from
+ * the program calls into realloc_verifier_state() to grow the stack size.
+ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
+ * which this function copies over. It points to previous bpf_verifier_state
+ * which is never reallocated
+ */
+static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
+ bool copy_old)
{
- u8 class = BPF_CLASS(insn->code);
+ u32 old_size = state->allocated_stack;
+ struct bpf_stack_state *new_stack;
+ int slot = size / BPF_REG_SIZE;
- if (class == BPF_ALU || class == BPF_ALU64) {
- if (BPF_SRC(insn->code) == BPF_X)
- verbose("(%02x) %sr%d %s %sr%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
- insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
- insn->src_reg);
- else
- verbose("(%02x) %sr%d %s %s%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
- insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
- insn->imm);
- } else if (class == BPF_STX) {
- if (BPF_MODE(insn->code) == BPF_MEM)
- verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->src_reg);
- else if (BPF_MODE(insn->code) == BPF_XADD)
- verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg, insn->off,
- insn->src_reg);
- else
- verbose("BUG_%02x\n", insn->code);
- } else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_st_%02x\n", insn->code);
- return;
- }
- verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->imm);
- } else if (class == BPF_LDX) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_ldx_%02x\n", insn->code);
- return;
- }
- verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
- insn->code, insn->dst_reg,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->src_reg, insn->off);
- } else if (class == BPF_LD) {
- if (BPF_MODE(insn->code) == BPF_ABS) {
- verbose("(%02x) r0 = *(%s *)skb[%d]\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IND) {
- verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->src_reg, insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IMM &&
- BPF_SIZE(insn->code) == BPF_DW) {
- /* At this point, we already made sure that the second
- * part of the ldimm64 insn is accessible.
- */
- u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+ if (size <= old_size || !size) {
+ if (copy_old)
+ return 0;
+ state->allocated_stack = slot * BPF_REG_SIZE;
+ if (!size && old_size) {
+ kfree(state->stack);
+ state->stack = NULL;
+ }
+ return 0;
+ }
+ new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
+ GFP_KERNEL);
+ if (!new_stack)
+ return -ENOMEM;
+ if (copy_old) {
+ if (state->stack)
+ memcpy(new_stack, state->stack,
+ sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
+ memset(new_stack + old_size / BPF_REG_SIZE, 0,
+ sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
+ }
+ state->allocated_stack = slot * BPF_REG_SIZE;
+ kfree(state->stack);
+ state->stack = new_stack;
+ return 0;
+}
- if (map_ptr && !env->allow_ptr_leaks)
- imm = 0;
+static void free_verifier_state(struct bpf_verifier_state *state,
+ bool free_self)
+{
+ kfree(state->stack);
+ if (free_self)
+ kfree(state);
+}
- verbose("(%02x) r%d = 0x%llx\n", insn->code,
- insn->dst_reg, (unsigned long long)imm);
- } else {
- verbose("BUG_ld_%02x\n", insn->code);
- return;
- }
- } else if (class == BPF_JMP) {
- u8 opcode = BPF_OP(insn->code);
+/* copy verifier state from src to dst growing dst stack space
+ * when necessary to accommodate larger src stack
+ */
+static int copy_verifier_state(struct bpf_verifier_state *dst,
+ const struct bpf_verifier_state *src)
+{
+ int err;
- if (opcode == BPF_CALL) {
- verbose("(%02x) call %s#%d\n", insn->code,
- func_id_name(insn->imm), insn->imm);
- } else if (insn->code == (BPF_JMP | BPF_JA)) {
- verbose("(%02x) goto pc%+d\n",
- insn->code, insn->off);
- } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
- verbose("(%02x) exit\n", insn->code);
- } else if (BPF_SRC(insn->code) == BPF_X) {
- verbose("(%02x) if r%d %s r%d goto pc%+d\n",
- insn->code, insn->dst_reg,
- bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->src_reg, insn->off);
- } else {
- verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
- insn->code, insn->dst_reg,
- bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->imm, insn->off);
- }
- } else {
- verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
- }
+ err = realloc_verifier_state(dst, src->allocated_stack, false);
+ if (err)
+ return err;
+ memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack));
+ return copy_stack_state(dst, src);
}
-static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
+ int *insn_idx)
{
- struct bpf_verifier_stack_elem *elem;
- int insn_idx;
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_verifier_stack_elem *elem, *head = env->head;
+ int err;
if (env->head == NULL)
- return -1;
+ return -ENOENT;
- memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
- insn_idx = env->head->insn_idx;
+ if (cur) {
+ err = copy_verifier_state(cur, &head->st);
+ if (err)
+ return err;
+ }
+ if (insn_idx)
+ *insn_idx = head->insn_idx;
if (prev_insn_idx)
- *prev_insn_idx = env->head->prev_insn_idx;
- elem = env->head->next;
- kfree(env->head);
+ *prev_insn_idx = head->prev_insn_idx;
+ elem = head->next;
+ free_verifier_state(&head->st, false);
+ kfree(head);
env->head = elem;
env->stack_size--;
- return insn_idx;
+ return 0;
}
static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx)
{
+ struct bpf_verifier_state *cur = env->cur_state;
struct bpf_verifier_stack_elem *elem;
+ int err;
- elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
if (!elem)
goto err;
- memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
env->head = elem;
env->stack_size++;
+ err = copy_verifier_state(&elem->st, cur);
+ if (err)
+ goto err;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
- verbose("BPF program is too complex\n");
+ verbose(env, "BPF program is too complex\n");
goto err;
}
return &elem->st;
err:
/* pop all elements and return */
- while (pop_stack(env, NULL) >= 0);
+ while (!pop_stack(env, NULL, NULL));
return NULL;
}
@@ -463,56 +426,192 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void __mark_reg_not_init(struct bpf_reg_state *reg);
+
+/* Mark the unknown part of a register (variable offset or scalar value) as
+ * known to have the value @imm.
+ */
+static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
- BUG_ON(regno >= MAX_BPF_REG);
+ reg->id = 0;
+ reg->var_off = tnum_const(imm);
+ reg->smin_value = (s64)imm;
+ reg->smax_value = (s64)imm;
+ reg->umin_value = imm;
+ reg->umax_value = imm;
+}
- memset(&regs[regno], 0, sizeof(regs[regno]));
- regs[regno].type = NOT_INIT;
- regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
- regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+/* Mark the 'variable offset' part of a register as zero. This should be
+ * used only on registers holding a pointer type.
+ */
+static void __mark_reg_known_zero(struct bpf_reg_state *reg)
+{
+ __mark_reg_known(reg, 0);
}
-static void init_reg_state(struct bpf_reg_state *regs)
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
- int i;
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_known_zero(regs + regno);
+}
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_reg_not_init(regs, i);
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+ return type_is_pkt_pointer(reg->type);
+}
- /* frame pointer */
- regs[BPF_REG_FP].type = FRAME_PTR;
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+ return reg_is_pkt_pointer(reg) ||
+ reg->type == PTR_TO_PACKET_END;
+}
- /* 1st arg to a function */
- regs[BPF_REG_1].type = PTR_TO_CTX;
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+ enum bpf_reg_type which)
+{
+ /* The register can already have a range from prior markings.
+ * This is fine as long as it hasn't been advanced from its
+ * origin.
+ */
+ return reg->type == which &&
+ reg->id == 0 &&
+ reg->off == 0 &&
+ tnum_equals_const(reg->var_off, 0);
}
-static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+/* Attempts to improve min/max values based on var_off information */
+static void __update_reg_bounds(struct bpf_reg_state *reg)
{
- regs[regno].type = UNKNOWN_VALUE;
- regs[regno].id = 0;
- regs[regno].imm = 0;
+ /* min signed is max(sign bit) | min(other bits) */
+ reg->smin_value = max_t(s64, reg->smin_value,
+ reg->var_off.value | (reg->var_off.mask & S64_MIN));
+ /* max signed is min(sign bit) | max(other bits) */
+ reg->smax_value = min_t(s64, reg->smax_value,
+ reg->var_off.value | (reg->var_off.mask & S64_MAX));
+ reg->umin_value = max(reg->umin_value, reg->var_off.value);
+ reg->umax_value = min(reg->umax_value,
+ reg->var_off.value | reg->var_off.mask);
}
-static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+/* Uses signed min/max values to inform unsigned, and vice-versa */
+static void __reg_deduce_bounds(struct bpf_reg_state *reg)
{
- BUG_ON(regno >= MAX_BPF_REG);
- __mark_reg_unknown_value(regs, regno);
+ /* Learn sign from signed bounds.
+ * If we cannot cross the sign boundary, then signed and unsigned bounds
+ * are the same, so combine. This works even in the negative case, e.g.
+ * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
+ */
+ if (reg->smin_value >= 0 || reg->smax_value < 0) {
+ reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
+ reg->umin_value);
+ reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
+ reg->umax_value);
+ return;
+ }
+ /* Learn sign from unsigned bounds. Signed bounds cross the sign
+ * boundary, so we must be careful.
+ */
+ if ((s64)reg->umax_value >= 0) {
+ /* Positive. We can't learn anything from the smin, but smax
+ * is positive, hence safe.
+ */
+ reg->smin_value = reg->umin_value;
+ reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
+ reg->umax_value);
+ } else if ((s64)reg->umin_value < 0) {
+ /* Negative. We can't learn anything from the smax, but smin
+ * is negative, hence safe.
+ */
+ reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
+ reg->umin_value);
+ reg->smax_value = reg->umax_value;
+ }
}
-static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
+/* Attempts to improve var_off based on unsigned min/max information */
+static void __reg_bound_offset(struct bpf_reg_state *reg)
{
- regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
- regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
- regs[regno].value_from_signed = false;
- regs[regno].min_align = 0;
+ reg->var_off = tnum_intersect(reg->var_off,
+ tnum_range(reg->umin_value,
+ reg->umax_value));
}
-static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
- u32 regno)
+/* Reset the min/max bounds of a register */
+static void __mark_reg_unbounded(struct bpf_reg_state *reg)
{
- mark_reg_unknown_value(regs, regno);
- reset_reg_range_values(regs, regno);
+ reg->smin_value = S64_MIN;
+ reg->smax_value = S64_MAX;
+ reg->umin_value = 0;
+ reg->umax_value = U64_MAX;
+}
+
+/* Mark a register as having a completely unknown (scalar) value. */
+static void __mark_reg_unknown(struct bpf_reg_state *reg)
+{
+ reg->type = SCALAR_VALUE;
+ reg->id = 0;
+ reg->off = 0;
+ reg->var_off = tnum_unknown;
+ __mark_reg_unbounded(reg);
+}
+
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
+{
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_unknown(regs + regno);
+}
+
+static void __mark_reg_not_init(struct bpf_reg_state *reg)
+{
+ __mark_reg_unknown(reg);
+ reg->type = NOT_INIT;
+}
+
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
+{
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_not_init(regs + regno);
+}
+
+static void init_reg_state(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ mark_reg_not_init(env, regs, i);
+ regs[i].live = REG_LIVE_NONE;
+ }
+
+ /* frame pointer */
+ regs[BPF_REG_FP].type = PTR_TO_STACK;
+ mark_reg_known_zero(env, regs, BPF_REG_FP);
+
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+ mark_reg_known_zero(env, regs, BPF_REG_1);
}
enum reg_arg_type {
@@ -521,28 +620,51 @@ enum reg_arg_type {
DST_OP_NO_MARK /* same as above, check only, don't mark */
};
-static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
+static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
+{
+ struct bpf_verifier_state *parent = state->parent;
+
+ if (regno == BPF_REG_FP)
+ /* We don't need to worry about FP liveness because it's read-only */
+ return;
+
+ while (parent) {
+ /* if read wasn't screened by an earlier write ... */
+ if (state->regs[regno].live & REG_LIVE_WRITTEN)
+ break;
+ /* ... then we depend on parent's value */
+ parent->regs[regno].live |= REG_LIVE_READ;
+ state = parent;
+ parent = state->parent;
+ }
+}
+
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
enum reg_arg_type t)
{
+ struct bpf_reg_state *regs = env->cur_state->regs;
+
if (regno >= MAX_BPF_REG) {
- verbose("R%d is invalid\n", regno);
+ verbose(env, "R%d is invalid\n", regno);
return -EINVAL;
}
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
if (regs[regno].type == NOT_INIT) {
- verbose("R%d !read_ok\n", regno);
+ verbose(env, "R%d !read_ok\n", regno);
return -EACCES;
}
+ mark_reg_read(env->cur_state, regno);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
- verbose("frame pointer is read only\n");
+ verbose(env, "frame pointer is read only\n");
return -EACCES;
}
+ regs[regno].live |= REG_LIVE_WRITTEN;
if (t == DST_OP)
- mark_reg_unknown_value(regs, regno);
+ mark_reg_unknown(env, regs, regno);
}
return 0;
}
@@ -552,12 +674,11 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
switch (type) {
case PTR_TO_MAP_VALUE:
case PTR_TO_MAP_VALUE_OR_NULL:
- case PTR_TO_MAP_VALUE_ADJ:
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
case PTR_TO_PACKET_END:
- case FRAME_PTR:
case CONST_PTR_TO_MAP:
return true;
default:
@@ -568,138 +689,178 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
-static int check_stack_write(struct bpf_verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state, int off,
int size, int value_regno)
{
- int i;
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+
+ err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE),
+ true);
+ if (err)
+ return err;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
+ if (!env->allow_ptr_leaks &&
+ state->stack[spi].slot_type[0] == STACK_SPILL &&
+ size != BPF_REG_SIZE) {
+ verbose(env, "attempt to corrupt spilled pointer on stack\n");
+ return -EACCES;
+ }
if (value_regno >= 0 &&
is_spillable_regtype(state->regs[value_regno].type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
+ verbose(env, "invalid size of register spill\n");
return -EACCES;
}
/* save register state */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- state->regs[value_regno];
+ state->stack[spi].spilled_ptr = state->regs[value_regno];
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
for (i = 0; i < BPF_REG_SIZE; i++)
- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+ state->stack[spi].slot_type[i] = STACK_SPILL;
} else {
/* regular write of data into stack */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- (struct bpf_reg_state) {};
+ state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
for (i = 0; i < size; i++)
- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+ state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
+ STACK_MISC;
}
return 0;
}
-static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
+static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot)
+{
+ struct bpf_verifier_state *parent = state->parent;
+
+ while (parent) {
+ /* if read wasn't screened by an earlier write ... */
+ if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
+ break;
+ /* ... then we depend on parent's value */
+ parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
+ state = parent;
+ parent = state->parent;
+ }
+}
+
+static int check_stack_read(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
- u8 *slot_type;
- int i;
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+ u8 *stype;
- slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+ if (state->allocated_stack <= slot) {
+ verbose(env, "invalid read from stack off %d+0 size %d\n",
+ off, size);
+ return -EACCES;
+ }
+ stype = state->stack[spi].slot_type;
- if (slot_type[0] == STACK_SPILL) {
+ if (stype[0] == STACK_SPILL) {
if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
+ verbose(env, "invalid size of register spill\n");
return -EACCES;
}
for (i = 1; i < BPF_REG_SIZE; i++) {
- if (slot_type[i] != STACK_SPILL) {
- verbose("corrupted spill memory\n");
+ if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
+ verbose(env, "corrupted spill memory\n");
return -EACCES;
}
}
- if (value_regno >= 0)
+ if (value_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] =
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
+ state->regs[value_regno] = state->stack[spi].spilled_ptr;
+ mark_stack_slot_read(state, spi);
+ }
return 0;
} else {
for (i = 0; i < size; i++) {
- if (slot_type[i] != STACK_MISC) {
- verbose("invalid read from stack off %d+%d size %d\n",
+ if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) {
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
}
}
if (value_regno >= 0)
/* have read misc data from the stack */
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(env, state->regs, value_regno);
return 0;
}
}
/* check read/write into map element returned by bpf_map_lookup_elem() */
-static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size)
+static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
+ int size, bool zero_size_allowed)
{
- struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_map *map = regs[regno].map_ptr;
- if (off < 0 || size <= 0 || off + size > map->value_size) {
- verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+ off + size > map->value_size) {
+ verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
map->value_size, off, size);
return -EACCES;
}
return 0;
}
-/* check read/write into an adjusted map element */
-static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno,
- int off, int size)
+/* check read/write into a map element with possible variable offset */
+static int check_map_access(struct bpf_verifier_env *env, u32 regno,
+ int off, int size, bool zero_size_allowed)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *reg = &state->regs[regno];
int err;
- /* We adjusted the register to this map value, so we
- * need to change off and size to min_value and max_value
- * respectively to make sure our theoretical access will be
- * safe.
+ /* We may have adjusted the register to this map value, so we
+ * need to try adding each of min_value and max_value to off
+ * to make sure our theoretical access will be safe.
*/
- if (log_level)
- print_verifier_state(state);
- env->varlen_map_value_access = true;
+ if (env->log.level)
+ print_verifier_state(env, state);
/* The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
* index'es we need to make sure that whatever we use
* will have a set floor within our range.
*/
- if (reg->min_value < 0) {
- verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ if (reg->smin_value < 0) {
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
- err = check_map_access(env, regno, reg->min_value + off, size);
+ err = __check_map_access(env, regno, reg->smin_value + off, size,
+ zero_size_allowed);
if (err) {
- verbose("R%d min value is outside of the array range\n",
+ verbose(env, "R%d min value is outside of the array range\n",
regno);
return err;
}
- /* If we haven't set a max value then we need to bail
- * since we can't be sure we won't do bad things.
+ /* If we haven't set a max value then we need to bail since we can't be
+ * sure we won't do bad things.
+ * If reg->umax_value + off could overflow, treat that as unbounded too.
*/
- if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
- verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+ if (reg->umax_value >= BPF_MAX_VAR_OFF) {
+ verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
regno);
return -EACCES;
}
- return check_map_access(env, regno, reg->max_value + off, size);
+ err = __check_map_access(env, regno, reg->umax_value + off, size,
+ zero_size_allowed);
+ if (err)
+ verbose(env, "R%d max value is outside of the array range\n",
+ regno);
+ return err;
}
#define MAX_PACKET_OFF 0xffff
@@ -719,6 +880,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_SCHED_ACT:
case BPF_PROG_TYPE_XDP:
case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_SK_SKB:
if (meta)
return meta->pkt_access;
@@ -729,22 +891,50 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
}
}
-static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size)
+static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
+ int off, int size, bool zero_size_allowed)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = &regs[regno];
- off += reg->off;
- if (off < 0 || size <= 0 || off + size > reg->range) {
- verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+ if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+ (u64)off + size > reg->range) {
+ verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, reg->off, reg->range);
return -EACCES;
}
return 0;
}
-/* check access to 'struct bpf_context' fields */
+static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
+ int size, bool zero_size_allowed)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[regno];
+ int err;
+
+ /* We may have added a variable offset to the packet pointer; but any
+ * reg->range we have comes after that. We are only checking the fixed
+ * offset.
+ */
+
+ /* We don't allow negative numbers, because we aren't tracking enough
+ * detail to prove they're safe.
+ */
+ if (reg->smin_value < 0) {
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ regno);
+ return -EACCES;
+ }
+ err = __check_packet_access(env, regno, off, size, zero_size_allowed);
+ if (err) {
+ verbose(env, "R%d offset is outside of the packet\n", regno);
+ return err;
+ }
+ return err;
+}
+
+/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type)
{
@@ -752,12 +942,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
.reg_type = *reg_type,
};
- /* for analyzer ctx accesses are already validated and converted */
- if (env->analyzer_ops)
- return 0;
-
- if (env->prog->aux->ops->is_valid_access &&
- env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+ if (env->ops->is_valid_access &&
+ env->ops->is_valid_access(off, size, t, &info)) {
/* A non zero info.ctx_field_size indicates that this field is a
* candidate for later verifier transformation to load the whole
* field and then apply a mask when accessed with a narrower
@@ -765,16 +951,16 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
* will only allow for whole field access and rejects any other
* type of narrower access.
*/
- env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
*reg_type = info.reg_type;
+ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
return 0;
}
- verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+ verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
return -EACCES;
}
@@ -784,40 +970,25 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
if (allow_ptr_leaks)
return false;
- switch (reg->type) {
- case UNKNOWN_VALUE:
- case CONST_IMM:
- return false;
- default:
- return true;
- }
+ return reg->type != SCALAR_VALUE;
}
static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
{
- return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+ return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
}
-static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
int off, int size, bool strict)
{
+ struct tnum reg_off;
int ip_align;
- int reg_off;
/* Byte size accesses are always allowed. */
if (!strict || size == 1)
return 0;
- reg_off = reg->off;
- if (reg->id) {
- if (reg->aux_off_align % size) {
- verbose("Packet access is only %u byte aligned, %d byte access not allowed\n",
- reg->aux_off_align, size);
- return -EACCES;
- }
- reg_off += reg->aux_off;
- }
-
/* For platforms that do not have a Kconfig enabling
* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
* NET_IP_ALIGN is universally set to '2'. And on platforms
@@ -827,20 +998,39 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
* unconditional IP align value of '2'.
*/
ip_align = 2;
- if ((ip_align + reg_off + off) % size != 0) {
- verbose("misaligned packet access off %d+%d+%d size %d\n",
- ip_align, reg_off, off, size);
+
+ reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off));
+ if (!tnum_is_aligned(reg_off, size)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env,
+ "misaligned packet access off %d+%s+%d+%d size %d\n",
+ ip_align, tn_buf, reg->off, off, size);
return -EACCES;
}
return 0;
}
-static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
- int size, bool strict)
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ const char *pointer_desc,
+ int off, int size, bool strict)
{
- if (strict && size != 1) {
- verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
+ struct tnum reg_off;
+
+ /* Byte size accesses are always allowed. */
+ if (!strict || size == 1)
+ return 0;
+
+ reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off));
+ if (!tnum_is_aligned(reg_off, size)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
+ pointer_desc, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -852,21 +1042,29 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
int off, int size)
{
bool strict = env->strict_alignment;
+ const char *pointer_desc = "";
switch (reg->type) {
case PTR_TO_PACKET:
- return check_pkt_ptr_alignment(reg, off, size, strict);
- case PTR_TO_MAP_VALUE_ADJ:
- return check_val_ptr_alignment(reg, size, strict);
+ case PTR_TO_PACKET_META:
+ /* Special case, because of NET_IP_ALIGN. Given metadata sits
+ * right in front, treat it the very same way.
+ */
+ return check_pkt_ptr_alignment(env, reg, off, size, strict);
+ case PTR_TO_MAP_VALUE:
+ pointer_desc = "value ";
+ break;
+ case PTR_TO_CTX:
+ pointer_desc = "context ";
+ break;
+ case PTR_TO_STACK:
+ pointer_desc = "stack ";
+ break;
default:
- if (off % size != 0) {
- verbose("misaligned access off %d size %d\n",
- off, size);
- return -EACCES;
- }
-
- return 0;
+ break;
}
+ return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+ strict);
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -879,129 +1077,158 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
int bpf_size, enum bpf_access_type t,
int value_regno)
{
- struct bpf_verifier_state *state = &env->cur_state;
- struct bpf_reg_state *reg = &state->regs[regno];
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = regs + regno;
int size, err = 0;
- if (reg->type == PTR_TO_STACK)
- off += reg->imm;
-
size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;
+ /* alignment checks will add in reg->off themselves */
err = check_ptr_alignment(env, reg, off, size);
if (err)
return err;
- if (reg->type == PTR_TO_MAP_VALUE ||
- reg->type == PTR_TO_MAP_VALUE_ADJ) {
+ /* for access checks, reg->off is just part of off */
+ off += reg->off;
+
+ if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into map\n", value_regno);
+ verbose(env, "R%d leaks addr into map\n", value_regno);
return -EACCES;
}
- if (reg->type == PTR_TO_MAP_VALUE_ADJ)
- err = check_map_access_adj(env, regno, off, size);
- else
- err = check_map_access(env, regno, off, size);
+ err = check_map_access(env, regno, off, size, false);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
- enum bpf_reg_type reg_type = UNKNOWN_VALUE;
+ enum bpf_reg_type reg_type = SCALAR_VALUE;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into ctx\n", value_regno);
+ verbose(env, "R%d leaks addr into ctx\n", value_regno);
+ return -EACCES;
+ }
+ /* ctx accesses must be at a fixed offset, so that we can
+ * determine what type of data were returned.
+ */
+ if (reg->off) {
+ verbose(env,
+ "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
+ regno, reg->off, off - reg->off);
+ return -EACCES;
+ }
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env,
+ "variable ctx access var_off=%s off=%d size=%d",
+ tn_buf, off, size);
return -EACCES;
}
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
- /* note that reg.[id|off|range] == 0 */
- state->regs[value_regno].type = reg_type;
- state->regs[value_regno].aux_off = 0;
- state->regs[value_regno].aux_off_align = 0;
+ /* ctx access returns either a scalar, or a
+ * PTR_TO_PACKET[_META,_END]. In the latter
+ * case, we know the offset is zero.
+ */
+ if (reg_type == SCALAR_VALUE)
+ mark_reg_unknown(env, regs, value_regno);
+ else
+ mark_reg_known_zero(env, regs,
+ value_regno);
+ regs[value_regno].id = 0;
+ regs[value_regno].off = 0;
+ regs[value_regno].range = 0;
+ regs[value_regno].type = reg_type;
}
- } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
+ } else if (reg->type == PTR_TO_STACK) {
+ /* stack accesses must be at a fixed offset, so that we can
+ * determine what type of data were returned.
+ * See check_stack_read().
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "variable stack access var_off=%s off=%d size=%d",
+ tn_buf, off, size);
+ return -EACCES;
+ }
+ off += reg->var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK) {
- verbose("invalid stack off=%d size=%d\n", off, size);
+ verbose(env, "invalid stack off=%d size=%d\n", off,
+ size);
return -EACCES;
}
if (env->prog->aux->stack_depth < -off)
env->prog->aux->stack_depth = -off;
- if (t == BPF_WRITE) {
- if (!env->allow_ptr_leaks &&
- state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
- size != BPF_REG_SIZE) {
- verbose("attempt to corrupt spilled pointer on stack\n");
- return -EACCES;
- }
- err = check_stack_write(state, off, size, value_regno);
- } else {
- err = check_stack_read(state, off, size, value_regno);
- }
- } else if (state->regs[regno].type == PTR_TO_PACKET) {
+ if (t == BPF_WRITE)
+ err = check_stack_write(env, state, off, size,
+ value_regno);
+ else
+ err = check_stack_read(env, state, off, size,
+ value_regno);
+ } else if (reg_is_pkt_pointer(reg)) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
- verbose("cannot write into packet\n");
+ verbose(env, "cannot write into packet\n");
return -EACCES;
}
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into packet\n", value_regno);
+ verbose(env, "R%d leaks addr into packet\n",
+ value_regno);
return -EACCES;
}
- err = check_packet_access(env, regno, off, size);
+ err = check_packet_access(env, regno, off, size, false);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(env, regs, value_regno);
} else {
- verbose("R%d invalid mem access '%s'\n",
- regno, reg_type_str[reg->type]);
+ verbose(env, "R%d invalid mem access '%s'\n", regno,
+ reg_type_str[reg->type]);
return -EACCES;
}
- if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks &&
- state->regs[value_regno].type == UNKNOWN_VALUE) {
- /* 1 or 2 byte load zero-extends, determine the number of
- * zero upper bits. Not doing it fo 4 byte load, since
- * such values cannot be added to ptr_to_packet anyway.
- */
- state->regs[value_regno].imm = 64 - size * 8;
+ if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
+ regs[value_regno].type == SCALAR_VALUE) {
+ /* b/h/w load zero-extends, mark upper bits as known 0 */
+ regs[value_regno].var_off =
+ tnum_cast(regs[value_regno].var_off, size);
+ __update_reg_bounds(&regs[value_regno]);
}
return err;
}
static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
int err;
if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
insn->imm != 0) {
- verbose("BPF_XADD uses reserved fields\n");
+ verbose(env, "BPF_XADD uses reserved fields\n");
return -EINVAL;
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d leaks addr into mem\n", insn->src_reg);
+ verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
return -EACCES;
}
@@ -1016,34 +1243,50 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
BPF_SIZE(insn->code), BPF_WRITE, -1);
}
+/* Does this register contain a constant zero? */
+static bool register_is_null(struct bpf_reg_state reg)
+{
+ return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0);
+}
+
/* when register 'regno' is passed into function that will read 'access_size'
* bytes from that pointer, make sure that it's within stack boundary
- * and all elements of stack are initialized
+ * and all elements of stack are initialized.
+ * Unlike most pointer bounds-checking functions, this one doesn't take an
+ * 'off' argument, so it has to add in reg->off itself.
*/
static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *regs = state->regs;
- int off, i;
+ int off, i, slot, spi;
if (regs[regno].type != PTR_TO_STACK) {
+ /* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
- regs[regno].type == CONST_IMM &&
- regs[regno].imm == 0)
+ register_is_null(regs[regno]))
return 0;
- verbose("R%d type=%s expected=%s\n", regno,
+ verbose(env, "R%d type=%s expected=%s\n", regno,
reg_type_str[regs[regno].type],
reg_type_str[PTR_TO_STACK]);
return -EACCES;
}
- off = regs[regno].imm;
+ /* Only allow fixed-offset stack reads */
+ if (!tnum_is_const(regs[regno].var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
+ verbose(env, "invalid variable stack read R%d var_off=%s\n",
+ regno, tn_buf);
+ }
+ off = regs[regno].off + regs[regno].var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
- access_size <= 0) {
- verbose("invalid stack type R%d off=%d access_size=%d\n",
+ access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
+ verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
regno, off, access_size);
return -EACCES;
}
@@ -1058,8 +1301,12 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
}
for (i = 0; i < access_size; i++) {
- if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
- verbose("invalid indirect read from stack off %d+%d size %d\n",
+ slot = -(off + i) - 1;
+ spi = slot / BPF_REG_SIZE;
+ if (state->allocated_stack <= slot ||
+ state->stack[spi].slot_type[slot % BPF_REG_SIZE] !=
+ STACK_MISC) {
+ verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
off, i, access_size);
return -EACCES;
}
@@ -1071,16 +1318,17 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- switch (regs[regno].type) {
+ switch (reg->type) {
case PTR_TO_PACKET:
- return check_packet_access(env, regno, 0, access_size);
+ case PTR_TO_PACKET_META:
+ return check_packet_access(env, regno, reg->off, access_size,
+ zero_size_allowed);
case PTR_TO_MAP_VALUE:
- return check_map_access(env, regno, 0, access_size);
- case PTR_TO_MAP_VALUE_ADJ:
- return check_map_access_adj(env, regno, 0, access_size);
- default: /* const_imm|ptr_to_stack or invalid ptr */
+ return check_map_access(env, regno, reg->off, access_size,
+ zero_size_allowed);
+ default: /* scalar_value|ptr_to_stack or invalid ptr */
return check_stack_boundary(env, regno, access_size,
zero_size_allowed, meta);
}
@@ -1090,44 +1338,42 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
enum bpf_reg_type expected_type, type = reg->type;
int err = 0;
if (arg_type == ARG_DONTCARE)
return 0;
- if (type == NOT_INIT) {
- verbose("R%d !read_ok\n", regno);
- return -EACCES;
- }
+ err = check_reg_arg(env, regno, SRC_OP);
+ if (err)
+ return err;
if (arg_type == ARG_ANYTHING) {
if (is_pointer_value(env, regno)) {
- verbose("R%d leaks addr into helper function\n", regno);
+ verbose(env, "R%d leaks addr into helper function\n",
+ regno);
return -EACCES;
}
return 0;
}
- if (type == PTR_TO_PACKET &&
+ if (type_is_pkt_pointer(type) &&
!may_access_direct_pkt_data(env, meta, BPF_READ)) {
- verbose("helper access to the packet is not allowed\n");
+ verbose(env, "helper access to the packet is not allowed\n");
return -EACCES;
}
if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
- if (type != PTR_TO_PACKET && type != expected_type)
+ if (!type_is_pkt_pointer(type) &&
+ type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
- expected_type = CONST_IMM;
- /* One exception. Allow UNKNOWN_VALUE registers when the
- * boundaries are known and don't cause unsafe memory accesses
- */
- if (type != UNKNOWN_VALUE && type != expected_type)
+ expected_type = SCALAR_VALUE;
+ if (type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_MAP_PTR) {
expected_type = CONST_PTR_TO_MAP;
@@ -1141,17 +1387,18 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
arg_type == ARG_PTR_TO_UNINIT_MEM) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
- * passed in as argument, it's a CONST_IMM type. Final test
+ * passed in as argument, it's a SCALAR_VALUE type. Final test
* happens during stack boundary checking.
*/
- if (type == CONST_IMM && reg->imm == 0)
+ if (register_is_null(*reg))
/* final test in check_stack_boundary() */;
- else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
- type != PTR_TO_MAP_VALUE_ADJ && type != expected_type)
+ else if (!type_is_pkt_pointer(type) &&
+ type != PTR_TO_MAP_VALUE &&
+ type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
} else {
- verbose("unsupported arg_type %d\n", arg_type);
+ verbose(env, "unsupported arg_type %d\n", arg_type);
return -EFAULT;
}
@@ -1169,12 +1416,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
* we have to check map_key here. Otherwise it means
* that kernel subsystem misconfigured verifier
*/
- verbose("invalid map_ptr to access map->key\n");
+ verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
- if (type == PTR_TO_PACKET)
- err = check_packet_access(env, regno, 0,
- meta->map_ptr->key_size);
+ if (type_is_pkt_pointer(type))
+ err = check_packet_access(env, regno, reg->off,
+ meta->map_ptr->key_size,
+ false);
else
err = check_stack_boundary(env, regno,
meta->map_ptr->key_size,
@@ -1185,12 +1433,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (!meta->map_ptr) {
/* kernel subsystem misconfigured verifier */
- verbose("invalid map_ptr to access map->value\n");
+ verbose(env, "invalid map_ptr to access map->value\n");
return -EACCES;
}
- if (type == PTR_TO_PACKET)
- err = check_packet_access(env, regno, 0,
- meta->map_ptr->value_size);
+ if (type_is_pkt_pointer(type))
+ err = check_packet_access(env, regno, reg->off,
+ meta->map_ptr->value_size,
+ false);
else
err = check_stack_boundary(env, regno,
meta->map_ptr->value_size,
@@ -1205,14 +1454,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (regno == 0) {
/* kernel subsystem misconfigured verifier */
- verbose("ARG_CONST_SIZE cannot be first argument\n");
+ verbose(env,
+ "ARG_CONST_SIZE cannot be first argument\n");
return -EACCES;
}
- /* If the register is UNKNOWN_VALUE, the access check happens
- * using its boundaries. Otherwise, just use its imm
+ /* The register is SCALAR_VALUE; the access check
+ * happens using its boundaries.
*/
- if (type == UNKNOWN_VALUE) {
+
+ if (!tnum_is_const(reg->var_off))
/* For unprivileged variable accesses, disable raw
* mode so that the program is required to
* initialize all the memory that the helper could
@@ -1220,45 +1471,39 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
meta = NULL;
- if (reg->min_value < 0) {
- verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
- regno);
- return -EACCES;
- }
-
- if (reg->min_value == 0) {
- err = check_helper_mem_access(env, regno - 1, 0,
- zero_size_allowed,
- meta);
- if (err)
- return err;
- }
+ if (reg->smin_value < 0) {
+ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
+ return -EACCES;
+ }
- if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
- verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
- regno);
- return -EACCES;
- }
- err = check_helper_mem_access(env, regno - 1,
- reg->max_value,
- zero_size_allowed, meta);
+ if (reg->umin_value == 0) {
+ err = check_helper_mem_access(env, regno - 1, 0,
+ zero_size_allowed,
+ meta);
if (err)
return err;
- } else {
- /* register is CONST_IMM */
- err = check_helper_mem_access(env, regno - 1, reg->imm,
- zero_size_allowed, meta);
}
+
+ if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1,
+ reg->umax_value,
+ zero_size_allowed, meta);
}
return err;
err_type:
- verbose("R%d type=%s expected=%s\n", regno,
+ verbose(env, "R%d type=%s expected=%s\n", regno,
reg_type_str[type], reg_type_str[expected_type]);
return -EACCES;
}
-static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map, int func_id)
{
if (!map)
return 0;
@@ -1271,7 +1516,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
break;
case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
if (func_id != BPF_FUNC_perf_event_read &&
- func_id != BPF_FUNC_perf_event_output)
+ func_id != BPF_FUNC_perf_event_output &&
+ func_id != BPF_FUNC_perf_event_read_value)
goto error;
break;
case BPF_MAP_TYPE_STACK_TRACE:
@@ -1283,10 +1529,30 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
func_id != BPF_FUNC_current_task_under_cgroup)
goto error;
break;
+ /* devmap returns a pointer to a live net_device ifindex that we cannot
+ * allow to be modified from bpf side. So do not allow lookup elements
+ * for now.
+ */
+ case BPF_MAP_TYPE_DEVMAP:
+ if (func_id != BPF_FUNC_redirect_map)
+ goto error;
+ break;
+ /* Restrict bpf side of cpumap, open when use-cases appear */
+ case BPF_MAP_TYPE_CPUMAP:
+ if (func_id != BPF_FUNC_redirect_map)
+ goto error;
+ break;
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
if (func_id != BPF_FUNC_map_lookup_elem)
goto error;
+ break;
+ case BPF_MAP_TYPE_SOCKMAP:
+ if (func_id != BPF_FUNC_sk_redirect_map &&
+ func_id != BPF_FUNC_sock_map_update &&
+ func_id != BPF_FUNC_map_delete_elem)
+ goto error;
+ break;
default:
break;
}
@@ -1299,6 +1565,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
break;
case BPF_FUNC_perf_event_read:
case BPF_FUNC_perf_event_output:
+ case BPF_FUNC_perf_event_read_value:
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
@@ -1311,13 +1578,26 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
goto error;
break;
+ case BPF_FUNC_redirect_map:
+ if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+ map->map_type != BPF_MAP_TYPE_CPUMAP)
+ goto error;
+ break;
+ case BPF_FUNC_sk_redirect_map:
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+ goto error;
+ break;
+ case BPF_FUNC_sock_map_update:
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+ goto error;
+ break;
default:
break;
}
return 0;
error:
- verbose("cannot pass map_type %d into func %s#%d\n",
+ verbose(env, "cannot pass map_type %d into func %s#%d\n",
map->map_type, func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1340,55 +1620,55 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
return count > 1 ? -EINVAL : 0;
}
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
+ */
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *regs = state->regs, *reg;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].type == PTR_TO_PACKET ||
- regs[i].type == PTR_TO_PACKET_END)
- mark_reg_unknown_value(regs, i);
+ if (reg_is_pkt_pointer_any(&regs[i]))
+ mark_reg_unknown(env, regs, i);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
- continue;
- reg = &state->spilled_regs[i / BPF_REG_SIZE];
- if (reg->type != PTR_TO_PACKET &&
- reg->type != PTR_TO_PACKET_END)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- __mark_reg_unknown_value(state->spilled_regs,
- i / BPF_REG_SIZE);
+ reg = &state->stack[i].spilled_ptr;
+ if (reg_is_pkt_pointer_any(reg))
+ __mark_reg_unknown(reg);
}
}
static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
- struct bpf_verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs;
struct bpf_call_arg_meta meta;
bool changes_data;
int i, err;
/* find function prototype */
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
- verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
+ verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
+ func_id);
return -EINVAL;
}
- if (env->prog->aux->ops->get_func_proto)
- fn = env->prog->aux->ops->get_func_proto(func_id);
+ if (env->ops->get_func_proto)
+ fn = env->ops->get_func_proto(func_id);
if (!fn) {
- verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
+ verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
+ func_id);
return -EINVAL;
}
/* eBPF programs must be GPL compatible to use GPL-ed functions */
if (!env->prog->gpl_compatible && fn->gpl_only) {
- verbose("cannot call GPL only function from proprietary program\n");
+ verbose(env, "cannot call GPL only function from proprietary program\n");
return -EINVAL;
}
@@ -1402,7 +1682,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
*/
err = check_raw_mode(fn);
if (err) {
- verbose("kernel subsystem misconfigured func %s#%d\n",
+ verbose(env, "kernel subsystem misconfigured func %s#%d\n",
func_id_name(func_id), func_id);
return err;
}
@@ -1433,26 +1713,33 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return err;
}
+ regs = cur_regs(env);
/* reset caller saved regs */
- for (i = 0; i < CALLER_SAVED_REGS; i++)
- mark_reg_not_init(regs, caller_saved[i]);
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ mark_reg_not_init(env, regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
- /* update return register */
+ /* update return register (already marked as written above) */
if (fn->ret_type == RET_INTEGER) {
- regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ /* sets type to SCALAR_VALUE */
+ mark_reg_unknown(env, regs, BPF_REG_0);
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
struct bpf_insn_aux_data *insn_aux;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
- regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
+ /* There is no offset yet applied, variable or fixed */
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].off = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
*/
if (meta.map_ptr == NULL) {
- verbose("kernel subsystem misconfigured verifier\n");
+ verbose(env,
+ "kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
@@ -1463,12 +1750,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
else if (insn_aux->map_ptr != meta.map_ptr)
insn_aux->map_ptr = BPF_MAP_PTR_POISON;
} else {
- verbose("unknown return type %d of func %s#%d\n",
+ verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
return -EINVAL;
}
- err = check_map_func_compatibility(meta.map_ptr, func_id);
+ err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
return err;
@@ -1477,494 +1764,554 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return 0;
}
-static int check_packet_ptr_add(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+static void coerce_reg_to_32(struct bpf_reg_state *reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
- struct bpf_reg_state tmp_reg;
- s32 imm;
-
- if (BPF_SRC(insn->code) == BPF_K) {
- /* pkt_ptr += imm */
- imm = insn->imm;
+ /* clear high 32 bits */
+ reg->var_off = tnum_cast(reg->var_off, 4);
+ /* Update bounds */
+ __update_reg_bounds(reg);
+}
-add_imm:
- if (imm < 0) {
- verbose("addition of negative constant to packet pointer is not allowed\n");
- return -EACCES;
- }
- if (imm >= MAX_PACKET_OFF ||
- imm + dst_reg->off >= MAX_PACKET_OFF) {
- verbose("constant %d is too large to add to packet pointer\n",
- imm);
- return -EACCES;
- }
- /* a constant was added to pkt_ptr.
- * Remember it while keeping the same 'id'
- */
- dst_reg->off += imm;
- } else {
- bool had_id;
-
- if (src_reg->type == PTR_TO_PACKET) {
- /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
- tmp_reg = *dst_reg; /* save r7 state */
- *dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */
- src_reg = &tmp_reg; /* pretend it's src_reg state */
- /* if the checks below reject it, the copy won't matter,
- * since we're rejecting the whole program. If all ok,
- * then imm22 state will be added to r7
- * and r7 will be pkt(id=0,off=22,r=62) while
- * r6 will stay as pkt(id=0,off=0,r=62)
- */
- }
+static bool signed_add_overflows(s64 a, s64 b)
+{
+ /* Do the add in u64, where overflow is well-defined */
+ s64 res = (s64)((u64)a + (u64)b);
- if (src_reg->type == CONST_IMM) {
- /* pkt_ptr += reg where reg is known constant */
- imm = src_reg->imm;
- goto add_imm;
- }
- /* disallow pkt_ptr += reg
- * if reg is not uknown_value with guaranteed zero upper bits
- * otherwise pkt_ptr may overflow and addition will become
- * subtraction which is not allowed
- */
- if (src_reg->type != UNKNOWN_VALUE) {
- verbose("cannot add '%s' to ptr_to_packet\n",
- reg_type_str[src_reg->type]);
- return -EACCES;
- }
- if (src_reg->imm < 48) {
- verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n",
- src_reg->imm);
- return -EACCES;
- }
+ if (b < 0)
+ return res > a;
+ return res < a;
+}
- had_id = (dst_reg->id != 0);
+static bool signed_sub_overflows(s64 a, s64 b)
+{
+ /* Do the sub in u64, where overflow is well-defined */
+ s64 res = (s64)((u64)a - (u64)b);
- /* dst_reg stays as pkt_ptr type and since some positive
- * integer value was added to the pointer, increment its 'id'
- */
- dst_reg->id = ++env->id_gen;
-
- /* something was added to pkt_ptr, set range to zero */
- dst_reg->aux_off += dst_reg->off;
- dst_reg->off = 0;
- dst_reg->range = 0;
- if (had_id)
- dst_reg->aux_off_align = min(dst_reg->aux_off_align,
- src_reg->min_align);
- else
- dst_reg->aux_off_align = src_reg->min_align;
- }
- return 0;
+ if (b < 0)
+ return res < a;
+ return res > a;
}
-static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
+/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
+ * Caller should also handle BPF_MOV case separately.
+ * If we return -EACCES, caller may want to try again treating pointer as a
+ * scalar. So we only emit a diagnostic if !env->allow_ptr_leaks.
+ */
+static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ const struct bpf_reg_state *ptr_reg,
+ const struct bpf_reg_state *off_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *regs = cur_regs(env), *dst_reg;
+ bool known = tnum_is_const(off_reg->var_off);
+ s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
+ smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
+ u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
+ umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
u8 opcode = BPF_OP(insn->code);
- s64 imm_log2;
+ u32 dst = insn->dst_reg;
- /* for type == UNKNOWN_VALUE:
- * imm > 0 -> number of zero upper bits
- * imm == 0 -> don't track which is the same as all bits can be non-zero
- */
+ dst_reg = &regs[dst];
- if (BPF_SRC(insn->code) == BPF_X) {
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-
- if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
- dst_reg->imm && opcode == BPF_ADD) {
- /* dreg += sreg
- * where both have zero upper bits. Adding them
- * can only result making one more bit non-zero
- * in the larger value.
- * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
- * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
- */
- dst_reg->imm = min(dst_reg->imm, src_reg->imm);
- dst_reg->imm--;
- return 0;
- }
- if (src_reg->type == CONST_IMM && src_reg->imm > 0 &&
- dst_reg->imm && opcode == BPF_ADD) {
- /* dreg += sreg
- * where dreg has zero upper bits and sreg is const.
- * Adding them can only result making one more bit
- * non-zero in the larger value.
- */
- imm_log2 = __ilog2_u64((long long)src_reg->imm);
- dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- return 0;
- }
- /* all other cases non supported yet, just mark dst_reg */
- dst_reg->imm = 0;
- return 0;
+ if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
+ print_verifier_state(env, env->cur_state);
+ verbose(env,
+ "verifier internal error: known but bad sbounds\n");
+ return -EINVAL;
+ }
+ if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
+ print_verifier_state(env, env->cur_state);
+ verbose(env,
+ "verifier internal error: known but bad ubounds\n");
+ return -EINVAL;
}
- /* sign extend 32-bit imm into 64-bit to make sure that
- * negative values occupy bit 63. Note ilog2() would have
- * been incorrect, since sizeof(insn->imm) == 4
- */
- imm_log2 = __ilog2_u64((long long)insn->imm);
-
- if (dst_reg->imm && opcode == BPF_LSH) {
- /* reg <<= imm
- * if reg was a result of 2 byte load, then its imm == 48
- * which means that upper 48 bits are zero and shifting this reg
- * left by 4 would mean that upper 44 bits are still zero
- */
- dst_reg->imm -= insn->imm;
- } else if (dst_reg->imm && opcode == BPF_MUL) {
- /* reg *= imm
- * if multiplying by 14 subtract 4
- * This is conservative calculation of upper zero bits.
- * It's not trying to special case insn->imm == 1 or 0 cases
- */
- dst_reg->imm -= imm_log2 + 1;
- } else if (opcode == BPF_AND) {
- /* reg &= imm */
- dst_reg->imm = 63 - imm_log2;
- } else if (dst_reg->imm && opcode == BPF_ADD) {
- /* reg += imm */
- dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- } else if (opcode == BPF_RSH) {
- /* reg >>= imm
- * which means that after right shift, upper bits will be zero
- * note that verifier already checked that
- * 0 <= imm < 64 for shift insn
- */
- dst_reg->imm += insn->imm;
- if (unlikely(dst_reg->imm > 64))
- /* some dumb code did:
- * r2 = *(u32 *)mem;
- * r2 >>= 32;
- * and all bits are zero now */
- dst_reg->imm = 64;
- } else {
- /* all other alu ops, means that we don't know what will
- * happen to the value, mark it with unknown number of zero bits
- */
- dst_reg->imm = 0;
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops on pointers produce (meaningless) scalars */
+ if (!env->allow_ptr_leaks)
+ verbose(env,
+ "R%d 32-bit pointer arithmetic prohibited\n",
+ dst);
+ return -EACCES;
}
- if (dst_reg->imm < 0) {
- /* all 64 bits of the register can contain non-zero bits
- * and such value cannot be added to ptr_to_packet, since it
- * may overflow, mark it as unknown to avoid further eval
- */
- dst_reg->imm = 0;
+ if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
+ if (!env->allow_ptr_leaks)
+ verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+ dst);
+ return -EACCES;
+ }
+ if (ptr_reg->type == CONST_PTR_TO_MAP) {
+ if (!env->allow_ptr_leaks)
+ verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+ dst);
+ return -EACCES;
+ }
+ if (ptr_reg->type == PTR_TO_PACKET_END) {
+ if (!env->allow_ptr_leaks)
+ verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+ dst);
+ return -EACCES;
}
- return 0;
-}
-static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
-{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
- u8 opcode = BPF_OP(insn->code);
- s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
-
- /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
- if (src_reg->imm > 0 && dst_reg->imm) {
- switch (opcode) {
- case BPF_ADD:
- /* dreg += sreg
- * where both have zero upper bits. Adding them
- * can only result making one more bit non-zero
- * in the larger value.
- * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
- * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
- */
- dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- break;
- case BPF_AND:
- /* dreg &= sreg
- * AND can not extend zero bits only shrink
- * Ex. 0x00..00ffffff
- * & 0x0f..ffffffff
- * ----------------
- * 0x00..00ffffff
- */
- dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
+ /* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
+ * The id may be overwritten later if we create a new variable offset.
+ */
+ dst_reg->type = ptr_reg->type;
+ dst_reg->id = ptr_reg->id;
+
+ switch (opcode) {
+ case BPF_ADD:
+ /* We can take a fixed offset as long as it doesn't overflow
+ * the s32 'off' field
+ */
+ if (known && (ptr_reg->off + smin_val ==
+ (s64)(s32)(ptr_reg->off + smin_val))) {
+ /* pointer += K. Accumulate it into fixed offset */
+ dst_reg->smin_value = smin_ptr;
+ dst_reg->smax_value = smax_ptr;
+ dst_reg->umin_value = umin_ptr;
+ dst_reg->umax_value = umax_ptr;
+ dst_reg->var_off = ptr_reg->var_off;
+ dst_reg->off = ptr_reg->off + smin_val;
+ dst_reg->range = ptr_reg->range;
break;
- case BPF_OR:
- /* dreg |= sreg
- * OR can only extend zero bits
- * Ex. 0x00..00ffffff
- * | 0x0f..ffffffff
- * ----------------
- * 0x0f..00ffffff
- */
- dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ }
+ /* A new variable offset is created. Note that off_reg->off
+ * == 0, since it's a scalar.
+ * dst_reg gets the pointer type and since some positive
+ * integer value was added to the pointer, give it a new 'id'
+ * if it's a PTR_TO_PACKET.
+ * this creates a new 'base' pointer, off_reg (variable) gets
+ * added into the variable offset, and we copy the fixed offset
+ * from ptr_reg.
+ */
+ if (signed_add_overflows(smin_ptr, smin_val) ||
+ signed_add_overflows(smax_ptr, smax_val)) {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = smin_ptr + smin_val;
+ dst_reg->smax_value = smax_ptr + smax_val;
+ }
+ if (umin_ptr + umin_val < umin_ptr ||
+ umax_ptr + umax_val < umax_ptr) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ dst_reg->umin_value = umin_ptr + umin_val;
+ dst_reg->umax_value = umax_ptr + umax_val;
+ }
+ dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
+ dst_reg->off = ptr_reg->off;
+ if (reg_is_pkt_pointer(ptr_reg)) {
+ dst_reg->id = ++env->id_gen;
+ /* something was added to pkt_ptr, set range to zero */
+ dst_reg->range = 0;
+ }
+ break;
+ case BPF_SUB:
+ if (dst_reg == off_reg) {
+ /* scalar -= pointer. Creates an unknown scalar */
+ if (!env->allow_ptr_leaks)
+ verbose(env, "R%d tried to subtract pointer from scalar\n",
+ dst);
+ return -EACCES;
+ }
+ /* We don't allow subtraction from FP, because (according to
+ * test_verifier.c test "invalid fp arithmetic", JITs might not
+ * be able to deal with it.
+ */
+ if (ptr_reg->type == PTR_TO_STACK) {
+ if (!env->allow_ptr_leaks)
+ verbose(env, "R%d subtraction from stack pointer prohibited\n",
+ dst);
+ return -EACCES;
+ }
+ if (known && (ptr_reg->off - smin_val ==
+ (s64)(s32)(ptr_reg->off - smin_val))) {
+ /* pointer -= K. Subtract it from fixed offset */
+ dst_reg->smin_value = smin_ptr;
+ dst_reg->smax_value = smax_ptr;
+ dst_reg->umin_value = umin_ptr;
+ dst_reg->umax_value = umax_ptr;
+ dst_reg->var_off = ptr_reg->var_off;
+ dst_reg->id = ptr_reg->id;
+ dst_reg->off = ptr_reg->off - smin_val;
+ dst_reg->range = ptr_reg->range;
break;
- case BPF_SUB:
- case BPF_MUL:
- case BPF_RSH:
- case BPF_LSH:
- /* These may be flushed out later */
- default:
- mark_reg_unknown_value(regs, insn->dst_reg);
}
- } else {
- mark_reg_unknown_value(regs, insn->dst_reg);
- }
-
- dst_reg->type = UNKNOWN_VALUE;
- return 0;
-}
-
-static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
-{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
- u8 opcode = BPF_OP(insn->code);
- u64 dst_imm = dst_reg->imm;
-
- if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
- return evaluate_reg_imm_alu_unknown(env, insn);
-
- /* dst_reg->type == CONST_IMM here. Simulate execution of insns
- * containing ALU ops. Don't care about overflow or negative
- * values, just add/sub/... them; registers are in u64.
- */
- if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) {
- dst_imm += insn->imm;
- } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm += src_reg->imm;
- } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) {
- dst_imm -= insn->imm;
- } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm -= src_reg->imm;
- } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) {
- dst_imm *= insn->imm;
- } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm *= src_reg->imm;
- } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) {
- dst_imm |= insn->imm;
- } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm |= src_reg->imm;
- } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) {
- dst_imm &= insn->imm;
- } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm &= src_reg->imm;
- } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) {
- dst_imm >>= insn->imm;
- } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm >>= src_reg->imm;
- } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) {
- dst_imm <<= insn->imm;
- } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm <<= src_reg->imm;
- } else {
- mark_reg_unknown_value(regs, insn->dst_reg);
- goto out;
+ /* A new variable offset is created. If the subtrahend is known
+ * nonnegative, then any reg->range we had before is still good.
+ */
+ if (signed_sub_overflows(smin_ptr, smax_val) ||
+ signed_sub_overflows(smax_ptr, smin_val)) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = smin_ptr - smax_val;
+ dst_reg->smax_value = smax_ptr - smin_val;
+ }
+ if (umin_ptr < umax_val) {
+ /* Overflow possible, we know nothing */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ /* Cannot overflow (as long as bounds are consistent) */
+ dst_reg->umin_value = umin_ptr - umax_val;
+ dst_reg->umax_value = umax_ptr - umin_val;
+ }
+ dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
+ dst_reg->off = ptr_reg->off;
+ if (reg_is_pkt_pointer(ptr_reg)) {
+ dst_reg->id = ++env->id_gen;
+ /* something was added to pkt_ptr, set range to zero */
+ if (smin_val < 0)
+ dst_reg->range = 0;
+ }
+ break;
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ /* bitwise ops on pointers are troublesome, prohibit for now.
+ * (However, in principle we could allow some cases, e.g.
+ * ptr &= ~3 which would reduce min_value by 3.)
+ */
+ if (!env->allow_ptr_leaks)
+ verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
+ return -EACCES;
+ default:
+ /* other operators (e.g. MUL,LSH) produce non-pointer results */
+ if (!env->allow_ptr_leaks)
+ verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
+ return -EACCES;
}
- dst_reg->imm = dst_imm;
-out:
+ __update_reg_bounds(dst_reg);
+ __reg_deduce_bounds(dst_reg);
+ __reg_bound_offset(dst_reg);
return 0;
}
-static void check_reg_overflow(struct bpf_reg_state *reg)
-{
- if (reg->max_value > BPF_REGISTER_MAX_RANGE)
- reg->max_value = BPF_REGISTER_MAX_RANGE;
- if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
- reg->min_value > BPF_REGISTER_MAX_RANGE)
- reg->min_value = BPF_REGISTER_MIN_RANGE;
-}
-
-static u32 calc_align(u32 imm)
-{
- if (!imm)
- return 1U << 31;
- return imm - ((imm - 1) & imm);
-}
-
-static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state src_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
- s64 min_val = BPF_REGISTER_MIN_RANGE;
- u64 max_val = BPF_REGISTER_MAX_RANGE;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
- u32 dst_align, src_align;
-
- dst_reg = &regs[insn->dst_reg];
- src_align = 0;
- if (BPF_SRC(insn->code) == BPF_X) {
- check_reg_overflow(&regs[insn->src_reg]);
- min_val = regs[insn->src_reg].min_value;
- max_val = regs[insn->src_reg].max_value;
-
- /* If the source register is a random pointer then the
- * min_value/max_value values represent the range of the known
- * accesses into that value, not the actual min/max value of the
- * register itself. In this case we have to reset the reg range
- * values so we know it is not safe to look at.
- */
- if (regs[insn->src_reg].type != CONST_IMM &&
- regs[insn->src_reg].type != UNKNOWN_VALUE) {
- min_val = BPF_REGISTER_MIN_RANGE;
- max_val = BPF_REGISTER_MAX_RANGE;
- src_align = 0;
- } else {
- src_align = regs[insn->src_reg].min_align;
- }
- } else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
- (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
- min_val = max_val = insn->imm;
- src_align = calc_align(insn->imm);
- }
-
- dst_align = dst_reg->min_align;
-
- /* We don't know anything about what was done to this register, mark it
- * as unknown. Also, if both derived bounds came from signed/unsigned
- * mixed compares and one side is unbounded, we cannot really do anything
- * with them as boundaries cannot be trusted. Thus, arithmetic of two
- * regs of such kind will get invalidated bounds on the dst side.
- */
- if ((min_val == BPF_REGISTER_MIN_RANGE &&
- max_val == BPF_REGISTER_MAX_RANGE) ||
- (BPF_SRC(insn->code) == BPF_X &&
- ((min_val != BPF_REGISTER_MIN_RANGE &&
- max_val == BPF_REGISTER_MAX_RANGE) ||
- (min_val == BPF_REGISTER_MIN_RANGE &&
- max_val != BPF_REGISTER_MAX_RANGE) ||
- (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
- dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
- (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
- dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
- regs[insn->dst_reg].value_from_signed !=
- regs[insn->src_reg].value_from_signed)) {
- reset_reg_range_values(regs, insn->dst_reg);
- return;
- }
-
- /* If one of our values was at the end of our ranges then we can't just
- * do our normal operations to the register, we need to set the values
- * to the min/max since they are undefined.
- */
- if (opcode != BPF_SUB) {
- if (min_val == BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- if (max_val == BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ bool src_known, dst_known;
+ s64 smin_val, smax_val;
+ u64 umin_val, umax_val;
+
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops are (32,32)->64 */
+ coerce_reg_to_32(dst_reg);
+ coerce_reg_to_32(&src_reg);
}
+ smin_val = src_reg.smin_value;
+ smax_val = src_reg.smax_value;
+ umin_val = src_reg.umin_value;
+ umax_val = src_reg.umax_value;
+ src_known = tnum_is_const(src_reg.var_off);
+ dst_known = tnum_is_const(dst_reg->var_off);
switch (opcode) {
case BPF_ADD:
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value += min_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value += max_val;
- dst_reg->min_align = min(src_align, dst_align);
+ if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
+ signed_add_overflows(dst_reg->smax_value, smax_val)) {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value += smin_val;
+ dst_reg->smax_value += smax_val;
+ }
+ if (dst_reg->umin_value + umin_val < umin_val ||
+ dst_reg->umax_value + umax_val < umax_val) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ dst_reg->umin_value += umin_val;
+ dst_reg->umax_value += umax_val;
+ }
+ dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
break;
case BPF_SUB:
- /* If one of our values was at the end of our ranges, then the
- * _opposite_ value in the dst_reg goes to the end of our range.
- */
- if (min_val == BPF_REGISTER_MIN_RANGE)
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
- if (max_val == BPF_REGISTER_MAX_RANGE)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value -= max_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value -= min_val;
- dst_reg->min_align = min(src_align, dst_align);
+ if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
+ signed_sub_overflows(dst_reg->smax_value, smin_val)) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value -= smax_val;
+ dst_reg->smax_value -= smin_val;
+ }
+ if (dst_reg->umin_value < umax_val) {
+ /* Overflow possible, we know nothing */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ /* Cannot overflow (as long as bounds are consistent) */
+ dst_reg->umin_value -= umax_val;
+ dst_reg->umax_value -= umin_val;
+ }
+ dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
break;
case BPF_MUL:
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value *= min_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value *= max_val;
- dst_reg->min_align = max(src_align, dst_align);
+ dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
+ if (smin_val < 0 || dst_reg->smin_value < 0) {
+ /* Ain't nobody got time to multiply that sign */
+ __mark_reg_unbounded(dst_reg);
+ __update_reg_bounds(dst_reg);
+ break;
+ }
+ /* Both values are positive, so we can work with unsigned and
+ * copy the result to signed (unless it exceeds S64_MAX).
+ */
+ if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
+ /* Potential overflow, we know nothing */
+ __mark_reg_unbounded(dst_reg);
+ /* (except what we can learn from the var_off) */
+ __update_reg_bounds(dst_reg);
+ break;
+ }
+ dst_reg->umin_value *= umin_val;
+ dst_reg->umax_value *= umax_val;
+ if (dst_reg->umax_value > S64_MAX) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
break;
case BPF_AND:
- /* Disallow AND'ing of negative numbers, ain't nobody got time
- * for that. Otherwise the minimum is 0 and the max is the max
- * value we could AND against.
+ if (src_known && dst_known) {
+ __mark_reg_known(dst_reg, dst_reg->var_off.value &
+ src_reg.var_off.value);
+ break;
+ }
+ /* We get our minimum from the var_off, since that's inherently
+ * bitwise. Our maximum is the minimum of the operands' maxima.
*/
- if (min_val < 0)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- else
- dst_reg->min_value = 0;
- dst_reg->max_value = max_val;
- dst_reg->min_align = max(src_align, dst_align);
+ dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
+ dst_reg->umin_value = dst_reg->var_off.value;
+ dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
+ if (dst_reg->smin_value < 0 || smin_val < 0) {
+ /* Lose signed bounds when ANDing negative numbers,
+ * ain't nobody got time for that.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ /* ANDing two positives gives a positive, so safe to
+ * cast result into s64.
+ */
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
- case BPF_LSH:
- /* Gotta have special overflow logic here, if we're shifting
- * more than MAX_RANGE then just assume we have an invalid
- * range.
+ case BPF_OR:
+ if (src_known && dst_known) {
+ __mark_reg_known(dst_reg, dst_reg->var_off.value |
+ src_reg.var_off.value);
+ break;
+ }
+ /* We get our maximum from the var_off, and our minimum is the
+ * maximum of the operands' minima
*/
- if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) {
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- dst_reg->min_align = 1;
+ dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
+ dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
+ dst_reg->umax_value = dst_reg->var_off.value |
+ dst_reg->var_off.mask;
+ if (dst_reg->smin_value < 0 || smin_val < 0) {
+ /* Lose signed bounds when ORing negative numbers,
+ * ain't nobody got time for that.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
} else {
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value <<= min_val;
- if (!dst_reg->min_align)
- dst_reg->min_align = 1;
- dst_reg->min_align <<= min_val;
- }
- if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
- else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value <<= max_val;
+ /* ORing two positives gives a positive, so safe to
+ * cast result into s64.
+ */
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
- case BPF_RSH:
- /* RSH by a negative number is undefined, and the BPF_RSH is an
- * unsigned shift, so make the appropriate casts.
+ case BPF_LSH:
+ if (umax_val > 63) {
+ /* Shifts greater than 63 are undefined. This includes
+ * shifts by a negative number.
+ */
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ break;
+ }
+ /* We lose all sign bit information (except what we can pick
+ * up from var_off)
*/
- if (min_val < 0 || dst_reg->min_value < 0) {
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ /* If we might shift our top bit out, then we know nothing */
+ if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
} else {
- dst_reg->min_value =
- (u64)(dst_reg->min_value) >> min_val;
+ dst_reg->umin_value <<= umin_val;
+ dst_reg->umax_value <<= umax_val;
+ }
+ if (src_known)
+ dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
+ else
+ dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
+ break;
+ case BPF_RSH:
+ if (umax_val > 63) {
+ /* Shifts greater than 63 are undefined. This includes
+ * shifts by a negative number.
+ */
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ break;
}
- if (min_val < 0) {
- dst_reg->min_align = 1;
+ /* BPF_RSH is an unsigned shift, so make the appropriate casts */
+ if (dst_reg->smin_value < 0) {
+ if (umin_val) {
+ /* Sign bit will be cleared */
+ dst_reg->smin_value = 0;
+ } else {
+ /* Lost sign bit information */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ }
} else {
- dst_reg->min_align >>= (u64) min_val;
- if (!dst_reg->min_align)
- dst_reg->min_align = 1;
+ dst_reg->smin_value =
+ (u64)(dst_reg->smin_value) >> umax_val;
}
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value >>= max_val;
+ if (src_known)
+ dst_reg->var_off = tnum_rshift(dst_reg->var_off,
+ umin_val);
+ else
+ dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+ dst_reg->umin_value >>= umax_val;
+ dst_reg->umax_value >>= umin_val;
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
default:
- reset_reg_range_values(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
- check_reg_overflow(dst_reg);
+ __reg_deduce_bounds(dst_reg);
+ __reg_bound_offset(dst_reg);
+ return 0;
+}
+
+/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max
+ * and var_off.
+ */
+static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
+ struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
+ u8 opcode = BPF_OP(insn->code);
+ int rc;
+
+ dst_reg = &regs[insn->dst_reg];
+ src_reg = NULL;
+ if (dst_reg->type != SCALAR_VALUE)
+ ptr_reg = dst_reg;
+ if (BPF_SRC(insn->code) == BPF_X) {
+ src_reg = &regs[insn->src_reg];
+ if (src_reg->type != SCALAR_VALUE) {
+ if (dst_reg->type != SCALAR_VALUE) {
+ /* Combining two pointers by any ALU op yields
+ * an arbitrary scalar.
+ */
+ if (!env->allow_ptr_leaks) {
+ verbose(env, "R%d pointer %s pointer prohibited\n",
+ insn->dst_reg,
+ bpf_alu_string[opcode >> 4]);
+ return -EACCES;
+ }
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ return 0;
+ } else {
+ /* scalar += pointer
+ * This is legal, but we have to reverse our
+ * src/dest handling in computing the range
+ */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ src_reg, dst_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* scalar += unknown scalar */
+ __mark_reg_unknown(&off_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn,
+ dst_reg, off_reg);
+ }
+ return rc;
+ }
+ } else if (ptr_reg) {
+ /* pointer += scalar */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ dst_reg, src_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* unknown scalar += scalar */
+ __mark_reg_unknown(dst_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn, dst_reg, *src_reg);
+ }
+ return rc;
+ }
+ } else {
+ /* Pretend the src is a reg with a known value, since we only
+ * need to be able to read from this state.
+ */
+ off_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&off_reg, insn->imm);
+ src_reg = &off_reg;
+ if (ptr_reg) { /* pointer += K */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ ptr_reg, src_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* unknown scalar += K */
+ __mark_reg_unknown(dst_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn, dst_reg, off_reg);
+ }
+ return rc;
+ }
+ }
+
+ /* Got here implies adding two SCALAR_VALUEs */
+ if (WARN_ON_ONCE(ptr_reg)) {
+ print_verifier_state(env, env->cur_state);
+ verbose(env, "verifier internal error: unexpected ptr_reg\n");
+ return -EINVAL;
+ }
+ if (WARN_ON(!src_reg)) {
+ print_verifier_state(env, env->cur_state);
+ verbose(env, "verifier internal error: no src_reg\n");
+ return -EINVAL;
+ }
+ return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
}
/* check validity of 32-bit and 64-bit arithmetic operations */
static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
int err;
@@ -1973,30 +2320,31 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->off != 0 || insn->imm != 0) {
- verbose("BPF_NEG uses reserved fields\n");
+ verbose(env, "BPF_NEG uses reserved fields\n");
return -EINVAL;
}
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
- (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
- verbose("BPF_END uses reserved fields\n");
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
+ BPF_CLASS(insn->code) == BPF_ALU64) {
+ verbose(env, "BPF_END uses reserved fields\n");
return -EINVAL;
}
}
/* check src operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
+ verbose(env, "R%d pointer arithmetic prohibited\n",
insn->dst_reg);
return -EACCES;
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
@@ -2004,88 +2352,85 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
+ verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
/* check src operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
+ verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
- /* we are setting our register to something new, we need to
- * reset its range values.
- */
- reset_reg_range_values(regs, insn->dst_reg);
-
if (BPF_SRC(insn->code) == BPF_X) {
if (BPF_CLASS(insn->code) == BPF_ALU64) {
/* case: R1 = R2
* copy register state to dest reg
*/
regs[insn->dst_reg] = regs[insn->src_reg];
+ regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
} else {
+ /* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d partial copy of pointer\n",
+ verbose(env,
+ "R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
}
- mark_reg_unknown_value(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ /* high 32 bits are known zero. */
+ regs[insn->dst_reg].var_off = tnum_cast(
+ regs[insn->dst_reg].var_off, 4);
+ __update_reg_bounds(&regs[insn->dst_reg]);
}
} else {
/* case: R = imm
* remember the value we stored into this reg
*/
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = insn->imm;
- regs[insn->dst_reg].id = 0;
- regs[insn->dst_reg].max_value = insn->imm;
- regs[insn->dst_reg].min_value = insn->imm;
- regs[insn->dst_reg].min_align = calc_align(insn->imm);
- regs[insn->dst_reg].value_from_signed = false;
+ regs[insn->dst_reg].type = SCALAR_VALUE;
+ __mark_reg_known(regs + insn->dst_reg, insn->imm);
}
} else if (opcode > BPF_END) {
- verbose("invalid BPF_ALU opcode %x\n", opcode);
+ verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
return -EINVAL;
} else { /* all other ALU ops: and, sub, xor, add, ... */
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
}
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
- verbose("div by zero\n");
+ verbose(env, "div by zero\n");
return -EINVAL;
}
@@ -2094,306 +2439,330 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
if (insn->imm < 0 || insn->imm >= size) {
- verbose("invalid shift %d\n", insn->imm);
+ verbose(env, "invalid shift %d\n", insn->imm);
return -EINVAL;
}
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err)
return err;
- dst_reg = &regs[insn->dst_reg];
-
- /* first we want to adjust our ranges. */
- adjust_reg_min_max_vals(env, insn);
-
- /* pattern match 'bpf_add Rx, imm' instruction */
- if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
- dst_reg->type = PTR_TO_STACK;
- dst_reg->imm = insn->imm;
- return 0;
- } else if (opcode == BPF_ADD &&
- BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == PTR_TO_STACK &&
- ((BPF_SRC(insn->code) == BPF_X &&
- regs[insn->src_reg].type == CONST_IMM) ||
- BPF_SRC(insn->code) == BPF_K)) {
- if (BPF_SRC(insn->code) == BPF_X)
- dst_reg->imm += regs[insn->src_reg].imm;
- else
- dst_reg->imm += insn->imm;
- return 0;
- } else if (opcode == BPF_ADD &&
- BPF_CLASS(insn->code) == BPF_ALU64 &&
- (dst_reg->type == PTR_TO_PACKET ||
- (BPF_SRC(insn->code) == BPF_X &&
- regs[insn->src_reg].type == PTR_TO_PACKET))) {
- /* ptr_to_packet += K|X */
- return check_packet_ptr_add(env, insn);
- } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == UNKNOWN_VALUE &&
- env->allow_ptr_leaks) {
- /* unknown += K|X */
- return evaluate_reg_alu(env, insn);
- } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == CONST_IMM &&
- env->allow_ptr_leaks) {
- /* reg_imm += K|X */
- return evaluate_reg_imm_alu(env, insn);
- } else if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
- insn->dst_reg);
- return -EACCES;
- } else if (BPF_SRC(insn->code) == BPF_X &&
- is_pointer_value(env, insn->src_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
- insn->src_reg);
- return -EACCES;
- }
-
- /* If we did pointer math on a map value then just set it to our
- * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or
- * loads to this register appropriately, otherwise just mark the
- * register as unknown.
- */
- if (env->allow_ptr_leaks &&
- BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
- (dst_reg->type == PTR_TO_MAP_VALUE ||
- dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
- dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
- else
- mark_reg_unknown_value(regs, insn->dst_reg);
+ return adjust_reg_min_max_vals(env, insn);
}
return 0;
}
static void find_good_pkt_pointers(struct bpf_verifier_state *state,
- struct bpf_reg_state *dst_reg)
+ struct bpf_reg_state *dst_reg,
+ enum bpf_reg_type type,
+ bool range_right_open)
{
struct bpf_reg_state *regs = state->regs, *reg;
+ u16 new_range;
int i;
- /* LLVM can generate two kind of checks:
+ if (dst_reg->off < 0 ||
+ (dst_reg->off == 0 && range_right_open))
+ /* This doesn't give us any range */
+ return;
+
+ if (dst_reg->umax_value > MAX_PACKET_OFF ||
+ dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF)
+ /* Risk of overflow. For instance, ptr + (1<<63) may be less
+ * than pkt_end, but that's because it's also less than pkt.
+ */
+ return;
+
+ new_range = dst_reg->off;
+ if (range_right_open)
+ new_range--;
+
+ /* Examples for register markings:
*
- * Type 1:
+ * pkt_data in dst register:
*
* r2 = r3;
* r2 += 8;
* if (r2 > pkt_end) goto <handle exception>
* <access okay>
*
+ * r2 = r3;
+ * r2 += 8;
+ * if (r2 < pkt_end) goto <access okay>
+ * <handle exception>
+ *
* Where:
* r2 == dst_reg, pkt_end == src_reg
* r2=pkt(id=n,off=8,r=0)
* r3=pkt(id=n,off=0,r=0)
*
- * Type 2:
+ * pkt_data in src register:
*
* r2 = r3;
* r2 += 8;
* if (pkt_end >= r2) goto <access okay>
* <handle exception>
*
+ * r2 = r3;
+ * r2 += 8;
+ * if (pkt_end <= r2) goto <handle exception>
+ * <access okay>
+ *
* Where:
* pkt_end == dst_reg, r2 == src_reg
* r2=pkt(id=n,off=8,r=0)
* r3=pkt(id=n,off=0,r=0)
*
* Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
- * so that range of bytes [r3, r3 + 8) is safe to access.
+ * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8)
+ * and [r3, r3 + 8-1) respectively is safe to access depending on
+ * the check.
*/
+ /* If our ids match, then we must have the same max_value. And we
+ * don't care about the other reg's fixed offset, since if it's too big
+ * the range won't allow anything.
+ * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
+ */
for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+ if (regs[i].type == type && regs[i].id == dst_reg->id)
/* keep the maximum range already checked */
- regs[i].range = max(regs[i].range, dst_reg->off);
+ regs[i].range = max(regs[i].range, new_range);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- reg = &state->spilled_regs[i / BPF_REG_SIZE];
- if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
- reg->range = max(reg->range, dst_reg->off);
+ reg = &state->stack[i].spilled_ptr;
+ if (reg->type == type && reg->id == dst_reg->id)
+ reg->range = max(reg->range, new_range);
}
}
/* Adjusts the register min/max values in the case that the dst_reg is the
* variable register that we are working on, and src_reg is a constant or we're
* simply doing a BPF_K check.
+ * In JEQ/JNE cases we also adjust the var_off values.
*/
static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
- bool value_from_signed = true;
- bool is_range = true;
+ /* If the dst_reg is a pointer, we can't learn anything about its
+ * variable offset from the compare (unless src_reg were a pointer into
+ * the same object, but we don't bother with that.
+ * Since false_reg and true_reg have the same type by construction, we
+ * only need to check one of them for pointerness.
+ */
+ if (__is_pointer_value(false, false_reg))
+ return;
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
- true_reg->max_value = true_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(true_reg, val);
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
- false_reg->max_value = false_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(false_reg, val);
break;
case BPF_JGT:
- value_from_signed = false;
- /* fallthrough */
+ false_reg->umax_value = min(false_reg->umax_value, val);
+ true_reg->umin_value = max(true_reg->umin_value, val + 1);
+ break;
case BPF_JSGT:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGT) {
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
- }
- /* If this is false then we know the maximum val is val,
- * otherwise we know the min val is val+1.
- */
- false_reg->max_value = val;
- false_reg->value_from_signed = value_from_signed;
- true_reg->min_value = val + 1;
- true_reg->value_from_signed = value_from_signed;
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
+ break;
+ case BPF_JLT:
+ false_reg->umin_value = max(false_reg->umin_value, val);
+ true_reg->umax_value = min(true_reg->umax_value, val - 1);
+ break;
+ case BPF_JSLT:
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
break;
case BPF_JGE:
- value_from_signed = false;
- /* fallthrough */
+ false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ true_reg->umin_value = max(true_reg->umin_value, val);
+ break;
case BPF_JSGE:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGE) {
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
- }
- /* If this is false then we know the maximum value is val - 1,
- * otherwise we know the mimimum value is val.
- */
- false_reg->max_value = val - 1;
- false_reg->value_from_signed = value_from_signed;
- true_reg->min_value = val;
- true_reg->value_from_signed = value_from_signed;
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ break;
+ case BPF_JLE:
+ false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ true_reg->umax_value = min(true_reg->umax_value, val);
+ break;
+ case BPF_JSLE:
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
break;
default:
break;
}
- check_reg_overflow(false_reg);
- check_reg_overflow(true_reg);
- if (is_range) {
- if (__is_pointer_value(false, false_reg))
- reset_reg_range_values(false_reg, 0);
- if (__is_pointer_value(false, true_reg))
- reset_reg_range_values(true_reg, 0);
- }
+ __reg_deduce_bounds(false_reg);
+ __reg_deduce_bounds(true_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(false_reg);
+ __reg_bound_offset(true_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(false_reg);
+ __update_reg_bounds(true_reg);
}
-/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
- * is the variable reg.
+/* Same as above, but for the case that dst_reg holds a constant and src_reg is
+ * the variable reg.
*/
static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
- bool value_from_signed = true;
- bool is_range = true;
+ if (__is_pointer_value(false, false_reg))
+ return;
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
- true_reg->max_value = true_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(true_reg, val);
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
- false_reg->max_value = false_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(false_reg, val);
break;
case BPF_JGT:
- value_from_signed = false;
- /* fallthrough */
+ true_reg->umax_value = min(true_reg->umax_value, val - 1);
+ false_reg->umin_value = max(false_reg->umin_value, val);
+ break;
case BPF_JSGT:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGT) {
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
- }
- /*
- * If this is false, then the val is <= the register, if it is
- * true the register <= to the val.
- */
- false_reg->min_value = val;
- false_reg->value_from_signed = value_from_signed;
- true_reg->max_value = val - 1;
- true_reg->value_from_signed = value_from_signed;
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
+ break;
+ case BPF_JLT:
+ true_reg->umin_value = max(true_reg->umin_value, val + 1);
+ false_reg->umax_value = min(false_reg->umax_value, val);
+ break;
+ case BPF_JSLT:
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
break;
case BPF_JGE:
- value_from_signed = false;
- /* fallthrough */
+ true_reg->umax_value = min(true_reg->umax_value, val);
+ false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ break;
case BPF_JSGE:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGE) {
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
- }
- /* If this is false then constant < register, if it is true then
- * the register < constant.
- */
- false_reg->min_value = val + 1;
- false_reg->value_from_signed = value_from_signed;
- true_reg->max_value = val;
- true_reg->value_from_signed = value_from_signed;
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ break;
+ case BPF_JLE:
+ true_reg->umin_value = max(true_reg->umin_value, val);
+ false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ break;
+ case BPF_JSLE:
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
break;
default:
break;
}
- check_reg_overflow(false_reg);
- check_reg_overflow(true_reg);
- if (is_range) {
- if (__is_pointer_value(false, false_reg))
- reset_reg_range_values(false_reg, 0);
- if (__is_pointer_value(false, true_reg))
- reset_reg_range_values(true_reg, 0);
+ __reg_deduce_bounds(false_reg);
+ __reg_deduce_bounds(true_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(false_reg);
+ __reg_bound_offset(true_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(false_reg);
+ __update_reg_bounds(true_reg);
+}
+
+/* Regs are known to be equal, so intersect their min/max/var_off */
+static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
+ struct bpf_reg_state *dst_reg)
+{
+ src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value,
+ dst_reg->umin_value);
+ src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value,
+ dst_reg->umax_value);
+ src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value,
+ dst_reg->smin_value);
+ src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value,
+ dst_reg->smax_value);
+ src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
+ dst_reg->var_off);
+ /* We might have learned new bounds from the var_off. */
+ __update_reg_bounds(src_reg);
+ __update_reg_bounds(dst_reg);
+ /* We might have learned something about the sign bit. */
+ __reg_deduce_bounds(src_reg);
+ __reg_deduce_bounds(dst_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(src_reg);
+ __reg_bound_offset(dst_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(src_reg);
+ __update_reg_bounds(dst_reg);
+}
+
+static void reg_combine_min_max(struct bpf_reg_state *true_src,
+ struct bpf_reg_state *true_dst,
+ struct bpf_reg_state *false_src,
+ struct bpf_reg_state *false_dst,
+ u8 opcode)
+{
+ switch (opcode) {
+ case BPF_JEQ:
+ __reg_combine_min_max(true_src, true_dst);
+ break;
+ case BPF_JNE:
+ __reg_combine_min_max(false_src, false_dst);
+ break;
}
}
static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
- enum bpf_reg_type type)
+ bool is_null)
{
struct bpf_reg_state *reg = &regs[regno];
if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
- if (type == UNKNOWN_VALUE) {
- __mark_reg_unknown_value(regs, regno);
+ /* Old offset (both fixed and variable parts) should
+ * have been known-zero, because we don't allow pointer
+ * arithmetic on pointers that might be NULL.
+ */
+ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
+ !tnum_equals_const(reg->var_off, 0) ||
+ reg->off)) {
+ __mark_reg_known_zero(reg);
+ reg->off = 0;
+ }
+ if (is_null) {
+ reg->type = SCALAR_VALUE;
} else if (reg->map_ptr->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
reg->map_ptr = reg->map_ptr->inner_map_meta;
} else {
- reg->type = type;
+ reg->type = PTR_TO_MAP_VALUE;
}
/* We don't need id from this point onwards anymore, thus we
* should better reset it, so that state pruning has chances
@@ -2407,60 +2776,153 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
* be folded together at some point.
*/
static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
- enum bpf_reg_type type)
+ bool is_null)
{
struct bpf_reg_state *regs = state->regs;
u32 id = regs[regno].id;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
- mark_map_reg(regs, i, id, type);
+ mark_map_reg(regs, i, id, is_null);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type);
+ mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
+ }
+}
+
+static bool try_match_pkt_pointers(const struct bpf_insn *insn,
+ struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg,
+ struct bpf_verifier_state *this_branch,
+ struct bpf_verifier_state *other_branch)
+{
+ if (BPF_SRC(insn->code) != BPF_X)
+ return false;
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_JGT:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' > pkt_end, pkt_meta' > pkt_data */
+ find_good_pkt_pointers(this_branch, dst_reg,
+ dst_reg->type, false);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end > pkt_data', pkt_data > pkt_meta' */
+ find_good_pkt_pointers(other_branch, src_reg,
+ src_reg->type, true);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JLT:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' < pkt_end, pkt_meta' < pkt_data */
+ find_good_pkt_pointers(other_branch, dst_reg,
+ dst_reg->type, true);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end < pkt_data', pkt_data > pkt_meta' */
+ find_good_pkt_pointers(this_branch, src_reg,
+ src_reg->type, false);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JGE:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
+ find_good_pkt_pointers(this_branch, dst_reg,
+ dst_reg->type, true);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
+ find_good_pkt_pointers(other_branch, src_reg,
+ src_reg->type, false);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JLE:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
+ find_good_pkt_pointers(other_branch, dst_reg,
+ dst_reg->type, false);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
+ find_good_pkt_pointers(this_branch, src_reg,
+ src_reg->type, true);
+ } else {
+ return false;
+ }
+ break;
+ default:
+ return false;
}
+
+ return true;
}
static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
- struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+ struct bpf_verifier_state *other_branch, *this_branch = env->cur_state;
struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
u8 opcode = BPF_OP(insn->code);
int err;
- if (opcode > BPF_EXIT) {
- verbose("invalid BPF_JMP opcode %x\n", opcode);
+ if (opcode > BPF_JSLE) {
+ verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
return -EINVAL;
}
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP uses reserved fields\n");
return -EINVAL;
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d pointer comparison prohibited\n",
+ verbose(env, "R%d pointer comparison prohibited\n",
insn->src_reg);
return -EACCES;
}
} else {
if (insn->src_reg != BPF_REG_0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP uses reserved fields\n");
return -EINVAL;
}
}
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -2469,7 +2931,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if R == 0 where R was initialized to zero earlier */
if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE) &&
- dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) {
+ dst_reg->type == SCALAR_VALUE &&
+ tnum_equals_const(dst_reg->var_off, insn->imm)) {
if (opcode == BPF_JEQ) {
/* if (imm == imm) goto pc+off;
* only follow the goto, ignore fall-through
@@ -2491,17 +2954,30 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if we are comparing against a constant value so we can adjust
* our min/max values for our dst register.
+ * this is only legit if both are scalars (or pointers to the same
+ * object, I suppose, but we don't support that right now), because
+ * otherwise the different base pointers mean the offsets aren't
+ * comparable.
*/
if (BPF_SRC(insn->code) == BPF_X) {
- if (regs[insn->src_reg].type == CONST_IMM)
- reg_set_min_max(&other_branch->regs[insn->dst_reg],
- dst_reg, regs[insn->src_reg].imm,
- opcode);
- else if (dst_reg->type == CONST_IMM)
- reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
- &regs[insn->src_reg], dst_reg->imm,
- opcode);
- } else {
+ if (dst_reg->type == SCALAR_VALUE &&
+ regs[insn->src_reg].type == SCALAR_VALUE) {
+ if (tnum_is_const(regs[insn->src_reg].var_off))
+ reg_set_min_max(&other_branch->regs[insn->dst_reg],
+ dst_reg, regs[insn->src_reg].var_off.value,
+ opcode);
+ else if (tnum_is_const(dst_reg->var_off))
+ reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
+ &regs[insn->src_reg],
+ dst_reg->var_off.value, opcode);
+ else if (opcode == BPF_JEQ || opcode == BPF_JNE)
+ /* Comparing for equality, we can combine knowledge */
+ reg_combine_min_max(&other_branch->regs[insn->src_reg],
+ &other_branch->regs[insn->dst_reg],
+ &regs[insn->src_reg],
+ &regs[insn->dst_reg], opcode);
+ }
+ } else if (dst_reg->type == SCALAR_VALUE) {
reg_set_min_max(&other_branch->regs[insn->dst_reg],
dst_reg, insn->imm, opcode);
}
@@ -2513,24 +2989,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* Mark all identical map registers in each branch as either
* safe or unknown depending R == 0 or R != 0 conditional.
*/
- mark_map_regs(this_branch, insn->dst_reg,
- opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE);
- mark_map_regs(other_branch, insn->dst_reg,
- opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
- dst_reg->type == PTR_TO_PACKET &&
- regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- find_good_pkt_pointers(this_branch, dst_reg);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
- dst_reg->type == PTR_TO_PACKET_END &&
- regs[insn->src_reg].type == PTR_TO_PACKET) {
- find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
- } else if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+ mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
+ mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
+ } else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
+ this_branch, other_branch) &&
+ is_pointer_value(env, insn->dst_reg)) {
+ verbose(env, "R%d pointer comparison prohibited\n",
+ insn->dst_reg);
return -EACCES;
}
- if (log_level)
- print_verifier_state(this_branch);
+ if (env->log.level)
+ print_verifier_state(env, this_branch);
return 0;
}
@@ -2545,28 +3014,27 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
/* verify BPF_LD_IMM64 instruction */
static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
int err;
if (BPF_SIZE(insn->code) != BPF_DW) {
- verbose("invalid BPF_LD_IMM insn\n");
+ verbose(env, "invalid BPF_LD_IMM insn\n");
return -EINVAL;
}
if (insn->off != 0) {
- verbose("BPF_LD_IMM64 uses reserved fields\n");
+ verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
return -EINVAL;
}
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
if (insn->src_reg == 0) {
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = imm;
- regs[insn->dst_reg].id = 0;
+ regs[insn->dst_reg].type = SCALAR_VALUE;
+ __mark_reg_known(&regs[insn->dst_reg], imm);
return 0;
}
@@ -2607,47 +3075,89 @@ static bool may_access_skb(enum bpf_prog_type type)
*/
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 mode = BPF_MODE(insn->code);
int i, err;
if (!may_access_skb(env->prog->type)) {
- verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
+ verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
return -EINVAL;
}
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
- verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
+ verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
return -EINVAL;
}
/* check whether implicit source operand (register R6) is readable */
- err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+ err = check_reg_arg(env, BPF_REG_6, SRC_OP);
if (err)
return err;
if (regs[BPF_REG_6].type != PTR_TO_CTX) {
- verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+ verbose(env,
+ "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
return -EINVAL;
}
if (mode == BPF_IND) {
/* check explicit source operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
}
/* reset caller saved regs to unreadable */
- for (i = 0; i < CALLER_SAVED_REGS; i++)
- mark_reg_not_init(regs, caller_saved[i]);
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ mark_reg_not_init(env, regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
/* mark destination R0 register as readable, since it contains
- * the value fetched from the packet
+ * the value fetched from the packet.
+ * Already marked as written above.
*/
- regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ return 0;
+}
+
+static int check_return_code(struct bpf_verifier_env *env)
+{
+ struct bpf_reg_state *reg;
+ struct tnum range = tnum_range(0, 1);
+
+ switch (env->prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ break;
+ default:
+ return 0;
+ }
+
+ reg = cur_regs(env) + BPF_REG_0;
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "At program exit the register R0 is not a known value (%s)\n",
+ reg_type_str[reg->type]);
+ return -EINVAL;
+ }
+
+ if (!tnum_in(range, reg->var_off)) {
+ verbose(env, "At program exit the register R0 ");
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "has value %s", tn_buf);
+ } else {
+ verbose(env, "has unknown scalar value");
+ }
+ verbose(env, " should have been 0 or 1\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -2711,7 +3221,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
return 0;
if (w < 0 || w >= env->prog->len) {
- verbose("jump out of range from insn %d to %d\n", t, w);
+ verbose(env, "jump out of range from insn %d to %d\n", t, w);
return -EINVAL;
}
@@ -2728,13 +3238,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
insn_stack[cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- verbose("back-edge from insn %d to %d\n", t, w);
+ verbose(env, "back-edge from insn %d to %d\n", t, w);
return -EINVAL;
} else if (insn_state[w] == EXPLORED) {
/* forward- or cross-edge */
insn_state[t] = DISCOVERED | e;
} else {
- verbose("insn state internal bug\n");
+ verbose(env, "insn state internal bug\n");
return -EFAULT;
}
return 0;
@@ -2828,7 +3338,7 @@ peek_stack:
mark_explored:
insn_state[t] = EXPLORED;
if (cur_stack-- <= 0) {
- verbose("pop stack internal bug\n");
+ verbose(env, "pop stack internal bug\n");
ret = -EFAULT;
goto err_free;
}
@@ -2837,7 +3347,7 @@ mark_explored:
check_state:
for (i = 0; i < insn_cnt; i++) {
if (insn_state[i] != EXPLORED) {
- verbose("unreachable insn %d\n", i);
+ verbose(env, "unreachable insn %d\n", i);
ret = -EINVAL;
goto err_free;
}
@@ -2850,60 +3360,199 @@ err_free:
return ret;
}
-/* the following conditions reduce the number of explored insns
- * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
+/* check %cur's range satisfies %old's */
+static bool range_within(struct bpf_reg_state *old,
+ struct bpf_reg_state *cur)
+{
+ return old->umin_value <= cur->umin_value &&
+ old->umax_value >= cur->umax_value &&
+ old->smin_value <= cur->smin_value &&
+ old->smax_value >= cur->smax_value;
+}
+
+/* Maximum number of register states that can exist at once */
+#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
+struct idpair {
+ u32 old;
+ u32 cur;
+};
+
+/* If in the old state two registers had the same id, then they need to have
+ * the same id in the new state as well. But that id could be different from
+ * the old state, so we need to track the mapping from old to new ids.
+ * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
+ * regs with old id 5 must also have new id 9 for the new state to be safe. But
+ * regs with a different old id could still have new id 9, we don't care about
+ * that.
+ * So we look through our idmap to see if this old id has been seen before. If
+ * so, we require the new id to match; otherwise, we add the id pair to the map.
*/
-static bool compare_ptrs_to_packet(struct bpf_verifier_env *env,
- struct bpf_reg_state *old,
- struct bpf_reg_state *cur)
+static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
{
- if (old->id != cur->id)
- return false;
+ unsigned int i;
+
+ for (i = 0; i < ID_MAP_SIZE; i++) {
+ if (!idmap[i].old) {
+ /* Reached an empty slot; haven't seen this id before */
+ idmap[i].old = old_id;
+ idmap[i].cur = cur_id;
+ return true;
+ }
+ if (idmap[i].old == old_id)
+ return idmap[i].cur == cur_id;
+ }
+ /* We ran out of idmap slots, which should be impossible */
+ WARN_ON_ONCE(1);
+ return false;
+}
- /* old ptr_to_packet is more conservative, since it allows smaller
- * range. Ex:
- * old(off=0,r=10) is equal to cur(off=0,r=20), because
- * old(off=0,r=10) means that with range=10 the verifier proceeded
- * further and found no issues with the program. Now we're in the same
- * spot with cur(off=0,r=20), so we're safe too, since anything further
- * will only be looking at most 10 bytes after this pointer.
- */
- if (old->off == cur->off && old->range < cur->range)
+/* Returns true if (rold safe implies rcur safe) */
+static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+ struct idpair *idmap)
+{
+ if (!(rold->live & REG_LIVE_READ))
+ /* explored state didn't use this */
return true;
- /* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0)
- * since both cannot be used for packet access and safe(old)
- * pointer has smaller off that could be used for further
- * 'if (ptr > data_end)' check
- * Ex:
- * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean
- * that we cannot access the packet.
- * The safe range is:
- * [ptr, ptr + range - off)
- * so whenever off >=range, it means no safe bytes from this pointer.
- * When comparing old->off <= cur->off, it means that older code
- * went with smaller offset and that offset was later
- * used to figure out the safe range after 'if (ptr > data_end)' check
- * Say, 'old' state was explored like:
- * ... R3(off=0, r=0)
- * R4 = R3 + 20
- * ... now R4(off=20,r=0) <-- here
- * if (R4 > data_end)
- * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access.
- * ... the code further went all the way to bpf_exit.
- * Now the 'cur' state at the mark 'here' has R4(off=30,r=0).
- * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier
- * goes further, such cur_R4 will give larger safe packet range after
- * 'if (R4 > data_end)' and all further insn were already good with r=20,
- * so they will be good with r=30 and we can prune the search.
- */
- if (!env->strict_alignment && old->off <= cur->off &&
- old->off >= old->range && cur->off >= cur->range)
+ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0)
return true;
+ if (rold->type == NOT_INIT)
+ /* explored state can't have used this */
+ return true;
+ if (rcur->type == NOT_INIT)
+ return false;
+ switch (rold->type) {
+ case SCALAR_VALUE:
+ if (rcur->type == SCALAR_VALUE) {
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ } else {
+ /* if we knew anything about the old value, we're not
+ * equal, because we can't know anything about the
+ * scalar value of the pointer in the new value.
+ */
+ return rold->umin_value == 0 &&
+ rold->umax_value == U64_MAX &&
+ rold->smin_value == S64_MIN &&
+ rold->smax_value == S64_MAX &&
+ tnum_is_unknown(rold->var_off);
+ }
+ case PTR_TO_MAP_VALUE:
+ /* If the new min/max/var_off satisfy the old ones and
+ * everything else matches, we are OK.
+ * We don't care about the 'id' value, because nothing
+ * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_MAP_VALUE_OR_NULL:
+ /* a PTR_TO_MAP_VALUE could be safe to use as a
+ * PTR_TO_MAP_VALUE_OR_NULL into the same map.
+ * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
+ * checked, doing so could have affected others with the same
+ * id, and we can't check for that because we lost the id when
+ * we converted to a PTR_TO_MAP_VALUE.
+ */
+ if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
+ return false;
+ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
+ return false;
+ /* Check our ids match any regs they're supposed to */
+ return check_ids(rold->id, rcur->id, idmap);
+ case PTR_TO_PACKET_META:
+ case PTR_TO_PACKET:
+ if (rcur->type != rold->type)
+ return false;
+ /* We must have at least as much range as the old ptr
+ * did, so that any accesses which were safe before are
+ * still safe. This is true even if old range < old off,
+ * since someone could have accessed through (ptr - k), or
+ * even done ptr -= k in a register, to get a safe access.
+ */
+ if (rold->range > rcur->range)
+ return false;
+ /* If the offsets don't match, we can't trust our alignment;
+ * nor can we be sure that we won't fall out of range.
+ */
+ if (rold->off != rcur->off)
+ return false;
+ /* id relations must be preserved */
+ if (rold->id && !check_ids(rold->id, rcur->id, idmap))
+ return false;
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_CTX:
+ case CONST_PTR_TO_MAP:
+ case PTR_TO_STACK:
+ case PTR_TO_PACKET_END:
+ /* Only valid matches are exact, which memcmp() above
+ * would have accepted
+ */
+ default:
+ /* Don't know what's going on, just say it's not safe */
+ return false;
+ }
+
+ /* Shouldn't get here; if we do, say it's not safe */
+ WARN_ON_ONCE(1);
return false;
}
+static bool stacksafe(struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur,
+ struct idpair *idmap)
+{
+ int i, spi;
+
+ /* if explored stack has more populated slots than current stack
+ * such stacks are not equivalent
+ */
+ if (old->allocated_stack > cur->allocated_stack)
+ return false;
+
+ /* walk slots of the explored stack and ignore any additional
+ * slots in the current stack, since explored(safe) state
+ * didn't use them
+ */
+ for (i = 0; i < old->allocated_stack; i++) {
+ spi = i / BPF_REG_SIZE;
+
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
+ continue;
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
+ return false;
+ if (i % BPF_REG_SIZE)
+ continue;
+ if (old->stack[spi].slot_type[0] != STACK_SPILL)
+ continue;
+ if (!regsafe(&old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr,
+ idmap))
+ /* when explored and current stack slot are both storing
+ * spilled registers, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
+ * but current path has stored:
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ return false;
+ }
+ return true;
+}
+
/* compare two verifier states
*
* all states stored in state_list are known to be valid, since
@@ -2934,84 +3583,101 @@ static bool states_equal(struct bpf_verifier_env *env,
struct bpf_verifier_state *old,
struct bpf_verifier_state *cur)
{
- bool varlen_map_access = env->varlen_map_value_access;
- struct bpf_reg_state *rold, *rcur;
+ struct idpair *idmap;
+ bool ret = false;
int i;
- for (i = 0; i < MAX_BPF_REG; i++) {
- rold = &old->regs[i];
- rcur = &cur->regs[i];
+ idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL);
+ /* If we failed to allocate the idmap, just say it's not safe */
+ if (!idmap)
+ return false;
- if (memcmp(rold, rcur, sizeof(*rold)) == 0)
- continue;
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ if (!regsafe(&old->regs[i], &cur->regs[i], idmap))
+ goto out_free;
+ }
- /* If the ranges were not the same, but everything else was and
- * we didn't do a variable access into a map then we are a-ok.
- */
- if (!varlen_map_access &&
- memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0)
- continue;
+ if (!stacksafe(old, cur, idmap))
+ goto out_free;
+ ret = true;
+out_free:
+ kfree(idmap);
+ return ret;
+}
- /* If we didn't map access then again we don't care about the
- * mismatched range values and it's ok if our old type was
- * UNKNOWN and we didn't go to a NOT_INIT'ed reg.
- */
- if (rold->type == NOT_INIT ||
- (!varlen_map_access && rold->type == UNKNOWN_VALUE &&
- rcur->type != NOT_INIT))
- continue;
+/* A write screens off any subsequent reads; but write marks come from the
+ * straight-line code between a state and its parent. When we arrive at a
+ * jump target (in the first iteration of the propagate_liveness() loop),
+ * we didn't arrive by the straight-line code, so read marks in state must
+ * propagate to parent regardless of state's write marks.
+ */
+static bool do_propagate_liveness(const struct bpf_verifier_state *state,
+ struct bpf_verifier_state *parent)
+{
+ bool writes = parent == state->parent; /* Observe write marks */
+ bool touched = false; /* any changes made? */
+ int i;
- /* Don't care about the reg->id in this case. */
- if (rold->type == PTR_TO_MAP_VALUE_OR_NULL &&
- rcur->type == PTR_TO_MAP_VALUE_OR_NULL &&
- rold->map_ptr == rcur->map_ptr)
+ if (!parent)
+ return touched;
+ /* Propagate read liveness of registers... */
+ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
+ /* We don't need to worry about FP liveness because it's read-only */
+ for (i = 0; i < BPF_REG_FP; i++) {
+ if (parent->regs[i].live & REG_LIVE_READ)
continue;
-
- if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
- compare_ptrs_to_packet(env, rold, rcur))
+ if (writes && (state->regs[i].live & REG_LIVE_WRITTEN))
continue;
-
- return false;
+ if (state->regs[i].live & REG_LIVE_READ) {
+ parent->regs[i].live |= REG_LIVE_READ;
+ touched = true;
+ }
}
-
- for (i = 0; i < MAX_BPF_STACK; i++) {
- if (old->stack_slot_type[i] == STACK_INVALID)
+ /* ... and stack slots */
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
+ i < parent->allocated_stack / BPF_REG_SIZE; i++) {
+ if (parent->stack[i].slot_type[0] != STACK_SPILL)
continue;
- if (old->stack_slot_type[i] != cur->stack_slot_type[i])
- /* Ex: old explored (safe) state has STACK_SPILL in
- * this stack slot, but current has has STACK_MISC ->
- * this verifier states are not equivalent,
- * return false to continue verification of this path
- */
- return false;
- if (i % BPF_REG_SIZE)
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- if (old->stack_slot_type[i] != STACK_SPILL)
+ if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
continue;
- if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
- &cur->spilled_regs[i / BPF_REG_SIZE],
- sizeof(old->spilled_regs[0])))
- /* when explored and current stack slot types are
- * the same, check that stored pointers types
- * are the same as well.
- * Ex: explored safe path could have stored
- * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8}
- * but current path has stored:
- * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16}
- * such verifier states are not equivalent.
- * return false to continue verification of this path
- */
- return false;
- else
+ if (writes &&
+ (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))
continue;
+ if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) {
+ parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;
+ touched = true;
+ }
+ }
+ return touched;
+}
+
+/* "parent" is "a state from which we reach the current state", but initially
+ * it is not the state->parent (i.e. "the state whose straight-line code leads
+ * to the current state"), instead it is the state that happened to arrive at
+ * a (prunable) equivalent of the current state. See comment above
+ * do_propagate_liveness() for consequences of this.
+ * This function is just a more efficient way of calling mark_reg_read() or
+ * mark_stack_slot_read() on each reg in "parent" that is read in "state",
+ * though it requires that parent != state->parent in the call arguments.
+ */
+static void propagate_liveness(const struct bpf_verifier_state *state,
+ struct bpf_verifier_state *parent)
+{
+ while (do_propagate_liveness(state, parent)) {
+ /* Something changed, so we need to feed those changes onward */
+ state = parent;
+ parent = state->parent;
}
- return true;
}
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
+ struct bpf_verifier_state *cur = env->cur_state;
+ int i, err;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -3021,11 +3687,20 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
return 0;
while (sl != STATE_LIST_MARK) {
- if (states_equal(env, &sl->state, &env->cur_state))
+ if (states_equal(env, &sl->state, cur)) {
/* reached equivalent register/stack state,
- * prune the search
+ * prune the search.
+ * Registers read by the continuation are read by us.
+ * If we have any write marks in env->cur_state, they
+ * will prevent corresponding reads in the continuation
+ * from reaching our parent (an explored_state). Our
+ * own state will get the read marks recorded, but
+ * they'll be immediately forgotten as we're pruning
+ * this state and will pop a new one.
*/
+ propagate_liveness(&sl->state, cur);
return 1;
+ }
sl = sl->next;
}
@@ -3035,46 +3710,68 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* it will be rejected. Since there are no loops, we won't be
* seeing this 'insn_idx' instruction again on the way to bpf_exit
*/
- new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
+ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
if (!new_sl)
return -ENOMEM;
/* add new state to the head of linked list */
- memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+ err = copy_verifier_state(&new_sl->state, cur);
+ if (err) {
+ free_verifier_state(&new_sl->state, false);
+ kfree(new_sl);
+ return err;
+ }
new_sl->next = env->explored_states[insn_idx];
env->explored_states[insn_idx] = new_sl;
+ /* connect new state to parentage chain */
+ cur->parent = &new_sl->state;
+ /* clear write marks in current state: the writes we did are not writes
+ * our child did, so they don't screen off its reads from us.
+ * (There are no read marks in current state, because reads always mark
+ * their parent and current state never has children yet. Only
+ * explored_states can get read marks.)
+ */
+ for (i = 0; i < BPF_REG_FP; i++)
+ cur->regs[i].live = REG_LIVE_NONE;
+ for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++)
+ if (cur->stack[i].slot_type[0] == STACK_SPILL)
+ cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;
return 0;
}
static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx)
{
- if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
- return 0;
+ if (env->dev_ops && env->dev_ops->insn_hook)
+ return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
- return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+ return 0;
}
static int do_check(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state;
struct bpf_insn *insns = env->prog->insnsi;
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs;
int insn_cnt = env->prog->len;
int insn_idx, prev_insn_idx = 0;
int insn_processed = 0;
bool do_print_state = false;
- init_reg_state(regs);
+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+ env->cur_state = state;
+ init_reg_state(env, state->regs);
+ state->parent = NULL;
insn_idx = 0;
- env->varlen_map_value_access = false;
for (;;) {
struct bpf_insn *insn;
u8 class;
int err;
if (insn_idx >= insn_cnt) {
- verbose("invalid insn idx %d insn_cnt %d\n",
+ verbose(env, "invalid insn idx %d insn_cnt %d\n",
insn_idx, insn_cnt);
return -EFAULT;
}
@@ -3083,7 +3780,8 @@ static int do_check(struct bpf_verifier_env *env)
class = BPF_CLASS(insn->code);
if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
- verbose("BPF program is too large. Processed %d insn\n",
+ verbose(env,
+ "BPF program is too large. Processed %d insn\n",
insn_processed);
return -E2BIG;
}
@@ -3093,12 +3791,12 @@ static int do_check(struct bpf_verifier_env *env)
return err;
if (err == 1) {
/* found equivalent state, can prune the search */
- if (log_level) {
+ if (env->log.level) {
if (do_print_state)
- verbose("\nfrom %d to %d: safe\n",
+ verbose(env, "\nfrom %d to %d: safe\n",
prev_insn_idx, insn_idx);
else
- verbose("%d: safe\n", insn_idx);
+ verbose(env, "%d: safe\n", insn_idx);
}
goto process_bpf_exit;
}
@@ -3106,25 +3804,27 @@ static int do_check(struct bpf_verifier_env *env)
if (need_resched())
cond_resched();
- if (log_level > 1 || (log_level && do_print_state)) {
- if (log_level > 1)
- verbose("%d:", insn_idx);
+ if (env->log.level > 1 || (env->log.level && do_print_state)) {
+ if (env->log.level > 1)
+ verbose(env, "%d:", insn_idx);
else
- verbose("\nfrom %d to %d:",
+ verbose(env, "\nfrom %d to %d:",
prev_insn_idx, insn_idx);
- print_verifier_state(&env->cur_state);
+ print_verifier_state(env, state);
do_print_state = false;
}
- if (log_level) {
- verbose("%d: ", insn_idx);
- print_bpf_insn(env, insn);
+ if (env->log.level) {
+ verbose(env, "%d: ", insn_idx);
+ print_bpf_insn(verbose, env, insn,
+ env->allow_ptr_leaks);
}
err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
if (err)
return err;
+ regs = cur_regs(env);
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
@@ -3136,11 +3836,11 @@ static int do_check(struct bpf_verifier_env *env)
/* check for reserved fields is already done */
/* check src operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
- err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err)
return err;
@@ -3174,7 +3874,7 @@ static int do_check(struct bpf_verifier_env *env)
* src_reg == stack|map in some other branch.
* Reject it.
*/
- verbose("same insn cannot be used with different pointers\n");
+ verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}
@@ -3190,11 +3890,11 @@ static int do_check(struct bpf_verifier_env *env)
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -3214,18 +3914,18 @@ static int do_check(struct bpf_verifier_env *env)
} else if (dst_reg_type != *prev_dst_type &&
(dst_reg_type == PTR_TO_CTX ||
*prev_dst_type == PTR_TO_CTX)) {
- verbose("same insn cannot be used with different pointers\n");
+ verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}
} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
- verbose("BPF_ST uses reserved fields\n");
+ verbose(env, "BPF_ST uses reserved fields\n");
return -EINVAL;
}
/* check src operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -3244,7 +3944,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->off != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_CALL uses reserved fields\n");
+ verbose(env, "BPF_CALL uses reserved fields\n");
return -EINVAL;
}
@@ -3257,7 +3957,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_JA uses reserved fields\n");
+ verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
@@ -3269,7 +3969,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_EXIT uses reserved fields\n");
+ verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
@@ -3279,18 +3979,23 @@ static int do_check(struct bpf_verifier_env *env)
* of bpf_exit, which means that program wrote
* something into it earlier
*/
- err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
if (err)
return err;
if (is_pointer_value(env, BPF_REG_0)) {
- verbose("R0 leaks addr as return value\n");
+ verbose(env, "R0 leaks addr as return value\n");
return -EACCES;
}
+ err = check_return_code(env);
+ if (err)
+ return err;
process_bpf_exit:
- insn_idx = pop_stack(env, &prev_insn_idx);
- if (insn_idx < 0) {
+ err = pop_stack(env, &prev_insn_idx, &insn_idx);
+ if (err < 0) {
+ if (err != -ENOENT)
+ return err;
break;
} else {
do_print_state = true;
@@ -3316,20 +4021,19 @@ process_bpf_exit:
insn_idx++;
} else {
- verbose("invalid BPF_LD mode\n");
+ verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
}
- reset_reg_range_values(regs, insn->dst_reg);
} else {
- verbose("unknown insn class %d\n", class);
+ verbose(env, "unknown insn class %d\n", class);
return -EINVAL;
}
insn_idx++;
}
- verbose("processed %d insns, stack depth %d\n",
- insn_processed, env->prog->aux->stack_depth);
+ verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+ env->prog->aux->stack_depth);
return 0;
}
@@ -3341,7 +4045,8 @@ static int check_map_prealloc(struct bpf_map *map)
!(map->map_flags & BPF_F_NO_PREALLOC);
}
-static int check_map_prog_compatibility(struct bpf_map *map,
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map,
struct bpf_prog *prog)
{
@@ -3352,12 +4057,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,
*/
if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
if (!check_map_prealloc(map)) {
- verbose("perf_event programs can only use preallocated hash map\n");
+ verbose(env, "perf_event programs can only use preallocated hash map\n");
return -EINVAL;
}
if (map->inner_map_meta &&
!check_map_prealloc(map->inner_map_meta)) {
- verbose("perf_event programs can only use preallocated inner hash map\n");
+ verbose(env, "perf_event programs can only use preallocated inner hash map\n");
return -EINVAL;
}
}
@@ -3380,14 +4085,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
- verbose("BPF_LDX uses reserved fields\n");
+ verbose(env, "BPF_LDX uses reserved fields\n");
return -EINVAL;
}
if (BPF_CLASS(insn->code) == BPF_STX &&
((BPF_MODE(insn->code) != BPF_MEM &&
BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
- verbose("BPF_STX uses reserved fields\n");
+ verbose(env, "BPF_STX uses reserved fields\n");
return -EINVAL;
}
@@ -3398,7 +4103,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
insn[1].off != 0) {
- verbose("invalid bpf_ld_imm64 insn\n");
+ verbose(env, "invalid bpf_ld_imm64 insn\n");
return -EINVAL;
}
@@ -3407,19 +4112,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
goto next_insn;
if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
- verbose("unrecognized bpf_ld_imm64 insn\n");
+ verbose(env,
+ "unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}
f = fdget(insn->imm);
map = __bpf_map_get(f);
if (IS_ERR(map)) {
- verbose("fd %d is not pointing to valid bpf_map\n",
+ verbose(env, "fd %d is not pointing to valid bpf_map\n",
insn->imm);
return PTR_ERR(map);
}
- err = check_map_prog_compatibility(map, env->prog);
+ err = check_map_prog_compatibility(env, map, env->prog);
if (err) {
fdput(f);
return err;
@@ -3528,7 +4234,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
*/
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
- const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+ const struct bpf_verifier_ops *ops = env->ops;
int i, cnt, size, ctx_field_size, delta = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16], *insn;
@@ -3541,7 +4247,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
env->prog);
if (cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
} else if (cnt) {
new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
@@ -3589,7 +4295,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
u8 size_code;
if (type == BPF_WRITE) {
- verbose("bpf verifier narrow ctx access misconfigured\n");
+ verbose(env, "bpf verifier narrow ctx access misconfigured\n");
return -EINVAL;
}
@@ -3608,7 +4314,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
&target_size);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
(ctx_field_size && !target_size)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -3678,7 +4384,11 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
- if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) {
+ /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
+ * handlers are currently limited to 64 bit only.
+ */
+ if (ebpf_jit_enabled() && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_map_lookup_elem) {
map_ptr = env->insn_aux_data[i + delta].map_ptr;
if (map_ptr == BPF_MAP_PTR_POISON ||
!map_ptr->ops->map_gen_lookup)
@@ -3686,7 +4396,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -3703,13 +4413,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ if (insn->imm == BPF_FUNC_redirect_map) {
+ /* Note, we cannot use prog directly as imm as subsequent
+ * rewrites would still change the prog pointer. The only
+ * stable address we can use is aux, which also works with
+ * prog clones during blinding.
+ */
+ u64 addr = (unsigned long)prog->aux;
+ struct bpf_insn r4_ld[] = {
+ BPF_LD_IMM64(BPF_REG_4, addr),
+ *insn,
+ };
+ cnt = ARRAY_SIZE(r4_ld);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
patch_call_imm:
- fn = prog->aux->ops->get_func_proto(insn->imm);
+ fn = env->ops->get_func_proto(insn->imm);
/* all functions that have prototype and verifier allowed
* programs to call them, must be real in-kernel functions
*/
if (!fn->func) {
- verbose("kernel subsystem misconfigured func %s#%d\n",
+ verbose(env,
+ "kernel subsystem misconfigured func %s#%d\n",
func_id_name(insn->imm), insn->imm);
return -EFAULT;
}
@@ -3733,6 +4465,7 @@ static void free_states(struct bpf_verifier_env *env)
if (sl)
while (sl != STATE_LIST_MARK) {
sln = sl->next;
+ free_verifier_state(&sl->state, false);
kfree(sl);
sl = sln;
}
@@ -3743,16 +4476,21 @@ static void free_states(struct bpf_verifier_env *env)
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
- char __user *log_ubuf = NULL;
struct bpf_verifier_env *env;
+ struct bpf_verifer_log *log;
int ret = -EINVAL;
+ /* no program is valid */
+ if (ARRAY_SIZE(bpf_verifier_ops) == 0)
+ return -EINVAL;
+
/* 'struct bpf_verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
+ log = &env->log;
env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
(*prog)->len);
@@ -3760,6 +4498,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (!env->insn_aux_data)
goto err_free_env;
env->prog = *prog;
+ env->ops = bpf_verifier_ops[env->prog->type];
/* grab the mutex to protect few globals used by verifier */
mutex_lock(&bpf_verifier_lock);
@@ -3768,29 +4507,27 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
/* user requested verbose verifier output
* and supplied buffer to store the verification trace
*/
- log_level = attr->log_level;
- log_ubuf = (char __user *) (unsigned long) attr->log_buf;
- log_size = attr->log_size;
- log_len = 0;
+ log->level = attr->log_level;
+ log->ubuf = (char __user *) (unsigned long) attr->log_buf;
+ log->len_total = attr->log_size;
ret = -EINVAL;
- /* log_* values have to be sane */
- if (log_size < 128 || log_size > UINT_MAX >> 8 ||
- log_level == 0 || log_ubuf == NULL)
- goto err_unlock;
-
- ret = -ENOMEM;
- log_buf = vmalloc(log_size);
- if (!log_buf)
+ /* log attributes have to be sane */
+ if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+ !log->level || !log->ubuf)
goto err_unlock;
- } else {
- log_level = 0;
}
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
+ if (env->prog->aux->offload) {
+ ret = bpf_prog_offload_verifier_prep(env);
+ if (ret)
+ goto err_unlock;
+ }
+
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
@@ -3809,9 +4546,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
ret = do_check(env);
+ if (env->cur_state) {
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ }
skip_full_check:
- while (pop_stack(env, NULL) >= 0);
+ while (!pop_stack(env, NULL, NULL));
free_states(env);
if (ret == 0)
@@ -3821,17 +4562,11 @@ skip_full_check:
if (ret == 0)
ret = fixup_bpf_calls(env);
- if (log_level && log_len >= log_size - 1) {
- BUG_ON(log_len >= log_size);
- /* verifier log exceeded user supplied buffer */
+ if (log->level && bpf_verifier_log_full(log))
ret = -ENOSPC;
- /* fall through to return what was recorded */
- }
-
- /* copy verifier log back to user space including trailing zero */
- if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ if (log->level && !log->ubuf) {
ret = -EFAULT;
- goto free_log_buf;
+ goto err_release_maps;
}
if (ret == 0 && env->used_map_cnt) {
@@ -3842,7 +4577,7 @@ skip_full_check:
if (!env->prog->aux->used_maps) {
ret = -ENOMEM;
- goto free_log_buf;
+ goto err_release_maps;
}
memcpy(env->prog->aux->used_maps, env->used_maps,
@@ -3855,9 +4590,7 @@ skip_full_check:
convert_pseudo_ld_imm64(env);
}
-free_log_buf:
- if (log_level)
- vfree(log_buf);
+err_release_maps:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_bpf_prog_info() will release them.
@@ -3871,58 +4604,3 @@ err_free_env:
kfree(env);
return ret;
}
-
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
- void *priv)
-{
- struct bpf_verifier_env *env;
- int ret;
-
- env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
- if (!env)
- return -ENOMEM;
-
- env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
- prog->len);
- ret = -ENOMEM;
- if (!env->insn_aux_data)
- goto err_free_env;
- env->prog = prog;
- env->analyzer_ops = ops;
- env->analyzer_priv = priv;
-
- /* grab the mutex to protect few globals used by verifier */
- mutex_lock(&bpf_verifier_lock);
-
- log_level = 0;
-
- env->strict_alignment = false;
- if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
- env->strict_alignment = true;
-
- env->explored_states = kcalloc(env->prog->len,
- sizeof(struct bpf_verifier_state_list *),
- GFP_KERNEL);
- ret = -ENOMEM;
- if (!env->explored_states)
- goto skip_full_check;
-
- ret = check_cfg(env);
- if (ret < 0)
- goto skip_full_check;
-
- env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
- ret = do_check(env);
-
-skip_full_check:
- while (pop_stack(env, NULL) >= 0);
- free_states(env);
-
- mutex_unlock(&bpf_verifier_lock);
- vfree(env->insn_aux_data);
-err_free_env:
- kfree(env);
- return ret;
-}
-EXPORT_SYMBOL_GPL(bpf_analyzer);
diff --git a/kernel/capability.c b/kernel/capability.c
index f97fe77ceb88..1e1c0236f55b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/capability.c
*
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index ce693ccb8c58..ae448f7632cc 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
obj-y := cgroup.o namespace.o cgroup-v1.o
obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 8b4c3c2f2509..bf54ade001be 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __CGROUP_INTERNAL_H
#define __CGROUP_INTERNAL_H
@@ -156,6 +157,8 @@ static inline void get_css_set(struct css_set *cset)
bool cgroup_ssid_enabled(int ssid);
bool cgroup_on_dfl(const struct cgroup *cgrp);
+bool cgroup_is_thread_root(struct cgroup *cgrp);
+bool cgroup_is_threaded(struct cgroup *cgrp);
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
@@ -173,7 +176,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_root *root, unsigned long magic,
struct cgroup_namespace *ns);
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
struct cgroup_mgctx *mgctx);
@@ -183,10 +186,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
-ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup);
-ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
- loff_t off);
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+ __acquires(&cgroup_threadgroup_rwsem);
+void cgroup_procs_write_finish(struct task_struct *task)
+ __releases(&cgroup_threadgroup_rwsem);
void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 7bf4b1533f34..024085daab1a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -99,8 +99,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
if (cgroup_on_dfl(to))
return -EINVAL;
- if (!cgroup_may_migrate_to(to))
- return -EBUSY;
+ ret = cgroup_migrate_vet_dst(to);
+ if (ret)
+ return ret;
mutex_lock(&cgroup_mutex);
@@ -121,7 +122,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
* ->can_attach() fails.
*/
do {
- css_task_iter_start(&from->self, &it);
+ css_task_iter_start(&from->self, 0, &it);
task = css_task_iter_next(&it);
if (task)
get_task_struct(task);
@@ -373,7 +374,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
if (!array)
return -ENOMEM;
/* now, populate the array */
- css_task_iter_start(&cgrp->self, &it);
+ css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
if (unlikely(n == length))
break;
@@ -510,10 +511,58 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
return 0;
}
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off,
+ bool threadgroup)
{
- return __cgroup_procs_write(of, buf, nbytes, off, false);
+ struct cgroup *cgrp;
+ struct task_struct *task;
+ const struct cred *cred, *tcred;
+ ssize_t ret;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, threadgroup);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Even if we're attaching all tasks in the thread group, we only
+ * need to check permissions on one of them.
+ */
+ cred = current_cred();
+ tcred = get_task_cred(task);
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EACCES;
+ put_cred(tcred);
+ if (ret)
+ goto out_finish;
+
+ ret = cgroup_attach_task(cgrp, task, threadgroup);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup1_procs_write(of, buf, nbytes, off, true);
+}
+
+static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup1_procs_write(of, buf, nbytes, off, false);
}
static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
@@ -592,7 +641,7 @@ struct cftype cgroup1_base_files[] = {
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_PROCS,
- .write = cgroup_procs_write,
+ .write = cgroup1_procs_write,
},
{
.name = "cgroup.clone_children",
@@ -611,7 +660,7 @@ struct cftype cgroup1_base_files[] = {
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_TASKS,
- .write = cgroup_tasks_write,
+ .write = cgroup1_tasks_write,
},
{
.name = "notify_on_release",
@@ -701,7 +750,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
}
rcu_read_unlock();
- css_task_iter_start(&cgrp->self, &it);
+ css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
@@ -846,6 +895,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
seq_puts(seq, ",xattr");
+ if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
+ seq_puts(seq, ",cpuset_v2_mode");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
@@ -900,6 +951,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
opts->cpuset_clone_children = true;
continue;
}
+ if (!strcmp(token, "cpuset_v2_mode")) {
+ opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ continue;
+ }
if (!strcmp(token, "xattr")) {
opts->flags |= CGRP_ROOT_XATTR;
continue;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index df2e0f14a95d..00f5b358aeac 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -162,6 +162,9 @@ static u16 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
static u16 cgrp_dfl_implicit_ss_mask;
+/* some controllers can be threaded on the default hierarchy */
+static u16 cgrp_dfl_threaded_ss_mask;
+
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
@@ -316,13 +319,87 @@ static void cgroup_idr_remove(struct idr *idr, int id)
spin_unlock_bh(&cgroup_idr_lock);
}
-static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+static bool cgroup_has_tasks(struct cgroup *cgrp)
{
- struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+ return cgrp->nr_populated_csets;
+}
- if (parent_css)
- return container_of(parent_css, struct cgroup, self);
- return NULL;
+bool cgroup_is_threaded(struct cgroup *cgrp)
+{
+ return cgrp->dom_cgrp != cgrp;
+}
+
+/* can @cgrp host both domain and threaded children? */
+static bool cgroup_is_mixable(struct cgroup *cgrp)
+{
+ /*
+ * Root isn't under domain level resource control exempting it from
+ * the no-internal-process constraint, so it can serve as a thread
+ * root and a parent of resource domains at the same time.
+ */
+ return !cgroup_parent(cgrp);
+}
+
+/* can @cgrp become a thread root? should always be true for a thread root */
+static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
+{
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return true;
+
+ /* domain roots can't be nested under threaded */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* can only have either domain or threaded children */
+ if (cgrp->nr_populated_domain_children)
+ return false;
+
+ /* and no domain controllers can be enabled */
+ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+ return false;
+
+ return true;
+}
+
+/* is @cgrp root of a threaded subtree? */
+bool cgroup_is_thread_root(struct cgroup *cgrp)
+{
+ /* thread root should be a domain */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* a domain w/ threaded children is a thread root */
+ if (cgrp->nr_threaded_children)
+ return true;
+
+ /*
+ * A domain which has tasks and explicit threaded controllers
+ * enabled is a thread root.
+ */
+ if (cgroup_has_tasks(cgrp) &&
+ (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
+ return true;
+
+ return false;
+}
+
+/* a domain which isn't connected to the root w/o brekage can't be used */
+static bool cgroup_is_valid_domain(struct cgroup *cgrp)
+{
+ /* the cgroup itself can be a thread root */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* but the ancestors can't be unless mixable */
+ while ((cgrp = cgroup_parent(cgrp))) {
+ if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
+ return false;
+ if (cgroup_is_threaded(cgrp))
+ return false;
+ }
+
+ return true;
}
/* subsystems visibly enabled on a cgroup */
@@ -331,8 +408,14 @@ static u16 cgroup_control(struct cgroup *cgrp)
struct cgroup *parent = cgroup_parent(cgrp);
u16 root_ss_mask = cgrp->root->subsys_mask;
- if (parent)
- return parent->subtree_control;
+ if (parent) {
+ u16 ss_mask = parent->subtree_control;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
if (cgroup_on_dfl(cgrp))
root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@ -345,8 +428,14 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
- if (parent)
- return parent->subtree_ss_mask;
+ if (parent) {
+ u16 ss_mask = parent->subtree_ss_mask;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
return cgrp->root->subsys_mask;
}
@@ -436,22 +525,12 @@ out_unlock:
return css;
}
-static void __maybe_unused cgroup_get(struct cgroup *cgrp)
-{
- css_get(&cgrp->self);
-}
-
static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
css_get(&cgrp->self);
}
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
- return css_tryget(&cgrp->self);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -560,9 +639,11 @@ EXPORT_SYMBOL_GPL(of_css);
*/
struct css_set init_css_set = {
.refcount = REFCOUNT_INIT(1),
+ .dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
+ .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
@@ -570,6 +651,11 @@ struct css_set init_css_set = {
static int css_set_count = 1; /* 1 for init_css_set */
+static bool css_set_threaded(struct css_set *cset)
+{
+ return cset->dom_cset != cset;
+}
+
/**
* css_set_populated - does a css_set contain any tasks?
* @cset: target css_set
@@ -587,39 +673,48 @@ static bool css_set_populated(struct css_set *cset)
}
/**
- * cgroup_update_populated - updated populated count of a cgroup
+ * cgroup_update_populated - update the populated count of a cgroup
* @cgrp: the target cgroup
* @populated: inc or dec populated count
*
* One of the css_sets associated with @cgrp is either getting its first
- * task or losing the last. Update @cgrp->populated_cnt accordingly. The
- * count is propagated towards root so that a given cgroup's populated_cnt
- * is zero iff the cgroup and all its descendants don't contain any tasks.
+ * task or losing the last. Update @cgrp->nr_populated_* accordingly. The
+ * count is propagated towards root so that a given cgroup's
+ * nr_populated_children is zero iff none of its descendants contain any
+ * tasks.
*
- * @cgrp's interface file "cgroup.populated" is zero if
- * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
- * changes from or to zero, userland is notified that the content of the
- * interface file has changed. This can be used to detect when @cgrp and
- * its descendants become populated or empty.
+ * @cgrp's interface file "cgroup.populated" is zero if both
+ * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
+ * 1 otherwise. When the sum changes from or to zero, userland is notified
+ * that the content of the interface file has changed. This can be used to
+ * detect when @cgrp and its descendants become populated or empty.
*/
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
+ struct cgroup *child = NULL;
+ int adj = populated ? 1 : -1;
+
lockdep_assert_held(&css_set_lock);
do {
- bool trigger;
+ bool was_populated = cgroup_is_populated(cgrp);
- if (populated)
- trigger = !cgrp->populated_cnt++;
- else
- trigger = !--cgrp->populated_cnt;
+ if (!child) {
+ cgrp->nr_populated_csets += adj;
+ } else {
+ if (cgroup_is_threaded(child))
+ cgrp->nr_populated_threaded_children += adj;
+ else
+ cgrp->nr_populated_domain_children += adj;
+ }
- if (!trigger)
+ if (was_populated == cgroup_is_populated(cgrp))
break;
cgroup1_check_for_release(cgrp);
cgroup_file_notify(&cgrp->events_file);
+ child = cgrp;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
}
@@ -630,7 +725,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
* @populated: whether @cset is populated or depopulated
*
* @cset is either getting the first task or losing the last. Update the
- * ->populated_cnt of all associated cgroups accordingly.
+ * populated counters of all associated cgroups accordingly.
*/
static void css_set_update_populated(struct css_set *cset, bool populated)
{
@@ -653,7 +748,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
* css_set, @from_cset can be NULL. If @task is being disassociated
* instead of moved, @to_cset can be NULL.
*
- * This function automatically handles populated_cnt updates and
+ * This function automatically handles populated counter updates and
* css_task_iter adjustments but the caller is responsible for managing
* @from_cset and @to_cset's reference counts.
*/
@@ -737,6 +832,8 @@ void put_css_set_locked(struct css_set *cset)
if (!refcount_dec_and_test(&cset->refcount))
return;
+ WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
+
/* This css_set is dead. unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
@@ -753,6 +850,11 @@ void put_css_set_locked(struct css_set *cset)
kfree(link);
}
+ if (css_set_threaded(cset)) {
+ list_del(&cset->threaded_csets_node);
+ put_css_set_locked(cset->dom_cset);
+ }
+
kfree_rcu(cset, rcu_head);
}
@@ -771,6 +873,7 @@ static bool compare_css_sets(struct css_set *cset,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
+ struct cgroup *new_dfl_cgrp;
struct list_head *l1, *l2;
/*
@@ -781,6 +884,16 @@ static bool compare_css_sets(struct css_set *cset,
if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
+
+ /* @cset's domain should match the default cgroup's */
+ if (cgroup_on_dfl(new_cgrp))
+ new_dfl_cgrp = new_cgrp;
+ else
+ new_dfl_cgrp = old_cset->dfl_cgrp;
+
+ if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
+ return false;
+
/*
* Compare cgroup pointers in order to distinguish between
* different cgroups in hierarchies. As different cgroups may
@@ -988,9 +1101,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
}
refcount_set(&cset->refcount, 1);
+ cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->task_iters);
+ INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->mg_preload_node);
@@ -1028,6 +1143,28 @@ static struct css_set *find_css_set(struct css_set *old_cset,
spin_unlock_irq(&css_set_lock);
+ /*
+ * If @cset should be threaded, look up the matching dom_cset and
+ * link them up. We first fully initialize @cset then look for the
+ * dom_cset. It's simpler this way and safe as @cset is guaranteed
+ * to stay empty until we return.
+ */
+ if (cgroup_is_threaded(cset->dfl_cgrp)) {
+ struct css_set *dcset;
+
+ dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
+ if (!dcset) {
+ put_css_set(cset);
+ return NULL;
+ }
+
+ spin_lock_irq(&css_set_lock);
+ cset->dom_cset = dcset;
+ list_add_tail(&cset->threaded_csets_node,
+ &dcset->threaded_csets);
+ spin_unlock_irq(&css_set_lock);
+ }
+
return cset;
}
@@ -1155,6 +1292,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
if (cset == &init_css_set) {
res = &root->cgrp;
+ } else if (root == &cgrp_dfl_root) {
+ res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
@@ -1670,6 +1809,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
cgrp->self.flags |= CSS_ONLINE;
+ cgrp->dom_cgrp = cgrp;
+ cgrp->max_descendants = INT_MAX;
+ cgrp->max_depth = INT_MAX;
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -1737,7 +1879,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
root->kf_root = kernfs_create_root(kf_sops,
- KERNFS_ROOT_CREATE_DEACTIVATED,
+ KERNFS_ROOT_CREATE_DEACTIVATED |
+ KERNFS_ROOT_SUPPORT_EXPORTOP,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
@@ -1753,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
if (ret)
goto destroy_root;
+ ret = cgroup_bpf_inherit(root_cgrp);
+ WARN_ON_ONCE(ret);
+
trace_cgroup_setup_root(root);
/*
@@ -2168,21 +2314,52 @@ out_release_tset:
list_del_init(&cset->mg_node);
}
spin_unlock_irq(&css_set_lock);
+
+ /*
+ * Re-initialize the cgroup_taskset structure in case it is reused
+ * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
+ * iteration.
+ */
+ tset->nr_tasks = 0;
+ tset->csets = &tset->src_csets;
return ret;
}
/**
- * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
* @dst_cgrp: destination cgroup to test
*
- * On the default hierarchy, except for the root, subtree_control must be
- * zero for migration destination cgroups with tasks so that child cgroups
- * don't compete against tasks.
+ * On the default hierarchy, except for the mixable, (possible) thread root
+ * and threaded cgroups, subtree_control must be zero for migration
+ * destination cgroups with tasks so that child cgroups don't compete
+ * against tasks.
*/
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
{
- return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
- !dst_cgrp->subtree_control;
+ /* v1 doesn't have any restriction */
+ if (!cgroup_on_dfl(dst_cgrp))
+ return 0;
+
+ /* verify @dst_cgrp can host resources */
+ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /* mixables don't care */
+ if (cgroup_is_mixable(dst_cgrp))
+ return 0;
+
+ /*
+ * If @dst_cgrp is already or can become a thread root or is
+ * threaded, it doesn't matter.
+ */
+ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
+ return 0;
+
+ /* apply no-internal-process constraint */
+ if (dst_cgrp->subtree_control)
+ return -EBUSY;
+
+ return 0;
}
/**
@@ -2387,8 +2564,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
struct task_struct *task;
int ret;
- if (!cgroup_may_migrate_to(dst_cgrp))
- return -EBUSY;
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
+ if (ret)
+ return ret;
/* look up all src csets */
spin_lock_irq(&css_set_lock);
@@ -2415,96 +2593,23 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
return ret;
}
-static int cgroup_procs_write_permission(struct task_struct *task,
- struct cgroup *dst_cgrp,
- struct kernfs_open_file *of)
-{
- struct super_block *sb = of->file->f_path.dentry->d_sb;
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
- struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
- struct cgroup *src_cgrp, *com_cgrp;
- struct inode *inode;
- int ret;
-
- if (!cgroup_on_dfl(dst_cgrp)) {
- const struct cred *cred = current_cred();
- const struct cred *tcred = get_task_cred(task);
-
- /*
- * even if we're attaching all tasks in the thread group,
- * we only need to check permissions on one of them.
- */
- if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
- uid_eq(cred->euid, tcred->uid) ||
- uid_eq(cred->euid, tcred->suid))
- ret = 0;
- else
- ret = -EACCES;
-
- put_cred(tcred);
- return ret;
- }
-
- /* find the source cgroup */
- spin_lock_irq(&css_set_lock);
- src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- /* and the common ancestor */
- com_cgrp = src_cgrp;
- while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
- com_cgrp = cgroup_parent(com_cgrp);
-
- /* %current should be authorized to migrate to the common ancestor */
- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
- if (!inode)
- return -ENOMEM;
-
- ret = inode_permission(inode, MAY_WRITE);
- iput(inode);
- if (ret)
- return ret;
-
- /*
- * If namespaces are delegation boundaries, %current must be able
- * to see both source and destination cgroups from its namespace.
- */
- if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
- (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
- !cgroup_is_descendant(dst_cgrp, root_cgrp)))
- return -ENOENT;
-
- return 0;
-}
-
-/*
- * Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex and threadgroup.
- */
-ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup)
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+ __acquires(&cgroup_threadgroup_rwsem)
{
struct task_struct *tsk;
- struct cgroup_subsys *ss;
- struct cgroup *cgrp;
pid_t pid;
- int ssid, ret;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
- return -EINVAL;
-
- cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!cgrp)
- return -ENODEV;
+ return ERR_PTR(-EINVAL);
percpu_down_write(&cgroup_threadgroup_rwsem);
+
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
- ret = -ESRCH;
- goto out_unlock_rcu;
+ tsk = ERR_PTR(-ESRCH);
+ goto out_unlock_threadgroup;
}
} else {
tsk = current;
@@ -2520,35 +2625,33 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
* cgroup with no rt_runtime allocated. Just say no.
*/
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
- ret = -EINVAL;
- goto out_unlock_rcu;
+ tsk = ERR_PTR(-EINVAL);
+ goto out_unlock_threadgroup;
}
get_task_struct(tsk);
+ goto out_unlock_rcu;
+
+out_unlock_threadgroup:
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+out_unlock_rcu:
rcu_read_unlock();
+ return tsk;
+}
- ret = cgroup_procs_write_permission(tsk, cgrp, of);
- if (!ret)
- ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+void cgroup_procs_write_finish(struct task_struct *task)
+ __releases(&cgroup_threadgroup_rwsem)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
- put_task_struct(tsk);
- goto out_unlock_threadgroup;
+ /* release reference from cgroup_procs_write_start() */
+ put_task_struct(task);
-out_unlock_rcu:
- rcu_read_unlock();
-out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
- cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
-}
-
-ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
- loff_t off)
-{
- return __cgroup_procs_write(of, buf, nbytes, off, true);
}
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -2891,6 +2994,46 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
cgroup_apply_control_disable(cgrp);
}
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+{
+ u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+
+ /* if nothing is getting enabled, nothing to worry about */
+ if (!enable)
+ return 0;
+
+ /* can @cgrp host any resources? */
+ if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return 0;
+
+ if (domain_enable) {
+ /* can't enable domain controllers inside a thread subtree */
+ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return -EOPNOTSUPP;
+ } else {
+ /*
+ * Threaded controllers can handle internal competitions
+ * and are always allowed inside a (prospective) thread
+ * subtree.
+ */
+ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return 0;
+ }
+
+ /*
+ * Controllers can't be enabled for a cgroup with tasks to avoid
+ * child cgroups competing against tasks.
+ */
+ if (cgroup_has_tasks(cgrp))
+ return -EBUSY;
+
+ return 0;
+}
+
/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
@@ -2966,33 +3109,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
goto out_unlock;
}
- /*
- * Except for the root, subtree_control must be zero for a cgroup
- * with tasks so that child cgroups don't compete against tasks.
- */
- if (enable && cgroup_parent(cgrp)) {
- struct cgrp_cset_link *link;
-
- /*
- * Because namespaces pin csets too, @cgrp->cset_links
- * might not be empty even when @cgrp is empty. Walk and
- * verify each cset.
- */
- spin_lock_irq(&css_set_lock);
-
- ret = 0;
- list_for_each_entry(link, &cgrp->cset_links, cset_link) {
- if (css_set_populated(link->cset)) {
- ret = -EBUSY;
- break;
- }
- }
-
- spin_unlock_irq(&css_set_lock);
-
- if (ret)
- goto out_unlock;
- }
+ ret = cgroup_vet_subtree_control_enable(cgrp, enable);
+ if (ret)
+ goto out_unlock;
/* save and update control masks and prepare csses */
cgroup_save_control(cgrp);
@@ -3011,6 +3130,172 @@ out_unlock:
return ret ?: nbytes;
}
+/**
+ * cgroup_enable_threaded - make @cgrp threaded
+ * @cgrp: the target cgroup
+ *
+ * Called when "threaded" is written to the cgroup.type interface file and
+ * tries to make @cgrp threaded and join the parent's resource domain.
+ * This function is never called on the root cgroup as cgroup.type doesn't
+ * exist on it.
+ */
+static int cgroup_enable_threaded(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup *dom_cgrp = parent->dom_cgrp;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* noop if already threaded */
+ if (cgroup_is_threaded(cgrp))
+ return 0;
+
+ /* we're joining the parent's domain, ensure its validity */
+ if (!cgroup_is_valid_domain(dom_cgrp) ||
+ !cgroup_can_be_thread_root(dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /*
+ * The following shouldn't cause actual migrations and should
+ * always succeed.
+ */
+ cgroup_save_control(cgrp);
+
+ cgrp->dom_cgrp = dom_cgrp;
+ ret = cgroup_apply_control(cgrp);
+ if (!ret)
+ parent->nr_threaded_children++;
+ else
+ cgrp->dom_cgrp = cgrp;
+
+ cgroup_finalize_control(cgrp, ret);
+ return ret;
+}
+
+static int cgroup_type_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ if (cgroup_is_threaded(cgrp))
+ seq_puts(seq, "threaded\n");
+ else if (!cgroup_is_valid_domain(cgrp))
+ seq_puts(seq, "domain invalid\n");
+ else if (cgroup_is_thread_root(cgrp))
+ seq_puts(seq, "domain threaded\n");
+ else
+ seq_puts(seq, "domain\n");
+
+ return 0;
+}
+
+static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ /* only switching to threaded mode is supported */
+ if (strcmp(strstrip(buf), "threaded"))
+ return -EINVAL;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ /* threaded can only be enabled */
+ ret = cgroup_enable_threaded(cgrp);
+
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int descendants = READ_ONCE(cgrp->max_descendants);
+
+ if (descendants == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", descendants);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int descendants;
+ ssize_t ret;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ descendants = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &descendants);
+ if (ret)
+ return ret;
+ }
+
+ if (descendants < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_descendants = descendants;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
+static int cgroup_max_depth_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int depth = READ_ONCE(cgrp->max_depth);
+
+ if (depth == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", depth);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ ssize_t ret;
+ int depth;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ depth = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &depth);
+ if (ret)
+ return ret;
+ }
+
+ if (depth < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_depth = depth;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
static int cgroup_events_show(struct seq_file *seq, void *v)
{
seq_printf(seq, "populated %d\n",
@@ -3018,6 +3303,18 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
return 0;
}
+static int cgroup_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "nr_descendants %d\n",
+ cgroup->nr_descendants);
+ seq_printf(seq, "nr_dying_descendants %d\n",
+ cgroup->nr_dying_descendants);
+
+ return 0;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
@@ -3234,7 +3531,6 @@ restart:
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
- LIST_HEAD(pending);
struct cgroup_subsys *ss = cfts[0].ss;
struct cgroup *root = &ss->root->cgrp;
struct cgroup_subsys_state *css;
@@ -3659,6 +3955,58 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
return ret;
}
+static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
+{
+ struct list_head *l;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ /* find the next threaded cset */
+ if (it->tcset_pos) {
+ l = it->tcset_pos->next;
+
+ if (l != it->tcset_head) {
+ it->tcset_pos = l;
+ return container_of(l, struct css_set,
+ threaded_csets_node);
+ }
+
+ it->tcset_pos = NULL;
+ }
+
+ /* find the next cset */
+ l = it->cset_pos;
+ l = l->next;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
+ return NULL;
+ }
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
+
+ it->cset_pos = l;
+
+ /* initialize threaded css_set walking */
+ if (it->flags & CSS_TASK_ITER_THREADED) {
+ if (it->cur_dcset)
+ put_css_set_locked(it->cur_dcset);
+ it->cur_dcset = cset;
+ get_css_set(cset);
+
+ it->tcset_head = &cset->threaded_csets;
+ it->tcset_pos = &cset->threaded_csets;
+ }
+
+ return cset;
+}
+
/**
* css_task_iter_advance_css_set - advance a task itererator to the next css_set
* @it: the iterator to advance
@@ -3667,32 +4015,19 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
*/
static void css_task_iter_advance_css_set(struct css_task_iter *it)
{
- struct list_head *l = it->cset_pos;
- struct cgrp_cset_link *link;
struct css_set *cset;
lockdep_assert_held(&css_set_lock);
/* Advance to the next non-empty css_set */
do {
- l = l->next;
- if (l == it->cset_head) {
- it->cset_pos = NULL;
+ cset = css_task_iter_next_css_set(it);
+ if (!cset) {
it->task_pos = NULL;
return;
}
-
- if (it->ss) {
- cset = container_of(l, struct css_set,
- e_cset_node[it->ss->id]);
- } else {
- link = list_entry(l, struct cgrp_cset_link, cset_link);
- cset = link->cset;
- }
} while (!css_set_populated(cset));
- it->cset_pos = l;
-
if (!list_empty(&cset->tasks))
it->task_pos = cset->tasks.next;
else
@@ -3732,6 +4067,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
lockdep_assert_held(&css_set_lock);
WARN_ON_ONCE(!l);
+repeat:
/*
* Advance iterator to find next entry. cset->tasks is consumed
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
@@ -3746,11 +4082,18 @@ static void css_task_iter_advance(struct css_task_iter *it)
css_task_iter_advance_css_set(it);
else
it->task_pos = l;
+
+ /* if PROCS, skip over tasks which aren't group leaders */
+ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
+ !thread_group_leader(list_entry(it->task_pos, struct task_struct,
+ cg_list)))
+ goto repeat;
}
/**
* css_task_iter_start - initiate task iteration
* @css: the css to walk tasks of
+ * @flags: CSS_TASK_ITER_* flags
* @it: the task iterator to use
*
* Initiate iteration through the tasks of @css. The caller can call
@@ -3758,7 +4101,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
*/
-void css_task_iter_start(struct cgroup_subsys_state *css,
+void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it)
{
/* no one should try to iterate before mounting cgroups */
@@ -3769,6 +4112,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
spin_lock_irq(&css_set_lock);
it->ss = css->ss;
+ it->flags = flags;
if (it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
@@ -3826,6 +4170,9 @@ void css_task_iter_end(struct css_task_iter *it)
spin_unlock_irq(&css_set_lock);
}
+ if (it->cur_dcset)
+ put_css_set(it->cur_dcset);
+
if (it->cur_task)
put_task_struct(it->cur_task);
}
@@ -3842,16 +4189,12 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
struct css_task_iter *it = of->priv;
- struct task_struct *task;
- do {
- task = css_task_iter_next(it);
- } while (task && !thread_group_leader(task));
-
- return task;
+ return css_task_iter_next(it);
}
-static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
+ unsigned int iter_flags)
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
@@ -3869,24 +4212,169 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
if (!it)
return ERR_PTR(-ENOMEM);
of->priv = it;
- css_task_iter_start(&cgrp->self, it);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
} else if (!(*pos)++) {
css_task_iter_end(it);
- css_task_iter_start(&cgrp->self, it);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
}
return cgroup_procs_next(s, NULL, NULL);
}
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+{
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+
+ /*
+ * All processes of a threaded subtree belong to the domain cgroup
+ * of the subtree. Only threads can be distributed across the
+ * subtree. Reject reads on cgroup.procs in the subtree proper.
+ * They're always empty anyway.
+ */
+ if (cgroup_is_threaded(cgrp))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
+ CSS_TASK_ITER_THREADED);
+}
+
static int cgroup_procs_show(struct seq_file *s, void *v)
{
- seq_printf(s, "%d\n", task_tgid_vnr(v));
+ seq_printf(s, "%d\n", task_pid_vnr(v));
+ return 0;
+}
+
+static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb)
+{
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup *com_cgrp = src_cgrp;
+ struct inode *inode;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* find the common ancestor */
+ while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+ com_cgrp = cgroup_parent(com_cgrp);
+
+ /* %current should be authorized to migrate to the common ancestor */
+ inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+ if (!inode)
+ return -ENOMEM;
+
+ ret = inode_permission(inode, MAY_WRITE);
+ iput(inode);
+ if (ret)
+ return ret;
+
+ /*
+ * If namespaces are delegation boundaries, %current must be able
+ * to see both source and destination cgroups from its namespace.
+ */
+ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+ (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
+ !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
+ return -ENOENT;
+
return 0;
}
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *src_cgrp, *dst_cgrp;
+ struct task_struct *task;
+ ssize_t ret;
+
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!dst_cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, true);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb);
+ if (ret)
+ goto out_finish;
+
+ ret = cgroup_attach_task(dst_cgrp, task, true);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
+{
+ return __cgroup_procs_start(s, pos, 0);
+}
+
+static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *src_cgrp, *dst_cgrp;
+ struct task_struct *task;
+ ssize_t ret;
+
+ buf = strstrip(buf);
+
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!dst_cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, false);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ /* thread migrations follow the cgroup.procs delegation rule */
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb);
+ if (ret)
+ goto out_finish;
+
+ /* and must be contained in the same domain */
+ ret = -EOPNOTSUPP;
+ if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
+ goto out_finish;
+
+ ret = cgroup_attach_task(dst_cgrp, task, false);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
/* cgroup core interface files for the default hierarchy */
static struct cftype cgroup_base_files[] = {
{
+ .name = "cgroup.type",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_type_show,
+ .write = cgroup_type_write,
+ },
+ {
.name = "cgroup.procs",
.flags = CFTYPE_NS_DELEGATABLE,
.file_offset = offsetof(struct cgroup, procs_file),
@@ -3897,6 +4385,14 @@ static struct cftype cgroup_base_files[] = {
.write = cgroup_procs_write,
},
{
+ .name = "cgroup.threads",
+ .release = cgroup_procs_release,
+ .seq_start = cgroup_threads_start,
+ .seq_next = cgroup_procs_next,
+ .seq_show = cgroup_procs_show,
+ .write = cgroup_threads_write,
+ },
+ {
.name = "cgroup.controllers",
.seq_show = cgroup_controllers_show,
},
@@ -3912,6 +4408,20 @@ static struct cftype cgroup_base_files[] = {
.file_offset = offsetof(struct cgroup, events_file),
.seq_show = cgroup_events_show,
},
+ {
+ .name = "cgroup.max.descendants",
+ .seq_show = cgroup_max_descendants_show,
+ .write = cgroup_max_descendants_write,
+ },
+ {
+ .name = "cgroup.max.depth",
+ .seq_show = cgroup_max_depth_show,
+ .write = cgroup_max_depth_write,
+ },
+ {
+ .name = "cgroup.stat",
+ .seq_show = cgroup_stat_show,
+ },
{ } /* terminate */
};
@@ -4011,9 +4521,15 @@ static void css_release_work_fn(struct work_struct *work)
if (ss->css_released)
ss->css_released(css);
} else {
+ struct cgroup *tcgrp;
+
/* cgroup release path */
trace_cgroup_release(cgrp);
+ for (tcgrp = cgroup_parent(cgrp); tcgrp;
+ tcgrp = cgroup_parent(tcgrp))
+ tcgrp->nr_dying_descendants--;
+
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
@@ -4100,9 +4616,6 @@ static void offline_css(struct cgroup_subsys_state *css)
if (!(css->flags & CSS_ONLINE))
return;
- if (ss->css_reset)
- ss->css_reset(css);
-
if (ss->css_offline)
ss->css_offline(css);
@@ -4211,10 +4724,17 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->self.parent = &parent->self;
cgrp->root = root;
cgrp->level = level;
+ ret = cgroup_bpf_inherit(cgrp);
+ if (ret)
+ goto out_idr_free;
- for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
+ if (tcgrp != cgrp)
+ tcgrp->nr_descendants++;
+ }
+
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4241,13 +4761,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp);
- if (parent)
- cgroup_bpf_inherit(cgrp, parent);
-
cgroup_propagate_control(cgrp);
return cgrp;
+out_idr_free:
+ cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -4255,6 +4774,29 @@ out_free_cgrp:
return ERR_PTR(ret);
}
+static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
+{
+ struct cgroup *cgroup;
+ int ret = false;
+ int level = 1;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
+ if (cgroup->nr_descendants >= cgroup->max_descendants)
+ goto fail;
+
+ if (level > cgroup->max_depth)
+ goto fail;
+
+ level++;
+ }
+
+ ret = true;
+fail:
+ return ret;
+}
+
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
@@ -4269,6 +4811,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
if (!parent)
return -ENODEV;
+ if (!cgroup_check_hierarchy_limits(parent)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
cgrp = cgroup_create(parent);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
@@ -4420,6 +4967,7 @@ static void kill_css(struct cgroup_subsys_state *css)
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
+ struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
int ssid;
@@ -4464,7 +5012,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
*/
kernfs_remove(cgrp->kn);
- cgroup1_check_for_release(cgroup_parent(cgrp));
+ if (parent && cgroup_is_threaded(cgrp))
+ parent->nr_threaded_children--;
+
+ for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+ tcgrp->nr_descendants--;
+ tcgrp->nr_dying_descendants++;
+ }
+
+ cgroup1_check_for_release(parent);
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -4659,11 +5215,17 @@ int __init cgroup_init(void)
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+ /* implicit controllers must be threaded too */
+ WARN_ON(ss->implicit_on_dfl && !ss->threaded);
+
if (ss->implicit_on_dfl)
cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
else if (!ss->dfl_cftypes)
cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
+ if (ss->threaded)
+ cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
+
if (ss->dfl_cftypes == ss->legacy_cftypes) {
WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
} else {
@@ -4708,6 +5270,18 @@ static int __init cgroup_wq_init(void)
}
core_initcall(cgroup_wq_init);
+void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+ char *buf, size_t buflen)
+{
+ struct kernfs_node *kn;
+
+ kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ if (!kn)
+ return;
+ kernfs_path(kn, buf, buflen);
+ kernfs_put(kn);
+}
+
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -5175,14 +5749,33 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */
#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, bool overridable)
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
- struct cgroup *parent = cgroup_parent(cgrp);
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+ ret = __cgroup_bpf_query(cgrp, attr, uattr);
mutex_unlock(&cgroup_mutex);
return ret;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 87a1213dd326..f7efa7b4d825 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -56,7 +56,8 @@
#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
-
+#include <linux/oom.h>
+#include <linux/sched/isolation.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
@@ -300,6 +301,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
/*
+ * Cgroup v2 behavior is used when on default hierarchy or the
+ * cgroup_v2_mode flag is set.
+ */
+static inline bool is_in_v2_mode(void)
+{
+ return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
+}
+
+/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
@@ -489,8 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_cpuset_subset(trial, par))
+ if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
goto out;
/*
@@ -577,6 +587,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
+/* Must be called with cpuset_mutex held. */
+static inline int nr_cpusets(void)
+{
+ /* jump label reference count + the top-level cpuset */
+ return static_key_count(&cpusets_enabled_key.key) + 1;
+}
+
/*
* generate_sched_domains()
*
@@ -639,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
- cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
@@ -649,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
dattr = NULL;
csa = NULL;
- if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
- goto done;
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
@@ -666,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- non_isolated_cpus);
+ housekeeping_cpumask(HK_FLAG_DOMAIN));
goto done;
}
@@ -690,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
+ cpumask_intersects(cp->cpus_allowed,
+ housekeeping_cpumask(HK_FLAG_DOMAIN))))
continue;
if (is_sched_load_balance(cp))
@@ -772,7 +785,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, non_isolated_cpus);
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -785,7 +798,6 @@ restart:
BUG_ON(nslot != ndoms);
done:
- free_cpumask_var(non_isolated_cpus);
kfree(csa);
/*
@@ -862,7 +874,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
set_cpus_allowed_ptr(task, cs->effective_cpus);
css_task_iter_end(&it);
@@ -896,8 +908,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- cpumask_empty(new_cpus))
+ if (is_in_v2_mode() && cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent->effective_cpus);
/* Skip the whole subtree if the cpumask remains the same. */
@@ -914,7 +925,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
cpumask_copy(cp->effective_cpus, new_cpus);
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
update_tasks_cpumask(cp);
@@ -1092,7 +1103,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it))) {
struct mm_struct *mm;
bool migrate;
@@ -1150,8 +1161,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- nodes_empty(*new_mems))
+ if (is_in_v2_mode() && nodes_empty(*new_mems))
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
@@ -1168,7 +1178,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
cp->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
update_tasks_nodemask(cp);
@@ -1285,7 +1295,7 @@ static void update_tasks_flags(struct cpuset *cs)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
cpuset_update_task_spread_flag(cs, task);
css_task_iter_end(&it);
@@ -1460,7 +1470,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ if (!is_in_v2_mode() &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
@@ -1979,7 +1989,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
@@ -2056,7 +2066,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
@@ -2250,7 +2260,7 @@ retry:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
else
@@ -2260,6 +2270,13 @@ retry:
mutex_unlock(&cpuset_mutex);
}
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+ force_rebuild = true;
+}
+
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
@@ -2281,7 +2298,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
- bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+ bool on_dfl = is_in_v2_mode();
mutex_lock(&cpuset_mutex);
@@ -2334,8 +2351,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
}
/* rebuild sched domains if cpus_allowed has changed */
- if (cpus_updated)
+ if (cpus_updated || force_rebuild) {
+ force_rebuild = false;
rebuild_sched_domains();
+ }
}
void cpuset_update_active_cpus(void)
@@ -2344,16 +2363,15 @@ void cpuset_update_active_cpus(void)
* We're inside cpu hotplug critical region which usually nests
* inside cgroup synchronization. Bounce actual hotplug processing
* to a work item to avoid reverse locking order.
- *
- * We still need to do partition_sched_domains() synchronously;
- * otherwise, the scheduler will get confused and put tasks to the
- * dead CPU. Fall back to the default single domain.
- * cpuset_hotplug_workfn() will rebuild it as necessary.
*/
- partition_sched_domains(1, NULL, NULL);
schedule_work(&cpuset_hotplug_work);
}
+void cpuset_wait_for_hotplug(void)
+{
+ flush_work(&cpuset_hotplug_work);
+}
+
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
@@ -2499,12 +2517,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* If we're in interrupt, yes, we can always allocate. If @node is set in
* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
- * yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
+ * yes. If current has access to memory reserves as an oom victim, yes.
* Otherwise, no.
*
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset
- * unless the task has been OOM killed as is marked TIF_MEMDIE.
+ * unless the task has been OOM killed.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
@@ -2527,7 +2545,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* affect that:
* in_interrupt - any node ok (current task context irrelevant)
* GFP_ATOMIC - any node ok
- * TIF_MEMDIE - any node ok
+ * tsk_is_oom_victim - any node ok
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
@@ -2545,7 +2563,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ if (unlikely(tsk_is_oom_victim(current)))
return true;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return false;
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index dac46af22782..5f780d8f6a9d 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Debug controller
*
@@ -114,27 +115,49 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
{
struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
- int dead_cnt = 0, extra_refs = 0;
+ int dead_cnt = 0, extra_refs = 0, threaded_csets = 0;
spin_lock_irq(&css_set_lock);
+
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
struct css_set *cset = link->cset;
struct task_struct *task;
int count = 0;
int refcnt = refcount_read(&cset->refcount);
- seq_printf(seq, " %d", refcnt);
- if (refcnt - cset->nr_tasks > 0) {
- int extra = refcnt - cset->nr_tasks;
-
- seq_printf(seq, " +%d", extra);
- /*
- * Take out the one additional reference in
- * init_css_set.
- */
- if (cset == &init_css_set)
- extra--;
- extra_refs += extra;
+ /*
+ * Print out the proc_cset and threaded_cset relationship
+ * and highlight difference between refcount and task_count.
+ */
+ seq_printf(seq, "css_set %pK", cset);
+ if (rcu_dereference_protected(cset->dom_cset, 1) != cset) {
+ threaded_csets++;
+ seq_printf(seq, "=>%pK", cset->dom_cset);
+ }
+ if (!list_empty(&cset->threaded_csets)) {
+ struct css_set *tcset;
+ int idx = 0;
+
+ list_for_each_entry(tcset, &cset->threaded_csets,
+ threaded_csets_node) {
+ seq_puts(seq, idx ? "," : "<=");
+ seq_printf(seq, "%pK", tcset);
+ idx++;
+ }
+ } else {
+ seq_printf(seq, " %d", refcnt);
+ if (refcnt - cset->nr_tasks > 0) {
+ int extra = refcnt - cset->nr_tasks;
+
+ seq_printf(seq, " +%d", extra);
+ /*
+ * Take out the one additional reference in
+ * init_css_set.
+ */
+ if (cset == &init_css_set)
+ extra--;
+ extra_refs += extra;
+ }
}
seq_puts(seq, "\n");
@@ -163,10 +186,12 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
}
spin_unlock_irq(&css_set_lock);
- if (!dead_cnt && !extra_refs)
+ if (!dead_cnt && !extra_refs && !threaded_csets)
return 0;
seq_puts(seq, "\n");
+ if (threaded_csets)
+ seq_printf(seq, "threaded css_sets = %d\n", threaded_csets);
if (extra_refs)
seq_printf(seq, "extra references = %d\n", extra_refs);
if (dead_cnt)
@@ -352,6 +377,7 @@ static int __init enable_cgroup_debug(char *str)
{
debug_cgrp_subsys.dfl_cftypes = debug_files;
debug_cgrp_subsys.implicit_on_dfl = true;
+ debug_cgrp_subsys.threaded = true;
return 1;
}
__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 1b72d56edce5..08236798d173 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
rcu_read_unlock();
/* are all tasks frozen? */
- css_task_iter_start(css, &it);
+ css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
if (freezing(task)) {
@@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&freezer->css, &it);
+ css_task_iter_start(&freezer->css, 0, &it);
while ((task = css_task_iter_next(&it)))
freeze_task(task);
css_task_iter_end(&it);
@@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&freezer->css, &it);
+ css_task_iter_start(&freezer->css, 0, &it);
while ((task = css_task_iter_next(&it)))
__thaw_task(task);
css_task_iter_end(&it);
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 66129eb4371d..b05f1dd58a62 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "cgroup-internal.h"
#include <linux/sched/task.h>
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 2237201d66d5..9829c67ebc0a 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -345,4 +345,5 @@ struct cgroup_subsys pids_cgrp_subsys = {
.free = pids_free,
.legacy_cftypes = pids_files,
.dfl_cftypes = pids_files,
+ .threaded = true,
};
diff --git a/kernel/compat.c b/kernel/compat.c
index 6f0a0e723a06..772e038d04d9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -200,29 +200,6 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts)
}
EXPORT_SYMBOL_GPL(compat_put_timespec);
-int compat_convert_timespec(struct timespec __user **kts,
- const void __user *cts)
-{
- struct timespec ts;
- struct timespec __user *uts;
-
- if (!cts || COMPAT_USE_64BIT_TIME) {
- *kts = (struct timespec __user *)cts;
- return 0;
- }
-
- uts = compat_alloc_user_space(sizeof(ts));
- if (!uts)
- return -EFAULT;
- if (compat_get_timespec(&ts, cts))
- return -EFAULT;
- if (copy_to_user(uts, &ts, sizeof(ts)))
- return -EFAULT;
-
- *kts = uts;
- return 0;
-}
-
int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i)
{
struct compat_itimerval v32;
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index d70829033bb7..d3fd428f4b92 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -10,6 +10,7 @@
# CONFIG_USELIB is not set
CONFIG_ANDROID=y
CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
CONFIG_ANDROID_LOW_MEMORY_KILLER=y
CONFIG_ARMV8_DEPRECATED=y
CONFIG_ASHMEM=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index eee033134262..04892a82f6ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -24,6 +24,7 @@
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <linux/irq.h>
+#include <linux/nmi.h>
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
@@ -46,11 +47,13 @@
* @bringup: Single callback bringup or teardown selector
* @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
- * @done: Signal completion to the issuer of the task
+ * @done_up: Signal completion to the issuer of the task for cpu-up
+ * @done_down: Signal completion to the issuer of the task for cpu-down
*/
struct cpuhp_cpu_state {
enum cpuhp_state state;
enum cpuhp_state target;
+ enum cpuhp_state fail;
#ifdef CONFIG_SMP
struct task_struct *thread;
bool should_run;
@@ -58,18 +61,39 @@ struct cpuhp_cpu_state {
bool single;
bool bringup;
struct hlist_node *node;
+ struct hlist_node *last;
enum cpuhp_state cb_state;
int result;
- struct completion done;
+ struct completion done_up;
+ struct completion done_down;
#endif
};
-static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
+static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
+ .fail = CPUHP_INVALID,
+};
#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
-static struct lock_class_key cpuhp_state_key;
-static struct lockdep_map cpuhp_state_lock_map =
- STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key);
+static struct lockdep_map cpuhp_state_up_map =
+ STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
+static struct lockdep_map cpuhp_state_down_map =
+ STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
+
+
+static void inline cpuhp_lock_acquire(bool bringup)
+{
+ lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+
+static void inline cpuhp_lock_release(bool bringup)
+{
+ lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+#else
+
+static void inline cpuhp_lock_acquire(bool bringup) { }
+static void inline cpuhp_lock_release(bool bringup) { }
+
#endif
/**
@@ -123,13 +147,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
/**
* cpuhp_invoke_callback _ Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
- * @step: The step in the state machine
+ * @state: The state to do callbacks for
* @bringup: True if the bringup callback should be invoked
+ * @node: For multi-instance, do a single entry callback for install/remove
+ * @lastp: For multi-instance rollback, remember how far we got
*
* Called from cpu hotplug and from the state register machinery.
*/
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
- bool bringup, struct hlist_node *node)
+ bool bringup, struct hlist_node *node,
+ struct hlist_node **lastp)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct cpuhp_step *step = cpuhp_get_step(state);
@@ -137,7 +164,17 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
int (*cb)(unsigned int cpu);
int ret, cnt;
+ if (st->fail == state) {
+ st->fail = CPUHP_INVALID;
+
+ if (!(bringup ? step->startup.single : step->teardown.single))
+ return 0;
+
+ return -EAGAIN;
+ }
+
if (!step->multi_instance) {
+ WARN_ON_ONCE(lastp && *lastp);
cb = bringup ? step->startup.single : step->teardown.single;
if (!cb)
return 0;
@@ -152,6 +189,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
/* Single invocation for instance add/remove */
if (node) {
+ WARN_ON_ONCE(lastp && *lastp);
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
@@ -161,13 +199,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
/* State transition. Invoke on all instances */
cnt = 0;
hlist_for_each(node, &step->list) {
+ if (lastp && node == *lastp)
+ break;
+
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
- if (ret)
- goto err;
+ if (ret) {
+ if (!lastp)
+ goto err;
+
+ *lastp = node;
+ return ret;
+ }
cnt++;
}
+ if (lastp)
+ *lastp = NULL;
return 0;
err:
/* Rollback the instances if one failed */
@@ -178,12 +226,39 @@ err:
hlist_for_each(node, &step->list) {
if (!cnt--)
break;
- cbm(cpu, node);
+
+ trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+ ret = cbm(cpu, node);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ /*
+ * Rollback must not fail,
+ */
+ WARN_ON_ONCE(ret);
}
return ret;
}
#ifdef CONFIG_SMP
+static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+ struct completion *done = bringup ? &st->done_up : &st->done_down;
+ wait_for_completion(done);
+}
+
+static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+ struct completion *done = bringup ? &st->done_up : &st->done_down;
+ complete(done);
+}
+
+/*
+ * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
+ */
+static bool cpuhp_is_atomic_state(enum cpuhp_state state)
+{
+ return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
+}
+
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
@@ -271,14 +346,79 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
+static inline enum cpuhp_state
+cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+
+ st->rollback = false;
+ st->last = NULL;
+
+ st->target = target;
+ st->single = false;
+ st->bringup = st->state < target;
+
+ return prev_state;
+}
+
+static inline void
+cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
+{
+ st->rollback = true;
+
+ /*
+ * If we have st->last we need to undo partial multi_instance of this
+ * state first. Otherwise start undo at the previous state.
+ */
+ if (!st->last) {
+ if (st->bringup)
+ st->state--;
+ else
+ st->state++;
+ }
+
+ st->target = prev_state;
+ st->bringup = !st->bringup;
+}
+
+/* Regular hotplug invocation of the AP hotplug thread */
+static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
+{
+ if (!st->single && st->state == st->target)
+ return;
+
+ st->result = 0;
+ /*
+ * Make sure the above stores are visible before should_run becomes
+ * true. Paired with the mb() above in cpuhp_thread_fun()
+ */
+ smp_mb();
+ st->should_run = true;
+ wake_up_process(st->thread);
+ wait_for_ap_thread(st, st->bringup);
+}
+
+static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state;
+ int ret;
+
+ prev_state = cpuhp_set_state(st, target);
+ __cpuhp_kick_ap(st);
+ if ((ret = st->result)) {
+ cpuhp_reset_state(st, prev_state);
+ __cpuhp_kick_ap(st);
+ }
+
+ return ret;
+}
static int bringup_wait_for_ap(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
/* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
- wait_for_completion(&st->done);
+ wait_for_ap_thread(st, true);
if (WARN_ON_ONCE((!cpu_online(cpu))))
return -ECANCELED;
@@ -286,12 +426,10 @@ static int bringup_wait_for_ap(unsigned int cpu)
stop_machine_unpark(cpu);
kthread_unpark(st->thread);
- /* Should we go further up ? */
- if (st->target > CPUHP_AP_ONLINE_IDLE) {
- __cpuhp_kick_ap_work(st);
- wait_for_completion(&st->done);
- }
- return st->result;
+ if (st->target <= CPUHP_AP_ONLINE_IDLE)
+ return 0;
+
+ return cpuhp_kick_ap(st, st->target);
}
static int bringup_cpu(unsigned int cpu)
@@ -317,32 +455,6 @@ static int bringup_cpu(unsigned int cpu)
/*
* Hotplug state machine related functions
*/
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- for (st->state++; st->state < st->target; st->state++) {
- struct cpuhp_step *step = cpuhp_get_step(st->state);
-
- if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, true, NULL);
- }
-}
-
-static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
- enum cpuhp_state target)
-{
- enum cpuhp_state prev_state = st->state;
- int ret = 0;
-
- for (; st->state > target; st->state--) {
- ret = cpuhp_invoke_callback(cpu, st->state, false, NULL);
- if (ret) {
- st->target = prev_state;
- undo_cpu_down(cpu, st);
- break;
- }
- }
- return ret;
-}
static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
{
@@ -350,7 +462,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
struct cpuhp_step *step = cpuhp_get_step(st->state);
if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, false, NULL);
+ cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
}
}
@@ -362,7 +474,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
while (st->state < target) {
st->state++;
- ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
+ ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
if (ret) {
st->target = prev_state;
undo_cpu_up(cpu, st);
@@ -379,7 +491,8 @@ static void cpuhp_create(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
- init_completion(&st->done);
+ init_completion(&st->done_up);
+ init_completion(&st->done_down);
}
static int cpuhp_should_run(unsigned int cpu)
@@ -389,69 +502,90 @@ static int cpuhp_should_run(unsigned int cpu)
return st->should_run;
}
-/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
-static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
-
- return cpuhp_down_callbacks(cpu, st, target);
-}
-
-/* Execute the online startup callbacks. Used to be CPU_ONLINE */
-static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- return cpuhp_up_callbacks(cpu, st, st->target);
-}
-
/*
* Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
* callbacks when a state gets [un]installed at runtime.
+ *
+ * Each invocation of this function by the smpboot thread does a single AP
+ * state callback.
+ *
+ * It has 3 modes of operation:
+ * - single: runs st->cb_state
+ * - up: runs ++st->state, while st->state < st->target
+ * - down: runs st->state--, while st->state > st->target
+ *
+ * When complete or on error, should_run is cleared and the completion is fired.
*/
static void cpuhp_thread_fun(unsigned int cpu)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
- int ret = 0;
+ bool bringup = st->bringup;
+ enum cpuhp_state state;
/*
- * Paired with the mb() in cpuhp_kick_ap_work and
- * cpuhp_invoke_ap_callback, so the work set is consistent visible.
+ * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
+ * that if we see ->should_run we also see the rest of the state.
*/
smp_mb();
- if (!st->should_run)
+
+ if (WARN_ON_ONCE(!st->should_run))
return;
- st->should_run = false;
+ cpuhp_lock_acquire(bringup);
- lock_map_acquire(&cpuhp_state_lock_map);
- /* Single callback invocation for [un]install ? */
if (st->single) {
- if (st->cb_state < CPUHP_AP_ONLINE) {
- local_irq_disable();
- ret = cpuhp_invoke_callback(cpu, st->cb_state,
- st->bringup, st->node);
- local_irq_enable();
+ state = st->cb_state;
+ st->should_run = false;
+ } else {
+ if (bringup) {
+ st->state++;
+ state = st->state;
+ st->should_run = (st->state < st->target);
+ WARN_ON_ONCE(st->state > st->target);
} else {
- ret = cpuhp_invoke_callback(cpu, st->cb_state,
- st->bringup, st->node);
+ state = st->state;
+ st->state--;
+ st->should_run = (st->state > st->target);
+ WARN_ON_ONCE(st->state < st->target);
}
- } else if (st->rollback) {
- BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+ }
+
+ WARN_ON_ONCE(!cpuhp_is_ap_state(state));
+
+ if (st->rollback) {
+ struct cpuhp_step *step = cpuhp_get_step(state);
+ if (step->skip_onerr)
+ goto next;
+ }
- undo_cpu_down(cpu, st);
- st->rollback = false;
+ if (cpuhp_is_atomic_state(state)) {
+ local_irq_disable();
+ st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+ local_irq_enable();
+
+ /*
+ * STARTING/DYING must not fail!
+ */
+ WARN_ON_ONCE(st->result);
} else {
- /* Cannot happen .... */
- BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
-
- /* Regular hotplug work */
- if (st->state < st->target)
- ret = cpuhp_ap_online(cpu, st);
- else if (st->state > st->target)
- ret = cpuhp_ap_offline(cpu, st);
+ st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+ }
+
+ if (st->result) {
+ /*
+ * If we fail on a rollback, we're up a creek without no
+ * paddle, no way forward, no way back. We loose, thanks for
+ * playing.
+ */
+ WARN_ON_ONCE(st->rollback);
+ st->should_run = false;
}
- lock_map_release(&cpuhp_state_lock_map);
- st->result = ret;
- complete(&st->done);
+
+next:
+ cpuhp_lock_release(bringup);
+
+ if (!st->should_run)
+ complete_ap_thread(st, bringup);
}
/* Invoke a single callback on a remote cpu */
@@ -460,62 +594,69 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
struct hlist_node *node)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int ret;
if (!cpu_online(cpu))
return 0;
- lock_map_acquire(&cpuhp_state_lock_map);
- lock_map_release(&cpuhp_state_lock_map);
+ cpuhp_lock_acquire(false);
+ cpuhp_lock_release(false);
+
+ cpuhp_lock_acquire(true);
+ cpuhp_lock_release(true);
/*
* If we are up and running, use the hotplug thread. For early calls
* we invoke the thread function directly.
*/
if (!st->thread)
- return cpuhp_invoke_callback(cpu, state, bringup, node);
+ return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
+ st->rollback = false;
+ st->last = NULL;
+
+ st->node = node;
+ st->bringup = bringup;
st->cb_state = state;
st->single = true;
- st->bringup = bringup;
- st->node = node;
+
+ __cpuhp_kick_ap(st);
/*
- * Make sure the above stores are visible before should_run becomes
- * true. Paired with the mb() above in cpuhp_thread_fun()
+ * If we failed and did a partial, do a rollback.
*/
- smp_mb();
- st->should_run = true;
- wake_up_process(st->thread);
- wait_for_completion(&st->done);
- return st->result;
-}
+ if ((ret = st->result) && st->last) {
+ st->rollback = true;
+ st->bringup = !bringup;
+
+ __cpuhp_kick_ap(st);
+ }
-/* Regular hotplug invocation of the AP hotplug thread */
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
-{
- st->result = 0;
- st->single = false;
/*
- * Make sure the above stores are visible before should_run becomes
- * true. Paired with the mb() above in cpuhp_thread_fun()
+ * Clean up the leftovers so the next hotplug operation wont use stale
+ * data.
*/
- smp_mb();
- st->should_run = true;
- wake_up_process(st->thread);
+ st->node = st->last = NULL;
+ return ret;
}
static int cpuhp_kick_ap_work(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
- enum cpuhp_state state = st->state;
+ enum cpuhp_state prev_state = st->state;
+ int ret;
- trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
- lock_map_acquire(&cpuhp_state_lock_map);
- lock_map_release(&cpuhp_state_lock_map);
- __cpuhp_kick_ap_work(st);
- wait_for_completion(&st->done);
- trace_cpuhp_exit(cpu, st->state, state, st->result);
- return st->result;
+ cpuhp_lock_acquire(false);
+ cpuhp_lock_release(false);
+
+ cpuhp_lock_acquire(true);
+ cpuhp_lock_release(true);
+
+ trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
+ ret = cpuhp_kick_ap(st, st->target);
+ trace_cpuhp_exit(cpu, st->state, prev_state, ret);
+
+ return ret;
}
static struct smp_hotplug_thread cpuhp_threads = {
@@ -581,6 +722,7 @@ static int take_cpu_down(void *_param)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
+ int ret;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
@@ -594,8 +736,13 @@ static int take_cpu_down(void *_param)
WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
st->state--;
/* Invoke the former CPU_DYING callbacks */
- for (; st->state > target; st->state--)
- cpuhp_invoke_callback(cpu, st->state, false, NULL);
+ for (; st->state > target; st->state--) {
+ ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+ /*
+ * DYING must not fail!
+ */
+ WARN_ON_ONCE(ret);
+ }
/* Give up timekeeping duties */
tick_handover_do_timer();
@@ -639,7 +786,7 @@ static int takedown_cpu(unsigned int cpu)
*
* Wait for the stop thread to go away.
*/
- wait_for_completion(&st->done);
+ wait_for_ap_thread(st, false);
BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
/* Interrupts are moved away from the dying cpu, reenable alloc/free */
@@ -650,6 +797,7 @@ static int takedown_cpu(unsigned int cpu)
__cpu_die(cpu);
tick_cleanup_dead_cpu(cpu);
+ rcutree_migrate_callbacks(cpu);
return 0;
}
@@ -657,7 +805,7 @@ static void cpuhp_complete_idle_dead(void *arg)
{
struct cpuhp_cpu_state *st = arg;
- complete(&st->done);
+ complete_ap_thread(st, false);
}
void cpuhp_report_idle_dead(void)
@@ -675,11 +823,32 @@ void cpuhp_report_idle_dead(void)
cpuhp_complete_idle_dead, st, 0);
}
-#else
-#define takedown_cpu NULL
-#endif
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+ for (st->state++; st->state < st->target; st->state++) {
+ struct cpuhp_step *step = cpuhp_get_step(st->state);
-#ifdef CONFIG_HOTPLUG_CPU
+ if (!step->skip_onerr)
+ cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+ }
+}
+
+static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+ int ret = 0;
+
+ for (; st->state > target; st->state--) {
+ ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+ if (ret) {
+ st->target = prev_state;
+ undo_cpu_down(cpu, st);
+ break;
+ }
+ }
+ return ret;
+}
/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
@@ -698,13 +867,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpuhp_tasks_frozen = tasks_frozen;
- prev_state = st->state;
- st->target = target;
+ prev_state = cpuhp_set_state(st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread.
*/
if (st->state > CPUHP_TEARDOWN_CPU) {
+ st->target = max((int)target, CPUHP_TEARDOWN_CPU);
ret = cpuhp_kick_ap_work(cpu);
/*
* The AP side has done the error rollback already. Just
@@ -719,6 +888,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
*/
if (st->state > CPUHP_TEARDOWN_CPU)
goto out;
+
+ st->target = target;
}
/*
* The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
@@ -726,13 +897,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
*/
ret = cpuhp_down_callbacks(cpu, st, target);
if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
- st->target = prev_state;
- st->rollback = true;
- cpuhp_kick_ap_work(cpu);
+ cpuhp_reset_state(st, prev_state);
+ __cpuhp_kick_ap(st);
}
out:
cpus_write_unlock();
+ /*
+ * Do post unplug cleanup. This is still protected against
+ * concurrent CPU hotplug via cpu_add_remove_lock.
+ */
+ lockup_detector_cleanup();
return ret;
}
@@ -753,11 +928,15 @@ out:
cpu_maps_update_done();
return err;
}
+
int cpu_down(unsigned int cpu)
{
return do_cpu_down(cpu, CPUHP_OFFLINE);
}
EXPORT_SYMBOL(cpu_down);
+
+#else
+#define takedown_cpu NULL
#endif /*CONFIG_HOTPLUG_CPU*/
/**
@@ -771,11 +950,16 @@ void notify_cpu_starting(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
+ int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
while (st->state < target) {
st->state++;
- cpuhp_invoke_callback(cpu, st->state, true, NULL);
+ ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+ /*
+ * STARTING must not fail!
+ */
+ WARN_ON_ONCE(ret);
}
}
@@ -793,7 +977,7 @@ void cpuhp_online_idle(enum cpuhp_state state)
return;
st->state = CPUHP_AP_ONLINE_IDLE;
- complete(&st->done);
+ complete_ap_thread(st, true);
}
/* Requires cpu_add_remove_lock to be held */
@@ -828,7 +1012,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
cpuhp_tasks_frozen = tasks_frozen;
- st->target = target;
+ cpuhp_set_state(st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread once more.
@@ -1252,7 +1436,17 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
struct cpuhp_step *sp;
int ret = 0;
- if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) {
+ /*
+ * If name is NULL, then the state gets removed.
+ *
+ * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
+ * the first allocation from these dynamic ranges, so the removal
+ * would trigger a new allocation and clear the wrong (already
+ * empty) state, leaving the callbacks of the to be cleared state
+ * dangling, which causes wreckage on the next hotplug operation.
+ */
+ if (name && (state == CPUHP_AP_ONLINE_DYN ||
+ state == CPUHP_BP_PREPARE_DYN)) {
ret = cpuhp_reserve_state(state);
if (ret < 0)
return ret;
@@ -1285,6 +1479,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
struct cpuhp_step *sp = cpuhp_get_step(state);
int ret;
+ /*
+ * If there's nothing to do, we done.
+ * Relies on the union for multi_instance.
+ */
if ((bringup && !sp->startup.single) ||
(!bringup && !sp->teardown.single))
return 0;
@@ -1296,9 +1494,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
if (cpuhp_is_ap_state(state))
ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
else
- ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#else
- ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#endif
BUG_ON(ret && !bringup);
return ret;
@@ -1630,9 +1828,55 @@ static ssize_t show_cpuhp_target(struct device *dev,
}
static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
+
+static ssize_t write_cpuhp_fail(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+ struct cpuhp_step *sp;
+ int fail, ret;
+
+ ret = kstrtoint(buf, 10, &fail);
+ if (ret)
+ return ret;
+
+ /*
+ * Cannot fail STARTING/DYING callbacks.
+ */
+ if (cpuhp_is_atomic_state(fail))
+ return -EINVAL;
+
+ /*
+ * Cannot fail anything that doesn't have callbacks.
+ */
+ mutex_lock(&cpuhp_state_mutex);
+ sp = cpuhp_get_step(fail);
+ if (!sp->startup.single && !sp->teardown.single)
+ ret = -EINVAL;
+ mutex_unlock(&cpuhp_state_mutex);
+ if (ret)
+ return ret;
+
+ st->fail = fail;
+
+ return count;
+}
+
+static ssize_t show_cpuhp_fail(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+ return sprintf(buf, "%d\n", st->fail);
+}
+
+static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
+
static struct attribute *cpuhp_cpu_attrs[] = {
&dev_attr_state.attr,
&dev_attr_target.attr,
+ &dev_attr_fail.attr,
NULL
};
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 009cc9a17d95..67b02e138a47 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -22,15 +22,21 @@
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
-static DEFINE_RWLOCK(cpu_pm_notifier_lock);
-static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
{
int ret;
- ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
+ /*
+ * __atomic_notifier_call_chain has a RCU read critical section, which
+ * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
+ * RCU know this.
+ */
+ rcu_irq_enter_irqson();
+ ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
nr_to_call, nr_calls);
+ rcu_irq_exit_irqson();
return notifier_to_errno(ret);
}
@@ -47,14 +53,7 @@ static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
*/
int cpu_pm_register_notifier(struct notifier_block *nb)
{
- unsigned long flags;
- int ret;
-
- write_lock_irqsave(&cpu_pm_notifier_lock, flags);
- ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
- write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-
- return ret;
+ return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
@@ -69,14 +68,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
*/
int cpu_pm_unregister_notifier(struct notifier_block *nb)
{
- unsigned long flags;
- int ret;
-
- write_lock_irqsave(&cpu_pm_notifier_lock, flags);
- ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
- write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-
- return ret;
+ return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
@@ -100,7 +92,6 @@ int cpu_pm_enter(void)
int nr_calls;
int ret = 0;
- read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
if (ret)
/*
@@ -108,7 +99,6 @@ int cpu_pm_enter(void)
* PM entry who are notified earlier to prepare for it.
*/
cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
return ret;
}
@@ -128,13 +118,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter);
*/
int cpu_pm_exit(void)
{
- int ret;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
}
EXPORT_SYMBOL_GPL(cpu_pm_exit);
@@ -159,7 +143,6 @@ int cpu_cluster_pm_enter(void)
int nr_calls;
int ret = 0;
- read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
if (ret)
/*
@@ -167,7 +150,6 @@ int cpu_cluster_pm_enter(void)
* PM entry who are notified earlier to prepare for it.
*/
cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
return ret;
}
@@ -190,13 +172,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
*/
int cpu_cluster_pm_exit(void)
{
- int ret;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
}
EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
diff --git a/kernel/dma.c b/kernel/dma.c
index 6c6262f86c17..3506fc34a712 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
*
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index e556751d15d9..fc482c8e0bd8 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/elf.h>
#include <linux/fs.h>
#include <linux/mm.h>
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 2925188f50ea..3c022e33c109 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
endif
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 03ac9c8b02fb..3939a4674e0a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -209,7 +209,7 @@ static int event_function(void *info)
struct perf_event_context *task_ctx = cpuctx->task_ctx;
int ret = 0;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
perf_ctx_lock(cpuctx, task_ctx);
/*
@@ -306,7 +306,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
struct task_struct *task = READ_ONCE(ctx->task);
struct perf_event_context *task_ctx = NULL;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
if (task) {
if (task == TASK_TOMBSTONE)
@@ -582,6 +582,88 @@ static inline u64 perf_event_clock(struct perf_event *event)
return event->clock();
}
+/*
+ * State based event timekeeping...
+ *
+ * The basic idea is to use event->state to determine which (if any) time
+ * fields to increment with the current delta. This means we only need to
+ * update timestamps when we change state or when they are explicitly requested
+ * (read).
+ *
+ * Event groups make things a little more complicated, but not terribly so. The
+ * rules for a group are that if the group leader is OFF the entire group is
+ * OFF, irrespecive of what the group member states are. This results in
+ * __perf_effective_state().
+ *
+ * A futher ramification is that when a group leader flips between OFF and
+ * !OFF, we need to update all group member times.
+ *
+ *
+ * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
+ * need to make sure the relevant context time is updated before we try and
+ * update our timestamps.
+ */
+
+static __always_inline enum perf_event_state
+__perf_effective_state(struct perf_event *event)
+{
+ struct perf_event *leader = event->group_leader;
+
+ if (leader->state <= PERF_EVENT_STATE_OFF)
+ return leader->state;
+
+ return event->state;
+}
+
+static __always_inline void
+__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
+{
+ enum perf_event_state state = __perf_effective_state(event);
+ u64 delta = now - event->tstamp;
+
+ *enabled = event->total_time_enabled;
+ if (state >= PERF_EVENT_STATE_INACTIVE)
+ *enabled += delta;
+
+ *running = event->total_time_running;
+ if (state >= PERF_EVENT_STATE_ACTIVE)
+ *running += delta;
+}
+
+static void perf_event_update_time(struct perf_event *event)
+{
+ u64 now = perf_event_time(event);
+
+ __perf_update_times(event, now, &event->total_time_enabled,
+ &event->total_time_running);
+ event->tstamp = now;
+}
+
+static void perf_event_update_sibling_time(struct perf_event *leader)
+{
+ struct perf_event *sibling;
+
+ list_for_each_entry(sibling, &leader->sibling_list, group_entry)
+ perf_event_update_time(sibling);
+}
+
+static void
+perf_event_set_state(struct perf_event *event, enum perf_event_state state)
+{
+ if (event->state == state)
+ return;
+
+ perf_event_update_time(event);
+ /*
+ * If a group leader gets enabled/disabled all its siblings
+ * are affected too.
+ */
+ if ((event->state < 0) ^ (state < 0))
+ perf_event_update_sibling_time(event);
+
+ WRITE_ONCE(event->state, state);
+}
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
@@ -662,7 +744,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
/*
* Do not update time when cgroup is not active
*/
- if (cgrp == event->cgrp)
+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
__update_cgrp_time(event->cgrp);
}
@@ -841,40 +923,6 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
event->shadow_ctx_time = now - t->timestamp;
}
-static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
-{
- /*
- * when the current task's perf cgroup does not match
- * the event's, we need to remember to call the
- * perf_mark_enable() function the first time a task with
- * a matching perf cgroup is scheduled in.
- */
- if (is_cgroup_event(event) && !perf_cgroup_match(event))
- event->cgrp_defer_enabled = 1;
-}
-
-static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- struct perf_event *sub;
- u64 tstamp = perf_event_time(event);
-
- if (!event->cgrp_defer_enabled)
- return;
-
- event->cgrp_defer_enabled = 0;
-
- event->tstamp_enabled = tstamp - event->total_time_enabled;
- list_for_each_entry(sub, &event->sibling_list, group_entry) {
- if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
- sub->cgrp_defer_enabled = 0;
- }
- }
-}
-
/*
* Update cpuctx->cgrp so that it is set when first cgroup event is added and
* cleared when last cgroup event is removed.
@@ -901,9 +949,11 @@ list_update_cgroup_event(struct perf_event *event,
cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
/* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
if (add) {
+ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
+
list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
- if (perf_cgroup_from_task(current, ctx) == event->cgrp)
- cpuctx->cgrp = event->cgrp;
+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
+ cpuctx->cgrp = cgrp;
} else {
list_del(cpuctx_entry);
cpuctx->cgrp = NULL;
@@ -973,17 +1023,6 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
}
static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
-{
-}
-
-static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
- struct perf_event_context *ctx)
-{
-}
-
-static inline void
list_update_cgroup_event(struct perf_event *event,
struct perf_event_context *ctx, bool add)
{
@@ -1004,7 +1043,7 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
struct perf_cpu_context *cpuctx;
int rotations = 0;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
rotations = perf_rotate_context(cpuctx);
@@ -1091,7 +1130,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx)
{
struct list_head *head = this_cpu_ptr(&active_ctx_list);
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
WARN_ON(!list_empty(&ctx->active_ctx_list));
@@ -1100,7 +1139,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx)
static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
{
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
WARN_ON(list_empty(&ctx->active_ctx_list));
@@ -1200,7 +1239,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
again:
rcu_read_lock();
- ctx = ACCESS_ONCE(event->ctx);
+ ctx = READ_ONCE(event->ctx);
if (!atomic_inc_not_zero(&ctx->refcount)) {
rcu_read_unlock();
goto again;
@@ -1249,26 +1288,31 @@ unclone_ctx(struct perf_event_context *ctx)
return parent_ctx;
}
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
+ enum pid_type type)
{
+ u32 nr;
/*
* only top level events have the pid namespace they were created in
*/
if (event->parent)
event = event->parent;
- return task_tgid_nr_ns(p, event->ns);
+ nr = __task_pid_nr_ns(p, type, event->ns);
+ /* avoid -1 if it is idle thread or runs in another ns */
+ if (!nr && !pid_alive(p))
+ nr = -1;
+ return nr;
}
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
- /*
- * only top level events have the pid namespace they were created in
- */
- if (event->parent)
- event = event->parent;
+ return perf_event_pid_type(event, p, __PIDTYPE_TGID);
+}
- return task_pid_nr_ns(p, event->ns);
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+ return perf_event_pid_type(event, p, PIDTYPE_PID);
}
/*
@@ -1391,60 +1435,6 @@ static u64 perf_event_time(struct perf_event *event)
return ctx ? ctx->time : 0;
}
-/*
- * Update the total_time_enabled and total_time_running fields for a event.
- */
-static void update_event_times(struct perf_event *event)
-{
- struct perf_event_context *ctx = event->ctx;
- u64 run_end;
-
- lockdep_assert_held(&ctx->lock);
-
- if (event->state < PERF_EVENT_STATE_INACTIVE ||
- event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
- return;
-
- /*
- * in cgroup mode, time_enabled represents
- * the time the event was enabled AND active
- * tasks were in the monitored cgroup. This is
- * independent of the activity of the context as
- * there may be a mix of cgroup and non-cgroup events.
- *
- * That is why we treat cgroup events differently
- * here.
- */
- if (is_cgroup_event(event))
- run_end = perf_cgroup_event_time(event);
- else if (ctx->is_active)
- run_end = ctx->time;
- else
- run_end = event->tstamp_stopped;
-
- event->total_time_enabled = run_end - event->tstamp_enabled;
-
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- run_end = event->tstamp_stopped;
- else
- run_end = perf_event_time(event);
-
- event->total_time_running = run_end - event->tstamp_running;
-
-}
-
-/*
- * Update total_time_enabled and total_time_running for all events in a group.
- */
-static void update_group_times(struct perf_event *leader)
-{
- struct perf_event *event;
-
- update_event_times(leader);
- list_for_each_entry(event, &leader->sibling_list, group_entry)
- update_event_times(event);
-}
-
static enum event_type_t get_event_type(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
@@ -1487,6 +1477,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
event->attach_state |= PERF_ATTACH_CONTEXT;
+ event->tstamp = perf_event_time(event);
+
/*
* If we're a stand alone event or group leader, we go to the context
* list, group events are kept attached to the group so that
@@ -1570,6 +1562,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_TRANSACTION)
size += sizeof(data->txn);
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ size += sizeof(data->phys_addr);
+
event->header_size = size;
}
@@ -1691,8 +1686,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
if (event->group_leader == event)
list_del_init(&event->group_entry);
- update_group_times(event);
-
/*
* If event was in error state, then keep it
* that way, otherwise bogus counts will be
@@ -1701,7 +1694,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
* of the event
*/
if (event->state > PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_OFF;
+ perf_event_set_state(event, PERF_EVENT_STATE_OFF);
ctx->generation++;
}
@@ -1800,38 +1793,24 @@ event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
- u64 delta;
+ enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
- /*
- * An event which could not be activated because of
- * filter mismatch still needs to have its timings
- * maintained, otherwise bogus information is return
- * via read() for time_enabled, time_running:
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE &&
- !event_filter_match(event)) {
- delta = tstamp - event->tstamp_stopped;
- event->tstamp_running += delta;
- event->tstamp_stopped = tstamp;
- }
-
if (event->state != PERF_EVENT_STATE_ACTIVE)
return;
perf_pmu_disable(event->pmu);
- event->tstamp_stopped = tstamp;
event->pmu->del(event, 0);
event->oncpu = -1;
- event->state = PERF_EVENT_STATE_INACTIVE;
+
if (event->pending_disable) {
event->pending_disable = 0;
- event->state = PERF_EVENT_STATE_OFF;
+ state = PERF_EVENT_STATE_OFF;
}
+ perf_event_set_state(event, state);
if (!is_software_event(event))
cpuctx->active_oncpu--;
@@ -1851,7 +1830,9 @@ group_sched_out(struct perf_event *group_event,
struct perf_event_context *ctx)
{
struct perf_event *event;
- int state = group_event->state;
+
+ if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
perf_pmu_disable(ctx->pmu);
@@ -1865,7 +1846,7 @@ group_sched_out(struct perf_event *group_event,
perf_pmu_enable(ctx->pmu);
- if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
+ if (group_event->attr.exclusive)
cpuctx->exclusive = 0;
}
@@ -1885,6 +1866,11 @@ __perf_remove_from_context(struct perf_event *event,
{
unsigned long flags = (unsigned long)info;
+ if (ctx->is_active & EVENT_TIME) {
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx);
+ }
+
event_sched_out(event, cpuctx, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
@@ -1947,14 +1933,17 @@ static void __perf_event_disable(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return;
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- update_group_times(event);
+ if (ctx->is_active & EVENT_TIME) {
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+
if (event == event->group_leader)
group_sched_out(event, cpuctx, ctx);
else
event_sched_out(event, cpuctx, ctx);
- event->state = PERF_EVENT_STATE_OFF;
+
+ perf_event_set_state(event, PERF_EVENT_STATE_OFF);
}
/*
@@ -2011,8 +2000,7 @@ void perf_event_disable_inatomic(struct perf_event *event)
}
static void perf_set_shadow_time(struct perf_event *event,
- struct perf_event_context *ctx,
- u64 tstamp)
+ struct perf_event_context *ctx)
{
/*
* use the correct time source for the time snapshot
@@ -2040,9 +2028,9 @@ static void perf_set_shadow_time(struct perf_event *event,
* is cleaner and simpler to understand.
*/
if (is_cgroup_event(event))
- perf_cgroup_set_shadow_time(event, tstamp);
+ perf_cgroup_set_shadow_time(event, event->tstamp);
else
- event->shadow_ctx_time = tstamp - ctx->timestamp;
+ event->shadow_ctx_time = event->tstamp - ctx->timestamp;
}
#define MAX_INTERRUPTS (~0ULL)
@@ -2055,7 +2043,6 @@ event_sched_in(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
int ret = 0;
lockdep_assert_held(&ctx->lock);
@@ -2065,11 +2052,12 @@ event_sched_in(struct perf_event *event,
WRITE_ONCE(event->oncpu, smp_processor_id());
/*
- * Order event::oncpu write to happen before the ACTIVE state
- * is visible.
+ * Order event::oncpu write to happen before the ACTIVE state is
+ * visible. This allows perf_event_{stop,read}() to observe the correct
+ * ->oncpu if it sees ACTIVE.
*/
smp_wmb();
- WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
+ perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
/*
* Unthrottle events, since we scheduled we might have missed several
@@ -2081,26 +2069,19 @@ event_sched_in(struct perf_event *event,
event->hw.interrupts = 0;
}
- /*
- * The new state must be visible before we turn it on in the hardware:
- */
- smp_wmb();
-
perf_pmu_disable(event->pmu);
- perf_set_shadow_time(event, ctx, tstamp);
+ perf_set_shadow_time(event, ctx);
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
- event->state = PERF_EVENT_STATE_INACTIVE;
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
event->oncpu = -1;
ret = -EAGAIN;
goto out;
}
- event->tstamp_running += tstamp - event->tstamp_stopped;
-
if (!is_software_event(event))
cpuctx->active_oncpu++;
if (!ctx->nr_active++)
@@ -2124,8 +2105,6 @@ group_sched_in(struct perf_event *group_event,
{
struct perf_event *event, *partial_group = NULL;
struct pmu *pmu = ctx->pmu;
- u64 now = ctx->time;
- bool simulate = false;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
@@ -2155,27 +2134,13 @@ group_error:
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
- * The events up to the failed event are scheduled out normally,
- * tstamp_stopped will be updated.
- *
- * The failed events and the remaining siblings need to have
- * their timings updated as if they had gone thru event_sched_in()
- * and event_sched_out(). This is required to get consistent timings
- * across the group. This also takes care of the case where the group
- * could never be scheduled by ensuring tstamp_stopped is set to mark
- * the time the event was actually stopped, such that time delta
- * calculation in update_event_times() is correct.
+ * The events up to the failed event are scheduled out normally.
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
if (event == partial_group)
- simulate = true;
+ break;
- if (simulate) {
- event->tstamp_running += now - event->tstamp_stopped;
- event->tstamp_stopped = now;
- } else {
- event_sched_out(event, cpuctx, ctx);
- }
+ event_sched_out(event, cpuctx, ctx);
}
event_sched_out(group_event, cpuctx, ctx);
@@ -2217,46 +2182,11 @@ static int group_can_go_on(struct perf_event *event,
return can_add_hw;
}
-/*
- * Complement to update_event_times(). This computes the tstamp_* values to
- * continue 'enabled' state from @now, and effectively discards the time
- * between the prior tstamp_stopped and now (as we were in the OFF state, or
- * just switched (context) time base).
- *
- * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
- * cannot have been scheduled in yet. And going into INACTIVE state means
- * '@event->tstamp_stopped = @now'.
- *
- * Thus given the rules of update_event_times():
- *
- * total_time_enabled = tstamp_stopped - tstamp_enabled
- * total_time_running = tstamp_stopped - tstamp_running
- *
- * We can insert 'tstamp_stopped == now' and reverse them to compute new
- * tstamp_* values.
- */
-static void __perf_event_enable_time(struct perf_event *event, u64 now)
-{
- WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
-
- event->tstamp_stopped = now;
- event->tstamp_enabled = now - event->total_time_enabled;
- event->tstamp_running = now - event->total_time_running;
-}
-
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
-
list_add_event(event, ctx);
perf_group_attach(event);
- /*
- * We can be called with event->state == STATE_OFF when we create with
- * .disabled = 1. In that case the IOC_ENABLE will call this function.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- __perf_event_enable_time(event, tstamp);
}
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2488,28 +2418,6 @@ again:
}
/*
- * Put a event into inactive state and update time fields.
- * Enabling the leader of a group effectively enables all
- * the group members that aren't explicitly disabled, so we
- * have to update their ->tstamp_enabled also.
- * Note: this works for group members as well as group leaders
- * since the non-leader members' sibling_lists will be empty.
- */
-static void __perf_event_mark_enabled(struct perf_event *event)
-{
- struct perf_event *sub;
- u64 tstamp = perf_event_time(event);
-
- event->state = PERF_EVENT_STATE_INACTIVE;
- __perf_event_enable_time(event, tstamp);
- list_for_each_entry(sub, &event->sibling_list, group_entry) {
- /* XXX should not be > INACTIVE if event isn't */
- if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- __perf_event_enable_time(sub, tstamp);
- }
-}
-
-/*
* Cross CPU call to enable a performance event
*/
static void __perf_event_enable(struct perf_event *event,
@@ -2527,14 +2435,12 @@ static void __perf_event_enable(struct perf_event *event,
if (ctx->is_active)
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
- __perf_event_mark_enabled(event);
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
if (!ctx->is_active)
return;
if (!event_filter_match(event)) {
- if (is_cgroup_event(event))
- perf_cgroup_defer_enabled(event);
ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
return;
}
@@ -2854,18 +2760,10 @@ static void __perf_event_sync_stat(struct perf_event *event,
* we know the event must be on the current CPU, therefore we
* don't need to use it.
*/
- switch (event->state) {
- case PERF_EVENT_STATE_ACTIVE:
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
event->pmu->read(event);
- /* fall-through */
- case PERF_EVENT_STATE_INACTIVE:
- update_event_times(event);
- break;
-
- default:
- break;
- }
+ perf_event_update_time(event);
/*
* In order to keep per-task stats reliable we need to flip the event
@@ -3102,10 +3000,6 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
-
if (group_can_go_on(event, cpuctx, 1))
group_sched_in(event, cpuctx, ctx);
@@ -3113,10 +3007,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
* If this pinned group hasn't been scheduled,
* put it in error state.
*/
- if (event->state == PERF_EVENT_STATE_INACTIVE) {
- update_group_times(event);
- event->state = PERF_EVENT_STATE_ERROR;
- }
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
}
@@ -3138,10 +3030,6 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
-
if (group_can_go_on(event, cpuctx, can_add_hw)) {
if (group_sched_in(event, cpuctx, ctx))
can_add_hw = 0;
@@ -3211,6 +3099,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
return;
perf_ctx_lock(cpuctx, ctx);
+ /*
+ * We must check ctx->nr_events while holding ctx->lock, such
+ * that we serialize against perf_install_in_context().
+ */
+ if (!ctx->nr_events)
+ goto unlock;
+
perf_pmu_disable(ctx->pmu);
/*
* We want to keep the following priority order:
@@ -3224,6 +3119,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
+
+unlock:
perf_ctx_unlock(cpuctx, ctx);
}
@@ -3504,7 +3401,7 @@ void perf_event_task_tick(void)
struct perf_event_context *ctx, *tmp;
int throttled;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
__this_cpu_inc(perf_throttled_seq);
throttled = __this_cpu_xchg(perf_throttled_count, 0);
@@ -3524,7 +3421,7 @@ static int event_enable_on_exec(struct perf_event *event,
if (event->state >= PERF_EVENT_STATE_INACTIVE)
return 0;
- __perf_event_mark_enabled(event);
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
return 1;
}
@@ -3618,12 +3515,15 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
- if (ctx->is_active) {
+ if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
- update_event_times(event);
+ perf_event_update_time(event);
+ if (data->group)
+ perf_event_update_sibling_time(event);
+
if (event->state != PERF_EVENT_STATE_ACTIVE)
goto unlock;
@@ -3638,7 +3538,6 @@ static void __perf_event_read(void *info)
pmu->read(event);
list_for_each_entry(sub, &event->sibling_list, group_entry) {
- update_event_times(sub);
if (sub->state == PERF_EVENT_STATE_ACTIVE) {
/*
* Use sibling's PMU rather than @event's since
@@ -3656,10 +3555,7 @@ unlock:
static inline u64 perf_event_count(struct perf_event *event)
{
- if (event->pmu->count)
- return event->pmu->count(event);
-
- return __perf_event_count(event);
+ return local64_read(&event->count) + atomic64_read(&event->child_count);
}
/*
@@ -3670,7 +3566,8 @@ static inline u64 perf_event_count(struct perf_event *event)
* will not be local and we cannot read them atomically
* - must not have a pmu::count method
*/
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+ u64 *enabled, u64 *running)
{
unsigned long flags;
int ret = 0;
@@ -3690,15 +3587,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
goto out;
}
- /*
- * It must not have a pmu::count method, those are not
- * NMI safe.
- */
- if (event->pmu->count) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
/* If this is a per-task event, it must be for current */
if ((event->attach_state & PERF_ATTACH_TASK) &&
event->hw.target != current) {
@@ -3722,6 +3610,16 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
event->pmu->read(event);
*value = local64_read(&event->count);
+ if (enabled || running) {
+ u64 now = event->shadow_ctx_time + perf_clock();
+ u64 __enabled, __running;
+
+ __perf_update_times(event, now, &__enabled, &__running);
+ if (enabled)
+ *enabled = __enabled;
+ if (running)
+ *running = __running;
+ }
out:
local_irq_restore(flags);
@@ -3730,23 +3628,35 @@ out:
static int perf_event_read(struct perf_event *event, bool group)
{
+ enum perf_event_state state = READ_ONCE(event->state);
int event_cpu, ret = 0;
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE) {
- struct perf_read_data data = {
- .event = event,
- .group = group,
- .ret = 0,
- };
+again:
+ if (state == PERF_EVENT_STATE_ACTIVE) {
+ struct perf_read_data data;
+
+ /*
+ * Orders the ->state and ->oncpu loads such that if we see
+ * ACTIVE we must also see the right ->oncpu.
+ *
+ * Matches the smp_wmb() from event_sched_in().
+ */
+ smp_rmb();
event_cpu = READ_ONCE(event->oncpu);
if ((unsigned)event_cpu >= nr_cpu_ids)
return 0;
+ data = (struct perf_read_data){
+ .event = event,
+ .group = group,
+ .ret = 0,
+ };
+
preempt_disable();
event_cpu = __perf_event_read_cpu(event, event_cpu);
@@ -3763,24 +3673,30 @@ static int perf_event_read(struct perf_event *event, bool group)
(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
preempt_enable();
ret = data.ret;
- } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+
+ } else if (state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;
raw_spin_lock_irqsave(&ctx->lock, flags);
+ state = event->state;
+ if (state != PERF_EVENT_STATE_INACTIVE) {
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ goto again;
+ }
+
/*
- * may read while context is not active
- * (e.g., thread is blocked), in that case
- * we cannot update context time
+ * May read while context is not active (e.g., thread is
+ * blocked), in that case we cannot update context time
*/
- if (ctx->is_active) {
+ if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
+
+ perf_event_update_time(event);
if (group)
- update_group_times(event);
- else
- update_event_times(event);
+ perf_event_update_sibling_time(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -4226,7 +4142,7 @@ static void perf_remove_from_owner(struct perf_event *event)
* indeed free this event, otherwise we need to serialize on
* owner->perf_event_mutex.
*/
- owner = lockless_dereference(event->owner);
+ owner = READ_ONCE(event->owner);
if (owner) {
/*
* Since delayed_put_task_struct() also drops the last
@@ -4323,7 +4239,7 @@ again:
* Cannot change, child events are not migrated, see the
* comment with perf_event_ctx_lock_nested().
*/
- ctx = lockless_dereference(child->ctx);
+ ctx = READ_ONCE(child->ctx);
/*
* Since child_mutex nests inside ctx::mutex, we must jump
* through hoops. We start by grabbing a reference on the ctx.
@@ -4383,7 +4299,7 @@ static int perf_release(struct inode *inode, struct file *file)
return 0;
}
-u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
u64 total = 0;
@@ -4411,6 +4327,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
return total;
}
+
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+{
+ struct perf_event_context *ctx;
+ u64 count;
+
+ ctx = perf_event_ctx_lock(event);
+ count = __perf_event_read_value(event, enabled, running);
+ perf_event_ctx_unlock(event, ctx);
+
+ return count;
+}
EXPORT_SYMBOL_GPL(perf_event_read_value);
static int __perf_read_group_add(struct perf_event *leader,
@@ -4426,6 +4354,8 @@ static int __perf_read_group_add(struct perf_event *leader,
if (ret)
return ret;
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+
/*
* Since we co-schedule groups, {enabled,running} times of siblings
* will be identical to those of the leader, so we only publish one
@@ -4448,8 +4378,6 @@ static int __perf_read_group_add(struct perf_event *leader,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
- raw_spin_lock_irqsave(&ctx->lock, flags);
-
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
@@ -4513,7 +4441,7 @@ static int perf_read_one(struct perf_event *event,
u64 values[4];
int n = 0;
- values[n++] = perf_event_read_value(event, &enabled, &running);
+ values[n++] = __perf_event_read_value(event, &enabled, &running);
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
values[n++] = enabled;
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
@@ -4892,8 +4820,7 @@ static void calc_timer_values(struct perf_event *event,
*now = perf_clock();
ctx_time = event->shadow_ctx_time + *now;
- *enabled = ctx_time - event->tstamp_enabled;
- *running = ctx_time - event->tstamp_running;
+ __perf_update_times(event, ctx_time, enabled, running);
}
static void perf_event_init_userpage(struct perf_event *event)
@@ -5297,8 +5224,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!rb)
goto aux_unlock;
- aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
- aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+ aux_offset = READ_ONCE(rb->user_page->aux_offset);
+ aux_size = READ_ONCE(rb->user_page->aux_size);
if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
goto aux_unlock;
@@ -6003,6 +5930,9 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ perf_output_put(handle, data->phys_addr);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -6018,6 +5948,38 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+static u64 perf_virt_to_phys(u64 virt)
+{
+ u64 phys_addr = 0;
+ struct page *p = NULL;
+
+ if (!virt)
+ return 0;
+
+ if (virt >= TASK_SIZE) {
+ /* If it's vmalloc()d memory, leave phys_addr as 0 */
+ if (virt_addr_valid((void *)(uintptr_t)virt) &&
+ !(virt >= VMALLOC_START && virt < VMALLOC_END))
+ phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+ } else {
+ /*
+ * Walking the pages tables for user address.
+ * Interrupts are disabled, so it prevents any tear down
+ * of the page tables.
+ * Try IRQ-safe __get_user_pages_fast first.
+ * If failed, leave phys_addr as 0.
+ */
+ if ((current->mm != NULL) &&
+ (__get_user_pages_fast(virt, 1, 0, &p) == 1))
+ phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+
+ if (p)
+ put_page(p);
+ }
+
+ return phys_addr;
+}
+
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
@@ -6136,6 +6098,9 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ data->phys_addr = perf_virt_to_phys(data->addr);
}
static void __always_inline
@@ -7287,6 +7252,11 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_output_end(&handle);
}
+void perf_event_itrace_started(struct perf_event *event)
+{
+ event->attach_state |= PERF_ATTACH_ITRACE;
+}
+
static void perf_log_itrace_start(struct perf_event *event)
{
struct perf_output_handle handle;
@@ -7302,7 +7272,7 @@ static void perf_log_itrace_start(struct perf_event *event)
event = event->parent;
if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
- event->hw.itrace_started)
+ event->attach_state & PERF_ATTACH_ITRACE)
return;
rec.header.type = PERF_RECORD_ITRACE_START;
@@ -7896,11 +7866,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
struct pt_regs *regs, struct hlist_head *head,
struct task_struct *task)
{
- struct bpf_prog *prog = call->prog;
-
- if (prog) {
+ if (bpf_prog_array_valid(call)) {
*(struct pt_regs **)raw_data = regs;
- if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+ if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
@@ -8024,6 +7992,7 @@ static void bpf_overflow_handler(struct perf_event *event,
struct bpf_perf_event_data_kern ctx = {
.data = data,
.regs = regs,
+ .event = event,
};
int ret = 0;
@@ -8086,18 +8055,17 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
- bool is_kprobe, is_tracepoint;
+ bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
+ int ret;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return perf_event_set_bpf_handler(event, prog_fd);
- if (event->tp_event->prog)
- return -EEXIST;
-
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
- if (!is_kprobe && !is_tracepoint)
+ is_syscall_tp = is_syscall_trace_event(event->tp_event);
+ if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
@@ -8106,13 +8074,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return PTR_ERR(prog);
if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
- (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
+ (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
+ (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
/* valid fd, but invalid bpf program type */
bpf_prog_put(prog);
return -EINVAL;
}
- if (is_tracepoint) {
+ if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);
if (prog->aux->max_ctx_offset > off) {
@@ -8120,25 +8089,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return -EACCES;
}
}
- event->tp_event->prog = prog;
- return 0;
+ ret = perf_event_attach_bpf_prog(event, prog);
+ if (ret)
+ bpf_prog_put(prog);
+ return ret;
}
static void perf_event_free_bpf_prog(struct perf_event *event)
{
- struct bpf_prog *prog;
-
- perf_event_free_bpf_handler(event);
-
- if (!event->tp_event)
+ if (event->attr.type != PERF_TYPE_TRACEPOINT) {
+ perf_event_free_bpf_handler(event);
return;
-
- prog = event->tp_event->prog;
- if (prog) {
- event->tp_event->prog = NULL;
- bpf_prog_put(prog);
}
+ perf_event_detach_bpf_prog(event);
}
#else
@@ -8904,6 +8868,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
static void free_pmu_context(struct pmu *pmu)
{
+ /*
+ * Static contexts such as perf_sw_context have a global lifetime
+ * and may be shared between different PMUs. Avoid freeing them
+ * when a single PMU is going away.
+ */
+ if (pmu->task_ctx_nr > perf_invalid_context)
+ return;
+
mutex_lock(&pmus_lock);
free_percpu(pmu->pmu_cpu_context);
mutex_unlock(&pmus_lock);
@@ -9343,6 +9315,11 @@ static void account_event(struct perf_event *event)
inc = true;
if (inc) {
+ /*
+ * We need the mutex here because static_branch_enable()
+ * must complete *before* the perf_sched_count increment
+ * becomes visible.
+ */
if (atomic_inc_not_zero(&perf_sched_count))
goto enabled;
@@ -9890,6 +9867,11 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ /* Only privileged users can get physical addresses */
+ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+ perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
if (!attr.sample_max_stack)
attr.sample_max_stack = sysctl_perf_event_max_stack;
@@ -10463,7 +10445,7 @@ perf_event_exit_event(struct perf_event *child_event,
if (parent_event)
perf_group_detach(child_event);
list_del_event(child_event, child_ctx);
- child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+ perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
raw_spin_unlock_irq(&child_ctx->lock);
/*
@@ -10701,7 +10683,7 @@ inherit_event(struct perf_event *parent_event,
struct perf_event *group_leader,
struct perf_event_context *child_ctx)
{
- enum perf_event_active_state parent_state = parent_event->state;
+ enum perf_event_state parent_state = parent_event->state;
struct perf_event *child_event;
unsigned long flags;
@@ -11037,6 +11019,7 @@ static void __perf_event_exit_context(void *__info)
struct perf_event *event;
raw_spin_lock(&ctx->lock);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
@@ -11238,5 +11221,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
* controller is not mounted on a legacy hierarchy.
*/
.implicit_on_dfl = true,
+ .threaded = true,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 486fd78eb8d5..09b1537ae06c 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _KERNEL_EVENTS_INTERNAL_H
#define _KERNEL_EVENTS_INTERNAL_H
@@ -38,9 +39,9 @@ struct ring_buffer {
struct user_struct *mmap_user;
/* AUX area */
- local_t aux_head;
+ long aux_head;
local_t aux_nest;
- local_t aux_wakeup;
+ long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
unsigned long aux_pgoff;
int aux_nr_pages;
int aux_overwrite;
@@ -208,7 +209,7 @@ static inline int get_recursion_context(int *recursion)
{
int rctx;
- if (in_nmi())
+ if (unlikely(in_nmi()))
rctx = 3;
else if (in_irq())
rctx = 2;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ee97196bb151..141aa2ca8728 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -367,7 +367,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
goto err_put;
- aux_head = local_read(&rb->aux_head);
+ aux_head = rb->aux_head;
handle->rb = rb;
handle->event = event;
@@ -381,8 +381,8 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* (B) <-> (C) ordering is still observed by the pmu driver.
*/
if (!rb->aux_overwrite) {
- aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
- handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+ aux_tail = READ_ONCE(rb->user_page->aux_tail);
+ handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
if (aux_head - aux_tail < perf_aux_size(rb))
handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
@@ -411,6 +411,20 @@ err:
return NULL;
}
+EXPORT_SYMBOL_GPL(perf_aux_output_begin);
+
+static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb)
+{
+ if (rb->aux_overwrite)
+ return false;
+
+ if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+ rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
+ return true;
+ }
+
+ return false;
+}
/*
* Commit the data written by hardware into the ring buffer by adjusting
@@ -433,12 +447,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
aux_head = handle->head;
- local_set(&rb->aux_head, aux_head);
+ rb->aux_head = aux_head;
} else {
handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
- aux_head = local_read(&rb->aux_head);
- local_add(size, &rb->aux_head);
+ aux_head = rb->aux_head;
+ rb->aux_head += size;
}
if (size || handle->aux_flags) {
@@ -450,12 +464,9 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
handle->aux_flags);
}
- aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
-
- if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ rb->user_page->aux_head = rb->aux_head;
+ if (rb_need_aux_wakeup(rb))
wakeup = true;
- local_add(rb->aux_watermark, &rb->aux_wakeup);
- }
if (wakeup) {
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
@@ -470,6 +481,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
rb_free_aux(rb);
ring_buffer_put(rb);
}
+EXPORT_SYMBOL_GPL(perf_aux_output_end);
/*
* Skip over a given number of bytes in the AUX buffer, due to, for example,
@@ -478,26 +490,24 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
{
struct ring_buffer *rb = handle->rb;
- unsigned long aux_head;
if (size > handle->size)
return -ENOSPC;
- local_add(size, &rb->aux_head);
+ rb->aux_head += size;
- aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
- if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ rb->user_page->aux_head = rb->aux_head;
+ if (rb_need_aux_wakeup(rb)) {
perf_output_wakeup(handle);
- local_add(rb->aux_watermark, &rb->aux_wakeup);
- handle->wakeup = local_read(&rb->aux_wakeup) +
- rb->aux_watermark;
+ handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
}
- handle->head = aux_head;
+ handle->head = rb->aux_head;
handle->size -= size;
return 0;
}
+EXPORT_SYMBOL_GPL(perf_aux_output_skip);
void *perf_get_aux(struct perf_output_handle *handle)
{
@@ -507,6 +517,7 @@ void *perf_get_aux(struct perf_output_handle *handle)
return handle->rb->aux_priv;
}
+EXPORT_SYMBOL_GPL(perf_get_aux);
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 6873bb3e6b7e..0975b0268545 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Handling of different ABIs (personalities).
*
diff --git a/kernel/exit.c b/kernel/exit.c
index c5548faa9f37..6b4298a41167 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -764,7 +764,6 @@ void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
- TASKS_RCU(int tasks_rcu_i);
profile_task_exit(tsk);
kcov_task_exit(tsk);
@@ -819,7 +818,8 @@ void __noreturn do_exit(long code)
* Ensure that we must observe the pi_state in exit_mm() ->
* mm_release() -> exit_pi_state_list().
*/
- raw_spin_unlock_wait(&tsk->pi_lock);
+ raw_spin_lock_irq(&tsk->pi_lock);
+ raw_spin_unlock_irq(&tsk->pi_lock);
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -881,9 +881,7 @@ void __noreturn do_exit(long code)
*/
flush_ptrace_hw_breakpoint(tsk);
- TASKS_RCU(preempt_disable());
- TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
- TASKS_RCU(preempt_enable());
+ exit_tasks_rcu_start();
exit_notify(tsk, group_dead);
proc_exit_connector(tsk);
mpol_put_task_policy(tsk);
@@ -918,8 +916,9 @@ void __noreturn do_exit(long code)
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
- TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
+ exit_tasks_rcu_finish();
+ lockdep_free_task(tsk);
do_task_dead();
}
EXPORT_SYMBOL_GPL(do_exit);
@@ -1340,7 +1339,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
* can't confuse the checks below.
*/
- int exit_state = ACCESS_ONCE(p->exit_state);
+ int exit_state = READ_ONCE(p->exit_state);
int ret;
if (unlikely(exit_state == EXIT_DEAD))
@@ -1601,22 +1600,23 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
struct waitid_info info = {.status = 0};
long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
int signo = 0;
+
if (err > 0) {
signo = SIGCHLD;
err = 0;
- }
-
- if (!err) {
if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
return -EFAULT;
}
if (!infop)
return err;
+ if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+ return -EFAULT;
+
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
- unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.cause, &infop->si_code, Efault);
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
@@ -1724,25 +1724,27 @@ COMPAT_SYSCALL_DEFINE5(waitid,
if (err > 0) {
signo = SIGCHLD;
err = 0;
- }
-
- if (!err && uru) {
- /* kernel_waitid() overwrites everything in ru */
- if (COMPAT_USE_64BIT_TIME)
- err = copy_to_user(uru, &ru, sizeof(ru));
- else
- err = put_compat_rusage(&ru, uru);
- if (err)
- return -EFAULT;
+ if (uru) {
+ /* kernel_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ err = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ err = put_compat_rusage(&ru, uru);
+ if (err)
+ return -EFAULT;
+ }
}
if (!infop)
return err;
+ if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+ return -EFAULT;
+
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
- unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.cause, &infop->si_code, Efault);
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
diff --git a/kernel/extable.c b/kernel/extable.c
index 38c2412401a1..a17fdb63dc3e 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -31,6 +31,8 @@
* mutex protecting text section modification (dynamic code patching).
* some users need to sleep (allocating memory...) while they hold this lock.
*
+ * Note: Also protects SMP-alternatives modification on x86.
+ *
* NOT exported to modules - patching kernel text is a really delicate matter.
*/
DEFINE_MUTEX(text_mutex);
@@ -102,15 +104,7 @@ int core_kernel_data(unsigned long addr)
int __kernel_text_address(unsigned long addr)
{
- if (core_kernel_text(addr))
- return 1;
- if (is_module_text_address(addr))
- return 1;
- if (is_ftrace_trampoline(addr))
- return 1;
- if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
- return 1;
- if (is_bpf_text_address(addr))
+ if (kernel_text_address(addr))
return 1;
/*
* There might be init symbols in saved stacktraces.
@@ -127,17 +121,42 @@ int __kernel_text_address(unsigned long addr)
int kernel_text_address(unsigned long addr)
{
+ bool no_rcu;
+ int ret = 1;
+
if (core_kernel_text(addr))
return 1;
+
+ /*
+ * If a stack dump happens while RCU is not watching, then
+ * RCU needs to be notified that it requires to start
+ * watching again. This can happen either by tracing that
+ * triggers a stack trace, or a WARN() that happens during
+ * coming back from idle, or cpu on or offlining.
+ *
+ * is_module_text_address() as well as the kprobe slots
+ * and is_bpf_text_address() require RCU to be watching.
+ */
+ no_rcu = !rcu_is_watching();
+
+ /* Treat this like an NMI as it can happen anywhere */
+ if (no_rcu)
+ rcu_nmi_enter();
+
if (is_module_text_address(addr))
- return 1;
+ goto out;
if (is_ftrace_trampoline(addr))
- return 1;
+ goto out;
if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
- return 1;
+ goto out;
if (is_bpf_text_address(addr))
- return 1;
- return 0;
+ goto out;
+ ret = 0;
+out:
+ if (no_rcu)
+ rcu_nmi_exit();
+
+ return ret;
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index b7e9e57b71ea..07cc743698d3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,6 +37,7 @@
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
+#include <linux/hmm.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
@@ -88,6 +89,7 @@
#include <linux/sysctl.h>
#include <linux/kcov.h>
#include <linux/livepatch.h>
+#include <linux/thread_info.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -213,11 +215,15 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
+#ifdef CONFIG_DEBUG_KMEMLEAK
+ /* Clear stale pointers from reused stack. */
+ memset(s->addr, 0, THREAD_SIZE);
+#endif
tsk->stack_vm_area = s;
return s->addr;
}
- stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
VMALLOC_START, VMALLOC_END,
THREADINFO_GFP,
PAGE_KERNEL,
@@ -484,6 +490,8 @@ void __init fork_init(void)
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
NULL, free_vm_stack_cache);
#endif
+
+ lockdep_init_task(&init_task);
}
int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -654,7 +662,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
retval = dup_userfaultfd(tmp, &uf);
if (retval)
goto fail_nomem_anon_vma_fork;
- if (anon_vma_fork(tmp, mpnt))
+ if (tmp->vm_flags & VM_WIPEONFORK) {
+ /* VM_WIPEONFORK gets a clean slate in the child. */
+ tmp->anon_vma = NULL;
+ if (anon_vma_prepare(tmp))
+ goto fail_nomem_anon_vma_fork;
+ } else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
tmp->vm_next = tmp->vm_prev = NULL;
@@ -698,7 +711,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
rb_parent = &tmp->vm_rb;
mm->map_count++;
- retval = copy_page_range(mm, oldmm, mpnt);
+ if (!(tmp->vm_flags & VM_WIPEONFORK))
+ retval = copy_page_range(mm, oldmm, mpnt);
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
@@ -815,6 +829,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_owner(mm, p);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
+ hmm_mm_init(mm);
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
@@ -894,6 +909,7 @@ void __mmdrop(struct mm_struct *mm)
BUG_ON(mm == &init_mm);
mm_free_pgd(mm);
destroy_context(mm);
+ hmm_mm_destroy(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
@@ -919,7 +935,6 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
- set_bit(MMF_OOM_SKIP, &mm->flags);
mmdrop(mm);
}
@@ -938,7 +953,9 @@ EXPORT_SYMBOL_GPL(mmput);
#ifdef CONFIG_MMU
static void mmput_async_fn(struct work_struct *work)
{
- struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+ struct mm_struct *mm = container_of(work, struct mm_struct,
+ async_put_work);
+
__mmput(mm);
}
@@ -1467,8 +1484,7 @@ static void rt_mutex_init_task(struct task_struct *p)
{
raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
- p->pi_waiters = RB_ROOT;
- p->pi_waiters_leftmost = NULL;
+ p->pi_waiters = RB_ROOT_CACHED;
p->pi_top_task = NULL;
p->pi_blocked_on = NULL;
#endif
@@ -1575,10 +1591,6 @@ static __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
- retval = security_task_create(clone_flags);
- if (retval)
- goto fork_out;
-
retval = -ENOMEM;
p = dup_task_struct(current, node);
if (!p)
@@ -1700,6 +1712,7 @@ static __latent_entropy struct task_struct *copy_process(
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
p->lockdep_recursion = 0;
+ lockdep_init_task(p);
#endif
#ifdef CONFIG_DEBUG_MUTEXES
@@ -1958,6 +1971,7 @@ bad_fork_cleanup_audit:
bad_fork_cleanup_perf:
perf_event_free_task(p);
bad_fork_cleanup_policy:
+ lockdep_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
diff --git a/kernel/futex.c b/kernel/futex.c
index f50b434756c1..76ed5921117a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state)
/*
* Drops a reference to the pi_state object and frees or caches it
* when the last reference is gone.
- *
- * Must be called with the hb lock held.
*/
static void put_pi_state(struct futex_pi_state *pi_state)
{
@@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* and has cleaned up the pi_state already
*/
if (pi_state->owner) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
- list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ struct task_struct *owner;
- rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ owner = pi_state->owner;
+ if (owner) {
+ raw_spin_lock(&owner->pi_lock);
+ list_del_init(&pi_state->list);
+ raw_spin_unlock(&owner->pi_lock);
+ }
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
}
- if (current->pi_state_cache)
+ if (current->pi_state_cache) {
kfree(pi_state);
- else {
+ } else {
/*
* pi_state->list is already empty.
* clear pi_state->owner.
@@ -876,6 +880,8 @@ static struct task_struct *futex_find_get_task(pid_t pid)
return p;
}
+#ifdef CONFIG_FUTEX_PI
+
/*
* This task is holding PI mutexes at exit time => bad.
* Kernel cleans up PI-state, but userspace is likely hosed.
@@ -897,22 +903,41 @@ void exit_pi_state_list(struct task_struct *curr)
*/
raw_spin_lock_irq(&curr->pi_lock);
while (!list_empty(head)) {
-
next = head->next;
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
hb = hash_futex(&key);
+
+ /*
+ * We can race against put_pi_state() removing itself from the
+ * list (a waiter going away). put_pi_state() will first
+ * decrement the reference count and then modify the list, so
+ * its possible to see the list entry but fail this reference
+ * acquire.
+ *
+ * In that case; drop the locks to let put_pi_state() make
+ * progress and retry the loop.
+ */
+ if (!atomic_inc_not_zero(&pi_state->refcount)) {
+ raw_spin_unlock_irq(&curr->pi_lock);
+ cpu_relax();
+ raw_spin_lock_irq(&curr->pi_lock);
+ continue;
+ }
raw_spin_unlock_irq(&curr->pi_lock);
spin_lock(&hb->lock);
-
- raw_spin_lock_irq(&curr->pi_lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock(&curr->pi_lock);
/*
* We dropped the pi-lock, so re-check whether this
* task still owns the PI-state:
*/
if (head->next != next) {
+ /* retain curr->pi_lock for the loop invariant */
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
+ put_pi_state(pi_state);
continue;
}
@@ -920,9 +945,9 @@ void exit_pi_state_list(struct task_struct *curr)
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
pi_state->owner = NULL;
- raw_spin_unlock_irq(&curr->pi_lock);
- get_pi_state(pi_state);
+ raw_spin_unlock(&curr->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
rt_mutex_futex_unlock(&pi_state->pi_mutex);
@@ -933,6 +958,8 @@ void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock_irq(&curr->pi_lock);
}
+#endif
+
/*
* We need to check the following states:
*
@@ -1204,6 +1231,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &p->pi_state_list);
+ /*
+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+ * because there is no concurrency as the object is not published yet.
+ */
pi_state->owner = p;
raw_spin_unlock_irq(&p->pi_lock);
@@ -1547,6 +1578,53 @@ out:
return ret;
}
+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+{
+ unsigned int op = (encoded_op & 0x70000000) >> 28;
+ unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
+ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12);
+ int cmparg = sign_extend32(encoded_op & 0x00000fff, 12);
+ int oldval, ret;
+
+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
+ if (oparg < 0 || oparg > 31) {
+ char comm[sizeof(current->comm)];
+ /*
+ * kill this print and return -EINVAL when userspace
+ * is sane again
+ */
+ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
+ get_task_comm(comm, current), oparg);
+ oparg &= 31;
+ }
+ oparg = 1 << oparg;
+ }
+
+ if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+ return -EFAULT;
+
+ ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
+ if (ret)
+ return ret;
+
+ switch (cmp) {
+ case FUTEX_OP_CMP_EQ:
+ return oldval == cmparg;
+ case FUTEX_OP_CMP_NE:
+ return oldval != cmparg;
+ case FUTEX_OP_CMP_LT:
+ return oldval < cmparg;
+ case FUTEX_OP_CMP_GE:
+ return oldval >= cmparg;
+ case FUTEX_OP_CMP_LE:
+ return oldval <= cmparg;
+ case FUTEX_OP_CMP_GT:
+ return oldval > cmparg;
+ default:
+ return -ENOSYS;
+ }
+}
+
/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
@@ -1800,6 +1878,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
+ /*
+ * When PI not supported: return -ENOSYS if requeue_pi is true,
+ * consequently the compiler knows requeue_pi is always false past
+ * this point which will optimize away all the conditional code
+ * further down.
+ */
+ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
+ return -ENOSYS;
+
if (requeue_pi) {
/*
* Requeue PI only works on two distinct uaddrs. This
@@ -2595,6 +2682,9 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
struct futex_q q = futex_q_init;
int res, ret;
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
if (refill_pi_state_cache())
return -ENOMEM;
@@ -2774,6 +2864,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
struct futex_q *top_waiter;
int ret;
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
retry:
if (get_user(uval, uaddr))
return -EFAULT;
@@ -2820,6 +2913,7 @@ retry:
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
+ /* drops pi_state->pi_mutex.wait_lock */
ret = wake_futex_pi(uaddr, uval, pi_state);
put_pi_state(pi_state);
@@ -2984,6 +3078,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
struct futex_q q = futex_q_init;
int res, ret;
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
if (uaddr == uaddr2)
return -EINVAL;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 3f409968e466..83f830acbb5f 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/futex_compat.c
*
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 752d6486b67e..c6c50e5c680e 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-y := base.o fs.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index c51a49c9be70..9c7c8d5c18f2 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This code maintains a list of active profiling data structures.
*
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index edf67c493a8e..6e40ff6be083 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This code exports profiling data as debugfs files to userspace.
*
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 27bc88a35013..1e32e66c9563 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This code provides functions to handle gcc's profiling data format
* introduced with gcc 3.4. Future versions of gcc may change the gcov
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 46a18e72bce6..ca5e5c0ef853 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This code provides functions to handle gcc's profiling data format
* introduced with gcc 4.7.
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 92c8e22a29ed..de118ad4a024 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Profiling infrastructure declarations.
*
diff --git a/kernel/groups.c b/kernel/groups.c
index 434f6665f187..e357bc800111 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Supplementary group IDs
*/
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 27c4e774071c..89e355866450 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -63,11 +63,20 @@ config GENERIC_IRQ_CHIP
config IRQ_DOMAIN
bool
+# Support for simulated interrupts
+config IRQ_SIM
+ bool
+ select IRQ_WORK
+
# Support for hierarchical irq domains
config IRQ_DOMAIN_HIERARCHY
bool
select IRQ_DOMAIN
+# Support for hierarchical fasteoi+edge and fasteoi+level handlers
+config IRQ_FASTEOI_HIERARCHY_HANDLERS
+ bool
+
# Generic IRQ IPI support
config GENERIC_IRQ_IPI
bool
@@ -88,6 +97,12 @@ config HANDLE_DOMAIN_IRQ
config IRQ_TIMINGS
bool
+config GENERIC_IRQ_MATRIX_ALLOCATOR
+ bool
+
+config GENERIC_IRQ_RESERVATION_MODE
+ bool
+
config IRQ_DOMAIN_DEBUG
bool "Expose hardware/virtual IRQ mapping via debugfs"
depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index e4aef7351f2b..ff6e352e3a6c 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,9 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
obj-$(CONFIG_IRQ_TIMINGS) += timings.o
obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
+obj-$(CONFIG_IRQ_SIM) += irq_sim.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
@@ -12,3 +14,4 @@ obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
obj-$(CONFIG_SMP) += affinity.o
obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
+obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index d69bd77252a7..e12d35108225 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2016 Thomas Gleixner.
* Copyright (C) 2016-2017 Christoph Hellwig.
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index d30a0dd5cc02..4e8089b319ae 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/irq/autoprobe.c
*
@@ -53,7 +54,7 @@ unsigned long probe_irq_on(void)
if (desc->irq_data.chip->irq_set_type)
desc->irq_data.chip->irq_set_type(&desc->irq_data,
IRQ_TYPE_PROBE);
- irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE);
+ irq_activate_and_startup(desc, IRQ_NORESEND);
}
raw_spin_unlock_irq(&desc->lock);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3675c6004f2a..043bfc35b353 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -202,25 +202,29 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
irqd_clr_managed_shutdown(d);
- if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) {
+ if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) {
/*
* Catch code which fiddles with enable_irq() on a managed
* and potentially shutdown IRQ. Chained interrupt
* installment or irq auto probing should not happen on
- * managed irqs either. Emit a warning, break the affinity
- * and start it up as a normal interrupt.
+ * managed irqs either.
*/
if (WARN_ON_ONCE(force))
- return IRQ_STARTUP_NORMAL;
+ return IRQ_STARTUP_ABORT;
/*
* The interrupt was requested, but there is no online CPU
* in it's affinity mask. Put it into managed shutdown
* state and let the cpu hotplug mechanism start it up once
* a CPU in the mask becomes available.
*/
- irqd_set_managed_shutdown(d);
return IRQ_STARTUP_ABORT;
}
+ /*
+ * Managed interrupts have reserved resources, so this should not
+ * happen.
+ */
+ if (WARN_ON(irq_domain_activate_irq(d, false)))
+ return IRQ_STARTUP_ABORT;
return IRQ_STARTUP_MANAGED;
}
#else
@@ -236,7 +240,9 @@ static int __irq_startup(struct irq_desc *desc)
struct irq_data *d = irq_desc_get_irq_data(desc);
int ret = 0;
- irq_domain_activate_irq(d);
+ /* Warn if this interrupt is not activated but try nevertheless */
+ WARN_ON_ONCE(!irqd_is_activated(d));
+
if (d->chip->irq_startup) {
ret = d->chip->irq_startup(d);
irq_state_clr_disabled(desc);
@@ -265,10 +271,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
irq_setup_affinity(desc);
break;
case IRQ_STARTUP_MANAGED:
+ irq_do_set_affinity(d, aff, false);
ret = __irq_startup(desc);
- irq_set_affinity_locked(d, aff, false);
break;
case IRQ_STARTUP_ABORT:
+ irqd_set_managed_shutdown(d);
return 0;
}
}
@@ -278,6 +285,22 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
return ret;
}
+int irq_activate(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+ if (!irqd_affinity_is_managed(d))
+ return irq_domain_activate_irq(d, false);
+ return 0;
+}
+
+void irq_activate_and_startup(struct irq_desc *desc, bool resend)
+{
+ if (WARN_ON(irq_activate(desc)))
+ return;
+ irq_startup(desc, resend, IRQ_START_FORCE);
+}
+
static void __irq_disable(struct irq_desc *desc, bool mask);
void irq_shutdown(struct irq_desc *desc)
@@ -953,7 +976,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
desc->action = &chained_action;
- irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
+ irq_activate_and_startup(desc, IRQ_RESEND);
}
}
@@ -1098,6 +1121,112 @@ void irq_cpu_offline(void)
}
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+
+#ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS
+/**
+ * handle_fasteoi_ack_irq - irq handler for edge hierarchy
+ * stacked on transparent controllers
+ *
+ * @desc: the interrupt description structure for this irq
+ *
+ * Like handle_fasteoi_irq(), but for use with hierarchy where
+ * the irq_chip also needs to have its ->irq_ack() function
+ * called.
+ */
+void handle_fasteoi_ack_irq(struct irq_desc *desc)
+{
+ struct irq_chip *chip = desc->irq_data.chip;
+
+ raw_spin_lock(&desc->lock);
+
+ if (!irq_may_run(desc))
+ goto out;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ /*
+ * If its disabled or no action available
+ * then mask it and get out of here:
+ */
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ mask_irq(desc);
+ goto out;
+ }
+
+ kstat_incr_irqs_this_cpu(desc);
+ if (desc->istate & IRQS_ONESHOT)
+ mask_irq(desc);
+
+ /* Start handling the irq */
+ desc->irq_data.chip->irq_ack(&desc->irq_data);
+
+ preflow_handler(desc);
+ handle_irq_event(desc);
+
+ cond_unmask_eoi_irq(desc, chip);
+
+ raw_spin_unlock(&desc->lock);
+ return;
+out:
+ if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ chip->irq_eoi(&desc->irq_data);
+ raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq);
+
+/**
+ * handle_fasteoi_mask_irq - irq handler for level hierarchy
+ * stacked on transparent controllers
+ *
+ * @desc: the interrupt description structure for this irq
+ *
+ * Like handle_fasteoi_irq(), but for use with hierarchy where
+ * the irq_chip also needs to have its ->irq_mask_ack() function
+ * called.
+ */
+void handle_fasteoi_mask_irq(struct irq_desc *desc)
+{
+ struct irq_chip *chip = desc->irq_data.chip;
+
+ raw_spin_lock(&desc->lock);
+ mask_ack_irq(desc);
+
+ if (!irq_may_run(desc))
+ goto out;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ /*
+ * If its disabled or no action available
+ * then mask it and get out of here:
+ */
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ mask_irq(desc);
+ goto out;
+ }
+
+ kstat_incr_irqs_this_cpu(desc);
+ if (desc->istate & IRQS_ONESHOT)
+ mask_irq(desc);
+
+ preflow_handler(desc);
+ handle_irq_event(desc);
+
+ cond_unmask_eoi_irq(desc, chip);
+
+ raw_spin_unlock(&desc->lock);
+ return;
+out:
+ if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ chip->irq_eoi(&desc->irq_data);
+ raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq);
+
+#endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */
+
/**
* irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
* NULL)
@@ -1111,6 +1240,7 @@ void irq_chip_enable_parent(struct irq_data *data)
else
data->chip->irq_unmask(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_enable_parent);
/**
* irq_chip_disable_parent - Disable the parent interrupt (defaults to mask if
@@ -1125,6 +1255,7 @@ void irq_chip_disable_parent(struct irq_data *data)
else
data->chip->irq_mask(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_disable_parent);
/**
* irq_chip_ack_parent - Acknowledge the parent interrupt
@@ -1187,6 +1318,7 @@ int irq_chip_set_affinity_parent(struct irq_data *data,
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(irq_chip_set_affinity_parent);
/**
* irq_chip_set_type_parent - Set IRQ type on the parent interrupt
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 638eb9c83d9f..9eb09aef0313 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -18,8 +18,34 @@
static inline bool irq_needs_fixup(struct irq_data *d)
{
const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
+ unsigned int cpu = smp_processor_id();
- return cpumask_test_cpu(smp_processor_id(), m);
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ /*
+ * The cpumask_empty() check is a workaround for interrupt chips,
+ * which do not implement effective affinity, but the architecture has
+ * enabled the config switch. Use the general affinity mask instead.
+ */
+ if (cpumask_empty(m))
+ m = irq_data_get_affinity_mask(d);
+
+ /*
+ * Sanity check. If the mask is not empty when excluding the outgoing
+ * CPU then it must contain at least one online CPU. The outgoing CPU
+ * has been removed from the online mask already.
+ */
+ if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
+ cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) {
+ /*
+ * If this happens then there was a missed IRQ fixup at some
+ * point. Warn about it and enforce fixup.
+ */
+ pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n",
+ cpumask_pr_args(m), d->irq, cpu);
+ return true;
+ }
+#endif
+ return cpumask_test_cpu(cpu, m);
}
static bool migrate_one_irq(struct irq_desc *desc)
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index e75e29e4434a..17f05ef8f575 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Debugging printout:
*/
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 4d384edc0c64..7f608ac39653 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -5,6 +5,7 @@
*/
#include <linux/irqdomain.h>
#include <linux/irq.h>
+#include <linux/uaccess.h>
#include "internals.h"
@@ -80,6 +81,8 @@ irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind)
data->domain ? data->domain->name : "");
seq_printf(m, "%*shwirq: 0x%lx\n", ind + 1, "", data->hwirq);
irq_debug_show_chip(m, data, ind + 1);
+ if (data->domain && data->domain->ops && data->domain->ops->debug_show)
+ data->domain->ops->debug_show(m, NULL, data, ind + 1);
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
if (!data->parent_data)
return;
@@ -148,6 +151,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
raw_spin_lock_irq(&desc->lock);
data = irq_desc_get_irq_data(desc);
seq_printf(m, "handler: %pf\n", desc->handle_irq);
+ seq_printf(m, "device: %s\n", desc->dev_name);
seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors);
irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
ARRAY_SIZE(irqdesc_states));
@@ -171,13 +175,69 @@ static int irq_debug_open(struct inode *inode, struct file *file)
return single_open(file, irq_debug_show, inode->i_private);
}
+static ssize_t irq_debug_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct irq_desc *desc = file_inode(file)->i_private;
+ char buf[8] = { 0, };
+ size_t size;
+
+ size = min(sizeof(buf) - 1, count);
+ if (copy_from_user(buf, user_buf, size))
+ return -EFAULT;
+
+ if (!strncmp(buf, "trigger", size)) {
+ unsigned long flags;
+ int err;
+
+ /* Try the HW interface first */
+ err = irq_set_irqchip_state(irq_desc_get_irq(desc),
+ IRQCHIP_STATE_PENDING, true);
+ if (!err)
+ return count;
+
+ /*
+ * Otherwise, try to inject via the resend interface,
+ * which may or may not succeed.
+ */
+ chip_bus_lock(desc);
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ if (irq_settings_is_level(desc)) {
+ /* Can't do level, sorry */
+ err = -EINVAL;
+ } else {
+ desc->istate |= IRQS_PENDING;
+ check_irq_resend(desc);
+ err = 0;
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(desc);
+
+ return err ? err : count;
+ }
+
+ return count;
+}
+
static const struct file_operations dfs_irq_ops = {
.open = irq_debug_open,
+ .write = irq_debug_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
+void irq_debugfs_copy_devname(int irq, struct device *dev)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ const char *name = dev_name(dev);
+
+ if (name)
+ desc->dev_name = kstrdup(name, GFP_KERNEL);
+}
+
void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
{
char name [10];
@@ -186,7 +246,7 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
return;
sprintf(name, "%d", irq);
- desc->debugfs_file = debugfs_create_file(name, 0444, irq_dir, desc,
+ desc->debugfs_file = debugfs_create_file(name, 0644, irq_dir, desc,
&dfs_irq_ops);
}
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index f7086b78ad6e..c26c5bb6b491 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -135,17 +135,26 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
}
/**
- * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt
+ * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt
* @d: irq_data
+ *
+ * This generic implementation of the irq_mask_ack method is for chips
+ * with separate enable/disable registers instead of a single mask
+ * register and where a pending interrupt is acknowledged by setting a
+ * bit.
+ *
+ * Note: This is the only permutation currently used. Similar generic
+ * functions should be added here if other permutations are required.
*/
-void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
+void irq_gc_mask_disable_and_ack_set(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(gc, mask, ct->regs.mask);
+ irq_reg_writel(gc, mask, ct->regs.disable);
+ *ct->mask_cache &= ~mask;
irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
@@ -322,7 +331,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
/* Calc pointer to the next generic chip */
tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
}
- d->name = name;
return 0;
}
EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a2c48058354c..07d08ca701ec 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* IRQ subsystem internal functions and variables:
*
@@ -74,6 +75,8 @@ extern void __enable_irq(struct irq_desc *desc);
#define IRQ_START_FORCE true
#define IRQ_START_COND false
+extern int irq_activate(struct irq_desc *desc);
+extern void irq_activate_and_startup(struct irq_desc *desc, bool resend);
extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
extern void irq_shutdown(struct irq_desc *desc);
@@ -151,7 +154,7 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
#define for_each_action_of_desc(desc, act) \
- for (act = desc->act; act; act = act->next)
+ for (act = desc->action; act; act = act->next)
struct irq_desc *
__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
@@ -436,6 +439,18 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
}
#endif /* !CONFIG_GENERIC_PENDING_IRQ */
+#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool early)
+{
+ irqd_set_activated(data);
+ return 0;
+}
+static inline void irq_domain_deactivate_irq(struct irq_data *data)
+{
+ irqd_clr_activated(data);
+}
+#endif
+
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
#include <linux/debugfs.h>
@@ -443,7 +458,9 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
{
debugfs_remove(desc->debugfs_file);
+ kfree(desc->dev_name);
}
+void irq_debugfs_copy_devname(int irq, struct device *dev);
# ifdef CONFIG_IRQ_DOMAIN
void irq_domain_debugfs_init(struct dentry *root);
# else
@@ -458,4 +475,7 @@ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
static inline void irq_remove_debugfs_entry(struct irq_desc *d)
{
}
+static inline void irq_debugfs_copy_devname(int irq, struct device *dev)
+{
+}
#endif /* CONFIG_GENERIC_IRQ_DEBUGFS */
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
new file mode 100644
index 000000000000..24caabf1a0f7
--- /dev/null
+++ b/kernel/irq/irq_sim.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/irq_sim.h>
+#include <linux/irq.h>
+
+struct irq_sim_devres {
+ struct irq_sim *sim;
+};
+
+static void irq_sim_irqmask(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+
+ irq_ctx->enabled = false;
+}
+
+static void irq_sim_irqunmask(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+
+ irq_ctx->enabled = true;
+}
+
+static struct irq_chip irq_sim_irqchip = {
+ .name = "irq_sim",
+ .irq_mask = irq_sim_irqmask,
+ .irq_unmask = irq_sim_irqunmask,
+};
+
+static void irq_sim_handle_irq(struct irq_work *work)
+{
+ struct irq_sim_work_ctx *work_ctx;
+
+ work_ctx = container_of(work, struct irq_sim_work_ctx, work);
+ handle_simple_irq(irq_to_desc(work_ctx->irq));
+}
+
+/**
+ * irq_sim_init - Initialize the interrupt simulator: allocate a range of
+ * dummy interrupts.
+ *
+ * @sim: The interrupt simulator object to initialize.
+ * @num_irqs: Number of interrupts to allocate
+ *
+ * Returns 0 on success and a negative error number on failure.
+ */
+int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs)
+{
+ int i;
+
+ sim->irqs = kmalloc_array(num_irqs, sizeof(*sim->irqs), GFP_KERNEL);
+ if (!sim->irqs)
+ return -ENOMEM;
+
+ sim->irq_base = irq_alloc_descs(-1, 0, num_irqs, 0);
+ if (sim->irq_base < 0) {
+ kfree(sim->irqs);
+ return sim->irq_base;
+ }
+
+ for (i = 0; i < num_irqs; i++) {
+ sim->irqs[i].irqnum = sim->irq_base + i;
+ sim->irqs[i].enabled = false;
+ irq_set_chip(sim->irq_base + i, &irq_sim_irqchip);
+ irq_set_chip_data(sim->irq_base + i, &sim->irqs[i]);
+ irq_set_handler(sim->irq_base + i, &handle_simple_irq);
+ irq_modify_status(sim->irq_base + i,
+ IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE);
+ }
+
+ init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq);
+ sim->irq_count = num_irqs;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_sim_init);
+
+/**
+ * irq_sim_fini - Deinitialize the interrupt simulator: free the interrupt
+ * descriptors and allocated memory.
+ *
+ * @sim: The interrupt simulator to tear down.
+ */
+void irq_sim_fini(struct irq_sim *sim)
+{
+ irq_work_sync(&sim->work_ctx.work);
+ irq_free_descs(sim->irq_base, sim->irq_count);
+ kfree(sim->irqs);
+}
+EXPORT_SYMBOL_GPL(irq_sim_fini);
+
+static void devm_irq_sim_release(struct device *dev, void *res)
+{
+ struct irq_sim_devres *this = res;
+
+ irq_sim_fini(this->sim);
+}
+
+/**
+ * irq_sim_init - Initialize the interrupt simulator for a managed device.
+ *
+ * @dev: Device to initialize the simulator object for.
+ * @sim: The interrupt simulator object to initialize.
+ * @num_irqs: Number of interrupts to allocate
+ *
+ * Returns 0 on success and a negative error number on failure.
+ */
+int devm_irq_sim_init(struct device *dev, struct irq_sim *sim,
+ unsigned int num_irqs)
+{
+ struct irq_sim_devres *dr;
+ int rv;
+
+ dr = devres_alloc(devm_irq_sim_release, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return -ENOMEM;
+
+ rv = irq_sim_init(sim, num_irqs);
+ if (rv) {
+ devres_free(dr);
+ return rv;
+ }
+
+ dr->sim = sim;
+ devres_add(dev, dr);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devm_irq_sim_init);
+
+/**
+ * irq_sim_fire - Enqueue an interrupt.
+ *
+ * @sim: The interrupt simulator object.
+ * @offset: Offset of the simulated interrupt which should be fired.
+ */
+void irq_sim_fire(struct irq_sim *sim, unsigned int offset)
+{
+ if (sim->irqs[offset].enabled) {
+ sim->work_ctx.irq = irq_sim_irqnum(sim, offset);
+ irq_work_queue(&sim->work_ctx.work);
+ }
+}
+EXPORT_SYMBOL_GPL(irq_sim_fire);
+
+/**
+ * irq_sim_irqnum - Get the allocated number of a dummy interrupt.
+ *
+ * @sim: The interrupt simulator object.
+ * @offset: Offset of the simulated interrupt for which to retrieve
+ * the number.
+ */
+int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset)
+{
+ return sim->irqs[offset].irqnum;
+}
+EXPORT_SYMBOL_GPL(irq_sim_irqnum);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 73be2b3909bd..49b54e9979cc 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -27,7 +27,7 @@ static struct lock_class_key irq_desc_lock_class;
#if defined(CONFIG_SMP)
static int __init irq_affinity_setup(char *str)
{
- zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+ alloc_bootmem_cpumask_var(&irq_default_affinity);
cpulist_parse(str, irq_default_affinity);
/*
* Set at least the boot cpu. We don't want to end up with
@@ -40,10 +40,8 @@ __setup("irqaffinity=", irq_affinity_setup);
static void __init init_irq_default_affinity(void)
{
-#ifdef CONFIG_CPUMASK_OFFSTACK
- if (!irq_default_affinity)
+ if (!cpumask_available(irq_default_affinity))
zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
-#endif
if (cpumask_empty(irq_default_affinity))
cpumask_setall(irq_default_affinity);
}
@@ -421,10 +419,8 @@ static void free_desc(unsigned int irq)
* The sysfs entry must be serialized against a concurrent
* irq_sysfs_init() as well.
*/
- mutex_lock(&sparse_irq_lock);
kobject_del(&desc->kobj);
delete_irq_desc(irq);
- mutex_unlock(&sparse_irq_lock);
/*
* We free the descriptor, masks and stat fields via RCU. That
@@ -450,7 +446,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
}
}
- flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
+ flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0;
mask = NULL;
for (i = 0; i < cnt; i++) {
@@ -462,20 +458,16 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
desc = alloc_desc(start + i, node, flags, mask, owner);
if (!desc)
goto err;
- mutex_lock(&sparse_irq_lock);
irq_insert_desc(start + i, desc);
irq_sysfs_add(start + i, desc);
- mutex_unlock(&sparse_irq_lock);
+ irq_add_debugfs_entry(start + i, desc);
}
+ bitmap_set(allocated_irqs, start, cnt);
return start;
err:
for (i--; i >= 0; i--)
free_desc(start + i);
-
- mutex_lock(&sparse_irq_lock);
- bitmap_clear(allocated_irqs, start, cnt);
- mutex_unlock(&sparse_irq_lock);
return -ENOMEM;
}
@@ -575,6 +567,7 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
desc->owner = owner;
}
+ bitmap_set(allocated_irqs, start, cnt);
return start;
}
@@ -670,10 +663,10 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
if (from >= nr_irqs || (from + cnt) > nr_irqs)
return;
+ mutex_lock(&sparse_irq_lock);
for (i = 0; i < cnt; i++)
free_desc(from + i);
- mutex_lock(&sparse_irq_lock);
bitmap_clear(allocated_irqs, from, cnt);
mutex_unlock(&sparse_irq_lock);
}
@@ -720,19 +713,15 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
from, cnt, 0);
ret = -EEXIST;
if (irq >=0 && start != irq)
- goto err;
+ goto unlock;
if (start + cnt > nr_irqs) {
ret = irq_expand_nr_irqs(start + cnt);
if (ret)
- goto err;
+ goto unlock;
}
-
- bitmap_set(allocated_irqs, start, cnt);
- mutex_unlock(&sparse_irq_lock);
- return alloc_descs(start, cnt, node, affinity, owner);
-
-err:
+ ret = alloc_descs(start, cnt, node, affinity, owner);
+unlock:
mutex_unlock(&sparse_irq_lock);
return ret;
}
@@ -873,6 +862,7 @@ int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity)
return 0;
}
+EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition);
void kstat_incr_irq_this_cpu(unsigned int irq)
{
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f1f251479aa6..4f4f60015e8a 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -21,7 +21,6 @@
static LIST_HEAD(irq_domain_list);
static DEFINE_MUTEX(irq_domain_mutex);
-static DEFINE_MUTEX(revmap_trees_mutex);
static struct irq_domain *irq_default_domain;
static void irq_domain_check_hierarchy(struct irq_domain *domain);
@@ -41,6 +40,9 @@ static inline void debugfs_add_domain_dir(struct irq_domain *d) { }
static inline void debugfs_remove_domain_dir(struct irq_domain *d) { }
#endif
+const struct fwnode_operations irqchip_fwnode_ops;
+EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
+
/**
* irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
* identifying an irq domain
@@ -86,7 +88,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
fwid->type = type;
fwid->name = n;
fwid->data = data;
- fwid->fwnode.type = FWNODE_IRQCHIP;
+ fwid->fwnode.ops = &irqchip_fwnode_ops;
return &fwid->fwnode;
}
EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode);
@@ -193,10 +195,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
}
if (!domain->name) {
- if (fwnode) {
- pr_err("Invalid fwnode type (%d) for irqdomain\n",
- fwnode->type);
- }
+ if (fwnode)
+ pr_err("Invalid fwnode type for irqdomain\n");
domain->name = kasprintf(GFP_KERNEL, "unknown-%d",
atomic_inc_return(&unknown_domains));
if (!domain->name) {
@@ -210,6 +210,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
+ mutex_init(&domain->revmap_tree_mutex);
domain->ops = ops;
domain->host_data = host_data;
domain->hwirq_max = hwirq_max;
@@ -455,6 +456,31 @@ void irq_set_default_host(struct irq_domain *domain)
}
EXPORT_SYMBOL_GPL(irq_set_default_host);
+static void irq_domain_clear_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq)
+{
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = 0;
+ } else {
+ mutex_lock(&domain->revmap_tree_mutex);
+ radix_tree_delete(&domain->revmap_tree, hwirq);
+ mutex_unlock(&domain->revmap_tree_mutex);
+ }
+}
+
+static void irq_domain_set_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ struct irq_data *irq_data)
+{
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = irq_data->irq;
+ } else {
+ mutex_lock(&domain->revmap_tree_mutex);
+ radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
+ mutex_unlock(&domain->revmap_tree_mutex);
+ }
+}
+
void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
{
struct irq_data *irq_data = irq_get_irq_data(irq);
@@ -483,13 +509,7 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
domain->mapcount--;
/* Clear reverse map for this hwirq */
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = 0;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&revmap_trees_mutex);
- }
+ irq_domain_clear_mapping(domain, hwirq);
}
int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
@@ -533,13 +553,7 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
}
domain->mapcount++;
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = virq;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
- mutex_unlock(&revmap_trees_mutex);
- }
+ irq_domain_set_mapping(domain, hwirq, irq_data);
mutex_unlock(&irq_domain_mutex);
irq_clear_status_flags(virq, IRQ_NOREQUEST);
@@ -907,8 +921,7 @@ static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc)
chip = irq_data_get_irq_chip(data);
seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
- seq_printf(m, data ? "0x%p " : " %p ",
- irq_data_get_irq_chip_data(data));
+ seq_printf(m, "0x%p ", irq_data_get_irq_chip_data(data));
seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq);
@@ -931,7 +944,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
struct irq_desc *desc;
struct irq_domain *domain;
struct radix_tree_iter iter;
- void **slot;
+ void __rcu **slot;
int i;
seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
@@ -1138,16 +1151,9 @@ static void irq_domain_insert_irq(int virq)
for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
struct irq_domain *domain = data->domain;
- irq_hw_number_t hwirq = data->hwirq;
domain->mapcount++;
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = virq;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_insert(&domain->revmap_tree, hwirq, data);
- mutex_unlock(&revmap_trees_mutex);
- }
+ irq_domain_set_mapping(domain, data->hwirq, data);
/* If not already assigned, give the domain the chip's name */
if (!domain->name && data->chip)
@@ -1171,13 +1177,7 @@ static void irq_domain_remove_irq(int virq)
irq_hw_number_t hwirq = data->hwirq;
domain->mapcount--;
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = 0;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&revmap_trees_mutex);
- }
+ irq_domain_clear_mapping(domain, hwirq);
}
}
@@ -1362,7 +1362,8 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain,
unsigned int irq_base,
unsigned int nr_irqs)
{
- domain->ops->free(domain, irq_base, nr_irqs);
+ if (domain->ops->free)
+ domain->ops->free(domain, irq_base, nr_irqs);
}
int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
@@ -1448,6 +1449,175 @@ out_free_desc:
return ret;
}
+/* The irq_data was moved, fix the revmap to refer to the new location */
+static void irq_domain_fix_revmap(struct irq_data *d)
+{
+ void __rcu **slot;
+
+ if (d->hwirq < d->domain->revmap_size)
+ return; /* Not using radix tree. */
+
+ /* Fix up the revmap. */
+ mutex_lock(&d->domain->revmap_tree_mutex);
+ slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
+ if (slot)
+ radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
+ mutex_unlock(&d->domain->revmap_tree_mutex);
+}
+
+/**
+ * irq_domain_push_irq() - Push a domain in to the top of a hierarchy.
+ * @domain: Domain to push.
+ * @virq: Irq to push the domain in to.
+ * @arg: Passed to the irq_domain_ops alloc() function.
+ *
+ * For an already existing irqdomain hierarchy, as might be obtained
+ * via a call to pci_enable_msix(), add an additional domain to the
+ * head of the processing chain. Must be called before request_irq()
+ * has been called.
+ */
+int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
+{
+ struct irq_data *child_irq_data;
+ struct irq_data *root_irq_data = irq_get_irq_data(virq);
+ struct irq_desc *desc;
+ int rv = 0;
+
+ /*
+ * Check that no action has been set, which indicates the virq
+ * is in a state where this function doesn't have to deal with
+ * races between interrupt handling and maintaining the
+ * hierarchy. This will catch gross misuse. Attempting to
+ * make the check race free would require holding locks across
+ * calls to struct irq_domain_ops->alloc(), which could lead
+ * to deadlock, so we just do a simple check before starting.
+ */
+ desc = irq_to_desc(virq);
+ if (!desc)
+ return -EINVAL;
+ if (WARN_ON(desc->action))
+ return -EBUSY;
+
+ if (domain == NULL)
+ return -EINVAL;
+
+ if (WARN_ON(!irq_domain_is_hierarchy(domain)))
+ return -EINVAL;
+
+ if (!root_irq_data)
+ return -EINVAL;
+
+ if (domain->parent != root_irq_data->domain)
+ return -EINVAL;
+
+ child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL,
+ irq_data_get_node(root_irq_data));
+ if (!child_irq_data)
+ return -ENOMEM;
+
+ mutex_lock(&irq_domain_mutex);
+
+ /* Copy the original irq_data. */
+ *child_irq_data = *root_irq_data;
+
+ /*
+ * Overwrite the root_irq_data, which is embedded in struct
+ * irq_desc, with values for this domain.
+ */
+ root_irq_data->parent_data = child_irq_data;
+ root_irq_data->domain = domain;
+ root_irq_data->mask = 0;
+ root_irq_data->hwirq = 0;
+ root_irq_data->chip = NULL;
+ root_irq_data->chip_data = NULL;
+
+ /* May (probably does) set hwirq, chip, etc. */
+ rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
+ if (rv) {
+ /* Restore the original irq_data. */
+ *root_irq_data = *child_irq_data;
+ goto error;
+ }
+
+ irq_domain_fix_revmap(child_irq_data);
+ irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data);
+
+error:
+ mutex_unlock(&irq_domain_mutex);
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(irq_domain_push_irq);
+
+/**
+ * irq_domain_pop_irq() - Remove a domain from the top of a hierarchy.
+ * @domain: Domain to remove.
+ * @virq: Irq to remove the domain from.
+ *
+ * Undo the effects of a call to irq_domain_push_irq(). Must be
+ * called either before request_irq() or after free_irq().
+ */
+int irq_domain_pop_irq(struct irq_domain *domain, int virq)
+{
+ struct irq_data *root_irq_data = irq_get_irq_data(virq);
+ struct irq_data *child_irq_data;
+ struct irq_data *tmp_irq_data;
+ struct irq_desc *desc;
+
+ /*
+ * Check that no action is set, which indicates the virq is in
+ * a state where this function doesn't have to deal with races
+ * between interrupt handling and maintaining the hierarchy.
+ * This will catch gross misuse. Attempting to make the check
+ * race free would require holding locks across calls to
+ * struct irq_domain_ops->free(), which could lead to
+ * deadlock, so we just do a simple check before starting.
+ */
+ desc = irq_to_desc(virq);
+ if (!desc)
+ return -EINVAL;
+ if (WARN_ON(desc->action))
+ return -EBUSY;
+
+ if (domain == NULL)
+ return -EINVAL;
+
+ if (!root_irq_data)
+ return -EINVAL;
+
+ tmp_irq_data = irq_domain_get_irq_data(domain, virq);
+
+ /* We can only "pop" if this domain is at the top of the list */
+ if (WARN_ON(root_irq_data != tmp_irq_data))
+ return -EINVAL;
+
+ if (WARN_ON(root_irq_data->domain != domain))
+ return -EINVAL;
+
+ child_irq_data = root_irq_data->parent_data;
+ if (WARN_ON(!child_irq_data))
+ return -EINVAL;
+
+ mutex_lock(&irq_domain_mutex);
+
+ root_irq_data->parent_data = NULL;
+
+ irq_domain_clear_mapping(domain, root_irq_data->hwirq);
+ irq_domain_free_irqs_hierarchy(domain, virq, 1);
+
+ /* Restore the original irq_data. */
+ *root_irq_data = *child_irq_data;
+
+ irq_domain_fix_revmap(root_irq_data);
+
+ mutex_unlock(&irq_domain_mutex);
+
+ kfree(child_irq_data);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_pop_irq);
+
/**
* irq_domain_free_irqs - Free IRQ number and associated data structures
* @virq: base IRQ number
@@ -1511,18 +1681,6 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
-static void __irq_domain_activate_irq(struct irq_data *irq_data)
-{
- if (irq_data && irq_data->domain) {
- struct irq_domain *domain = irq_data->domain;
-
- if (irq_data->parent_data)
- __irq_domain_activate_irq(irq_data->parent_data);
- if (domain->ops->activate)
- domain->ops->activate(domain, irq_data);
- }
-}
-
static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
{
if (irq_data && irq_data->domain) {
@@ -1535,6 +1693,26 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
}
}
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
+{
+ int ret = 0;
+
+ if (irqd && irqd->domain) {
+ struct irq_domain *domain = irqd->domain;
+
+ if (irqd->parent_data)
+ ret = __irq_domain_activate_irq(irqd->parent_data,
+ early);
+ if (!ret && domain->ops->activate) {
+ ret = domain->ops->activate(domain, irqd, early);
+ /* Rollback in case of error */
+ if (ret && irqd->parent_data)
+ __irq_domain_deactivate_irq(irqd->parent_data);
+ }
+ }
+ return ret;
+}
+
/**
* irq_domain_activate_irq - Call domain_ops->activate recursively to activate
* interrupt
@@ -1543,12 +1721,15 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
* This is the second step to call domain_ops->activate to program interrupt
* controllers, so the interrupt could actually get delivered.
*/
-void irq_domain_activate_irq(struct irq_data *irq_data)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool early)
{
- if (!irqd_is_activated(irq_data)) {
- __irq_domain_activate_irq(irq_data);
+ int ret = 0;
+
+ if (!irqd_is_activated(irq_data))
+ ret = __irq_domain_activate_irq(irq_data, early);
+ if (!ret)
irqd_set_activated(irq_data);
- }
+ return ret;
}
/**
@@ -1639,6 +1820,8 @@ irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
d->revmap_size + d->revmap_direct_max_irq);
seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);
seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags);
+ if (d->ops && d->ops->debug_show)
+ d->ops->debug_show(m, d, NULL, ind + 1);
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
if (!d->parent)
return;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1d1a5b945ab4..2ff1c0c82fc9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_desc *desc)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
}
+static void irq_validate_effective_affinity(struct irq_data *data)
+{
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+
+ if (!cpumask_empty(m))
+ return;
+ pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
+ chip->name, data->irq);
+#endif
+}
+
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@@ -175,12 +188,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
struct irq_chip *chip = irq_data_get_irq_chip(data);
int ret;
+ if (!chip || !chip->irq_set_affinity)
+ return -EINVAL;
+
ret = chip->irq_set_affinity(data, mask, force);
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
cpumask_copy(desc->irq_common_data.affinity, mask);
case IRQ_SET_MASK_OK_NOCOPY:
+ irq_validate_effective_affinity(data);
irq_set_thread_affinity(desc);
ret = 0;
}
@@ -381,7 +398,8 @@ int irq_select_affinity_usr(unsigned int irq)
/**
* irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
* @irq: interrupt number to set affinity
- * @vcpu_info: vCPU specific data
+ * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU
+ * specific data for percpu_devid interrupts
*
* This function uses the vCPU specific data to set the vCPU
* affinity for an irq. The vCPU specific data is passed from
@@ -400,8 +418,18 @@ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
return -EINVAL;
data = irq_desc_get_irq_data(desc);
- chip = irq_data_get_irq_chip(data);
- if (chip && chip->irq_set_vcpu_affinity)
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip && chip->irq_set_vcpu_affinity)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
irq_put_desc_unlock(desc, flags);
@@ -509,7 +537,7 @@ void __enable_irq(struct irq_desc *desc)
* time. If it was already started up, then irq_startup()
* will invoke irq_enable() under the hood.
*/
- irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+ irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
break;
}
default:
@@ -1278,7 +1306,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* thread_mask assigned. See the loop above which or's
* all existing action->thread_mask bits.
*/
- new->thread_mask = 1 << ffz(thread_mask);
+ new->thread_mask = 1UL << ffz(thread_mask);
} else if (new->handler == irq_default_primary_handler &&
!(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
@@ -1315,6 +1343,21 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
goto out_unlock;
}
+ /*
+ * Activate the interrupt. That activation must happen
+ * independently of IRQ_NOAUTOEN. request_irq() can fail
+ * and the callers are supposed to handle
+ * that. enable_irq() of an interrupt requested with
+ * IRQ_NOAUTOEN is not supposed to fail. The activation
+ * keeps it in shutdown mode, it merily associates
+ * resources if necessary and if that's not possible it
+ * fails. Interrupts which are in managed shutdown mode
+ * will simply ignore that activation request.
+ */
+ ret = irq_activate(desc);
+ if (ret)
+ goto out_unlock;
+
desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
IRQS_ONESHOT | IRQS_WAITING);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@@ -1390,7 +1433,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
wake_up_process(new->secondary->thread);
register_irq_proc(irq, desc);
- irq_add_debugfs_entry(irq, desc);
new->dir = NULL;
register_handler_proc(irq, new);
return 0;
@@ -1633,6 +1675,10 @@ const void *free_irq(unsigned int irq, void *dev_id)
#endif
action = __free_irq(irq, dev_id);
+
+ if (!action)
+ return NULL;
+
devname = action->name;
kfree(action);
return devname;
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
new file mode 100644
index 000000000000..a3cbbc8191c5
--- /dev/null
+++ b/kernel/irq/matrix.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+#include <linux/bitmap.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+
+#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS) * sizeof(unsigned long))
+
+struct cpumap {
+ unsigned int available;
+ unsigned int allocated;
+ unsigned int managed;
+ bool online;
+ unsigned long alloc_map[IRQ_MATRIX_SIZE];
+ unsigned long managed_map[IRQ_MATRIX_SIZE];
+};
+
+struct irq_matrix {
+ unsigned int matrix_bits;
+ unsigned int alloc_start;
+ unsigned int alloc_end;
+ unsigned int alloc_size;
+ unsigned int global_available;
+ unsigned int global_reserved;
+ unsigned int systembits_inalloc;
+ unsigned int total_allocated;
+ unsigned int online_maps;
+ struct cpumap __percpu *maps;
+ unsigned long scratch_map[IRQ_MATRIX_SIZE];
+ unsigned long system_map[IRQ_MATRIX_SIZE];
+};
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/irq_matrix.h>
+
+/**
+ * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it
+ * @matrix_bits: Number of matrix bits must be <= IRQ_MATRIX_BITS
+ * @alloc_start: From which bit the allocation search starts
+ * @alloc_end: At which bit the allocation search ends, i.e first
+ * invalid bit
+ */
+__init struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits,
+ unsigned int alloc_start,
+ unsigned int alloc_end)
+{
+ struct irq_matrix *m;
+
+ if (matrix_bits > IRQ_MATRIX_BITS)
+ return NULL;
+
+ m = kzalloc(sizeof(*m), GFP_KERNEL);
+ if (!m)
+ return NULL;
+
+ m->matrix_bits = matrix_bits;
+ m->alloc_start = alloc_start;
+ m->alloc_end = alloc_end;
+ m->alloc_size = alloc_end - alloc_start;
+ m->maps = alloc_percpu(*m->maps);
+ if (!m->maps) {
+ kfree(m);
+ return NULL;
+ }
+ return m;
+}
+
+/**
+ * irq_matrix_online - Bring the local CPU matrix online
+ * @m: Matrix pointer
+ */
+void irq_matrix_online(struct irq_matrix *m)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ BUG_ON(cm->online);
+
+ bitmap_zero(cm->alloc_map, m->matrix_bits);
+ cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc);
+ cm->allocated = 0;
+ m->global_available += cm->available;
+ cm->online = true;
+ m->online_maps++;
+ trace_irq_matrix_online(m);
+}
+
+/**
+ * irq_matrix_offline - Bring the local CPU matrix offline
+ * @m: Matrix pointer
+ */
+void irq_matrix_offline(struct irq_matrix *m)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ /* Update the global available size */
+ m->global_available -= cm->available;
+ cm->online = false;
+ m->online_maps--;
+ trace_irq_matrix_offline(m);
+}
+
+static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm,
+ unsigned int num, bool managed)
+{
+ unsigned int area, start = m->alloc_start;
+ unsigned int end = m->alloc_end;
+
+ bitmap_or(m->scratch_map, cm->managed_map, m->system_map, end);
+ bitmap_or(m->scratch_map, m->scratch_map, cm->alloc_map, end);
+ area = bitmap_find_next_zero_area(m->scratch_map, end, start, num, 0);
+ if (area >= end)
+ return area;
+ if (managed)
+ bitmap_set(cm->managed_map, area, num);
+ else
+ bitmap_set(cm->alloc_map, area, num);
+ return area;
+}
+
+/**
+ * irq_matrix_assign_system - Assign system wide entry in the matrix
+ * @m: Matrix pointer
+ * @bit: Which bit to reserve
+ * @replace: Replace an already allocated vector with a system
+ * vector at the same bit position.
+ *
+ * The BUG_ON()s below are on purpose. If this goes wrong in the
+ * early boot process, then the chance to survive is about zero.
+ * If this happens when the system is life, it's not much better.
+ */
+void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit,
+ bool replace)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ BUG_ON(bit > m->matrix_bits);
+ BUG_ON(m->online_maps > 1 || (m->online_maps && !replace));
+
+ set_bit(bit, m->system_map);
+ if (replace) {
+ BUG_ON(!test_and_clear_bit(bit, cm->alloc_map));
+ cm->allocated--;
+ m->total_allocated--;
+ }
+ if (bit >= m->alloc_start && bit < m->alloc_end)
+ m->systembits_inalloc++;
+
+ trace_irq_matrix_assign_system(bit, m);
+}
+
+/**
+ * irq_matrix_reserve_managed - Reserve a managed interrupt in a CPU map
+ * @m: Matrix pointer
+ * @msk: On which CPUs the bits should be reserved.
+ *
+ * Can be called for offline CPUs. Note, this will only reserve one bit
+ * on all CPUs in @msk, but it's not guaranteed that the bits are at the
+ * same offset on all CPUs
+ */
+int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk)
+{
+ unsigned int cpu, failed_cpu;
+
+ for_each_cpu(cpu, msk) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit;
+
+ bit = matrix_alloc_area(m, cm, 1, true);
+ if (bit >= m->alloc_end)
+ goto cleanup;
+ cm->managed++;
+ if (cm->online) {
+ cm->available--;
+ m->global_available--;
+ }
+ trace_irq_matrix_reserve_managed(bit, cpu, m, cm);
+ }
+ return 0;
+cleanup:
+ failed_cpu = cpu;
+ for_each_cpu(cpu, msk) {
+ if (cpu == failed_cpu)
+ break;
+ irq_matrix_remove_managed(m, cpumask_of(cpu));
+ }
+ return -ENOSPC;
+}
+
+/**
+ * irq_matrix_remove_managed - Remove managed interrupts in a CPU map
+ * @m: Matrix pointer
+ * @msk: On which CPUs the bits should be removed
+ *
+ * Can be called for offline CPUs
+ *
+ * This removes not allocated managed interrupts from the map. It does
+ * not matter which one because the managed interrupts free their
+ * allocation when they shut down. If not, the accounting is screwed,
+ * but all what can be done at this point is warn about it.
+ */
+void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, msk) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit, end = m->alloc_end;
+
+ if (WARN_ON_ONCE(!cm->managed))
+ continue;
+
+ /* Get managed bit which are not allocated */
+ bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end);
+
+ bit = find_first_bit(m->scratch_map, end);
+ if (WARN_ON_ONCE(bit >= end))
+ continue;
+
+ clear_bit(bit, cm->managed_map);
+
+ cm->managed--;
+ if (cm->online) {
+ cm->available++;
+ m->global_available++;
+ }
+ trace_irq_matrix_remove_managed(bit, cpu, m, cm);
+ }
+}
+
+/**
+ * irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map
+ * @m: Matrix pointer
+ * @cpu: On which CPU the interrupt should be allocated
+ */
+int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu)
+{
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit, end = m->alloc_end;
+
+ /* Get managed bit which are not allocated */
+ bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end);
+ bit = find_first_bit(m->scratch_map, end);
+ if (bit >= end)
+ return -ENOSPC;
+ set_bit(bit, cm->alloc_map);
+ cm->allocated++;
+ m->total_allocated++;
+ trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
+ return bit;
+}
+
+/**
+ * irq_matrix_assign - Assign a preallocated interrupt in the local CPU map
+ * @m: Matrix pointer
+ * @bit: Which bit to mark
+ *
+ * This should only be used to mark preallocated vectors
+ */
+void irq_matrix_assign(struct irq_matrix *m, unsigned int bit)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
+ return;
+ if (WARN_ON_ONCE(test_and_set_bit(bit, cm->alloc_map)))
+ return;
+ cm->allocated++;
+ m->total_allocated++;
+ cm->available--;
+ m->global_available--;
+ trace_irq_matrix_assign(bit, smp_processor_id(), m, cm);
+}
+
+/**
+ * irq_matrix_reserve - Reserve interrupts
+ * @m: Matrix pointer
+ *
+ * This is merily a book keeping call. It increments the number of globally
+ * reserved interrupt bits w/o actually allocating them. This allows to
+ * setup interrupt descriptors w/o assigning low level resources to it.
+ * The actual allocation happens when the interrupt gets activated.
+ */
+void irq_matrix_reserve(struct irq_matrix *m)
+{
+ if (m->global_reserved <= m->global_available &&
+ m->global_reserved + 1 > m->global_available)
+ pr_warn("Interrupt reservation exceeds available resources\n");
+
+ m->global_reserved++;
+ trace_irq_matrix_reserve(m);
+}
+
+/**
+ * irq_matrix_remove_reserved - Remove interrupt reservation
+ * @m: Matrix pointer
+ *
+ * This is merily a book keeping call. It decrements the number of globally
+ * reserved interrupt bits. This is used to undo irq_matrix_reserve() when the
+ * interrupt was never in use and a real vector allocated, which undid the
+ * reservation.
+ */
+void irq_matrix_remove_reserved(struct irq_matrix *m)
+{
+ m->global_reserved--;
+ trace_irq_matrix_remove_reserved(m);
+}
+
+/**
+ * irq_matrix_alloc - Allocate a regular interrupt in a CPU map
+ * @m: Matrix pointer
+ * @msk: Which CPUs to search in
+ * @reserved: Allocate previously reserved interrupts
+ * @mapped_cpu: Pointer to store the CPU for which the irq was allocated
+ */
+int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
+ bool reserved, unsigned int *mapped_cpu)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, msk) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit;
+
+ if (!cm->online)
+ continue;
+
+ bit = matrix_alloc_area(m, cm, 1, false);
+ if (bit < m->alloc_end) {
+ cm->allocated++;
+ cm->available--;
+ m->total_allocated++;
+ m->global_available--;
+ if (reserved)
+ m->global_reserved--;
+ *mapped_cpu = cpu;
+ trace_irq_matrix_alloc(bit, cpu, m, cm);
+ return bit;
+ }
+ }
+ return -ENOSPC;
+}
+
+/**
+ * irq_matrix_free - Free allocated interrupt in the matrix
+ * @m: Matrix pointer
+ * @cpu: Which CPU map needs be updated
+ * @bit: The bit to remove
+ * @managed: If true, the interrupt is managed and not accounted
+ * as available.
+ */
+void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
+ unsigned int bit, bool managed)
+{
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+ if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
+ return;
+
+ if (cm->online) {
+ clear_bit(bit, cm->alloc_map);
+ cm->allocated--;
+ m->total_allocated--;
+ if (!managed) {
+ cm->available++;
+ m->global_available++;
+ }
+ }
+ trace_irq_matrix_free(bit, cpu, m, cm);
+}
+
+/**
+ * irq_matrix_available - Get the number of globally available irqs
+ * @m: Pointer to the matrix to query
+ * @cpudown: If true, the local CPU is about to go down, adjust
+ * the number of available irqs accordingly
+ */
+unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ return m->global_available - cpudown ? cm->available : 0;
+}
+
+/**
+ * irq_matrix_reserved - Get the number of globally reserved irqs
+ * @m: Pointer to the matrix to query
+ */
+unsigned int irq_matrix_reserved(struct irq_matrix *m)
+{
+ return m->global_reserved;
+}
+
+/**
+ * irq_matrix_allocated - Get the number of allocated irqs on the local cpu
+ * @m: Pointer to the matrix to search
+ *
+ * This returns number of allocated irqs
+ */
+unsigned int irq_matrix_allocated(struct irq_matrix *m)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ return cm->allocated;
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+/**
+ * irq_matrix_debug_show - Show detailed allocation information
+ * @sf: Pointer to the seq_file to print to
+ * @m: Pointer to the matrix allocator
+ * @ind: Indentation for the print format
+ *
+ * Note, this is a lockless snapshot.
+ */
+void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind)
+{
+ unsigned int nsys = bitmap_weight(m->system_map, m->matrix_bits);
+ int cpu;
+
+ seq_printf(sf, "Online bitmaps: %6u\n", m->online_maps);
+ seq_printf(sf, "Global available: %6u\n", m->global_available);
+ seq_printf(sf, "Global reserved: %6u\n", m->global_reserved);
+ seq_printf(sf, "Total allocated: %6u\n", m->total_allocated);
+ seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits,
+ m->system_map);
+ seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " ");
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+ seq_printf(sf, "%*s %4d %4u %4u %4u %*pbl\n", ind, " ",
+ cpu, cm->available, cm->managed, cm->allocated,
+ m->matrix_bits, cm->alloc_map);
+ }
+ cpus_read_unlock();
+}
+#endif
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 6ca054a3f91d..86ae0eb80b53 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/irq.h>
#include <linux/interrupt.h>
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 48eadf416c24..edb987b2c58d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -16,6 +16,8 @@
#include <linux/msi.h>
#include <linux/slab.h>
+#include "internals.h"
+
/**
* alloc_msi_entry - Allocate an initialize msi_entry
* @dev: Pointer to the device for which this is allocated
@@ -100,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data,
return ret;
}
-static void msi_domain_activate(struct irq_domain *domain,
- struct irq_data *irq_data)
+static int msi_domain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool early)
{
struct msi_msg msg;
BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
irq_chip_write_msi_msg(irq_data, &msg);
+ return 0;
}
static void msi_domain_deactivate(struct irq_domain *domain,
@@ -315,11 +318,12 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
ops->set_desc(arg, desc);
/* Assumes the domain mutex is held! */
- ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
+ ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1,
+ arg);
if (ret)
break;
- irq_set_msi_desc_off(virq, 0, desc);
+ irq_set_msi_desc_off(desc->irq, 0, desc);
}
if (ret) {
@@ -372,8 +376,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
return ret;
}
- for (i = 0; i < desc->nvec_used; i++)
+ for (i = 0; i < desc->nvec_used; i++) {
irq_set_msi_desc_off(virq, i, desc);
+ irq_debugfs_copy_devname(virq + i, dev);
+ }
}
if (ops->msi_finish)
@@ -395,11 +401,28 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
struct irq_data *irq_data;
irq_data = irq_domain_get_irq_data(domain, desc->irq);
- irq_domain_activate_irq(irq_data);
+ ret = irq_domain_activate_irq(irq_data, true);
+ if (ret)
+ goto cleanup;
+ if (info->flags & MSI_FLAG_MUST_REACTIVATE)
+ irqd_clr_activated(irq_data);
}
}
-
return 0;
+
+cleanup:
+ for_each_msi_entry(desc, dev) {
+ struct irq_data *irqd;
+
+ if (desc->irq == virq)
+ break;
+
+ irqd = irq_domain_get_irq_data(domain, desc->irq);
+ if (irqd_is_activated(irqd))
+ irq_domain_deactivate_irq(irqd);
+ }
+ msi_domain_free_irqs(domain, dev);
+ return ret;
}
/**
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7f9642a1e267..e8f374971e37 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/irq/proc.c
*
@@ -61,12 +62,12 @@ static int show_irq_affinity(int type, struct seq_file *m)
case EFFECTIVE:
case EFFECTIVE_LIST:
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
- mask = desc->irq_common_data.effective_affinity;
+ mask = irq_data_get_effective_affinity_mask(&desc->irq_data);
break;
-#else
- return -EINVAL;
#endif
- };
+ default:
+ return -EINVAL;
+ }
switch (type) {
case AFFINITY_LIST:
@@ -154,8 +155,9 @@ static ssize_t write_irq_affinity(int type, struct file *file,
*/
err = irq_select_affinity_usr(irq) ? -EINVAL : count;
} else {
- irq_set_affinity(irq, new_value);
- err = count;
+ err = irq_set_affinity(irq, new_value);
+ if (!err)
+ err = count;
}
free_cpumask:
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index b86886beee4f..1d08f45135c2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/irq/resend.c
*
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 320579d89091..e43795cd2ccf 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Internal header to deal with irq_desc->status which will be renamed
* to irq_desc->settings.
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 061ba7eed4ed..1215229d1c12 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/irq/spurious.c
*
@@ -20,7 +21,7 @@ static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
static void poll_spurious_irqs(unsigned long dummy);
-static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
static int irq_poll_cpu;
static atomic_t irq_poll_active;
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index c8c1d073fbf1..e0923fa4927a 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -264,7 +264,7 @@ u64 irq_timings_next_event(u64 now)
* order to prevent the timings circular buffer to be updated
* while we are reading it.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
/*
* Number of elements in the circular buffer: If it happens it
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index bcf107ce0854..40e9d739c169 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -56,7 +56,6 @@ void __weak arch_irq_work_raise(void)
*/
}
-#ifdef CONFIG_SMP
/*
* Enqueue the irq_work @work on @cpu unless it's already pending
* somewhere.
@@ -68,6 +67,8 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(cpu));
+#ifdef CONFIG_SMP
+
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
@@ -78,10 +79,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
arch_send_call_function_single_ipi(cpu);
+#else /* #ifdef CONFIG_SMP */
+ irq_work_queue(work);
+#endif /* #else #ifdef CONFIG_SMP */
+
return true;
}
-EXPORT_SYMBOL_GPL(irq_work_queue_on);
-#endif
/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
@@ -128,9 +131,9 @@ bool irq_work_needs_cpu(void)
static void irq_work_run_list(struct llist_head *list)
{
- unsigned long flags;
- struct irq_work *work;
+ struct irq_work *work, *tmp;
struct llist_node *llnode;
+ unsigned long flags;
BUG_ON(!irqs_disabled());
@@ -138,11 +141,7 @@ static void irq_work_run_list(struct llist_head *list)
return;
llnode = llist_del_all(list);
- while (llnode != NULL) {
- work = llist_entry(llnode, struct irq_work, llnode);
-
- llnode = llist_next(llnode);
-
+ llist_for_each_entry_safe(work, tmp, llnode, llnode) {
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
@@ -188,7 +187,7 @@ void irq_work_tick(void)
*/
void irq_work_sync(struct irq_work *work)
{
- WARN_ON_ONCE(irqs_disabled());
+ lockdep_assert_irqs_enabled();
while (work->flags & IRQ_WORK_BUSY)
cpu_relax();
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index d11c506a6ac3..8ff4ca4665ff 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,33 +79,11 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);
-void static_key_enable(struct static_key *key)
-{
- int count = static_key_count(key);
-
- WARN_ON_ONCE(count < 0 || count > 1);
-
- if (!count)
- static_key_slow_inc(key);
-}
-EXPORT_SYMBOL_GPL(static_key_enable);
-
-void static_key_disable(struct static_key *key)
-{
- int count = static_key_count(key);
-
- WARN_ON_ONCE(count < 0 || count > 1);
-
- if (count)
- static_key_slow_dec(key);
-}
-EXPORT_SYMBOL_GPL(static_key_disable);
-
-void static_key_slow_inc(struct static_key *key)
+static void static_key_slow_inc_cpuslocked(struct static_key *key)
{
int v, v1;
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
/*
* Careful if we get concurrent static_key_slow_inc() calls;
@@ -125,24 +103,87 @@ void static_key_slow_inc(struct static_key *key)
return;
}
- cpus_read_lock();
jump_label_lock();
if (atomic_read(&key->enabled) == 0) {
atomic_set(&key->enabled, -1);
jump_label_update(key);
- atomic_set(&key->enabled, 1);
+ /*
+ * Ensure that if the above cmpxchg loop observes our positive
+ * value, it must also observe all the text changes.
+ */
+ atomic_set_release(&key->enabled, 1);
} else {
atomic_inc(&key->enabled);
}
jump_label_unlock();
+}
+
+void static_key_slow_inc(struct static_key *key)
+{
+ cpus_read_lock();
+ static_key_slow_inc_cpuslocked(key);
cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
-static void __static_key_slow_dec(struct static_key *key,
- unsigned long rate_limit, struct delayed_work *work)
+void static_key_enable_cpuslocked(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE(key);
+
+ if (atomic_read(&key->enabled) > 0) {
+ WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
+ return;
+ }
+
+ jump_label_lock();
+ if (atomic_read(&key->enabled) == 0) {
+ atomic_set(&key->enabled, -1);
+ jump_label_update(key);
+ /*
+ * See static_key_slow_inc().
+ */
+ atomic_set_release(&key->enabled, 1);
+ }
+ jump_label_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked);
+
+void static_key_enable(struct static_key *key)
{
cpus_read_lock();
+ static_key_enable_cpuslocked(key);
+ cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_enable);
+
+void static_key_disable_cpuslocked(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE(key);
+
+ if (atomic_read(&key->enabled) != 1) {
+ WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
+ return;
+ }
+
+ jump_label_lock();
+ if (atomic_cmpxchg(&key->enabled, 1, 0))
+ jump_label_update(key);
+ jump_label_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked);
+
+void static_key_disable(struct static_key *key)
+{
+ cpus_read_lock();
+ static_key_disable_cpuslocked(key);
+ cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_disable);
+
+static void static_key_slow_dec_cpuslocked(struct static_key *key,
+ unsigned long rate_limit,
+ struct delayed_work *work)
+{
/*
* The negative count check is valid even when a negative
* key->enabled is in use by static_key_slow_inc(); a
@@ -153,7 +194,6 @@ static void __static_key_slow_dec(struct static_key *key,
if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
WARN(atomic_read(&key->enabled) < 0,
"jump label: negative count!\n");
- cpus_read_unlock();
return;
}
@@ -164,6 +204,14 @@ static void __static_key_slow_dec(struct static_key *key,
jump_label_update(key);
}
jump_label_unlock();
+}
+
+static void __static_key_slow_dec(struct static_key *key,
+ unsigned long rate_limit,
+ struct delayed_work *work)
+{
+ cpus_read_lock();
+ static_key_slow_dec_cpuslocked(key, rate_limit, work);
cpus_read_unlock();
}
@@ -176,21 +224,21 @@ static void jump_label_update_timeout(struct work_struct *work)
void static_key_slow_dec(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
__static_key_slow_dec(key, 0, NULL);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
void static_key_slow_dec_deferred(struct static_key_deferred *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
__static_key_slow_dec(&key->key, key->timeout, &key->work);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
void static_key_deferred_flush(struct static_key_deferred *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
flush_delayed_work(&key->work);
}
EXPORT_SYMBOL_GPL(static_key_deferred_flush);
@@ -198,7 +246,7 @@ EXPORT_SYMBOL_GPL(static_key_deferred_flush);
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
key->timeout = rl;
INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 127e7cfafa55..1e6ae66c6244 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -480,6 +480,7 @@ struct kallsym_iter {
char name[KSYM_NAME_LEN];
char module_name[MODULE_NAME_LEN];
int exported;
+ int show_value;
};
static int get_ksymbol_mod(struct kallsym_iter *iter)
@@ -582,12 +583,15 @@ static void s_stop(struct seq_file *m, void *p)
static int s_show(struct seq_file *m, void *p)
{
+ unsigned long value;
struct kallsym_iter *iter = m->private;
/* Some debugging symbols have no name. Ignore them. */
if (!iter->name[0])
return 0;
+ value = iter->show_value ? iter->value : 0;
+
if (iter->module_name[0]) {
char type;
@@ -597,10 +601,10 @@ static int s_show(struct seq_file *m, void *p)
*/
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
- seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
+ seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value,
type, iter->name, iter->module_name);
} else
- seq_printf(m, "%pK %c %s\n", (void *)iter->value,
+ seq_printf(m, KALLSYM_FMT " %c %s\n", value,
iter->type, iter->name);
return 0;
}
@@ -612,6 +616,40 @@ static const struct seq_operations kallsyms_op = {
.show = s_show
};
+static inline int kallsyms_for_perf(void)
+{
+#ifdef CONFIG_PERF_EVENTS
+ extern int sysctl_perf_event_paranoid;
+ if (sysctl_perf_event_paranoid <= 1)
+ return 1;
+#endif
+ return 0;
+}
+
+/*
+ * We show kallsyms information even to normal users if we've enabled
+ * kernel profiling and are explicitly not paranoid (so kptr_restrict
+ * is clear, and sysctl_perf_event_paranoid isn't set).
+ *
+ * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
+ * block even that).
+ */
+int kallsyms_show_value(void)
+{
+ switch (kptr_restrict) {
+ case 0:
+ if (kallsyms_for_perf())
+ return 1;
+ /* fallthrough */
+ case 1:
+ if (has_capability_noaudit(current, CAP_SYSLOG))
+ return 1;
+ /* fallthrough */
+ default:
+ return 0;
+ }
+}
+
static int kallsyms_open(struct inode *inode, struct file *file)
{
/*
@@ -625,6 +663,7 @@ static int kallsyms_open(struct inode *inode, struct file *file)
return -ENOMEM;
reset_iter(iter, 0);
+ iter->show_value = kallsyms_show_value();
return 0;
}
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index ea34ed8bb952..a0e3d7a0e8b8 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/fdtable.h>
@@ -131,7 +132,7 @@ static int kcmp_epoll_target(struct task_struct *task1,
if (filp_epoll) {
filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
fput(filp_epoll);
- } else
+ }
if (IS_ERR(filp_tgt))
return PTR_ERR(filp_tgt);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index cd771993f96f..fc6af9e1308b 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) "kcov: " fmt
#define DISABLE_BRANCH_PROFILING
@@ -270,6 +271,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
static const struct file_operations kcov_fops = {
.open = kcov_open,
.unlocked_ioctl = kcov_ioctl,
+ .compat_ioctl = kcov_ioctl,
.mmap = kcov_mmap,
.release = kcov_close,
};
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1ae7c41c33c1..20fef1a38602 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -301,7 +301,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *pages;
- pages = alloc_pages(gfp_mask, order);
+ pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
if (pages) {
unsigned int count, i;
@@ -310,6 +310,13 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
count = 1 << order;
for (i = 0; i < count; i++)
SetPageReserved(pages + i);
+
+ arch_kexec_post_alloc_pages(page_address(pages), count,
+ gfp_mask);
+
+ if (gfp_mask & __GFP_ZERO)
+ for (i = 0; i < count; i++)
+ clear_highpage(pages + i);
}
return pages;
@@ -321,6 +328,9 @@ static void kimage_free_pages(struct page *page)
order = page_private(page);
count = 1 << order;
+
+ arch_kexec_pre_free_pages(page_address(page), count);
+
for (i = 0; i < count; i++)
ClearPageReserved(page + i);
__free_pages(page, order);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 9f48f4412297..e5bcd94c1efb 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -406,9 +406,10 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
return 1;
}
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+static int locate_mem_hole_callback(struct resource *res, void *arg)
{
struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ u64 start = res->start, end = res->end;
unsigned long sz = end - start + 1;
/* Returning 0 will take to next memory range */
@@ -437,7 +438,7 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
* func returning non-zero, then zero will be returned.
*/
int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
- int (*func)(u64, u64, void *))
+ int (*func)(struct resource *, void *))
{
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return walk_iomem_res_desc(crashk_res.desc,
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 50dfcb039a41..48aaf2ac0d0d 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_KEXEC_INTERNAL_H
#define LINUX_KEXEC_INTERNAL_H
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2f37acde640b..bc6addd9152b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -1,23 +1,6 @@
/*
- kmod, the new module loader (replaces kerneld)
- Kirk Petersen
-
- Reorganized not to be a daemon by Adam Richter, with guidance
- from Greg Zornetzer.
-
- Modified to avoid chroot and file sharing problems.
- Mikael Pettersson
-
- Limit the concurrent number of kmod modprobes to catch loops from
- "modprobe needs a service that is in a module".
- Keith Owens <kaos@ocs.com.au> December 1999
-
- Unblock all signals when we exec a usermode process.
- Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
-
- call_usermodehelper wait flag, and remove exec_usermodehelper.
- Rusty Russell <rusty@rustcorp.com.au> Jan 2003
-*/
+ * kmod - the kernel module loader
+ */
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -45,15 +28,6 @@
#include <trace/events/module.h>
-#define CAP_BSET (void *)1
-#define CAP_PI (void *)2
-
-static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
-static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
-static DEFINE_SPINLOCK(umh_sysctl_lock);
-static DECLARE_RWSEM(umhelper_sem);
-
-#ifdef CONFIG_MODULES
/*
* Assuming:
*
@@ -202,536 +176,3 @@ int __request_module(bool wait, const char *fmt, ...)
return ret;
}
EXPORT_SYMBOL(__request_module);
-
-#endif /* CONFIG_MODULES */
-
-static void call_usermodehelper_freeinfo(struct subprocess_info *info)
-{
- if (info->cleanup)
- (*info->cleanup)(info);
- kfree(info);
-}
-
-static void umh_complete(struct subprocess_info *sub_info)
-{
- struct completion *comp = xchg(&sub_info->complete, NULL);
- /*
- * See call_usermodehelper_exec(). If xchg() returns NULL
- * we own sub_info, the UMH_KILLABLE caller has gone away
- * or the caller used UMH_NO_WAIT.
- */
- if (comp)
- complete(comp);
- else
- call_usermodehelper_freeinfo(sub_info);
-}
-
-/*
- * This is the task which runs the usermode application
- */
-static int call_usermodehelper_exec_async(void *data)
-{
- struct subprocess_info *sub_info = data;
- struct cred *new;
- int retval;
-
- spin_lock_irq(&current->sighand->siglock);
- flush_signal_handlers(current, 1);
- spin_unlock_irq(&current->sighand->siglock);
-
- /*
- * Our parent (unbound workqueue) runs with elevated scheduling
- * priority. Avoid propagating that into the userspace child.
- */
- set_user_nice(current, 0);
-
- retval = -ENOMEM;
- new = prepare_kernel_cred(current);
- if (!new)
- goto out;
-
- spin_lock(&umh_sysctl_lock);
- new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
- new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
- new->cap_inheritable);
- spin_unlock(&umh_sysctl_lock);
-
- if (sub_info->init) {
- retval = sub_info->init(sub_info, new);
- if (retval) {
- abort_creds(new);
- goto out;
- }
- }
-
- commit_creds(new);
-
- retval = do_execve(getname_kernel(sub_info->path),
- (const char __user *const __user *)sub_info->argv,
- (const char __user *const __user *)sub_info->envp);
-out:
- sub_info->retval = retval;
- /*
- * call_usermodehelper_exec_sync() will call umh_complete
- * if UHM_WAIT_PROC.
- */
- if (!(sub_info->wait & UMH_WAIT_PROC))
- umh_complete(sub_info);
- if (!retval)
- return 0;
- do_exit(0);
-}
-
-/* Handles UMH_WAIT_PROC. */
-static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
-{
- pid_t pid;
-
- /* If SIGCLD is ignored sys_wait4 won't populate the status. */
- kernel_sigaction(SIGCHLD, SIG_DFL);
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
- if (pid < 0) {
- sub_info->retval = pid;
- } else {
- int ret = -ECHILD;
- /*
- * Normally it is bogus to call wait4() from in-kernel because
- * wait4() wants to write the exit code to a userspace address.
- * But call_usermodehelper_exec_sync() always runs as kernel
- * thread (workqueue) and put_user() to a kernel address works
- * OK for kernel threads, due to their having an mm_segment_t
- * which spans the entire address space.
- *
- * Thus the __user pointer cast is valid here.
- */
- sys_wait4(pid, (int __user *)&ret, 0, NULL);
-
- /*
- * If ret is 0, either call_usermodehelper_exec_async failed and
- * the real error code is already in sub_info->retval or
- * sub_info->retval is 0 anyway, so don't mess with it then.
- */
- if (ret)
- sub_info->retval = ret;
- }
-
- /* Restore default kernel sig handler */
- kernel_sigaction(SIGCHLD, SIG_IGN);
-
- umh_complete(sub_info);
-}
-
-/*
- * We need to create the usermodehelper kernel thread from a task that is affine
- * to an optimized set of CPUs (or nohz housekeeping ones) such that they
- * inherit a widest affinity irrespective of call_usermodehelper() callers with
- * possibly reduced affinity (eg: per-cpu workqueues). We don't want
- * usermodehelper targets to contend a busy CPU.
- *
- * Unbound workqueues provide such wide affinity and allow to block on
- * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
- *
- * Besides, workqueues provide the privilege level that caller might not have
- * to perform the usermodehelper request.
- *
- */
-static void call_usermodehelper_exec_work(struct work_struct *work)
-{
- struct subprocess_info *sub_info =
- container_of(work, struct subprocess_info, work);
-
- if (sub_info->wait & UMH_WAIT_PROC) {
- call_usermodehelper_exec_sync(sub_info);
- } else {
- pid_t pid;
- /*
- * Use CLONE_PARENT to reparent it to kthreadd; we do not
- * want to pollute current->children, and we need a parent
- * that always ignores SIGCHLD to ensure auto-reaping.
- */
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
- CLONE_PARENT | SIGCHLD);
- if (pid < 0) {
- sub_info->retval = pid;
- umh_complete(sub_info);
- }
- }
-}
-
-/*
- * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
- * (used for preventing user land processes from being created after the user
- * land has been frozen during a system-wide hibernation or suspend operation).
- * Should always be manipulated under umhelper_sem acquired for write.
- */
-static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
-
-/* Number of helpers running */
-static atomic_t running_helpers = ATOMIC_INIT(0);
-
-/*
- * Wait queue head used by usermodehelper_disable() to wait for all running
- * helpers to finish.
- */
-static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
-
-/*
- * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
- * to become 'false'.
- */
-static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
-
-/*
- * Time to wait for running_helpers to become zero before the setting of
- * usermodehelper_disabled in usermodehelper_disable() fails
- */
-#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
-
-int usermodehelper_read_trylock(void)
-{
- DEFINE_WAIT(wait);
- int ret = 0;
-
- down_read(&umhelper_sem);
- for (;;) {
- prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
- TASK_INTERRUPTIBLE);
- if (!usermodehelper_disabled)
- break;
-
- if (usermodehelper_disabled == UMH_DISABLED)
- ret = -EAGAIN;
-
- up_read(&umhelper_sem);
-
- if (ret)
- break;
-
- schedule();
- try_to_freeze();
-
- down_read(&umhelper_sem);
- }
- finish_wait(&usermodehelper_disabled_waitq, &wait);
- return ret;
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
-
-long usermodehelper_read_lock_wait(long timeout)
-{
- DEFINE_WAIT(wait);
-
- if (timeout < 0)
- return -EINVAL;
-
- down_read(&umhelper_sem);
- for (;;) {
- prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- if (!usermodehelper_disabled)
- break;
-
- up_read(&umhelper_sem);
-
- timeout = schedule_timeout(timeout);
- if (!timeout)
- break;
-
- down_read(&umhelper_sem);
- }
- finish_wait(&usermodehelper_disabled_waitq, &wait);
- return timeout;
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
-
-void usermodehelper_read_unlock(void)
-{
- up_read(&umhelper_sem);
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
-
-/**
- * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
- * @depth: New value to assign to usermodehelper_disabled.
- *
- * Change the value of usermodehelper_disabled (under umhelper_sem locked for
- * writing) and wakeup tasks waiting for it to change.
- */
-void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
-{
- down_write(&umhelper_sem);
- usermodehelper_disabled = depth;
- wake_up(&usermodehelper_disabled_waitq);
- up_write(&umhelper_sem);
-}
-
-/**
- * __usermodehelper_disable - Prevent new helpers from being started.
- * @depth: New value to assign to usermodehelper_disabled.
- *
- * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
- */
-int __usermodehelper_disable(enum umh_disable_depth depth)
-{
- long retval;
-
- if (!depth)
- return -EINVAL;
-
- down_write(&umhelper_sem);
- usermodehelper_disabled = depth;
- up_write(&umhelper_sem);
-
- /*
- * From now on call_usermodehelper_exec() won't start any new
- * helpers, so it is sufficient if running_helpers turns out to
- * be zero at one point (it may be increased later, but that
- * doesn't matter).
- */
- retval = wait_event_timeout(running_helpers_waitq,
- atomic_read(&running_helpers) == 0,
- RUNNING_HELPERS_TIMEOUT);
- if (retval)
- return 0;
-
- __usermodehelper_set_disable_depth(UMH_ENABLED);
- return -EAGAIN;
-}
-
-static void helper_lock(void)
-{
- atomic_inc(&running_helpers);
- smp_mb__after_atomic();
-}
-
-static void helper_unlock(void)
-{
- if (atomic_dec_and_test(&running_helpers))
- wake_up(&running_helpers_waitq);
-}
-
-/**
- * call_usermodehelper_setup - prepare to call a usermode helper
- * @path: path to usermode executable
- * @argv: arg vector for process
- * @envp: environment for process
- * @gfp_mask: gfp mask for memory allocation
- * @cleanup: a cleanup function
- * @init: an init function
- * @data: arbitrary context sensitive data
- *
- * Returns either %NULL on allocation failure, or a subprocess_info
- * structure. This should be passed to call_usermodehelper_exec to
- * exec the process and free the structure.
- *
- * The init function is used to customize the helper process prior to
- * exec. A non-zero return code causes the process to error out, exit,
- * and return the failure to the calling process
- *
- * The cleanup function is just before ethe subprocess_info is about to
- * be freed. This can be used for freeing the argv and envp. The
- * Function must be runnable in either a process context or the
- * context in which call_usermodehelper_exec is called.
- */
-struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
- char **envp, gfp_t gfp_mask,
- int (*init)(struct subprocess_info *info, struct cred *new),
- void (*cleanup)(struct subprocess_info *info),
- void *data)
-{
- struct subprocess_info *sub_info;
- sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
- if (!sub_info)
- goto out;
-
- INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
-
-#ifdef CONFIG_STATIC_USERMODEHELPER
- sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
-#else
- sub_info->path = path;
-#endif
- sub_info->argv = argv;
- sub_info->envp = envp;
-
- sub_info->cleanup = cleanup;
- sub_info->init = init;
- sub_info->data = data;
- out:
- return sub_info;
-}
-EXPORT_SYMBOL(call_usermodehelper_setup);
-
-/**
- * call_usermodehelper_exec - start a usermode application
- * @sub_info: information about the subprocessa
- * @wait: wait for the application to finish and return status.
- * when UMH_NO_WAIT don't wait at all, but you get no useful error back
- * when the program couldn't be exec'ed. This makes it safe to call
- * from interrupt context.
- *
- * Runs a user-space application. The application is started
- * asynchronously if wait is not set, and runs as a child of system workqueues.
- * (ie. it runs with full root capabilities and optimized affinity).
- */
-int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
-{
- DECLARE_COMPLETION_ONSTACK(done);
- int retval = 0;
-
- if (!sub_info->path) {
- call_usermodehelper_freeinfo(sub_info);
- return -EINVAL;
- }
- helper_lock();
- if (usermodehelper_disabled) {
- retval = -EBUSY;
- goto out;
- }
-
- /*
- * If there is no binary for us to call, then just return and get out of
- * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and
- * disable all call_usermodehelper() calls.
- */
- if (strlen(sub_info->path) == 0)
- goto out;
-
- /*
- * Set the completion pointer only if there is a waiter.
- * This makes it possible to use umh_complete to free
- * the data structure in case of UMH_NO_WAIT.
- */
- sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
- sub_info->wait = wait;
-
- queue_work(system_unbound_wq, &sub_info->work);
- if (wait == UMH_NO_WAIT) /* task has freed sub_info */
- goto unlock;
-
- if (wait & UMH_KILLABLE) {
- retval = wait_for_completion_killable(&done);
- if (!retval)
- goto wait_done;
-
- /* umh_complete() will see NULL and free sub_info */
- if (xchg(&sub_info->complete, NULL))
- goto unlock;
- /* fallthrough, umh_complete() was already called */
- }
-
- wait_for_completion(&done);
-wait_done:
- retval = sub_info->retval;
-out:
- call_usermodehelper_freeinfo(sub_info);
-unlock:
- helper_unlock();
- return retval;
-}
-EXPORT_SYMBOL(call_usermodehelper_exec);
-
-/**
- * call_usermodehelper() - prepare and start a usermode application
- * @path: path to usermode executable
- * @argv: arg vector for process
- * @envp: environment for process
- * @wait: wait for the application to finish and return status.
- * when UMH_NO_WAIT don't wait at all, but you get no useful error back
- * when the program couldn't be exec'ed. This makes it safe to call
- * from interrupt context.
- *
- * This function is the equivalent to use call_usermodehelper_setup() and
- * call_usermodehelper_exec().
- */
-int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
-{
- struct subprocess_info *info;
- gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
-
- info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
- NULL, NULL, NULL);
- if (info == NULL)
- return -ENOMEM;
-
- return call_usermodehelper_exec(info, wait);
-}
-EXPORT_SYMBOL(call_usermodehelper);
-
-static int proc_cap_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct ctl_table t;
- unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
- kernel_cap_t new_cap;
- int err, i;
-
- if (write && (!capable(CAP_SETPCAP) ||
- !capable(CAP_SYS_MODULE)))
- return -EPERM;
-
- /*
- * convert from the global kernel_cap_t to the ulong array to print to
- * userspace if this is a read.
- */
- spin_lock(&umh_sysctl_lock);
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
- if (table->data == CAP_BSET)
- cap_array[i] = usermodehelper_bset.cap[i];
- else if (table->data == CAP_PI)
- cap_array[i] = usermodehelper_inheritable.cap[i];
- else
- BUG();
- }
- spin_unlock(&umh_sysctl_lock);
-
- t = *table;
- t.data = &cap_array;
-
- /*
- * actually read or write and array of ulongs from userspace. Remember
- * these are least significant 32 bits first
- */
- err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
- if (err < 0)
- return err;
-
- /*
- * convert from the sysctl array of ulongs to the kernel_cap_t
- * internal representation
- */
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
- new_cap.cap[i] = cap_array[i];
-
- /*
- * Drop everything not in the new_cap (but don't add things)
- */
- spin_lock(&umh_sysctl_lock);
- if (write) {
- if (table->data == CAP_BSET)
- usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
- if (table->data == CAP_PI)
- usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
- }
- spin_unlock(&umh_sysctl_lock);
-
- return 0;
-}
-
-struct ctl_table usermodehelper_table[] = {
- {
- .procname = "bset",
- .data = CAP_BSET,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
- .mode = 0600,
- .proc_handler = proc_cap_handler,
- },
- {
- .procname = "inheritable",
- .data = CAP_PI,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
- .mode = 0600,
- .proc_handler = proc_cap_handler,
- },
- { }
-};
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a1606a4224e1..da2ccf142358 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
SLOT_USED = 2,
};
-static void *alloc_insn_page(void)
+void __weak *alloc_insn_page(void)
{
return module_alloc(PAGE_SIZE);
}
@@ -573,13 +573,15 @@ static void kprobe_optimizer(struct work_struct *work)
do_unoptimize_kprobes();
/*
- * Step 2: Wait for quiesence period to ensure all running interrupts
- * are done. Because optprobe may modify multiple instructions
- * there is a chance that Nth instruction is interrupted. In that
- * case, running interrupt can return to 2nd-Nth byte of jump
- * instruction. This wait is for avoiding it.
+ * Step 2: Wait for quiesence period to ensure all potentially
+ * preempted tasks to have normally scheduled. Because optprobe
+ * may modify multiple instructions, there is a chance that Nth
+ * instruction is preempted. In that case, such tasks can return
+ * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
+ * Note that on non-preemptive kernel, this is transparently converted
+ * to synchronoze_sched() to wait for all interrupts to have completed.
*/
- synchronize_sched();
+ synchronize_rcu_tasks();
/* Step 3: Optimize kprobes after quiesence period */
do_optimize_kprobes();
@@ -1769,6 +1771,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
return (unsigned long)entry;
}
+#if 0
int register_jprobes(struct jprobe **jps, int num)
{
int ret = 0, i;
@@ -1837,6 +1840,7 @@ void unregister_jprobes(struct jprobe **jps, int num)
}
}
EXPORT_SYMBOL_GPL(unregister_jprobes);
+#endif
#ifdef CONFIG_KRETPROBES
/*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1c19edf82427..8af313081b0d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,7 +20,6 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
-#include <linux/cgroup.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -47,6 +46,9 @@ struct kthread {
void *data;
struct completion parked;
struct completion exited;
+#ifdef CONFIG_BLK_CGROUP
+ struct cgroup_subsys_state *blkcg_css;
+#endif
};
enum KTHREAD_BITS {
@@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k)
void free_kthread_struct(struct task_struct *k)
{
+ struct kthread *kthread;
+
/*
* Can be NULL if this kthread was created by kernel_thread()
* or if kmalloc() in kthread() failed.
*/
- kfree(to_kthread(k));
+ kthread = to_kthread(k);
+#ifdef CONFIG_BLK_CGROUP
+ WARN_ON_ONCE(kthread && kthread->blkcg_css);
+#endif
+ kfree(kthread);
}
/**
@@ -196,7 +204,7 @@ static int kthread(void *_create)
struct kthread *self;
int ret;
- self = kmalloc(sizeof(*self), GFP_KERNEL);
+ self = kzalloc(sizeof(*self), GFP_KERNEL);
set_kthread_struct(self);
/* If user was SIGKILLed, I release the structure. */
@@ -212,7 +220,6 @@ static int kthread(void *_create)
do_exit(-ENOMEM);
}
- self->flags = 0;
self->data = data;
init_completion(&self->exited);
init_completion(&self->parked);
@@ -798,15 +805,14 @@ EXPORT_SYMBOL_GPL(kthread_queue_work);
/**
* kthread_delayed_work_timer_fn - callback that queues the associated kthread
* delayed work when the timer expires.
- * @__data: pointer to the data associated with the timer
+ * @t: pointer to the expired timer
*
* The format of the function is defined by struct timer_list.
* It should have been called from irqsafe timer with irq already off.
*/
-void kthread_delayed_work_timer_fn(unsigned long __data)
+void kthread_delayed_work_timer_fn(struct timer_list *t)
{
- struct kthread_delayed_work *dwork =
- (struct kthread_delayed_work *)__data;
+ struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
struct kthread_work *work = &dwork->work;
struct kthread_worker *worker = work->worker;
@@ -837,8 +843,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
- WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn ||
- timer->data != (unsigned long)dwork);
+ WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn);
/*
* If @delay is 0, queue @dwork->work immediately. This is for
@@ -1154,3 +1159,54 @@ void kthread_destroy_worker(struct kthread_worker *worker)
kfree(worker);
}
EXPORT_SYMBOL(kthread_destroy_worker);
+
+#ifdef CONFIG_BLK_CGROUP
+/**
+ * kthread_associate_blkcg - associate blkcg to current kthread
+ * @css: the cgroup info
+ *
+ * Current thread must be a kthread. The thread is running jobs on behalf of
+ * other threads. In some cases, we expect the jobs attach cgroup info of
+ * original threads instead of that of current thread. This function stores
+ * original thread's cgroup info in current kthread context for later
+ * retrieval.
+ */
+void kthread_associate_blkcg(struct cgroup_subsys_state *css)
+{
+ struct kthread *kthread;
+
+ if (!(current->flags & PF_KTHREAD))
+ return;
+ kthread = to_kthread(current);
+ if (!kthread)
+ return;
+
+ if (kthread->blkcg_css) {
+ css_put(kthread->blkcg_css);
+ kthread->blkcg_css = NULL;
+ }
+ if (css) {
+ css_get(css);
+ kthread->blkcg_css = css;
+ }
+}
+EXPORT_SYMBOL(kthread_associate_blkcg);
+
+/**
+ * kthread_blkcg - get associated blkcg css of current kthread
+ *
+ * Current thread must be a kthread.
+ */
+struct cgroup_subsys_state *kthread_blkcg(void)
+{
+ struct kthread *kthread;
+
+ if (current->flags & PF_KTHREAD) {
+ kthread = to_kthread(current);
+ if (kthread)
+ return kthread->blkcg_css;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(kthread_blkcg);
+#endif
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index 2b8bdb1925da..b36ceda6488e 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
obj-$(CONFIG_LIVEPATCH) += livepatch.o
-livepatch-objs := core.o patch.o transition.o
+livepatch-objs := core.o patch.o shadow.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index b9628e43c78f..de9e45dca70f 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -54,11 +54,6 @@ static bool klp_is_module(struct klp_object *obj)
return obj->name;
}
-static bool klp_is_object_loaded(struct klp_object *obj)
-{
- return !obj->name || obj->mod;
-}
-
/* sets obj->mod if object is not vmlinux and module is found */
static void klp_find_object_module(struct klp_object *obj)
{
@@ -285,6 +280,11 @@ static int klp_write_object_relocations(struct module *pmod,
static int __klp_disable_patch(struct klp_patch *patch)
{
+ struct klp_object *obj;
+
+ if (WARN_ON(!patch->enabled))
+ return -EINVAL;
+
if (klp_transition_patch)
return -EBUSY;
@@ -295,6 +295,10 @@ static int __klp_disable_patch(struct klp_patch *patch)
klp_init_transition(patch, KLP_UNPATCHED);
+ klp_for_each_object(patch, obj)
+ if (obj->patched)
+ klp_pre_unpatch_callback(obj);
+
/*
* Enforce the order of the func->transition writes in
* klp_init_transition() and the TIF_PATCH_PENDING writes in
@@ -388,13 +392,18 @@ static int __klp_enable_patch(struct klp_patch *patch)
if (!klp_is_object_loaded(obj))
continue;
- ret = klp_patch_object(obj);
+ ret = klp_pre_patch_callback(obj);
if (ret) {
- pr_warn("failed to enable patch '%s'\n",
- patch->mod->name);
+ pr_warn("pre-patch callback failed for object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
+ }
- klp_cancel_transition();
- return ret;
+ ret = klp_patch_object(obj);
+ if (ret) {
+ pr_warn("failed to patch object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
}
}
@@ -403,6 +412,11 @@ static int __klp_enable_patch(struct klp_patch *patch)
patch->enabled = true;
return 0;
+err:
+ pr_warn("failed to enable patch '%s'\n", patch->mod->name);
+
+ klp_cancel_transition();
+ return ret;
}
/**
@@ -830,6 +844,47 @@ int klp_register_patch(struct klp_patch *patch)
}
EXPORT_SYMBOL_GPL(klp_register_patch);
+/*
+ * Remove parts of patches that touch a given kernel module. The list of
+ * patches processed might be limited. When limit is NULL, all patches
+ * will be handled.
+ */
+static void klp_cleanup_module_patches_limited(struct module *mod,
+ struct klp_patch *limit)
+{
+ struct klp_patch *patch;
+ struct klp_object *obj;
+
+ list_for_each_entry(patch, &klp_patches, list) {
+ if (patch == limit)
+ break;
+
+ klp_for_each_object(patch, obj) {
+ if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+ continue;
+
+ /*
+ * Only unpatch the module if the patch is enabled or
+ * is in transition.
+ */
+ if (patch->enabled || patch == klp_transition_patch) {
+
+ if (patch != klp_transition_patch)
+ klp_pre_unpatch_callback(obj);
+
+ pr_notice("reverting patch '%s' on unloading module '%s'\n",
+ patch->mod->name, obj->mod->name);
+ klp_unpatch_object(obj);
+
+ klp_post_unpatch_callback(obj);
+ }
+
+ klp_free_object_loaded(obj);
+ break;
+ }
+ }
+}
+
int klp_module_coming(struct module *mod)
{
int ret;
@@ -871,13 +926,25 @@ int klp_module_coming(struct module *mod)
pr_notice("applying patch '%s' to loading module '%s'\n",
patch->mod->name, obj->mod->name);
+ ret = klp_pre_patch_callback(obj);
+ if (ret) {
+ pr_warn("pre-patch callback failed for object '%s'\n",
+ obj->name);
+ goto err;
+ }
+
ret = klp_patch_object(obj);
if (ret) {
pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
patch->mod->name, obj->mod->name, ret);
+
+ klp_post_unpatch_callback(obj);
goto err;
}
+ if (patch != klp_transition_patch)
+ klp_post_patch_callback(obj);
+
break;
}
}
@@ -894,7 +961,7 @@ err:
pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",
patch->mod->name, obj->mod->name, obj->mod->name);
mod->klp_alive = false;
- klp_free_object_loaded(obj);
+ klp_cleanup_module_patches_limited(mod, patch);
mutex_unlock(&klp_mutex);
return ret;
@@ -902,9 +969,6 @@ err:
void klp_module_going(struct module *mod)
{
- struct klp_patch *patch;
- struct klp_object *obj;
-
if (WARN_ON(mod->state != MODULE_STATE_GOING &&
mod->state != MODULE_STATE_COMING))
return;
@@ -917,25 +981,7 @@ void klp_module_going(struct module *mod)
*/
mod->klp_alive = false;
- list_for_each_entry(patch, &klp_patches, list) {
- klp_for_each_object(patch, obj) {
- if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
- continue;
-
- /*
- * Only unpatch the module if the patch is enabled or
- * is in transition.
- */
- if (patch->enabled || patch == klp_transition_patch) {
- pr_notice("reverting patch '%s' on unloading module '%s'\n",
- patch->mod->name, obj->mod->name);
- klp_unpatch_object(obj);
- }
-
- klp_free_object_loaded(obj);
- break;
- }
- }
+ klp_cleanup_module_patches_limited(mod, NULL);
mutex_unlock(&klp_mutex);
}
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
index c74f24c47837..48a83d4364cf 100644
--- a/kernel/livepatch/core.h
+++ b/kernel/livepatch/core.h
@@ -1,6 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LIVEPATCH_CORE_H
#define _LIVEPATCH_CORE_H
+#include <linux/livepatch.h>
+
extern struct mutex klp_mutex;
+static inline bool klp_is_object_loaded(struct klp_object *obj)
+{
+ return !obj->name || obj->mod;
+}
+
+static inline int klp_pre_patch_callback(struct klp_object *obj)
+{
+ int ret = 0;
+
+ if (obj->callbacks.pre_patch)
+ ret = (*obj->callbacks.pre_patch)(obj);
+
+ obj->callbacks.post_unpatch_enabled = !ret;
+
+ return ret;
+}
+
+static inline void klp_post_patch_callback(struct klp_object *obj)
+{
+ if (obj->callbacks.post_patch)
+ (*obj->callbacks.post_patch)(obj);
+}
+
+static inline void klp_pre_unpatch_callback(struct klp_object *obj)
+{
+ if (obj->callbacks.pre_unpatch)
+ (*obj->callbacks.pre_unpatch)(obj);
+}
+
+static inline void klp_post_unpatch_callback(struct klp_object *obj)
+{
+ if (obj->callbacks.post_unpatch_enabled &&
+ obj->callbacks.post_unpatch)
+ (*obj->callbacks.post_unpatch)(obj);
+
+ obj->callbacks.post_unpatch_enabled = false;
+}
+
#endif /* _LIVEPATCH_CORE_H */
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 52c4e907c14b..82d584225dc6 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/bug.h>
#include <linux/printk.h>
+#include "core.h"
#include "patch.h"
#include "transition.h"
diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h
index 0db227170c36..e72d8250d04b 100644
--- a/kernel/livepatch/patch.h
+++ b/kernel/livepatch/patch.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LIVEPATCH_PATCH_H
#define _LIVEPATCH_PATCH_H
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
new file mode 100644
index 000000000000..fdac27588d60
--- /dev/null
+++ b/kernel/livepatch/shadow.c
@@ -0,0 +1,277 @@
+/*
+ * shadow.c - Shadow Variables
+ *
+ * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * DOC: Shadow variable API concurrency notes:
+ *
+ * The shadow variable API provides a simple relationship between an
+ * <obj, id> pair and a pointer value. It is the responsibility of the
+ * caller to provide any mutual exclusion required of the shadow data.
+ *
+ * Once a shadow variable is attached to its parent object via the
+ * klp_shadow_*alloc() API calls, it is considered live: any subsequent
+ * call to klp_shadow_get() may then return the shadow variable's data
+ * pointer. Callers of klp_shadow_*alloc() should prepare shadow data
+ * accordingly.
+ *
+ * The klp_shadow_*alloc() API calls may allocate memory for new shadow
+ * variable structures. Their implementation does not call kmalloc
+ * inside any spinlocks, but API callers should pass GFP flags according
+ * to their specific needs.
+ *
+ * The klp_shadow_hash is an RCU-enabled hashtable and is safe against
+ * concurrent klp_shadow_free() and klp_shadow_get() operations.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/hashtable.h>
+#include <linux/slab.h>
+#include <linux/livepatch.h>
+
+static DEFINE_HASHTABLE(klp_shadow_hash, 12);
+
+/*
+ * klp_shadow_lock provides exclusive access to the klp_shadow_hash and
+ * the shadow variables it references.
+ */
+static DEFINE_SPINLOCK(klp_shadow_lock);
+
+/**
+ * struct klp_shadow - shadow variable structure
+ * @node: klp_shadow_hash hash table node
+ * @rcu_head: RCU is used to safely free this structure
+ * @obj: pointer to parent object
+ * @id: data identifier
+ * @data: data area
+ */
+struct klp_shadow {
+ struct hlist_node node;
+ struct rcu_head rcu_head;
+ void *obj;
+ unsigned long id;
+ char data[];
+};
+
+/**
+ * klp_shadow_match() - verify a shadow variable matches given <obj, id>
+ * @shadow: shadow variable to match
+ * @obj: pointer to parent object
+ * @id: data identifier
+ *
+ * Return: true if the shadow variable matches.
+ */
+static inline bool klp_shadow_match(struct klp_shadow *shadow, void *obj,
+ unsigned long id)
+{
+ return shadow->obj == obj && shadow->id == id;
+}
+
+/**
+ * klp_shadow_get() - retrieve a shadow variable data pointer
+ * @obj: pointer to parent object
+ * @id: data identifier
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get(void *obj, unsigned long id)
+{
+ struct klp_shadow *shadow;
+
+ rcu_read_lock();
+
+ hash_for_each_possible_rcu(klp_shadow_hash, shadow, node,
+ (unsigned long)obj) {
+
+ if (klp_shadow_match(shadow, obj, id)) {
+ rcu_read_unlock();
+ return shadow->data;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get);
+
+static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+ size_t size, gfp_t gfp_flags, bool warn_on_exist)
+{
+ struct klp_shadow *new_shadow;
+ void *shadow_data;
+ unsigned long flags;
+
+ /* Check if the shadow variable already exists */
+ shadow_data = klp_shadow_get(obj, id);
+ if (shadow_data)
+ goto exists;
+
+ /* Allocate a new shadow variable for use inside the lock below */
+ new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags);
+ if (!new_shadow)
+ return NULL;
+
+ new_shadow->obj = obj;
+ new_shadow->id = id;
+
+ /* Initialize the shadow variable if data provided */
+ if (data)
+ memcpy(new_shadow->data, data, size);
+
+ /* Look for <obj, id> again under the lock */
+ spin_lock_irqsave(&klp_shadow_lock, flags);
+ shadow_data = klp_shadow_get(obj, id);
+ if (unlikely(shadow_data)) {
+ /*
+ * Shadow variable was found, throw away speculative
+ * allocation.
+ */
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+ kfree(new_shadow);
+ goto exists;
+ }
+
+ /* No <obj, id> found, so attach the newly allocated one */
+ hash_add_rcu(klp_shadow_hash, &new_shadow->node,
+ (unsigned long)new_shadow->obj);
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+
+ return new_shadow->data;
+
+exists:
+ if (warn_on_exist) {
+ WARN(1, "Duplicate shadow variable <%p, %lx>\n", obj, id);
+ return NULL;
+ }
+
+ return shadow_data;
+}
+
+/**
+ * klp_shadow_alloc() - allocate and add a new shadow variable
+ * @obj: pointer to parent object
+ * @id: data identifier
+ * @data: pointer to data to attach to parent
+ * @size: size of attached data
+ * @gfp_flags: GFP mask for allocation
+ *
+ * Allocates @size bytes for new shadow variable data using @gfp_flags
+ * and copies @size bytes from @data into the new shadow variable's own
+ * data space. If @data is NULL, @size bytes are still allocated, but
+ * no copy is performed. The new shadow variable is then added to the
+ * global hashtable.
+ *
+ * If an existing <obj, id> shadow variable can be found, this routine
+ * will issue a WARN, exit early and return NULL.
+ *
+ * Return: the shadow variable data element, NULL on duplicate or
+ * failure.
+ */
+void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
+ size_t size, gfp_t gfp_flags)
+{
+ return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_alloc);
+
+/**
+ * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable
+ * @obj: pointer to parent object
+ * @id: data identifier
+ * @data: pointer to data to attach to parent
+ * @size: size of attached data
+ * @gfp_flags: GFP mask for allocation
+ *
+ * Returns a pointer to existing shadow data if an <obj, id> shadow
+ * variable is already present. Otherwise, it creates a new shadow
+ * variable like klp_shadow_alloc().
+ *
+ * This function guarantees that only one shadow variable exists with
+ * the given @id for the given @obj. It also guarantees that the shadow
+ * variable will be initialized by the given @data only when it did not
+ * exist before.
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+ size_t size, gfp_t gfp_flags)
+{
+ return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
+
+/**
+ * klp_shadow_free() - detach and free a <obj, id> shadow variable
+ * @obj: pointer to parent object
+ * @id: data identifier
+ *
+ * This function releases the memory for this <obj, id> shadow variable
+ * instance, callers should stop referencing it accordingly.
+ */
+void klp_shadow_free(void *obj, unsigned long id)
+{
+ struct klp_shadow *shadow;
+ unsigned long flags;
+
+ spin_lock_irqsave(&klp_shadow_lock, flags);
+
+ /* Delete <obj, id> from hash */
+ hash_for_each_possible(klp_shadow_hash, shadow, node,
+ (unsigned long)obj) {
+
+ if (klp_shadow_match(shadow, obj, id)) {
+ hash_del_rcu(&shadow->node);
+ kfree_rcu(shadow, rcu_head);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free);
+
+/**
+ * klp_shadow_free_all() - detach and free all <*, id> shadow variables
+ * @id: data identifier
+ *
+ * This function releases the memory for all <*, id> shadow variable
+ * instances, callers should stop referencing them accordingly.
+ */
+void klp_shadow_free_all(unsigned long id)
+{
+ struct klp_shadow *shadow;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&klp_shadow_lock, flags);
+
+ /* Delete all <*, id> from hash */
+ hash_for_each(klp_shadow_hash, i, shadow, node) {
+ if (klp_shadow_match(shadow, shadow->obj, id)) {
+ hash_del_rcu(&shadow->node);
+ kfree_rcu(shadow, rcu_head);
+ }
+ }
+
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free_all);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index b004a1fb6032..56add6327736 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -82,6 +82,10 @@ static void klp_complete_transition(void)
unsigned int cpu;
bool immediate_func = false;
+ pr_debug("'%s': completing %s transition\n",
+ klp_transition_patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
if (klp_target_state == KLP_UNPATCHED) {
/*
* All tasks have transitioned to KLP_UNPATCHED so we can now
@@ -109,9 +113,6 @@ static void klp_complete_transition(void)
}
}
- if (klp_target_state == KLP_UNPATCHED && !immediate_func)
- module_put(klp_transition_patch->mod);
-
/* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
if (klp_target_state == KLP_PATCHED)
klp_synchronize_transition();
@@ -130,6 +131,27 @@ static void klp_complete_transition(void)
}
done:
+ klp_for_each_object(klp_transition_patch, obj) {
+ if (!klp_is_object_loaded(obj))
+ continue;
+ if (klp_target_state == KLP_PATCHED)
+ klp_post_patch_callback(obj);
+ else if (klp_target_state == KLP_UNPATCHED)
+ klp_post_unpatch_callback(obj);
+ }
+
+ pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
+ /*
+ * See complementary comment in __klp_enable_patch() for why we
+ * keep the module reference for immediate patches.
+ */
+ if (!klp_transition_patch->immediate && !immediate_func &&
+ klp_target_state == KLP_UNPATCHED) {
+ module_put(klp_transition_patch->mod);
+ }
+
klp_target_state = KLP_UNDEFINED;
klp_transition_patch = NULL;
}
@@ -145,6 +167,9 @@ void klp_cancel_transition(void)
if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED))
return;
+ pr_debug("'%s': canceling patching transition, going to unpatch\n",
+ klp_transition_patch->mod->name);
+
klp_target_state = KLP_UNPATCHED;
klp_complete_transition();
}
@@ -408,9 +433,6 @@ void klp_try_complete_transition(void)
}
success:
- pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
- klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
-
/* we're done, now cleanup the data structures */
klp_complete_transition();
}
@@ -426,7 +448,8 @@ void klp_start_transition(void)
WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
- pr_notice("'%s': %s...\n", klp_transition_patch->mod->name,
+ pr_notice("'%s': starting %s transition\n",
+ klp_transition_patch->mod->name,
klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
/*
@@ -482,6 +505,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
*/
klp_target_state = state;
+ pr_debug("'%s': initializing %s transition\n", patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
/*
* If the patch can be applied or reverted immediately, skip the
* per-task transitions.
@@ -547,6 +573,11 @@ void klp_reverse_transition(void)
unsigned int cpu;
struct task_struct *g, *task;
+ pr_debug("'%s': reversing transition from %s\n",
+ klp_transition_patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching to unpatching" :
+ "unpatching to patching");
+
klp_transition_patch->enabled = !klp_transition_patch->enabled;
klp_target_state = !klp_target_state;
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h
index ce09b326546c..0f6e27c481f9 100644
--- a/kernel/livepatch/transition.h
+++ b/kernel/livepatch/transition.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LIVEPATCH_TRANSITION_H
#define _LIVEPATCH_TRANSITION_H
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 760158d9d98d..392c7f23af76 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
# Any varying coverage in these files is non-deterministic
# and is generally not a function of system call inputs.
KCOV_INSTRUMENT := n
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7d2499bec5fe..db933d063bfc 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -58,6 +58,10 @@
#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+#include <linux/slab.h>
+#endif
+
#ifdef CONFIG_PROVE_LOCKING
int prove_locking = 1;
module_param(prove_locking, int, 0644);
@@ -72,6 +76,19 @@ module_param(lock_stat, int, 0644);
#define lock_stat 0
#endif
+#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
+static int crossrelease_fullstack = 1;
+#else
+static int crossrelease_fullstack;
+#endif
+static int __init allow_crossrelease_fullstack(char *str)
+{
+ crossrelease_fullstack = 1;
+ return 0;
+}
+
+early_param("crossrelease_fullstack", allow_crossrelease_fullstack);
+
/*
* lockdep_lock: protects the lockdep graph, the hashes and the
* class/list/hash allocators.
@@ -344,14 +361,12 @@ EXPORT_SYMBOL(lockdep_on);
#if VERBOSE
# define HARDIRQ_VERBOSE 1
# define SOFTIRQ_VERBOSE 1
-# define RECLAIM_VERBOSE 1
#else
# define HARDIRQ_VERBOSE 0
# define SOFTIRQ_VERBOSE 0
-# define RECLAIM_VERBOSE 0
#endif
-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
/*
* Quick filtering for interesting events:
*/
@@ -726,6 +741,18 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
}
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+static void cross_init(struct lockdep_map *lock, int cross);
+static int cross_lock(struct lockdep_map *lock);
+static int lock_acquire_crosslock(struct held_lock *hlock);
+static int lock_release_crosslock(struct lockdep_map *lock);
+#else
+static inline void cross_init(struct lockdep_map *lock, int cross) {}
+static inline int cross_lock(struct lockdep_map *lock) { return 0; }
+static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
+static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
+#endif
+
/*
* Register a lock's class in the hash-table, if the class is not present
* yet. Otherwise we look it up. We cache the result in the lock object
@@ -1125,22 +1152,41 @@ print_circular_lock_scenario(struct held_lock *src,
printk(KERN_CONT "\n\n");
}
- printk(" Possible unsafe locking scenario:\n\n");
- printk(" CPU0 CPU1\n");
- printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(parent);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(source);
- printk(KERN_CONT ");\n");
- printk("\n *** DEADLOCK ***\n\n");
+ if (cross_lock(tgt->instance)) {
+ printk(" Possible unsafe locking scenario by crosslock:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(parent);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(source);
+ printk(KERN_CONT ");\n");
+ printk(" unlock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+ } else {
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(parent);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(source);
+ printk(KERN_CONT ");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+ }
}
/*
@@ -1165,7 +1211,12 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
pr_warn("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(check_src);
- pr_warn("\nbut task is already holding lock:\n");
+
+ if (cross_lock(check_tgt->instance))
+ pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
+ else
+ pr_warn("\nbut task is already holding lock:\n");
+
print_lock(check_tgt);
pr_warn("\nwhich lock already depends on the new lock.\n\n");
pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
@@ -1183,7 +1234,8 @@ static inline int class_equal(struct lock_list *entry, void *data)
static noinline int print_circular_bug(struct lock_list *this,
struct lock_list *target,
struct held_lock *check_src,
- struct held_lock *check_tgt)
+ struct held_lock *check_tgt,
+ struct stack_trace *trace)
{
struct task_struct *curr = current;
struct lock_list *parent;
@@ -1193,7 +1245,9 @@ static noinline int print_circular_bug(struct lock_list *this,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- if (!save_trace(&this->trace))
+ if (cross_lock(check_tgt->instance))
+ this->trace = *trace;
+ else if (!save_trace(&this->trace))
return 0;
depth = get_lock_depth(target);
@@ -1309,6 +1363,19 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
return result;
}
+static noinline int
+check_redundant(struct lock_list *root, struct lock_class *target,
+ struct lock_list **target_entry)
+{
+ int result;
+
+ debug_atomic_inc(nr_redundant_checks);
+
+ result = __bfs_forwards(root, target, class_equal, target_entry);
+
+ return result;
+}
+
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
/*
* Forwards and backwards subgraph searching, for the purposes of
@@ -1784,6 +1851,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
if (nest)
return 2;
+ if (cross_lock(prev->instance))
+ continue;
+
return print_deadlock_bug(curr, prev, next);
}
return 1;
@@ -1813,20 +1883,13 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, int *stack_saved)
+ struct held_lock *next, int distance, struct stack_trace *trace,
+ int (*save)(struct stack_trace *trace))
{
+ struct lock_list *uninitialized_var(target_entry);
struct lock_list *entry;
- int ret;
struct lock_list this;
- struct lock_list *uninitialized_var(target_entry);
- /*
- * Static variable, serialized by the graph_lock().
- *
- * We use this static variable to save the stack trace in case
- * we call into this function multiple times due to encountering
- * trylocks in the held lock stack.
- */
- static struct stack_trace trace;
+ int ret;
/*
* Prove that the new <prev> -> <next> dependency would not
@@ -1840,8 +1903,17 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
this.class = hlock_class(next);
this.parent = NULL;
ret = check_noncircular(&this, hlock_class(prev), &target_entry);
- if (unlikely(!ret))
- return print_circular_bug(&this, target_entry, next, prev);
+ if (unlikely(!ret)) {
+ if (!trace->entries) {
+ /*
+ * If @save fails here, the printing might trigger
+ * a WARN but because of the !nr_entries it should
+ * not do bad things.
+ */
+ save(trace);
+ }
+ return print_circular_bug(&this, target_entry, next, prev, trace);
+ }
else if (unlikely(ret < 0))
return print_bfs_bug(ret);
@@ -1870,15 +1942,26 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
if (entry->class == hlock_class(next)) {
if (distance == 1)
entry->distance = 1;
- return 2;
+ return 1;
}
}
- if (!*stack_saved) {
- if (!save_trace(&trace))
- return 0;
- *stack_saved = 1;
+ /*
+ * Is the <prev> -> <next> link redundant?
+ */
+ this.class = hlock_class(prev);
+ this.parent = NULL;
+ ret = check_redundant(&this, hlock_class(next), &target_entry);
+ if (!ret) {
+ debug_atomic_inc(nr_redundant);
+ return 2;
}
+ if (ret < 0)
+ return print_bfs_bug(ret);
+
+
+ if (!trace->entries && !save(trace))
+ return 0;
/*
* Ok, all validations passed, add the new lock
@@ -1886,33 +1969,18 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
*/
ret = add_lock_to_list(hlock_class(next),
&hlock_class(prev)->locks_after,
- next->acquire_ip, distance, &trace);
+ next->acquire_ip, distance, trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(prev),
&hlock_class(next)->locks_before,
- next->acquire_ip, distance, &trace);
+ next->acquire_ip, distance, trace);
if (!ret)
return 0;
- /*
- * Debugging printouts:
- */
- if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
- /* We drop graph lock, so another thread can overwrite trace. */
- *stack_saved = 0;
- graph_unlock();
- printk("\n new dependency: ");
- print_lock_name(hlock_class(prev));
- printk(KERN_CONT " => ");
- print_lock_name(hlock_class(next));
- printk(KERN_CONT "\n");
- dump_stack();
- return graph_lock();
- }
- return 1;
+ return 2;
}
/*
@@ -1925,8 +1993,13 @@ static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
int depth = curr->lockdep_depth;
- int stack_saved = 0;
struct held_lock *hlock;
+ struct stack_trace trace = {
+ .nr_entries = 0,
+ .max_entries = 0,
+ .entries = NULL,
+ .skip = 0,
+ };
/*
* Debugging checks.
@@ -1947,21 +2020,29 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
int distance = curr->lockdep_depth - depth + 1;
hlock = curr->held_locks + depth - 1;
/*
- * Only non-recursive-read entries get new dependencies
- * added:
+ * Only non-crosslock entries get new dependencies added.
+ * Crosslock entries will be added by commit later:
*/
- if (hlock->read != 2 && hlock->check) {
- if (!check_prev_add(curr, hlock, next,
- distance, &stack_saved))
- return 0;
+ if (!cross_lock(hlock->instance)) {
/*
- * Stop after the first non-trylock entry,
- * as non-trylock entries have added their
- * own direct dependencies already, so this
- * lock is connected to them indirectly:
+ * Only non-recursive-read entries get new dependencies
+ * added:
*/
- if (!hlock->trylock)
- break;
+ if (hlock->read != 2 && hlock->check) {
+ int ret = check_prev_add(curr, hlock, next,
+ distance, &trace, save_trace);
+ if (!ret)
+ return 0;
+
+ /*
+ * Stop after the first non-trylock entry,
+ * as non-trylock entries have added their
+ * own direct dependencies already, so this
+ * lock is connected to them indirectly:
+ */
+ if (!hlock->trylock)
+ break;
+ }
}
depth--;
/*
@@ -2126,19 +2207,26 @@ static int check_no_collision(struct task_struct *curr,
}
/*
- * Look up a dependency chain. If the key is not present yet then
- * add it and return 1 - in this case the new dependency chain is
- * validated. If the key is already hashed, return 0.
- * (On return with 1 graph_lock is held.)
+ * This is for building a chain between just two different classes,
+ * instead of adding a new hlock upon current, which is done by
+ * add_chain_cache().
+ *
+ * This can be called in any context with two classes, while
+ * add_chain_cache() must be done within the lock owener's context
+ * since it uses hlock which might be racy in another context.
*/
-static inline int lookup_chain_cache(struct task_struct *curr,
- struct held_lock *hlock,
- u64 chain_key)
+static inline int add_chain_cache_classes(unsigned int prev,
+ unsigned int next,
+ unsigned int irq_context,
+ u64 chain_key)
{
- struct lock_class *class = hlock_class(hlock);
struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
- int i, j;
+
+ /*
+ * Allocate a new chain entry from the static array, and add
+ * it to the hash:
+ */
/*
* We might need to take the graph lock, ensure we've got IRQs
@@ -2147,43 +2235,76 @@ static inline int lookup_chain_cache(struct task_struct *curr,
*/
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
+
+ if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
+ dump_stack();
+ return 0;
+ }
+
+ chain = lock_chains + nr_lock_chains++;
+ chain->chain_key = chain_key;
+ chain->irq_context = irq_context;
+ chain->depth = 2;
+ if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ chain->base = nr_chain_hlocks;
+ nr_chain_hlocks += chain->depth;
+ chain_hlocks[chain->base] = prev - 1;
+ chain_hlocks[chain->base + 1] = next -1;
+ }
+#ifdef CONFIG_DEBUG_LOCKDEP
/*
- * We can walk it lock-free, because entries only get added
- * to the hash:
+ * Important for check_no_collision().
*/
- hlist_for_each_entry_rcu(chain, hash_head, entry) {
- if (chain->chain_key == chain_key) {
-cache_hit:
- debug_atomic_inc(chain_lookup_hits);
- if (!check_no_collision(curr, hlock, chain))
- return 0;
-
- if (very_verbose(class))
- printk("\nhash chain already cached, key: "
- "%016Lx tail class: [%p] %s\n",
- (unsigned long long)chain_key,
- class->key, class->name);
+ else {
+ if (!debug_locks_off_graph_unlock())
return 0;
- }
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
+ dump_stack();
+ return 0;
}
- if (very_verbose(class))
- printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
- (unsigned long long)chain_key, class->key, class->name);
+#endif
+
+ hlist_add_head_rcu(&chain->entry, hash_head);
+ debug_atomic_inc(chain_lookup_misses);
+ inc_chains();
+
+ return 1;
+}
+
+/*
+ * Adds a dependency chain into chain hashtable. And must be called with
+ * graph_lock held.
+ *
+ * Return 0 if fail, and graph_lock is released.
+ * Return 1 if succeed, with graph_lock held.
+ */
+static inline int add_chain_cache(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
+{
+ struct lock_class *class = hlock_class(hlock);
+ struct hlist_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+ int i, j;
+
/*
* Allocate a new chain entry from the static array, and add
* it to the hash:
*/
- if (!graph_lock())
- return 0;
+
/*
- * We have to walk the chain again locked - to avoid duplicates:
+ * We might need to take the graph lock, ensure we've got IRQs
+ * disabled to make this an IRQ-safe lock.. for recursion reasons
+ * lockdep won't complain about its own locking errors.
*/
- hlist_for_each_entry(chain, hash_head, entry) {
- if (chain->chain_key == chain_key) {
- graph_unlock();
- goto cache_hit;
- }
- }
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
if (!debug_locks_off_graph_unlock())
return 0;
@@ -2235,6 +2356,78 @@ cache_hit:
return 1;
}
+/*
+ * Look up a dependency chain.
+ */
+static inline struct lock_chain *lookup_chain_cache(u64 chain_key)
+{
+ struct hlist_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+
+ /*
+ * We can walk it lock-free, because entries only get added
+ * to the hash:
+ */
+ hlist_for_each_entry_rcu(chain, hash_head, entry) {
+ if (chain->chain_key == chain_key) {
+ debug_atomic_inc(chain_lookup_hits);
+ return chain;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * If the key is not present yet in dependency chain cache then
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
+ */
+static inline int lookup_chain_cache_add(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
+{
+ struct lock_class *class = hlock_class(hlock);
+ struct lock_chain *chain = lookup_chain_cache(chain_key);
+
+ if (chain) {
+cache_hit:
+ if (!check_no_collision(curr, hlock, chain))
+ return 0;
+
+ if (very_verbose(class)) {
+ printk("\nhash chain already cached, key: "
+ "%016Lx tail class: [%p] %s\n",
+ (unsigned long long)chain_key,
+ class->key, class->name);
+ }
+
+ return 0;
+ }
+
+ if (very_verbose(class)) {
+ printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
+ (unsigned long long)chain_key, class->key, class->name);
+ }
+
+ if (!graph_lock())
+ return 0;
+
+ /*
+ * We have to walk the chain again locked - to avoid duplicates:
+ */
+ chain = lookup_chain_cache(chain_key);
+ if (chain) {
+ graph_unlock();
+ goto cache_hit;
+ }
+
+ if (!add_chain_cache(curr, hlock, chain_key))
+ return 0;
+
+ return 1;
+}
+
static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
struct held_lock *hlock, int chain_head, u64 chain_key)
{
@@ -2245,11 +2438,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
*
* We look up the chain_key and do the O(N^2) check and update of
* the dependencies only if this is a new dependency chain.
- * (If lookup_chain_cache() returns with 1 it acquires
+ * (If lookup_chain_cache_add() return with 1 it acquires
* graph_lock for us)
*/
if (!hlock->trylock && hlock->check &&
- lookup_chain_cache(curr, hlock, chain_key)) {
+ lookup_chain_cache_add(curr, hlock, chain_key)) {
/*
* Check whether last held lock:
*
@@ -2277,14 +2470,17 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
* Add dependency only if this lock is not the head
* of the chain, and if it's not a secondary read-lock:
*/
- if (!chain_head && ret != 2)
+ if (!chain_head && ret != 2) {
if (!check_prevs_add(curr, hlock))
return 0;
+ }
+
graph_unlock();
- } else
- /* after lookup_chain_cache(): */
+ } else {
+ /* after lookup_chain_cache_add(): */
if (unlikely(!debug_locks))
return 0;
+ }
return 1;
}
@@ -2567,14 +2763,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
return 0;
}
-static int RECLAIM_FS_verbose(struct lock_class *class)
-{
-#if RECLAIM_VERBOSE
- return class_filter(class);
-#endif
- return 0;
-}
-
#define STRICT_READ_CHECKS 1
static int (*state_verbose_f[])(struct lock_class *class) = {
@@ -2870,57 +3058,6 @@ void trace_softirqs_off(unsigned long ip)
debug_atomic_inc(redundant_softirqs_off);
}
-static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
-{
- struct task_struct *curr = current;
-
- if (unlikely(!debug_locks))
- return;
-
- gfp_mask = current_gfp_context(gfp_mask);
-
- /* no reclaim without waiting on it */
- if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
- return;
-
- /* this guy won't enter reclaim */
- if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
- return;
-
- /* We're only interested __GFP_FS allocations for now */
- if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
- return;
-
- /*
- * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
- */
- if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
- return;
-
- /* Disable lockdep if explicitly requested */
- if (gfp_mask & __GFP_NOLOCKDEP)
- return;
-
- mark_held_locks(curr, RECLAIM_FS);
-}
-
-static void check_flags(unsigned long flags);
-
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
- unsigned long flags;
-
- if (unlikely(current->lockdep_recursion))
- return;
-
- raw_local_irq_save(flags);
- check_flags(flags);
- current->lockdep_recursion = 1;
- __lockdep_trace_alloc(gfp_mask, flags);
- current->lockdep_recursion = 0;
- raw_local_irq_restore(flags);
-}
-
static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
{
/*
@@ -2966,22 +3103,6 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
}
}
- /*
- * We reuse the irq context infrastructure more broadly as a general
- * context checking code. This tests GFP_FS recursion (a lock taken
- * during reclaim for a GFP_FS allocation is held over a GFP_FS
- * allocation).
- */
- if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
- if (hlock->read) {
- if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
- return 0;
- } else {
- if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
- return 0;
- }
- }
-
return 1;
}
@@ -3040,10 +3161,6 @@ static inline int separate_irq_context(struct task_struct *curr,
return 0;
}
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
-}
-
#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
/*
@@ -3116,7 +3233,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
/*
* Initialize a lock instance's lock-class mapping info:
*/
-void lockdep_init_map(struct lockdep_map *lock, const char *name,
+static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass)
{
int i;
@@ -3174,8 +3291,25 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
raw_local_irq_restore(flags);
}
}
+
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass)
+{
+ cross_init(lock, 0);
+ __lockdep_init_map(lock, name, key, subclass);
+}
EXPORT_SYMBOL_GPL(lockdep_init_map);
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass)
+{
+ cross_init(lock, 1);
+ __lockdep_init_map(lock, name, key, subclass);
+}
+EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
+#endif
+
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
@@ -3231,6 +3365,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int chain_head = 0;
int class_idx;
u64 chain_key;
+ int ret;
if (unlikely(!debug_locks))
return 0;
@@ -3279,7 +3414,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes + 1;
- if (depth) {
+ /* TODO: nest_lock is not implemented for crosslock yet. */
+ if (depth && !cross_lock(lock)) {
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (hlock->references) {
@@ -3367,6 +3503,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
return 0;
+ ret = lock_acquire_crosslock(hlock);
+ /*
+ * 2 means normal acquire operations are needed. Otherwise, it's
+ * ok just to return with '0:fail, 1:success'.
+ */
+ if (ret != 2)
+ return ret;
+
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -3604,11 +3748,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
struct task_struct *curr = current;
struct held_lock *hlock;
unsigned int depth;
- int i;
+ int ret, i;
if (unlikely(!debug_locks))
return 0;
+ ret = lock_release_crosslock(lock);
+ /*
+ * 2 means normal release operations are needed. Otherwise, it's
+ * ok just to return with '0:fail, 1:success'.
+ */
+ if (ret != 2)
+ return ret;
+
depth = curr->lockdep_depth;
/*
* So we're all set to release this lock.. wait what lock? We don't
@@ -3952,18 +4104,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
}
EXPORT_SYMBOL_GPL(lock_unpin_lock);
-void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
-{
- current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
-}
-EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
-
-void lockdep_clear_current_reclaim_state(void)
-{
- current->lockdep_reclaim_gfp = 0;
-}
-EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state);
-
#ifdef CONFIG_LOCK_STAT
static int
print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
@@ -4484,6 +4624,12 @@ asmlinkage __visible void lockdep_sys_exit(void)
curr->comm, curr->pid);
lockdep_print_held_locks(curr);
}
+
+ /*
+ * The lock history for each syscall should be independent. So wipe the
+ * slate clean on return to userspace.
+ */
+ lockdep_invariant_state(false);
}
void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
@@ -4532,3 +4678,494 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
dump_stack();
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
+
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+
+/*
+ * Crossrelease works by recording a lock history for each thread and
+ * connecting those historic locks that were taken after the
+ * wait_for_completion() in the complete() context.
+ *
+ * Task-A Task-B
+ *
+ * mutex_lock(&A);
+ * mutex_unlock(&A);
+ *
+ * wait_for_completion(&C);
+ * lock_acquire_crosslock();
+ * atomic_inc_return(&cross_gen_id);
+ * |
+ * | mutex_lock(&B);
+ * | mutex_unlock(&B);
+ * |
+ * | complete(&C);
+ * `-- lock_commit_crosslock();
+ *
+ * Which will then add a dependency between B and C.
+ */
+
+#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR])
+
+/*
+ * Whenever a crosslock is held, cross_gen_id will be increased.
+ */
+static atomic_t cross_gen_id; /* Can be wrapped */
+
+/*
+ * Make an entry of the ring buffer invalid.
+ */
+static inline void invalidate_xhlock(struct hist_lock *xhlock)
+{
+ /*
+ * Normally, xhlock->hlock.instance must be !NULL.
+ */
+ xhlock->hlock.instance = NULL;
+}
+
+/*
+ * Lock history stacks; we have 2 nested lock history stacks:
+ *
+ * HARD(IRQ)
+ * SOFT(IRQ)
+ *
+ * The thing is that once we complete a HARD/SOFT IRQ the future task locks
+ * should not depend on any of the locks observed while running the IRQ. So
+ * what we do is rewind the history buffer and erase all our knowledge of that
+ * temporal event.
+ */
+
+void crossrelease_hist_start(enum xhlock_context_t c)
+{
+ struct task_struct *cur = current;
+
+ if (!cur->xhlocks)
+ return;
+
+ cur->xhlock_idx_hist[c] = cur->xhlock_idx;
+ cur->hist_id_save[c] = cur->hist_id;
+}
+
+void crossrelease_hist_end(enum xhlock_context_t c)
+{
+ struct task_struct *cur = current;
+
+ if (cur->xhlocks) {
+ unsigned int idx = cur->xhlock_idx_hist[c];
+ struct hist_lock *h = &xhlock(idx);
+
+ cur->xhlock_idx = idx;
+
+ /* Check if the ring was overwritten. */
+ if (h->hist_id != cur->hist_id_save[c])
+ invalidate_xhlock(h);
+ }
+}
+
+/*
+ * lockdep_invariant_state() is used to annotate independence inside a task, to
+ * make one task look like multiple independent 'tasks'.
+ *
+ * Take for instance workqueues; each work is independent of the last. The
+ * completion of a future work does not depend on the completion of a past work
+ * (in general). Therefore we must not carry that (lock) dependency across
+ * works.
+ *
+ * This is true for many things; pretty much all kthreads fall into this
+ * pattern, where they have an invariant state and future completions do not
+ * depend on past completions. Its just that since they all have the 'same'
+ * form -- the kthread does the same over and over -- it doesn't typically
+ * matter.
+ *
+ * The same is true for system-calls, once a system call is completed (we've
+ * returned to userspace) the next system call does not depend on the lock
+ * history of the previous system call.
+ *
+ * They key property for independence, this invariant state, is that it must be
+ * a point where we hold no locks and have no history. Because if we were to
+ * hold locks, the restore at _end() would not necessarily recover it's history
+ * entry. Similarly, independence per-definition means it does not depend on
+ * prior state.
+ */
+void lockdep_invariant_state(bool force)
+{
+ /*
+ * We call this at an invariant point, no current state, no history.
+ * Verify the former, enforce the latter.
+ */
+ WARN_ON_ONCE(!force && current->lockdep_depth);
+ invalidate_xhlock(&xhlock(current->xhlock_idx));
+}
+
+static int cross_lock(struct lockdep_map *lock)
+{
+ return lock ? lock->cross : 0;
+}
+
+/*
+ * This is needed to decide the relationship between wrapable variables.
+ */
+static inline int before(unsigned int a, unsigned int b)
+{
+ return (int)(a - b) < 0;
+}
+
+static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
+{
+ return hlock_class(&xhlock->hlock);
+}
+
+static inline struct lock_class *xlock_class(struct cross_lock *xlock)
+{
+ return hlock_class(&xlock->hlock);
+}
+
+/*
+ * Should we check a dependency with previous one?
+ */
+static inline int depend_before(struct held_lock *hlock)
+{
+ return hlock->read != 2 && hlock->check && !hlock->trylock;
+}
+
+/*
+ * Should we check a dependency with next one?
+ */
+static inline int depend_after(struct held_lock *hlock)
+{
+ return hlock->read != 2 && hlock->check;
+}
+
+/*
+ * Check if the xhlock is valid, which would be false if,
+ *
+ * 1. Has not used after initializaion yet.
+ * 2. Got invalidated.
+ *
+ * Remind hist_lock is implemented as a ring buffer.
+ */
+static inline int xhlock_valid(struct hist_lock *xhlock)
+{
+ /*
+ * xhlock->hlock.instance must be !NULL.
+ */
+ return !!xhlock->hlock.instance;
+}
+
+/*
+ * Record a hist_lock entry.
+ *
+ * Irq disable is only required.
+ */
+static void add_xhlock(struct held_lock *hlock)
+{
+ unsigned int idx = ++current->xhlock_idx;
+ struct hist_lock *xhlock = &xhlock(idx);
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ /*
+ * This can be done locklessly because they are all task-local
+ * state, we must however ensure IRQs are disabled.
+ */
+ WARN_ON_ONCE(!irqs_disabled());
+#endif
+
+ /* Initialize hist_lock's members */
+ xhlock->hlock = *hlock;
+ xhlock->hist_id = ++current->hist_id;
+
+ xhlock->trace.nr_entries = 0;
+ xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
+ xhlock->trace.entries = xhlock->trace_entries;
+
+ if (crossrelease_fullstack) {
+ xhlock->trace.skip = 3;
+ save_stack_trace(&xhlock->trace);
+ } else {
+ xhlock->trace.nr_entries = 1;
+ xhlock->trace.entries[0] = hlock->acquire_ip;
+ }
+}
+
+static inline int same_context_xhlock(struct hist_lock *xhlock)
+{
+ return xhlock->hlock.irq_context == task_irq_context(current);
+}
+
+/*
+ * This should be lockless as far as possible because this would be
+ * called very frequently.
+ */
+static void check_add_xhlock(struct held_lock *hlock)
+{
+ /*
+ * Record a hist_lock, only in case that acquisitions ahead
+ * could depend on the held_lock. For example, if the held_lock
+ * is trylock then acquisitions ahead never depends on that.
+ * In that case, we don't need to record it. Just return.
+ */
+ if (!current->xhlocks || !depend_before(hlock))
+ return;
+
+ add_xhlock(hlock);
+}
+
+/*
+ * For crosslock.
+ */
+static int add_xlock(struct held_lock *hlock)
+{
+ struct cross_lock *xlock;
+ unsigned int gen_id;
+
+ if (!graph_lock())
+ return 0;
+
+ xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
+
+ /*
+ * When acquisitions for a crosslock are overlapped, we use
+ * nr_acquire to perform commit for them, based on cross_gen_id
+ * of the first acquisition, which allows to add additional
+ * dependencies.
+ *
+ * Moreover, when no acquisition of a crosslock is in progress,
+ * we should not perform commit because the lock might not exist
+ * any more, which might cause incorrect memory access. So we
+ * have to track the number of acquisitions of a crosslock.
+ *
+ * depend_after() is necessary to initialize only the first
+ * valid xlock so that the xlock can be used on its commit.
+ */
+ if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
+ goto unlock;
+
+ gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
+ xlock->hlock = *hlock;
+ xlock->hlock.gen_id = gen_id;
+unlock:
+ graph_unlock();
+ return 1;
+}
+
+/*
+ * Called for both normal and crosslock acquires. Normal locks will be
+ * pushed on the hist_lock queue. Cross locks will record state and
+ * stop regular lock_acquire() to avoid being placed on the held_lock
+ * stack.
+ *
+ * Return: 0 - failure;
+ * 1 - crosslock, done;
+ * 2 - normal lock, continue to held_lock[] ops.
+ */
+static int lock_acquire_crosslock(struct held_lock *hlock)
+{
+ /*
+ * CONTEXT 1 CONTEXT 2
+ * --------- ---------
+ * lock A (cross)
+ * X = atomic_inc_return(&cross_gen_id)
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Y = atomic_read_acquire(&cross_gen_id)
+ * lock B
+ *
+ * atomic_read_acquire() is for ordering between A and B,
+ * IOW, A happens before B, when CONTEXT 2 see Y >= X.
+ *
+ * Pairs with atomic_inc_return() in add_xlock().
+ */
+ hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
+
+ if (cross_lock(hlock->instance))
+ return add_xlock(hlock);
+
+ check_add_xhlock(hlock);
+ return 2;
+}
+
+static int copy_trace(struct stack_trace *trace)
+{
+ unsigned long *buf = stack_trace + nr_stack_trace_entries;
+ unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+ unsigned int nr = min(max_nr, trace->nr_entries);
+
+ trace->nr_entries = nr;
+ memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
+ trace->entries = buf;
+ nr_stack_trace_entries += nr;
+
+ if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
+ dump_stack();
+
+ return 0;
+ }
+
+ return 1;
+}
+
+static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
+{
+ unsigned int xid, pid;
+ u64 chain_key;
+
+ xid = xlock_class(xlock) - lock_classes;
+ chain_key = iterate_chain_key((u64)0, xid);
+ pid = xhlock_class(xhlock) - lock_classes;
+ chain_key = iterate_chain_key(chain_key, pid);
+
+ if (lookup_chain_cache(chain_key))
+ return 1;
+
+ if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
+ chain_key))
+ return 0;
+
+ if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
+ &xhlock->trace, copy_trace))
+ return 0;
+
+ return 1;
+}
+
+static void commit_xhlocks(struct cross_lock *xlock)
+{
+ unsigned int cur = current->xhlock_idx;
+ unsigned int prev_hist_id = xhlock(cur).hist_id;
+ unsigned int i;
+
+ if (!graph_lock())
+ return;
+
+ if (xlock->nr_acquire) {
+ for (i = 0; i < MAX_XHLOCKS_NR; i++) {
+ struct hist_lock *xhlock = &xhlock(cur - i);
+
+ if (!xhlock_valid(xhlock))
+ break;
+
+ if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
+ break;
+
+ if (!same_context_xhlock(xhlock))
+ break;
+
+ /*
+ * Filter out the cases where the ring buffer was
+ * overwritten and the current entry has a bigger
+ * hist_id than the previous one, which is impossible
+ * otherwise:
+ */
+ if (unlikely(before(prev_hist_id, xhlock->hist_id)))
+ break;
+
+ prev_hist_id = xhlock->hist_id;
+
+ /*
+ * commit_xhlock() returns 0 with graph_lock already
+ * released if fail.
+ */
+ if (!commit_xhlock(xlock, xhlock))
+ return;
+ }
+ }
+
+ graph_unlock();
+}
+
+void lock_commit_crosslock(struct lockdep_map *lock)
+{
+ struct cross_lock *xlock;
+ unsigned long flags;
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ if (!current->xhlocks)
+ return;
+
+ /*
+ * Do commit hist_locks with the cross_lock, only in case that
+ * the cross_lock could depend on acquisitions after that.
+ *
+ * For example, if the cross_lock does not have the 'check' flag
+ * then we don't need to check dependencies and commit for that.
+ * Just skip it. In that case, of course, the cross_lock does
+ * not depend on acquisitions ahead, either.
+ *
+ * WARNING: Don't do that in add_xlock() in advance. When an
+ * acquisition context is different from the commit context,
+ * invalid(skipped) cross_lock might be accessed.
+ */
+ if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ xlock = &((struct lockdep_map_cross *)lock)->xlock;
+ commit_xhlocks(xlock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_commit_crosslock);
+
+/*
+ * Return: 0 - failure;
+ * 1 - crosslock, done;
+ * 2 - normal lock, continue to held_lock[] ops.
+ */
+static int lock_release_crosslock(struct lockdep_map *lock)
+{
+ if (cross_lock(lock)) {
+ if (!graph_lock())
+ return 0;
+ ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
+ graph_unlock();
+ return 1;
+ }
+ return 2;
+}
+
+static void cross_init(struct lockdep_map *lock, int cross)
+{
+ if (cross)
+ ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
+
+ lock->cross = cross;
+
+ /*
+ * Crossrelease assumes that the ring buffer size of xhlocks
+ * is aligned with power of 2. So force it on build.
+ */
+ BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
+}
+
+void lockdep_init_task(struct task_struct *task)
+{
+ int i;
+
+ task->xhlock_idx = UINT_MAX;
+ task->hist_id = 0;
+
+ for (i = 0; i < XHLOCK_CTX_NR; i++) {
+ task->xhlock_idx_hist[i] = UINT_MAX;
+ task->hist_id_save[i] = 0;
+ }
+
+ task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
+ GFP_KERNEL);
+}
+
+void lockdep_free_task(struct task_struct *task)
+{
+ if (task->xhlocks) {
+ void *tmp = task->xhlocks;
+ /* Diable crossrelease for current */
+ task->xhlocks = NULL;
+ kfree(tmp);
+ }
+}
+#endif
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index c08fbd2f5ba9..d459d624ba2a 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* kernel/lockdep_internals.h
*
@@ -143,6 +144,8 @@ struct lockdep_stats {
int redundant_softirqs_on;
int redundant_softirqs_off;
int nr_unused_locks;
+ int nr_redundant_checks;
+ int nr_redundant;
int nr_cyclic_checks;
int nr_cyclic_check_recursions;
int nr_find_usage_forwards_checks;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 6d1fcc786081..ad69bbc9bd28 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* kernel/lockdep_proc.c
*
@@ -201,6 +202,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
debug_atomic_read(chain_lookup_hits));
seq_printf(m, " cyclic checks: %11llu\n",
debug_atomic_read(nr_cyclic_checks));
+ seq_printf(m, " redundant checks: %11llu\n",
+ debug_atomic_read(nr_redundant_checks));
+ seq_printf(m, " redundant links: %11llu\n",
+ debug_atomic_read(nr_redundant));
seq_printf(m, " find-mask forwards checks: %11llu\n",
debug_atomic_read(nr_find_usage_forwards_checks));
seq_printf(m, " find-mask backwards checks: %11llu\n",
diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc2b84c..35ca09f2ed0b 100644
--- a/kernel/locking/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
@@ -6,4 +6,3 @@
*/
LOCKDEP_STATE(HARDIRQ)
LOCKDEP_STATE(SOFTIRQ)
-LOCKDEP_STATE(RECLAIM_FS)
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 6a385aabcce7..f046b7ce9dd6 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* MCS lock defines
*
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 4174417d5309..1edd3f45a4ec 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Mutexes: blocking mutual exclusion locks
*
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 6ebc1902f779..1c2287d3fa71 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Mutexes: blocking mutual exclusion locks
*
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index a3167941093b..6ef600aa0f47 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/osq_lock.h>
@@ -109,6 +110,19 @@ bool osq_lock(struct optimistic_spin_queue *lock)
prev = decode_cpu(old);
node->prev = prev;
+
+ /*
+ * osq_lock() unqueue
+ *
+ * node->prev = prev osq_wait_next()
+ * WMB MB
+ * prev->next = node next->prev = prev // unqueue-C
+ *
+ * Here 'node->prev' and 'next->prev' are the same variable and we need
+ * to ensure these stores happen in-order to avoid corrupting the list.
+ */
+ smp_wmb();
+
WRITE_ONCE(prev->next, node);
/*
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 2655f26ec882..c7471c3fb798 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -23,49 +23,11 @@
#include <linux/spinlock.h>
#include <asm/qrwlock.h>
-/*
- * This internal data structure is used for optimizing access to some of
- * the subfields within the atomic_t cnts.
- */
-struct __qrwlock {
- union {
- atomic_t cnts;
- struct {
-#ifdef __LITTLE_ENDIAN
- u8 wmode; /* Writer mode */
- u8 rcnts[3]; /* Reader counts */
-#else
- u8 rcnts[3]; /* Reader counts */
- u8 wmode; /* Writer mode */
-#endif
- };
- };
- arch_spinlock_t lock;
-};
-
-/**
- * rspin_until_writer_unlock - inc reader count & spin until writer is gone
- * @lock : Pointer to queue rwlock structure
- * @writer: Current queue rwlock writer status byte
- *
- * In interrupt context or at the head of the queue, the reader will just
- * increment the reader count & wait until the writer releases the lock.
- */
-static __always_inline void
-rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
-{
- while ((cnts & _QW_WMASK) == _QW_LOCKED) {
- cpu_relax();
- cnts = atomic_read_acquire(&lock->cnts);
- }
-}
-
/**
* queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
- * @cnts: Current qrwlock lock value
*/
-void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
+void queued_read_lock_slowpath(struct qrwlock *lock)
{
/*
* Readers come here when they cannot get the lock without waiting
@@ -73,13 +35,11 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
if (unlikely(in_interrupt())) {
/*
* Readers in interrupt context will get the lock immediately
- * if the writer is just waiting (not holding the lock yet).
- * The rspin_until_writer_unlock() function returns immediately
- * in this case. Otherwise, they will spin (with ACQUIRE
- * semantics) until the lock is available without waiting in
- * the queue.
+ * if the writer is just waiting (not holding the lock yet),
+ * so spin with ACQUIRE semantics until the lock is available
+ * without waiting in the queue.
*/
- rspin_until_writer_unlock(lock, cnts);
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
return;
}
atomic_sub(_QR_BIAS, &lock->cnts);
@@ -88,14 +48,14 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
* Put the reader into the wait queue
*/
arch_spin_lock(&lock->wait_lock);
+ atomic_add(_QR_BIAS, &lock->cnts);
/*
* The ACQUIRE semantics of the following spinning code ensure
* that accesses can't leak upwards out of our subsequent critical
* section in the case that the lock is currently held for write.
*/
- cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts);
- rspin_until_writer_unlock(lock, cnts);
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
/*
* Signal the next one in queue to become queue head
@@ -110,8 +70,6 @@ EXPORT_SYMBOL(queued_read_lock_slowpath);
*/
void queued_write_lock_slowpath(struct qrwlock *lock)
{
- u32 cnts;
-
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);
@@ -120,30 +78,14 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
(atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
goto unlock;
- /*
- * Set the waiting flag to notify readers that a writer is pending,
- * or wait for a previous writer to go away.
- */
- for (;;) {
- struct __qrwlock *l = (struct __qrwlock *)lock;
-
- if (!READ_ONCE(l->wmode) &&
- (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
- break;
+ /* Set the waiting flag to notify readers that a writer is pending */
+ atomic_add(_QW_WAITING, &lock->cnts);
- cpu_relax();
- }
-
- /* When no more readers, set the locked flag */
- for (;;) {
- cnts = atomic_read(&lock->cnts);
- if ((cnts == _QW_WAITING) &&
- (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) == _QW_WAITING))
- break;
-
- cpu_relax();
- }
+ /* When no more readers or writers, set the locked flag */
+ do {
+ atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING);
+ } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING,
+ _QW_LOCKED) != _QW_WAITING);
unlock:
arch_spin_unlock(&lock->wait_lock);
}
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index fd24153e8a48..294294c71ba4 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -268,123 +268,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
#endif
-/*
- * Various notes on spin_is_locked() and spin_unlock_wait(), which are
- * 'interesting' functions:
- *
- * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
- * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
- * PPC). Also qspinlock has a similar issue per construction, the setting of
- * the locked byte can be unordered acquiring the lock proper.
- *
- * This gets to be 'interesting' in the following cases, where the /should/s
- * end up false because of this issue.
- *
- *
- * CASE 1:
- *
- * So the spin_is_locked() correctness issue comes from something like:
- *
- * CPU0 CPU1
- *
- * global_lock(); local_lock(i)
- * spin_lock(&G) spin_lock(&L[i])
- * for (i) if (!spin_is_locked(&G)) {
- * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep();
- * return;
- * }
- * // deal with fail
- *
- * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
- * that there is exclusion between the two critical sections.
- *
- * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
- * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
- * /should/ be constrained by the ACQUIRE from spin_lock(&G).
- *
- * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
- *
- *
- * CASE 2:
- *
- * For spin_unlock_wait() there is a second correctness issue, namely:
- *
- * CPU0 CPU1
- *
- * flag = set;
- * smp_mb(); spin_lock(&l)
- * spin_unlock_wait(&l); if (!flag)
- * // add to lockless list
- * spin_unlock(&l);
- * // iterate lockless list
- *
- * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
- * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
- * semantics etc..)
- *
- * Where flag /should/ be ordered against the locked store of l.
- */
-
-/*
- * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
- * issuing an _unordered_ store to set _Q_LOCKED_VAL.
- *
- * This means that the store can be delayed, but no later than the
- * store-release from the unlock. This means that simply observing
- * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
- *
- * There are two paths that can issue the unordered store:
- *
- * (1) clear_pending_set_locked(): *,1,0 -> *,0,1
- *
- * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0
- * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1
- *
- * However, in both cases we have other !0 state we've set before to queue
- * ourseves:
- *
- * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
- * load is constrained by that ACQUIRE to not pass before that, and thus must
- * observe the store.
- *
- * For (2) we have a more intersting scenario. We enqueue ourselves using
- * xchg_tail(), which ends up being a RELEASE. This in itself is not
- * sufficient, however that is followed by an smp_cond_acquire() on the same
- * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
- * guarantees we must observe that store.
- *
- * Therefore both cases have other !0 state that is observable before the
- * unordered locked byte store comes through. This means we can use that to
- * wait for the lock store, and then wait for an unlock.
- */
-#ifndef queued_spin_unlock_wait
-void queued_spin_unlock_wait(struct qspinlock *lock)
-{
- u32 val;
-
- for (;;) {
- val = atomic_read(&lock->val);
-
- if (!val) /* not locked, we're done */
- goto done;
-
- if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
- break;
-
- /* not locked, but pending, wait until we observe the lock */
- cpu_relax();
- }
-
- /* any unlock is good */
- while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
- cpu_relax();
-
-done:
- smp_acquire__after_ctrl_dep();
-}
-EXPORT_SYMBOL(queued_spin_unlock_wait);
-#endif
-
#endif /* _GEN_PV_LOCK_SLOWPATH */
/**
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 4ccfcaae5b89..6ee477765e6c 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _GEN_PV_LOCK_SLOWPATH
#error "do not include this file"
#endif
@@ -60,21 +61,50 @@ struct pv_node {
#include "qspinlock_stat.h"
/*
+ * Hybrid PV queued/unfair lock
+ *
* By replacing the regular queued_spin_trylock() with the function below,
* it will be called once when a lock waiter enter the PV slowpath before
- * being queued. By allowing one lock stealing attempt here when the pending
- * bit is off, it helps to reduce the performance impact of lock waiter
- * preemption without the drawback of lock starvation.
+ * being queued.
+ *
+ * The pending bit is set by the queue head vCPU of the MCS wait queue in
+ * pv_wait_head_or_lock() to signal that it is ready to spin on the lock.
+ * When that bit becomes visible to the incoming waiters, no lock stealing
+ * is allowed. The function will return immediately to make the waiters
+ * enter the MCS wait queue. So lock starvation shouldn't happen as long
+ * as the queued mode vCPUs are actively running to set the pending bit
+ * and hence disabling lock stealing.
+ *
+ * When the pending bit isn't set, the lock waiters will stay in the unfair
+ * mode spinning on the lock unless the MCS wait queue is empty. In this
+ * case, the lock waiters will enter the queued mode slowpath trying to
+ * become the queue head and set the pending bit.
+ *
+ * This hybrid PV queued/unfair lock combines the best attributes of a
+ * queued lock (no lock starvation) and an unfair lock (good performance
+ * on not heavily contended locks).
*/
-#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
-static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l)
+static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
- if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
- qstat_inc(qstat_pv_lock_stealing, true);
- return true;
+ /*
+ * Stay in unfair lock mode as long as queued mode waiters are
+ * present in the MCS wait queue but the pending bit isn't set.
+ */
+ for (;;) {
+ int val = atomic_read(&lock->val);
+
+ if (!(val & _Q_LOCKED_PENDING_MASK) &&
+ (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ qstat_inc(qstat_pv_lock_stealing, true);
+ return true;
+ }
+ if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
+ break;
+
+ cpu_relax();
}
return false;
@@ -101,16 +131,16 @@ static __always_inline void clear_pending(struct qspinlock *lock)
/*
* The pending bit check in pv_queued_spin_steal_lock() isn't a memory
- * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
- * just to be sure that it will get it.
+ * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
+ * lock just to be sure that it will get it.
*/
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
return !READ_ONCE(l->locked) &&
- (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
- == _Q_PENDING_VAL);
+ (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
+ _Q_LOCKED_VAL) == _Q_PENDING_VAL);
}
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
@@ -138,7 +168,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
*/
old = val;
new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
- val = atomic_cmpxchg(&lock->val, old, new);
+ val = atomic_cmpxchg_acquire(&lock->val, old, new);
if (val == old)
return 1;
@@ -362,8 +392,18 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
* observe its next->locked value and advance itself.
*
* Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+ *
+ * The write to next->locked in arch_mcs_spin_unlock_contended()
+ * must be ordered before the read of pn->state in the cmpxchg()
+ * below for the code to work correctly. To guarantee full ordering
+ * irrespective of the success or failure of the cmpxchg(),
+ * a relaxed version with explicit barrier is used. The control
+ * dependency will order the reading of pn->state before any
+ * subsequent writes.
*/
- if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+ smp_mb__before_atomic();
+ if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed)
+ != vcpu_halted)
return;
/*
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index ac35e648b0e5..fd4fe1f5b458 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* RT-Mutexes: blocking mutual exclusion locks with PI support
*
@@ -58,7 +59,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
void rt_mutex_debug_task_free(struct task_struct *task)
{
- DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
}
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 5078c6ddf4a5..fc549713bba3 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* RT-Mutexes: blocking mutual exclusion locks with PI support
*
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 649dc9d3951a..6f3dba6e4e9e 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -271,10 +271,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &lock->waiters.rb_node;
+ struct rb_node **link = &lock->waiters.rb_root.rb_node;
struct rb_node *parent = NULL;
struct rt_mutex_waiter *entry;
- int leftmost = 1;
+ bool leftmost = true;
while (*link) {
parent = *link;
@@ -283,15 +283,12 @@ rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- if (leftmost)
- lock->waiters_leftmost = &waiter->tree_entry;
-
rb_link_node(&waiter->tree_entry, parent, link);
- rb_insert_color(&waiter->tree_entry, &lock->waiters);
+ rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost);
}
static void
@@ -300,20 +297,17 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
if (RB_EMPTY_NODE(&waiter->tree_entry))
return;
- if (lock->waiters_leftmost == &waiter->tree_entry)
- lock->waiters_leftmost = rb_next(&waiter->tree_entry);
-
- rb_erase(&waiter->tree_entry, &lock->waiters);
+ rb_erase_cached(&waiter->tree_entry, &lock->waiters);
RB_CLEAR_NODE(&waiter->tree_entry);
}
static void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &task->pi_waiters.rb_node;
+ struct rb_node **link = &task->pi_waiters.rb_root.rb_node;
struct rb_node *parent = NULL;
struct rt_mutex_waiter *entry;
- int leftmost = 1;
+ bool leftmost = true;
while (*link) {
parent = *link;
@@ -322,15 +316,12 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- if (leftmost)
- task->pi_waiters_leftmost = &waiter->pi_tree_entry;
-
rb_link_node(&waiter->pi_tree_entry, parent, link);
- rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
+ rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost);
}
static void
@@ -339,10 +330,7 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
return;
- if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
- task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
-
- rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
+ rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
RB_CLEAR_NODE(&waiter->pi_tree_entry);
}
@@ -1657,8 +1645,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name,
{
lock->owner = NULL;
raw_spin_lock_init(&lock->wait_lock);
- lock->waiters = RB_ROOT;
- lock->waiters_leftmost = NULL;
+ lock->waiters = RB_ROOT_CACHED;
if (name && key)
debug_rt_mutex_init(lock, name, key);
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index 5c253caffe91..732f96abf462 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* RT-Mutexes: blocking mutual exclusion locks with PI support
*
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 72ad45a9a794..124e98ca0b17 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* RT Mutexes: blocking mutual exclusion locks with PI support
*
@@ -40,9 +41,12 @@ struct rt_mutex_waiter {
/*
* Various helpers to access the waiters-tree:
*/
+
+#ifdef CONFIG_RT_MUTEXES
+
static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
{
- return !RB_EMPTY_ROOT(&lock->waiters);
+ return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
}
static inline struct rt_mutex_waiter *
@@ -50,8 +54,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
{
struct rt_mutex_waiter *w;
- w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
- tree_entry);
+ w = rb_entry(lock->waiters.rb_leftmost,
+ struct rt_mutex_waiter, tree_entry);
BUG_ON(w->lock != lock);
return w;
@@ -59,16 +63,42 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
static inline int task_has_pi_waiters(struct task_struct *p)
{
- return !RB_EMPTY_ROOT(&p->pi_waiters);
+ return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root);
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+ return rb_entry(p->pi_waiters.rb_leftmost,
+ struct rt_mutex_waiter, pi_tree_entry);
+}
+
+#else
+
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return false;
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+ return NULL;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+ return false;
}
static inline struct rt_mutex_waiter *
task_top_pi_waiter(struct task_struct *p)
{
- return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
- pi_tree_entry);
+ return NULL;
}
+#endif
+
/*
* lock->owner state tracking:
*/
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 20819df98125..a7ffb2a96ede 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
* generic spinlock implementation
*
@@ -126,7 +127,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem)
/*
* get a read lock on the semaphore
*/
-void __sched __down_read(struct rw_semaphore *sem)
+int __sched __down_read_common(struct rw_semaphore *sem, int state)
{
struct rwsem_waiter waiter;
unsigned long flags;
@@ -140,8 +141,6 @@ void __sched __down_read(struct rw_semaphore *sem)
goto out;
}
- set_current_state(TASK_UNINTERRUPTIBLE);
-
/* set up my own style of waitqueue */
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_READ;
@@ -149,20 +148,41 @@ void __sched __down_read(struct rw_semaphore *sem)
list_add_tail(&waiter.list, &sem->wait_list);
- /* we don't need to touch the semaphore struct anymore */
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
/* wait to be given the lock */
for (;;) {
if (!waiter.task)
break;
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+ set_current_state(state);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
schedule();
- set_current_state(TASK_UNINTERRUPTIBLE);
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
}
- __set_current_state(TASK_RUNNING);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
out:
- ;
+ return 0;
+
+out_nolock:
+ /*
+ * We didn't take the lock, so that there is a writer, which
+ * is owner or the first waiter of the sem. If it's a waiter,
+ * it will be woken by current owner. Not need to wake anybody.
+ */
+ list_del(&waiter.list);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ return -EINTR;
+}
+
+void __sched __down_read(struct rw_semaphore *sem)
+{
+ __down_read_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+int __sched __down_read_killable(struct rw_semaphore *sem)
+{
+ return __down_read_common(sem, TASK_KILLABLE);
}
/*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 34e727f18e49..e795908f3607 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* rwsem.c: R/W semaphores: contention handling functions
*
* Written by David Howells (dhowells@redhat.com).
@@ -221,8 +222,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
/*
* Wait for the read lock to be granted
*/
-__visible
-struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
+static inline struct rw_semaphore __sched *
+__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
{
long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
struct rwsem_waiter waiter;
@@ -255,17 +256,44 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
/* wait to be given the lock */
while (true) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(state);
if (!waiter.task)
break;
+ if (signal_pending_state(state, current)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (waiter.task)
+ goto out_nolock;
+ raw_spin_unlock_irq(&sem->wait_lock);
+ break;
+ }
schedule();
}
__set_current_state(TASK_RUNNING);
return sem;
+out_nolock:
+ list_del(&waiter.list);
+ if (list_empty(&sem->wait_list))
+ atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ __set_current_state(TASK_RUNNING);
+ return ERR_PTR(-EINTR);
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+ return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(rwsem_down_read_failed);
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed_killable(struct rw_semaphore *sem)
+{
+ return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed_killable);
+
/*
* This function must be called with the sem->wait_lock held to prevent
* race conditions between checking the rwsem wait list and setting the
@@ -586,6 +614,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
DEFINE_WAKE_Q(wake_q);
/*
+ * __rwsem_down_write_failed_common(sem)
+ * rwsem_optimistic_spin(sem)
+ * osq_unlock(sem->osq)
+ * ...
+ * atomic_long_add_return(&sem->count)
+ *
+ * - VS -
+ *
+ * __up_write()
+ * if (atomic_long_sub_return_release(&sem->count) < 0)
+ * rwsem_wake(sem)
+ * osq_is_locked(&sem->osq)
+ *
+ * And __up_write() must observe !osq_is_locked() when it observes the
+ * atomic_long_add_return() in order to not miss a wakeup.
+ *
+ * This boils down to:
+ *
+ * [S.rel] X = 1 [RmW] r0 = (Y += 0)
+ * MB RMB
+ * [RmW] Y += 1 [L] r1 = X
+ *
+ * exists (r0=1 /\ r1=0)
+ */
+ smp_rmb();
+
+ /*
* If a spinner is present, it is not necessary to do the wakeup.
* Try to do wakeup only if the trylock succeeds to minimize
* spinlock contention which may introduce too much delay in the
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 4d48b1c4870d..f549c552dbf1 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* kernel/rwsem.c: R/W semaphores, public implementation
*
* Written by David Howells (dhowells@redhat.com).
@@ -28,6 +29,22 @@ void __sched down_read(struct rw_semaphore *sem)
EXPORT_SYMBOL(down_read);
+int __sched down_read_killable(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ return -EINTR;
+ }
+
+ rwsem_set_reader_owned(sem);
+ return 0;
+}
+
+EXPORT_SYMBOL(down_read_killable);
+
/*
* trylock for reading -- returns 1 if successful, 0 if contention
*/
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index a699f4048ba1..a883b8f1fdc6 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* The owner field of the rw_semaphore structure will be set to
* RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..1fd1a7543cdd 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (2004) Linus Torvalds
*
@@ -29,11 +30,10 @@
#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
/*
* The __lock_function inlines are taken from
- * include/linux/spinlock_api_smp.h
+ * spinlock : include/linux/spinlock_api_smp.h
+ * rwlock : include/linux/rwlock_api_smp.h
*/
#else
-#define raw_read_can_lock(l) read_can_lock(l)
-#define raw_write_can_lock(l) write_can_lock(l)
/*
* Some architectures can relax in favour of the CPU owning the lock.
@@ -68,7 +68,7 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
\
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
- while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+ while ((lock)->break_lock) \
arch_##op##_relax(&lock->raw_lock); \
} \
(lock)->break_lock = 0; \
@@ -88,7 +88,7 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
\
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
- while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+ while ((lock)->break_lock) \
arch_##op##_relax(&lock->raw_lock); \
} \
(lock)->break_lock = 0; \
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 39f56c870051..0e4cd64ad2c0 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -362,7 +362,7 @@ static int *get_random_order(int count)
int *order;
int n, r, tmp;
- order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY);
+ order = kmalloc_array(count, sizeof(*order), GFP_KERNEL);
if (!order)
return order;
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
deleted file mode 100644
index 9f9284f37f8d..000000000000
--- a/kernel/membarrier.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
- *
- * membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- */
-
-#include <linux/syscalls.h>
-#include <linux/membarrier.h>
-#include <linux/tick.h>
-
-/*
- * Bitmask made from a "or" of all commands within enum membarrier_cmd,
- * except MEMBARRIER_CMD_QUERY.
- */
-#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED)
-
-/**
- * sys_membarrier - issue memory barriers on a set of threads
- * @cmd: Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
- *
- * If this system call is not implemented, -ENOSYS is returned. If the
- * command specified does not exist, or if the command argument is invalid,
- * this system call returns -EINVAL. For a given command, with flags argument
- * set to 0, this system call is guaranteed to always return the same value
- * until reboot.
- *
- * All memory accesses performed in program order from each targeted thread
- * is guaranteed to be ordered with respect to sys_membarrier(). If we use
- * the semantic "barrier()" to represent a compiler barrier forcing memory
- * accesses to be performed in program order across the barrier, and
- * smp_mb() to represent explicit memory barriers forcing full memory
- * ordering across the barrier, we have the following ordering table for
- * each pair of barrier(), sys_membarrier() and smp_mb():
- *
- * The pair ordering is detailed as (O: ordered, X: not ordered):
- *
- * barrier() smp_mb() sys_membarrier()
- * barrier() X X O
- * smp_mb() X O O
- * sys_membarrier() O O O
- */
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
-{
- /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
- if (tick_nohz_full_enabled())
- return -ENOSYS;
- if (unlikely(flags))
- return -EINVAL;
- switch (cmd) {
- case MEMBARRIER_CMD_QUERY:
- return MEMBARRIER_CMD_BITMASK;
- case MEMBARRIER_CMD_SHARED:
- if (num_online_cpus() > 1)
- synchronize_sched();
- return 0;
- default:
- return -EINVAL;
- }
-}
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 124bed776532..403ab9cdb949 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,13 +11,14 @@
* General Public License for more details.
*/
#include <linux/radix-tree.h>
-#include <linux/memremap.h>
#include <linux/device.h>
#include <linux/types.h>
#include <linux/pfn_t.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/memory_hotplug.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#ifndef ioremap_cache
/* temporary while we convert existing ioremap_cache users to memremap */
@@ -34,13 +35,24 @@ static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
}
#endif
-static void *try_ram_remap(resource_size_t offset, size_t size)
+#ifndef arch_memremap_can_ram_remap
+static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
+ unsigned long flags)
+{
+ return true;
+}
+#endif
+
+static void *try_ram_remap(resource_size_t offset, size_t size,
+ unsigned long flags)
{
unsigned long pfn = PHYS_PFN(offset);
/* In the simple case just return the existing linear address */
- if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
+ if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
+ arch_memremap_can_ram_remap(offset, size, flags))
return __va(offset);
+
return NULL; /* fallback to arch_memremap_wb */
}
@@ -48,7 +60,8 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
* memremap() - remap an iomem_resource as cacheable memory
* @offset: iomem resource start address
* @size: size of remap
- * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC
+ * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
+ * MEMREMAP_ENC, MEMREMAP_DEC
*
* memremap() is "ioremap" for cases where it is known that the resource
* being mapped does not have i/o side effects and the __iomem
@@ -95,7 +108,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
* the requested range is potentially in System RAM.
*/
if (is_ram == REGION_INTERSECTS)
- addr = try_ram_remap(offset, size);
+ addr = try_ram_remap(offset, size, flags);
if (!addr)
addr = arch_memremap_wb(offset, size);
}
@@ -182,18 +195,69 @@ struct page_map {
struct vmem_altmap altmap;
};
-static void pgmap_radix_release(struct resource *res)
+static unsigned long order_at(struct resource *res, unsigned long pgoff)
{
- resource_size_t key, align_start, align_size, align_end;
+ unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
+ unsigned long nr_pages, mask;
- align_start = res->start & ~(SECTION_SIZE - 1);
- align_size = ALIGN(resource_size(res), SECTION_SIZE);
- align_end = align_start + align_size - 1;
+ nr_pages = PHYS_PFN(resource_size(res));
+ if (nr_pages == pgoff)
+ return ULONG_MAX;
+
+ /*
+ * What is the largest aligned power-of-2 range available from
+ * this resource pgoff to the end of the resource range,
+ * considering the alignment of the current pgoff?
+ */
+ mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
+ if (!mask)
+ return ULONG_MAX;
+
+ return find_first_bit(&mask, BITS_PER_LONG);
+}
+
+#define foreach_order_pgoff(res, order, pgoff) \
+ for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
+ pgoff += 1UL << order, order = order_at((res), pgoff))
+
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+int device_private_entry_fault(struct vm_area_struct *vma,
+ unsigned long addr,
+ swp_entry_t entry,
+ unsigned int flags,
+ pmd_t *pmdp)
+{
+ struct page *page = device_private_entry_to_page(entry);
+
+ /*
+ * The page_fault() callback must migrate page back to system memory
+ * so that CPU can access it. This might fail for various reasons
+ * (device issue, device was unsafely unplugged, ...). When such
+ * error conditions happen, the callback must return VM_FAULT_SIGBUS.
+ *
+ * Note that because memory cgroup charges are accounted to the device
+ * memory, this should never fail because of memory restrictions (but
+ * allocation of regular system page might still fail because we are
+ * out of memory).
+ *
+ * There is a more in-depth description of what that callback can and
+ * cannot do, in include/linux/memremap.h
+ */
+ return page->pgmap->page_fault(vma, addr, page, flags, pmdp);
+}
+EXPORT_SYMBOL(device_private_entry_fault);
+#endif /* CONFIG_DEVICE_PRIVATE */
+
+static void pgmap_radix_release(struct resource *res)
+{
+ unsigned long pgoff, order;
mutex_lock(&pgmap_lock);
- for (key = res->start; key <= res->end; key += SECTION_SIZE)
- radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+ foreach_order_pgoff(res, order, pgoff)
+ radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
mutex_unlock(&pgmap_lock);
+
+ synchronize_rcu();
}
static unsigned long pfn_first(struct page_map *page_map)
@@ -256,7 +320,7 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
WARN_ON_ONCE(!rcu_read_lock_held());
- page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+ page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
return page_map ? &page_map->pgmap : NULL;
}
@@ -281,12 +345,12 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
void *devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap)
{
- resource_size_t key, align_start, align_size, align_end;
+ resource_size_t align_start, align_size, align_end;
+ unsigned long pfn, pgoff, order;
pgprot_t pgprot = PAGE_KERNEL;
struct dev_pagemap *pgmap;
struct page_map *page_map;
- int error, nid, is_ram;
- unsigned long pfn;
+ int error, nid, is_ram, i = 0;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -321,15 +385,20 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
}
pgmap->ref = ref;
pgmap->res = &page_map->res;
+ pgmap->type = MEMORY_DEVICE_HOST;
+ pgmap->page_fault = NULL;
+ pgmap->page_free = NULL;
+ pgmap->data = NULL;
mutex_lock(&pgmap_lock);
error = 0;
align_end = align_start + align_size - 1;
- for (key = align_start; key <= align_end; key += SECTION_SIZE) {
+
+ foreach_order_pgoff(res, order, pgoff) {
struct dev_pagemap *dup;
rcu_read_lock();
- dup = find_dev_pagemap(key);
+ dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff));
rcu_read_unlock();
if (dup) {
dev_err(dev, "%s: %pr collides with mapping for %s\n",
@@ -337,8 +406,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
error = -EBUSY;
break;
}
- error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
- page_map);
+ error = __radix_tree_insert(&pgmap_radix,
+ PHYS_PFN(res->start) + pgoff, order, page_map);
if (error) {
dev_err(dev, "%s: failed: %d\n", __func__, error);
break;
@@ -379,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
list_del(&page->lru);
page->pgmap = pgmap;
percpu_ref_get(ref);
+ if (!(++i % 1024))
+ cond_resched();
}
devres_add(dev, page_map);
return __va(res->start);
@@ -430,3 +501,28 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
return pgmap ? pgmap->altmap : NULL;
}
#endif /* CONFIG_ZONE_DEVICE */
+
+
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+void put_zone_device_private_or_public_page(struct page *page)
+{
+ int count = page_ref_dec_return(page);
+
+ /*
+ * If refcount is 1 then page is freed and refcount is stable as nobody
+ * holds a reference on the page.
+ */
+ if (count == 1) {
+ /* Clear Active bit in case of parallel mark_page_accessed */
+ __ClearPageActive(page);
+ __ClearPageWaiters(page);
+
+ page->mapping = NULL;
+ mem_cgroup_uncharge(page);
+
+ page->pgmap->page_free(page, page->pgmap->data);
+ } else if (!count)
+ __put_page(page);
+}
+EXPORT_SYMBOL(put_zone_device_private_or_public_page);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
diff --git a/kernel/module.c b/kernel/module.c
index 40f983cbea81..32c2cdaccd93 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -278,6 +278,16 @@ static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
module_param(sig_enforce, bool_enable_only, 0644);
#endif /* !CONFIG_MODULE_SIG_FORCE */
+/*
+ * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+ * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+ */
+bool is_module_sig_enforced(void)
+{
+ return sig_enforce;
+}
+EXPORT_SYMBOL(is_module_sig_enforced);
+
/* Block module loading/unloading? */
int modules_disabled = 0;
core_param(nomodule, modules_disabled, bint, 0);
@@ -1516,7 +1526,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
sattr->mattr.show = module_sect_show;
sattr->mattr.store = NULL;
sattr->mattr.attr.name = sattr->name;
- sattr->mattr.attr.mode = S_IRUGO;
+ sattr->mattr.attr.mode = S_IRUSR;
*(gattr++) = &(sattr++)->mattr.attr;
}
*gattr = NULL;
@@ -2707,21 +2717,21 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
}
#endif /* CONFIG_KALLSYMS */
-static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
+static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
{
if (!debug)
return;
#ifdef CONFIG_DYNAMIC_DEBUG
- if (ddebug_add_module(debug, num, debug->modname))
+ if (ddebug_add_module(debug, num, mod->name))
pr_err("dynamic debug error adding module: %s\n",
debug->modname);
#endif
}
-static void dynamic_debug_remove(struct _ddebug *debug)
+static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
{
if (debug)
- ddebug_remove_module(debug->modname);
+ ddebug_remove_module(mod->name);
}
void * __weak module_alloc(unsigned long size)
@@ -3715,7 +3725,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto free_arch_cleanup;
}
- dynamic_debug_setup(info->debug, info->num_debug);
+ dynamic_debug_setup(mod, info->debug, info->num_debug);
/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
ftrace_module_init(mod);
@@ -3779,7 +3789,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_disable_nx(mod);
ddebug_cleanup:
- dynamic_debug_remove(info->debug);
+ dynamic_debug_remove(mod, info->debug);
synchronize_sched();
kfree(mod->args);
free_arch_cleanup:
@@ -4147,6 +4157,7 @@ static int m_show(struct seq_file *m, void *p)
{
struct module *mod = list_entry(p, struct module, list);
char buf[MODULE_FLAGS_BUF_SIZE];
+ unsigned long value;
/* We always ignore unformed modules. */
if (mod->state == MODULE_STATE_UNFORMED)
@@ -4162,7 +4173,8 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
- seq_printf(m, " 0x%pK", mod->core_layout.base);
+ value = m->private ? 0 : (unsigned long)mod->core_layout.base;
+ seq_printf(m, " 0x" KALLSYM_FMT, value);
/* Taints info */
if (mod->taints)
@@ -4184,9 +4196,23 @@ static const struct seq_operations modules_op = {
.show = m_show
};
+/*
+ * This also sets the "private" pointer to non-NULL if the
+ * kernel pointers should be hidden (so you can just test
+ * "m->private" to see if you should keep the values private).
+ *
+ * We use the same logic as for /proc/kallsyms.
+ */
static int modules_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &modules_op);
+ int err = seq_open(file, &modules_op);
+
+ if (!err) {
+ struct seq_file *m = file->private_data;
+ m->private = kallsyms_show_value() ? NULL : (void *)8ul;
+ }
+
+ return 0;
}
static const struct file_operations proc_modules_operations = {
diff --git a/kernel/padata.c b/kernel/padata.c
index 868f947166d7..f262c9a4e70a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -131,6 +131,7 @@ int padata_do_parallel(struct padata_instance *pinst,
padata->cb_cpu = cb_cpu;
target_cpu = padata_cpu_hash(pd);
+ padata->cpu = target_cpu;
queue = per_cpu_ptr(pd->pqueue, target_cpu);
spin_lock(&queue->parallel.lock);
@@ -275,11 +276,51 @@ static void padata_reorder(struct parallel_data *pd)
return;
}
+static void invoke_padata_reorder(struct work_struct *work)
+{
+ struct padata_parallel_queue *pqueue;
+ struct parallel_data *pd;
+
+ local_bh_disable();
+ pqueue = container_of(work, struct padata_parallel_queue, reorder_work);
+ pd = pqueue->pd;
+ padata_reorder(pd);
+ local_bh_enable();
+}
+
static void padata_reorder_timer(unsigned long arg)
{
struct parallel_data *pd = (struct parallel_data *)arg;
+ unsigned int weight;
+ int target_cpu, cpu;
- padata_reorder(pd);
+ cpu = get_cpu();
+
+ /* We don't lock pd here to not interfere with parallel processing
+ * padata_reorder() calls on other CPUs. We just need any CPU out of
+ * the cpumask.pcpu set. It would be nice if it's the right one but
+ * it doesn't matter if we're off to the next one by using an outdated
+ * pd->processed value.
+ */
+ weight = cpumask_weight(pd->cpumask.pcpu);
+ target_cpu = padata_index_to_cpu(pd, pd->processed % weight);
+
+ /* ensure to call the reorder callback on the correct CPU */
+ if (cpu != target_cpu) {
+ struct padata_parallel_queue *pqueue;
+ struct padata_instance *pinst;
+
+ /* The timer function is serialized wrt itself -- no locking
+ * needed.
+ */
+ pinst = pd->pinst;
+ pqueue = per_cpu_ptr(pd->pqueue, target_cpu);
+ queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work);
+ } else {
+ padata_reorder(pd);
+ }
+
+ put_cpu();
}
static void padata_serial_worker(struct work_struct *serial_work)
@@ -323,10 +364,21 @@ void padata_do_serial(struct padata_priv *padata)
int cpu;
struct padata_parallel_queue *pqueue;
struct parallel_data *pd;
+ int reorder_via_wq = 0;
pd = padata->pd;
cpu = get_cpu();
+
+ /* We need to run on the same CPU padata_do_parallel(.., padata, ..)
+ * was called on -- or, at least, enqueue the padata object into the
+ * correct per-cpu queue.
+ */
+ if (cpu != padata->cpu) {
+ reorder_via_wq = 1;
+ cpu = padata->cpu;
+ }
+
pqueue = per_cpu_ptr(pd->pqueue, cpu);
spin_lock(&pqueue->reorder.lock);
@@ -336,7 +388,13 @@ void padata_do_serial(struct padata_priv *padata)
put_cpu();
- padata_reorder(pd);
+ /* If we're running on the wrong CPU, call padata_reorder() via a
+ * kernel worker.
+ */
+ if (reorder_via_wq)
+ queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work);
+ else
+ padata_reorder(pd);
}
EXPORT_SYMBOL(padata_do_serial);
@@ -384,8 +442,14 @@ static void padata_init_pqueues(struct parallel_data *pd)
struct padata_parallel_queue *pqueue;
cpu_index = 0;
- for_each_cpu(cpu, pd->cpumask.pcpu) {
+ for_each_possible_cpu(cpu) {
pqueue = per_cpu_ptr(pd->pqueue, cpu);
+
+ if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) {
+ pqueue->cpu_index = -1;
+ continue;
+ }
+
pqueue->pd = pd;
pqueue->cpu_index = cpu_index;
cpu_index++;
@@ -393,6 +457,7 @@ static void padata_init_pqueues(struct parallel_data *pd)
__padata_list_init(&pqueue->reorder);
__padata_list_init(&pqueue->parallel);
INIT_WORK(&pqueue->work, padata_parallel_worker);
+ INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder);
atomic_set(&pqueue->num_obj, 0);
}
}
diff --git a/kernel/panic.c b/kernel/panic.c
index a58932b41700..bdd18afa19a4 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -26,6 +26,7 @@
#include <linux/nmi.h>
#include <linux/console.h>
#include <linux/bug.h>
+#include <linux/ratelimit.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -601,6 +602,17 @@ EXPORT_SYMBOL(__stack_chk_fail);
#endif
+#ifdef CONFIG_ARCH_HAS_REFCOUNT
+void refcount_error_report(struct pt_regs *regs, const char *err)
+{
+ WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n",
+ err, (void *)instruction_pointer(regs),
+ current->comm, task_pid_nr(current),
+ from_kuid_munged(&init_user_ns, current_uid()),
+ from_kuid_munged(&init_user_ns, current_euid()));
+}
+#endif
+
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
diff --git a/kernel/params.c b/kernel/params.c
index 60b2d8101355..cc9108c2a1fd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -224,7 +224,7 @@ char *parse_args(const char *doing,
} \
int param_get_##name(char *buffer, const struct kernel_param *kp) \
{ \
- return scnprintf(buffer, PAGE_SIZE, format, \
+ return scnprintf(buffer, PAGE_SIZE, format "\n", \
*((type *)kp->arg)); \
} \
const struct kernel_param_ops param_ops_##name = { \
@@ -236,14 +236,14 @@ char *parse_args(const char *doing,
EXPORT_SYMBOL(param_ops_##name)
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
-STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
-STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
-STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
-STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
+STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
+STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
+STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
@@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp);
int param_get_charp(char *buffer, const struct kernel_param *kp)
{
- return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg));
}
EXPORT_SYMBOL(param_get_charp);
@@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool);
int param_get_bool(char *buffer, const struct kernel_param *kp)
{
/* Y and N chosen as being relatively non-coder friendly */
- return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
+ return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N');
}
EXPORT_SYMBOL(param_get_bool);
@@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool);
int param_get_invbool(char *buffer, const struct kernel_param *kp)
{
- return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
+ return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y');
}
EXPORT_SYMBOL(param_get_invbool);
@@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
struct kernel_param p = *kp;
for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+ /* Replace \n with comma */
if (i)
- buffer[off++] = ',';
+ buffer[off - 1] = ',';
p.arg = arr->elem + arr->elemsize * i;
check_kparam_locked(p.mod);
ret = arr->ops->get(buffer + off, &p);
@@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring);
int param_get_string(char *buffer, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
- return strlcpy(buffer, kps->string, kps->maxlen);
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string);
}
EXPORT_SYMBOL(param_get_string);
@@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
kernel_param_lock(mk->mod);
count = attribute->param->ops->get(buf, attribute->param);
kernel_param_unlock(mk->mod);
- if (count > 0) {
- strcat(buf, "\n");
- ++count;
- }
return count;
}
@@ -600,7 +597,7 @@ EXPORT_SYMBOL(kernel_param_unlock);
/*
* add_sysfs_param - add a parameter to sysfs
* @mk: struct module_kobject
- * @kparam: the actual parameter definition to add to sysfs
+ * @kp: the actual parameter definition to add to sysfs
* @name: name of parameter
*
* Create a kobject if for a (per-module) parameter if mp NULL, and
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 74a5a7255b4d..4918314893bc 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -101,6 +101,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
int i;
int err;
+ err = -EINVAL;
+ if (!in_userns(parent_pid_ns->user_ns, user_ns))
+ goto out;
+
err = -ENOSPC;
if (level > MAX_PID_NS_LEVEL)
goto out;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e8517b63eb37..e880ca22c5a5 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -259,20 +259,6 @@ config APM_EMULATION
anything, try disabling/enabling this option (or disabling/enabling
APM in your BIOS).
-config PM_OPP
- bool
- select SRCU
- ---help---
- SOCs have a standard set of tuples consisting of frequency and
- voltage pairs that the device will support per voltage domain. This
- is called Operating Performance Point or OPP. The actual definitions
- of OPP varies over silicon within the same family of devices.
-
- OPP layer organizes the data internally using device pointers
- representing individual voltage domains and provides SOC
- implementations a ready to use framework to manage OPPs.
- For more information, read <file:Documentation/power/opp.txt>
-
config PM_CLK
def_bool y
depends on PM && HAVE_CLK
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index eb4f717705ba..a3f79f0eef36 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index 9012ecf7b814..41e83a779e19 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* kernel/power/autosleep.c
*
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 0e781798b0b3..fcdf0e14a47d 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Functions for saving/restoring console.
*
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e1914c7b85b1..a5c36e9c56a6 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -651,7 +651,7 @@ static int load_image_and_restore(void)
int error;
unsigned int flags;
- pr_debug("Loading hibernation image.\n");
+ pm_pr_dbg("Loading hibernation image.\n");
lock_device_hotplug();
error = create_basic_memory_bitmaps();
@@ -681,7 +681,7 @@ int hibernate(void)
bool snapshot_test = false;
if (!hibernation_available()) {
- pr_debug("Hibernation not available.\n");
+ pm_pr_dbg("Hibernation not available.\n");
return -EPERM;
}
@@ -692,6 +692,7 @@ int hibernate(void)
goto Unlock;
}
+ pr_info("hibernation entry\n");
pm_prepare_console();
error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
if (error) {
@@ -727,7 +728,7 @@ int hibernate(void)
else
flags |= SF_CRC32_MODE;
- pr_debug("Writing image.\n");
+ pm_pr_dbg("Writing image.\n");
error = swsusp_write(flags);
swsusp_free();
if (!error) {
@@ -739,7 +740,7 @@ int hibernate(void)
in_suspend = 0;
pm_restore_gfp_mask();
} else {
- pr_debug("Image restored successfully.\n");
+ pm_pr_dbg("Image restored successfully.\n");
}
Free_bitmaps:
@@ -747,7 +748,7 @@ int hibernate(void)
Thaw:
unlock_device_hotplug();
if (snapshot_test) {
- pr_debug("Checking hibernation image\n");
+ pm_pr_dbg("Checking hibernation image\n");
error = swsusp_check();
if (!error)
error = load_image_and_restore();
@@ -762,6 +763,8 @@ int hibernate(void)
atomic_inc(&snapshot_device_available);
Unlock:
unlock_system_sleep();
+ pr_info("hibernation exit\n");
+
return error;
}
@@ -811,7 +814,7 @@ static int software_resume(void)
goto Unlock;
}
- pr_debug("Checking hibernation image partition %s\n", resume_file);
+ pm_pr_dbg("Checking hibernation image partition %s\n", resume_file);
if (resume_delay) {
pr_info("Waiting %dsec before reading resume device ...\n",
@@ -853,10 +856,10 @@ static int software_resume(void)
}
Check_image:
- pr_debug("Hibernation image partition %d:%d present\n",
+ pm_pr_dbg("Hibernation image partition %d:%d present\n",
MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
- pr_debug("Looking for hibernation image.\n");
+ pm_pr_dbg("Looking for hibernation image.\n");
error = swsusp_check();
if (error)
goto Unlock;
@@ -868,6 +871,7 @@ static int software_resume(void)
goto Unlock;
}
+ pr_info("resume from hibernation\n");
pm_prepare_console();
error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
if (error) {
@@ -875,7 +879,7 @@ static int software_resume(void)
goto Close_Finish;
}
- pr_debug("Preparing processes for restore.\n");
+ pm_pr_dbg("Preparing processes for restore.\n");
error = freeze_processes();
if (error)
goto Close_Finish;
@@ -884,11 +888,12 @@ static int software_resume(void)
Finish:
__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
pm_restore_console();
+ pr_info("resume from hibernation failed (%d)\n", error);
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
Unlock:
mutex_unlock(&pm_mutex);
- pr_debug("Hibernation image not present or could not be loaded.\n");
+ pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
swsusp_close(FMODE_READ);
@@ -1012,8 +1017,8 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
error = -EINVAL;
if (!error)
- pr_debug("Hibernation mode set to '%s'\n",
- hibernation_modes[mode]);
+ pm_pr_dbg("Hibernation mode set to '%s'\n",
+ hibernation_modes[mode]);
unlock_system_sleep();
return error ? error : n;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 42bd800a6755..3a2ca9066583 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -150,7 +150,7 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr
power_attr(mem_sleep);
#endif /* CONFIG_SUSPEND */
-#ifdef CONFIG_PM_DEBUG
+#ifdef CONFIG_PM_SLEEP_DEBUG
int pm_test_level = TEST_NONE;
static const char * const pm_tests[__TEST_AFTER_LAST] = {
@@ -211,7 +211,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
}
power_attr(pm_test);
-#endif /* CONFIG_PM_DEBUG */
+#endif /* CONFIG_PM_SLEEP_DEBUG */
#ifdef CONFIG_DEBUG_FS
static char *suspend_step_name(enum suspend_stat_step step)
@@ -361,6 +361,61 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
power_attr_ro(pm_wakeup_irq);
+bool pm_debug_messages_on __read_mostly;
+
+static ssize_t pm_debug_messages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", pm_debug_messages_on);
+}
+
+static ssize_t pm_debug_messages_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ pm_debug_messages_on = !!val;
+ return n;
+}
+
+power_attr(pm_debug_messages);
+
+/**
+ * __pm_pr_dbg - Print a suspend debug message to the kernel log.
+ * @defer: Whether or not to use printk_deferred() to print the message.
+ * @fmt: Message format.
+ *
+ * The message will be emitted if enabled through the pm_debug_messages
+ * sysfs attribute.
+ */
+void __pm_pr_dbg(bool defer, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (!pm_debug_messages_on)
+ return;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (defer)
+ printk_deferred(KERN_DEBUG "PM: %pV", &vaf);
+ else
+ printk(KERN_DEBUG "PM: %pV", &vaf);
+
+ va_end(args);
+}
+
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
@@ -691,12 +746,11 @@ static struct attribute * g[] = {
&wake_lock_attr.attr,
&wake_unlock_attr.attr,
#endif
-#ifdef CONFIG_PM_DEBUG
- &pm_test_attr.attr,
-#endif
#ifdef CONFIG_PM_SLEEP_DEBUG
+ &pm_test_attr.attr,
&pm_print_times_attr.attr,
&pm_wakeup_irq_attr.attr,
+ &pm_debug_messages_attr.attr,
#endif
#endif
#ifdef CONFIG_FREEZER
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7fdc40d31b7d..f29cd178df90 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/suspend.h>
#include <linux/suspend_ioctls.h>
#include <linux/utsname.h>
@@ -192,7 +193,6 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
extern const char * const pm_labels[];
extern const char *pm_states[];
extern const char *mem_sleep_states[];
-extern suspend_state_t mem_sleep_current;
extern int suspend_devices_and_enter(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
@@ -245,7 +245,11 @@ enum {
#define TEST_FIRST TEST_NONE
#define TEST_MAX (__TEST_AFTER_LAST - 1)
+#ifdef CONFIG_PM_SLEEP_DEBUG
extern int pm_test_level;
+#else
+#define pm_test_level (TEST_NONE)
+#endif
#ifdef CONFIG_SUSPEND_FREEZER
static inline int suspend_freeze_processes(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78672d324a6e..7381d49a44db 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* drivers/power/process.c - Functions for starting/stopping processes on
* suspend transitions.
@@ -20,8 +21,9 @@
#include <linux/workqueue.h>
#include <linux/kmod.h>
#include <trace/events/power.h>
+#include <linux/cpuset.h>
-/*
+/*
* Timeout for stopping processes
*/
unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
@@ -202,6 +204,8 @@ void thaw_processes(void)
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
+ cpuset_wait_for_hotplug();
+
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df71303e..9d7503910ce2 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -701,8 +701,8 @@ static int __init pm_qos_power_init(void)
for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
ret = register_pm_qos_misc(pm_qos_array[i], d);
if (ret < 0) {
- printk(KERN_ERR "pm_qos_param: %s setup failed\n",
- pm_qos_array[i]->name);
+ pr_err("%s: %s setup failed\n",
+ __func__, pm_qos_array[i]->name);
return ret;
}
}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0972a8e09d08..a917a301e201 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) "PM: " fmt
+
#include <linux/version.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -967,7 +969,7 @@ void __init __register_nosave_region(unsigned long start_pfn,
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
Report:
- printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
+ pr_info("Registered nosave memory: [mem %#010llx-%#010llx]\n",
(unsigned long long) start_pfn << PAGE_SHIFT,
((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
}
@@ -1039,7 +1041,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
list_for_each_entry(region, &nosave_regions, list) {
unsigned long pfn;
- pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
+ pr_debug("Marking nosave pages: [mem %#010llx-%#010llx]\n",
(unsigned long long) region->start_pfn << PAGE_SHIFT,
((unsigned long long) region->end_pfn << PAGE_SHIFT)
- 1);
@@ -1095,7 +1097,7 @@ int create_basic_memory_bitmaps(void)
free_pages_map = bm2;
mark_nosave_pages(forbidden_pages_map);
- pr_debug("PM: Basic memory bitmaps created\n");
+ pr_debug("Basic memory bitmaps created\n");
return 0;
@@ -1131,7 +1133,7 @@ void free_basic_memory_bitmaps(void)
memory_bm_free(bm2, PG_UNSAFE_CLEAR);
kfree(bm2);
- pr_debug("PM: Basic memory bitmaps freed\n");
+ pr_debug("Basic memory bitmaps freed\n");
}
void clear_free_pages(void)
@@ -1152,7 +1154,7 @@ void clear_free_pages(void)
pfn = memory_bm_next_pfn(bm);
}
memory_bm_position_reset(bm);
- pr_info("PM: free pages cleared after restore\n");
+ pr_info("free pages cleared after restore\n");
#endif /* PAGE_POISONING_ZERO */
}
@@ -1690,7 +1692,7 @@ int hibernate_preallocate_memory(void)
ktime_t start, stop;
int error;
- printk(KERN_INFO "PM: Preallocating image memory... ");
+ pr_info("Preallocating image memory... ");
start = ktime_get();
error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
@@ -1821,13 +1823,13 @@ int hibernate_preallocate_memory(void)
out:
stop = ktime_get();
- printk(KERN_CONT "done (allocated %lu pages)\n", pages);
+ pr_cont("done (allocated %lu pages)\n", pages);
swsusp_show_speed(start, stop, pages, "Allocated");
return 0;
err_out:
- printk(KERN_CONT "\n");
+ pr_cont("\n");
swsusp_free();
return -ENOMEM;
}
@@ -1867,8 +1869,8 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
free += zone_page_state(zone, NR_FREE_PAGES);
nr_pages += count_pages_for_highmem(nr_highmem);
- pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
- nr_pages, PAGES_FOR_IO, free);
+ pr_debug("Normal pages needed: %u + %u, available pages: %u\n",
+ nr_pages, PAGES_FOR_IO, free);
return free > nr_pages + PAGES_FOR_IO;
}
@@ -1961,20 +1963,20 @@ asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
- printk(KERN_INFO "PM: Creating hibernation image:\n");
+ pr_info("Creating hibernation image:\n");
drain_local_pages(NULL);
nr_pages = count_data_pages();
nr_highmem = count_highmem_pages();
- printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
+ pr_info("Need to copy %u pages\n", nr_pages + nr_highmem);
if (!enough_free_mem(nr_pages, nr_highmem)) {
- printk(KERN_ERR "PM: Not enough free memory\n");
+ pr_err("Not enough free memory\n");
return -ENOMEM;
}
if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem)) {
- printk(KERN_ERR "PM: Memory allocation failed\n");
+ pr_err("Memory allocation failed\n");
return -ENOMEM;
}
@@ -1995,8 +1997,7 @@ asmlinkage __visible int swsusp_save(void)
nr_copy_pages = nr_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
- printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n",
- nr_pages);
+ pr_info("Hibernation image created (%d pages copied)\n", nr_pages);
return 0;
}
@@ -2170,7 +2171,7 @@ static int check_header(struct swsusp_info *info)
if (!reason && info->num_physpages != get_num_physpages())
reason = "memory size";
if (reason) {
- printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
+ pr_err("Image mismatch: %s\n", reason);
return -EPERM;
}
return 0;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 3ecf275d7e44..0685c4499431 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -8,6 +8,8 @@
* This file is released under the GPLv2.
*/
+#define pr_fmt(fmt) "PM: " fmt
+
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
@@ -33,53 +35,55 @@
#include "power.h"
const char * const pm_labels[] = {
- [PM_SUSPEND_FREEZE] = "freeze",
+ [PM_SUSPEND_TO_IDLE] = "freeze",
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
};
const char *pm_states[PM_SUSPEND_MAX];
static const char * const mem_sleep_labels[] = {
- [PM_SUSPEND_FREEZE] = "s2idle",
+ [PM_SUSPEND_TO_IDLE] = "s2idle",
[PM_SUSPEND_STANDBY] = "shallow",
[PM_SUSPEND_MEM] = "deep",
};
const char *mem_sleep_states[PM_SUSPEND_MAX];
-suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
-static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM;
+suspend_state_t mem_sleep_current = PM_SUSPEND_TO_IDLE;
+suspend_state_t mem_sleep_default = PM_SUSPEND_MAX;
+suspend_state_t pm_suspend_target_state;
+EXPORT_SYMBOL_GPL(pm_suspend_target_state);
unsigned int pm_suspend_global_flags;
EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
static const struct platform_suspend_ops *suspend_ops;
-static const struct platform_freeze_ops *freeze_ops;
-static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static const struct platform_s2idle_ops *s2idle_ops;
+static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head);
-enum freeze_state __read_mostly suspend_freeze_state;
-static DEFINE_SPINLOCK(suspend_freeze_lock);
+enum s2idle_states __read_mostly s2idle_state;
+static DEFINE_SPINLOCK(s2idle_lock);
-void freeze_set_ops(const struct platform_freeze_ops *ops)
+void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
lock_system_sleep();
- freeze_ops = ops;
+ s2idle_ops = ops;
unlock_system_sleep();
}
-static void freeze_begin(void)
+static void s2idle_begin(void)
{
- suspend_freeze_state = FREEZE_STATE_NONE;
+ s2idle_state = S2IDLE_STATE_NONE;
}
-static void freeze_enter(void)
+static void s2idle_enter(void)
{
- trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);
+ trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
- spin_lock_irq(&suspend_freeze_lock);
+ spin_lock_irq(&s2idle_lock);
if (pm_wakeup_pending())
goto out;
- suspend_freeze_state = FREEZE_STATE_ENTER;
- spin_unlock_irq(&suspend_freeze_lock);
+ s2idle_state = S2IDLE_STATE_ENTER;
+ spin_unlock_irq(&s2idle_lock);
get_online_cpus();
cpuidle_resume();
@@ -87,56 +91,79 @@ static void freeze_enter(void)
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
/* Make the current CPU wait so it can enter the idle loop too. */
- wait_event(suspend_freeze_wait_head,
- suspend_freeze_state == FREEZE_STATE_WAKE);
+ wait_event(s2idle_wait_head,
+ s2idle_state == S2IDLE_STATE_WAKE);
cpuidle_pause();
put_online_cpus();
- spin_lock_irq(&suspend_freeze_lock);
+ spin_lock_irq(&s2idle_lock);
out:
- suspend_freeze_state = FREEZE_STATE_NONE;
- spin_unlock_irq(&suspend_freeze_lock);
+ s2idle_state = S2IDLE_STATE_NONE;
+ spin_unlock_irq(&s2idle_lock);
- trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
+ trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false);
}
static void s2idle_loop(void)
{
- pr_debug("PM: suspend-to-idle\n");
+ pm_pr_dbg("suspend-to-idle\n");
- do {
- freeze_enter();
+ for (;;) {
+ int error;
+
+ dpm_noirq_begin();
+
+ /*
+ * Suspend-to-idle equals
+ * frozen processes + suspended devices + idle processors.
+ * Thus s2idle_enter() should be called right after
+ * all devices have been suspended.
+ *
+ * Wakeups during the noirq suspend of devices may be spurious,
+ * so prevent them from terminating the loop right away.
+ */
+ error = dpm_noirq_suspend_devices(PMSG_SUSPEND);
+ if (!error)
+ s2idle_enter();
+ else if (error == -EBUSY && pm_wakeup_pending())
+ error = 0;
+
+ if (!error && s2idle_ops && s2idle_ops->wake)
+ s2idle_ops->wake();
- if (freeze_ops && freeze_ops->wake)
- freeze_ops->wake();
+ dpm_noirq_resume_devices(PMSG_RESUME);
- dpm_resume_noirq(PMSG_RESUME);
- if (freeze_ops && freeze_ops->sync)
- freeze_ops->sync();
+ dpm_noirq_end();
+
+ if (error)
+ break;
+
+ if (s2idle_ops && s2idle_ops->sync)
+ s2idle_ops->sync();
if (pm_wakeup_pending())
break;
pm_wakeup_clear(false);
- } while (!dpm_suspend_noirq(PMSG_SUSPEND));
+ }
- pr_debug("PM: resume from suspend-to-idle\n");
+ pm_pr_dbg("resume from suspend-to-idle\n");
}
-void freeze_wake(void)
+void s2idle_wake(void)
{
unsigned long flags;
- spin_lock_irqsave(&suspend_freeze_lock, flags);
- if (suspend_freeze_state > FREEZE_STATE_NONE) {
- suspend_freeze_state = FREEZE_STATE_WAKE;
- wake_up(&suspend_freeze_wait_head);
+ spin_lock_irqsave(&s2idle_lock, flags);
+ if (s2idle_state > S2IDLE_STATE_NONE) {
+ s2idle_state = S2IDLE_STATE_WAKE;
+ wake_up(&s2idle_wait_head);
}
- spin_unlock_irqrestore(&suspend_freeze_lock, flags);
+ spin_unlock_irqrestore(&s2idle_lock, flags);
}
-EXPORT_SYMBOL_GPL(freeze_wake);
+EXPORT_SYMBOL_GPL(s2idle_wake);
static bool valid_state(suspend_state_t state)
{
@@ -152,19 +179,19 @@ void __init pm_states_init(void)
{
/* "mem" and "freeze" are always present in /sys/power/state. */
pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM];
- pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE];
+ pm_states[PM_SUSPEND_TO_IDLE] = pm_labels[PM_SUSPEND_TO_IDLE];
/*
* Suspend-to-idle should be supported even without any suspend_ops,
* initialize mem_sleep_states[] accordingly here.
*/
- mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE];
+ mem_sleep_states[PM_SUSPEND_TO_IDLE] = mem_sleep_labels[PM_SUSPEND_TO_IDLE];
}
static int __init mem_sleep_default_setup(char *str)
{
suspend_state_t state;
- for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++)
+ for (state = PM_SUSPEND_TO_IDLE; state <= PM_SUSPEND_MEM; state++)
if (mem_sleep_labels[state] &&
!strcmp(str, mem_sleep_labels[state])) {
mem_sleep_default = state;
@@ -193,7 +220,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
}
if (valid_state(PM_SUSPEND_MEM)) {
mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM];
- if (mem_sleep_default == PM_SUSPEND_MEM)
+ if (mem_sleep_default >= PM_SUSPEND_MEM)
mem_sleep_current = PM_SUSPEND_MEM;
}
@@ -216,49 +243,49 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
static bool sleep_state_supported(suspend_state_t state)
{
- return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
+ return state == PM_SUSPEND_TO_IDLE || (suspend_ops && suspend_ops->enter);
}
static int platform_suspend_prepare(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare ?
suspend_ops->prepare() : 0;
}
static int platform_suspend_prepare_late(suspend_state_t state)
{
- return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
- freeze_ops->prepare() : 0;
+ return state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->prepare ?
+ s2idle_ops->prepare() : 0;
}
static int platform_suspend_prepare_noirq(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare_late ?
suspend_ops->prepare_late() : 0;
}
static void platform_resume_noirq(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->wake)
suspend_ops->wake();
}
static void platform_resume_early(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
- freeze_ops->restore();
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->restore)
+ s2idle_ops->restore();
}
static void platform_resume_finish(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->finish)
suspend_ops->finish();
}
static int platform_suspend_begin(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
- return freeze_ops->begin();
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->begin)
+ return s2idle_ops->begin();
else if (suspend_ops && suspend_ops->begin)
return suspend_ops->begin(state);
else
@@ -267,21 +294,21 @@ static int platform_suspend_begin(suspend_state_t state)
static void platform_resume_end(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
- freeze_ops->end();
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->end)
+ s2idle_ops->end();
else if (suspend_ops && suspend_ops->end)
suspend_ops->end();
}
static void platform_recover(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->recover)
suspend_ops->recover();
}
static bool platform_suspend_again(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->suspend_again ?
suspend_ops->suspend_again() : false;
}
@@ -370,16 +397,21 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
- pr_err("PM: late suspend of devices failed\n");
+ pr_err("late suspend of devices failed\n");
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
if (error)
goto Devices_early_resume;
+ if (state == PM_SUSPEND_TO_IDLE && pm_test_level != TEST_PLATFORM) {
+ s2idle_loop();
+ goto Platform_early_resume;
+ }
+
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
- pr_err("PM: noirq suspend of devices failed\n");
+ pr_err("noirq suspend of devices failed\n");
goto Platform_early_resume;
}
error = platform_suspend_prepare_noirq(state);
@@ -389,17 +421,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
- /*
- * PM_SUSPEND_FREEZE equals
- * frozen processes + suspended devices + idle processors.
- * Thus we should invoke freeze_enter() soon after
- * all the devices are suspended.
- */
- if (state == PM_SUSPEND_FREEZE) {
- s2idle_loop();
- goto Platform_early_resume;
- }
-
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -416,7 +437,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = suspend_ops->enter(state);
trace_suspend_resume(TPS("machine_suspend"),
state, false);
- events_check_enabled = false;
} else if (*wakeup) {
error = -EBUSY;
}
@@ -456,6 +476,8 @@ int suspend_devices_and_enter(suspend_state_t state)
if (!sleep_state_supported(state))
return -ENOSYS;
+ pm_suspend_target_state = state;
+
error = platform_suspend_begin(state);
if (error)
goto Close;
@@ -464,7 +486,7 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
- pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
+ pr_err("Some devices failed to suspend, or early wake event detected\n");
goto Recover_platform;
}
suspend_test_finish("suspend devices");
@@ -485,6 +507,7 @@ int suspend_devices_and_enter(suspend_state_t state)
Close:
platform_resume_end(state);
+ pm_suspend_target_state = PM_SUSPEND_ON;
return error;
Recover_platform:
@@ -518,10 +541,10 @@ static int enter_state(suspend_state_t state)
int error;
trace_suspend_resume(TPS("suspend_enter"), state, true);
- if (state == PM_SUSPEND_FREEZE) {
+ if (state == PM_SUSPEND_TO_IDLE) {
#ifdef CONFIG_PM_DEBUG
if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
- pr_warn("PM: Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n");
+ pr_warn("Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n");
return -EAGAIN;
}
#endif
@@ -531,18 +554,18 @@ static int enter_state(suspend_state_t state)
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
- if (state == PM_SUSPEND_FREEZE)
- freeze_begin();
+ if (state == PM_SUSPEND_TO_IDLE)
+ s2idle_begin();
#ifndef CONFIG_SUSPEND_SKIP_SYNC
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
- pr_info("PM: Syncing filesystems ... ");
+ pr_info("Syncing filesystems ... ");
sys_sync();
pr_cont("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
#endif
- pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
+ pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
pm_suspend_clear_flags();
error = suspend_prepare(state);
if (error)
@@ -552,13 +575,14 @@ static int enter_state(suspend_state_t state)
goto Finish;
trace_suspend_resume(TPS("suspend_enter"), state, false);
- pr_debug("PM: Suspending system (%s)\n", pm_states[state]);
+ pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
pm_restore_gfp_mask();
Finish:
- pr_debug("PM: Finishing wakeup.\n");
+ events_check_enabled = false;
+ pm_pr_dbg("Finishing wakeup.\n");
suspend_finish();
Unlock:
mutex_unlock(&pm_mutex);
@@ -579,6 +603,7 @@ int pm_suspend(suspend_state_t state)
if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
return -EINVAL;
+ pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
error = enter_state(state);
if (error) {
suspend_stats.fail++;
@@ -586,6 +611,7 @@ int pm_suspend(suspend_state_t state)
} else {
suspend_stats.success++;
}
+ pr_info("suspend exit\n");
return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 5db217051232..6a897e8b2a88 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -104,9 +104,9 @@ repeat:
printk(info_test, pm_states[state]);
status = pm_suspend(state);
if (status < 0)
- state = PM_SUSPEND_FREEZE;
+ state = PM_SUSPEND_TO_IDLE;
}
- if (state == PM_SUSPEND_FREEZE) {
+ if (state == PM_SUSPEND_TO_IDLE) {
printk(info_test, pm_states[state]);
status = pm_suspend(state);
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 57d22571f306..293ead59eccc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -12,6 +12,8 @@
*
*/
+#define pr_fmt(fmt) "PM: " fmt
+
#include <linux/module.h>
#include <linux/file.h>
#include <linux/delay.h>
@@ -241,10 +243,9 @@ static void hib_end_io(struct bio *bio)
struct page *page = bio->bi_io_vec[0].bv_page;
if (bio->bi_status) {
- printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
}
if (bio_data_dir(bio) == WRITE)
@@ -270,12 +271,12 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
- bio->bi_bdev = hib_resume_bdev;
+ bio_set_dev(bio, hib_resume_bdev);
bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_err("Adding page to bio failed at %llu\n",
+ (unsigned long long)bio->bi_iter.bi_sector);
bio_put(bio);
return -EFAULT;
}
@@ -320,7 +321,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
swsusp_resume_block, swsusp_header, NULL);
} else {
- printk(KERN_ERR "PM: Swap header not found!\n");
+ pr_err("Swap header not found!\n");
error = -ENODEV;
}
return error;
@@ -414,8 +415,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
ret = swsusp_swap_check();
if (ret) {
if (ret != -ENOSPC)
- printk(KERN_ERR "PM: Cannot find swap device, try "
- "swapon -a.\n");
+ pr_err("Cannot find swap device, try swapon -a\n");
return ret;
}
handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -492,9 +492,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
{
if (!error) {
flush_swap_writer(handle);
- printk(KERN_INFO "PM: S");
+ pr_info("S");
error = mark_swapfiles(handle, flags);
- printk("|\n");
+ pr_cont("|\n");
}
if (error)
@@ -543,7 +543,7 @@ static int save_image(struct swap_map_handle *handle,
hib_init_batch(&hb);
- printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
+ pr_info("Saving image data pages (%u pages)...\n",
nr_to_write);
m = nr_to_write / 10;
if (!m)
@@ -558,8 +558,8 @@ static int save_image(struct swap_map_handle *handle,
if (ret)
break;
if (!(nr_pages % m))
- printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image saving progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
err2 = hib_wait_io(&hb);
@@ -567,7 +567,7 @@ static int save_image(struct swap_map_handle *handle,
if (!ret)
ret = err2;
if (!ret)
- printk(KERN_INFO "PM: Image saving done.\n");
+ pr_info("Image saving done\n");
swsusp_show_speed(start, stop, nr_to_write, "Wrote");
return ret;
}
@@ -693,14 +693,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
if (!page) {
- printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+ pr_err("Failed to allocate LZO page\n");
ret = -ENOMEM;
goto out_clean;
}
data = vmalloc(sizeof(*data) * nr_threads);
if (!data) {
- printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+ pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -709,7 +709,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
crc = kmalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
- printk(KERN_ERR "PM: Failed to allocate crc\n");
+ pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -727,8 +727,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
"image_compress/%u", thr);
if (IS_ERR(data[thr].thr)) {
data[thr].thr = NULL;
- printk(KERN_ERR
- "PM: Cannot start compression threads\n");
+ pr_err("Cannot start compression threads\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -750,7 +749,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
if (IS_ERR(crc->thr)) {
crc->thr = NULL;
- printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+ pr_err("Cannot start CRC32 thread\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -761,10 +760,9 @@ static int save_image_lzo(struct swap_map_handle *handle,
*/
handle->reqd_free_pages = reqd_free_pages();
- printk(KERN_INFO
- "PM: Using %u thread(s) for compression.\n"
- "PM: Compressing and saving image data (%u pages)...\n",
- nr_threads, nr_to_write);
+ pr_info("Using %u thread(s) for compression\n", nr_threads);
+ pr_info("Compressing and saving image data (%u pages)...\n",
+ nr_to_write);
m = nr_to_write / 10;
if (!m)
m = 1;
@@ -784,10 +782,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
data_of(*snapshot), PAGE_SIZE);
if (!(nr_pages % m))
- printk(KERN_INFO
- "PM: Image saving progress: "
- "%3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image saving progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
if (!off)
@@ -814,15 +810,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
ret = data[thr].ret;
if (ret < 0) {
- printk(KERN_ERR "PM: LZO compression failed\n");
+ pr_err("LZO compression failed\n");
goto out_finish;
}
if (unlikely(!data[thr].cmp_len ||
data[thr].cmp_len >
lzo1x_worst_compress(data[thr].unc_len))) {
- printk(KERN_ERR
- "PM: Invalid LZO compressed length\n");
+ pr_err("Invalid LZO compressed length\n");
ret = -1;
goto out_finish;
}
@@ -858,7 +853,7 @@ out_finish:
if (!ret)
ret = err2;
if (!ret)
- printk(KERN_INFO "PM: Image saving done.\n");
+ pr_info("Image saving done\n");
swsusp_show_speed(start, stop, nr_to_write, "Wrote");
out_clean:
if (crc) {
@@ -889,7 +884,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
unsigned int free_swap = count_swap_pages(root_swap, 1);
unsigned int required;
- pr_debug("PM: Free swap pages: %u\n", free_swap);
+ pr_debug("Free swap pages: %u\n", free_swap);
required = PAGES_FOR_IO + nr_pages;
return free_swap > required;
@@ -916,12 +911,12 @@ int swsusp_write(unsigned int flags)
pages = snapshot_get_image_size();
error = get_swap_writer(&handle);
if (error) {
- printk(KERN_ERR "PM: Cannot get swap writer\n");
+ pr_err("Cannot get swap writer\n");
return error;
}
if (flags & SF_NOCOMPRESS_MODE) {
if (!enough_swap(pages, flags)) {
- printk(KERN_ERR "PM: Not enough free swap\n");
+ pr_err("Not enough free swap\n");
error = -ENOSPC;
goto out_finish;
}
@@ -1069,8 +1064,7 @@ static int load_image(struct swap_map_handle *handle,
hib_init_batch(&hb);
clean_pages_on_read = true;
- printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
- nr_to_read);
+ pr_info("Loading image data pages (%u pages)...\n", nr_to_read);
m = nr_to_read / 10;
if (!m)
m = 1;
@@ -1088,8 +1082,8 @@ static int load_image(struct swap_map_handle *handle,
if (ret)
break;
if (!(nr_pages % m))
- printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image loading progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
err2 = hib_wait_io(&hb);
@@ -1097,7 +1091,7 @@ static int load_image(struct swap_map_handle *handle,
if (!ret)
ret = err2;
if (!ret) {
- printk(KERN_INFO "PM: Image loading done.\n");
+ pr_info("Image loading done\n");
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
@@ -1191,14 +1185,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
if (!page) {
- printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+ pr_err("Failed to allocate LZO page\n");
ret = -ENOMEM;
goto out_clean;
}
data = vmalloc(sizeof(*data) * nr_threads);
if (!data) {
- printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+ pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1207,7 +1201,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
crc = kmalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
- printk(KERN_ERR "PM: Failed to allocate crc\n");
+ pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1227,8 +1221,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
"image_decompress/%u", thr);
if (IS_ERR(data[thr].thr)) {
data[thr].thr = NULL;
- printk(KERN_ERR
- "PM: Cannot start decompression threads\n");
+ pr_err("Cannot start decompression threads\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1250,7 +1243,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
if (IS_ERR(crc->thr)) {
crc->thr = NULL;
- printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+ pr_err("Cannot start CRC32 thread\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1275,8 +1268,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
if (!page[i]) {
if (i < LZO_CMP_PAGES) {
ring_size = i;
- printk(KERN_ERR
- "PM: Failed to allocate LZO pages\n");
+ pr_err("Failed to allocate LZO pages\n");
ret = -ENOMEM;
goto out_clean;
} else {
@@ -1286,10 +1278,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
want = ring_size = i;
- printk(KERN_INFO
- "PM: Using %u thread(s) for decompression.\n"
- "PM: Loading and decompressing image data (%u pages)...\n",
- nr_threads, nr_to_read);
+ pr_info("Using %u thread(s) for decompression\n", nr_threads);
+ pr_info("Loading and decompressing image data (%u pages)...\n",
+ nr_to_read);
m = nr_to_read / 10;
if (!m)
m = 1;
@@ -1349,8 +1340,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
if (unlikely(!data[thr].cmp_len ||
data[thr].cmp_len >
lzo1x_worst_compress(LZO_UNC_SIZE))) {
- printk(KERN_ERR
- "PM: Invalid LZO compressed length\n");
+ pr_err("Invalid LZO compressed length\n");
ret = -1;
goto out_finish;
}
@@ -1401,16 +1391,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
ret = data[thr].ret;
if (ret < 0) {
- printk(KERN_ERR
- "PM: LZO decompression failed\n");
+ pr_err("LZO decompression failed\n");
goto out_finish;
}
if (unlikely(!data[thr].unc_len ||
data[thr].unc_len > LZO_UNC_SIZE ||
data[thr].unc_len & (PAGE_SIZE - 1))) {
- printk(KERN_ERR
- "PM: Invalid LZO uncompressed length\n");
+ pr_err("Invalid LZO uncompressed length\n");
ret = -1;
goto out_finish;
}
@@ -1421,10 +1409,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
data[thr].unc + off, PAGE_SIZE);
if (!(nr_pages % m))
- printk(KERN_INFO
- "PM: Image loading progress: "
- "%3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image loading progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
ret = snapshot_write_next(snapshot);
@@ -1449,15 +1435,14 @@ out_finish:
}
stop = ktime_get();
if (!ret) {
- printk(KERN_INFO "PM: Image loading done.\n");
+ pr_info("Image loading done\n");
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
if (!ret) {
if (swsusp_header->flags & SF_CRC32_MODE) {
if(handle->crc32 != swsusp_header->crc32) {
- printk(KERN_ERR
- "PM: Invalid image CRC32!\n");
+ pr_err("Invalid image CRC32!\n");
ret = -ENODATA;
}
}
@@ -1514,9 +1499,9 @@ int swsusp_read(unsigned int *flags_p)
swap_reader_finish(&handle);
end:
if (!error)
- pr_debug("PM: Image successfully loaded\n");
+ pr_debug("Image successfully loaded\n");
else
- pr_debug("PM: Error %d resuming\n", error);
+ pr_debug("Error %d resuming\n", error);
return error;
}
@@ -1553,13 +1538,13 @@ put:
if (error)
blkdev_put(hib_resume_bdev, FMODE_READ);
else
- pr_debug("PM: Image signature found, resuming\n");
+ pr_debug("Image signature found, resuming\n");
} else {
error = PTR_ERR(hib_resume_bdev);
}
if (error)
- pr_debug("PM: Image not found (code %d)\n", error);
+ pr_debug("Image not found (code %d)\n", error);
return error;
}
@@ -1571,7 +1556,7 @@ put:
void swsusp_close(fmode_t mode)
{
if (IS_ERR(hib_resume_bdev)) {
- pr_debug("PM: Image device not initialised\n");
+ pr_debug("Image device not initialised\n");
return;
}
@@ -1595,7 +1580,7 @@ int swsusp_unmark(void)
swsusp_resume_block,
swsusp_header, NULL);
} else {
- printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
+ pr_err("Cannot find swsusp signature!\n");
error = -ENODEV;
}
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 1896386e16bb..dfba59be190b 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* kernel/power/wakelock.c
*
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 61d41ca41844..1d21ebacfdb8 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h
index 749a6756843a..123154f86304 100644
--- a/kernel/printk/braille.h
+++ b/kernel/printk/braille.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _PRINTK_BRAILLE_H
#define _PRINTK_BRAILLE_H
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
index 2ca4a8b5fe57..11f19c466af5 100644
--- a/kernel/printk/console_cmdline.h
+++ b/kernel/printk/console_cmdline.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _CONSOLE_CMDLINE_H
#define _CONSOLE_CMDLINE_H
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fc47863f629c..512f7c2baedd 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -649,7 +649,7 @@ static int syslog_action_restricted(int type)
type != SYSLOG_ACTION_SIZE_BUFFER;
}
-int check_syslog_permissions(int type, int source)
+static int check_syslog_permissions(int type, int source)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
@@ -677,7 +677,6 @@ int check_syslog_permissions(int type, int source)
ok:
return security_syslog(type);
}
-EXPORT_SYMBOL_GPL(check_syslog_permissions);
static void append_char(char **pp, char *e, char c)
{
@@ -1435,7 +1434,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
error = check_syslog_permissions(type, source);
if (error)
- goto out;
+ return error;
switch (type) {
case SYSLOG_ACTION_CLOSE: /* Close log */
@@ -1443,20 +1442,16 @@ int do_syslog(int type, char __user *buf, int len, int source)
case SYSLOG_ACTION_OPEN: /* Open log */
break;
case SYSLOG_ACTION_READ: /* Read from log */
- error = -EINVAL;
if (!buf || len < 0)
- goto out;
- error = 0;
+ return -EINVAL;
if (!len)
- goto out;
- if (!access_ok(VERIFY_WRITE, buf, len)) {
- error = -EFAULT;
- goto out;
- }
+ return 0;
+ if (!access_ok(VERIFY_WRITE, buf, len))
+ return -EFAULT;
error = wait_event_interruptible(log_wait,
syslog_seq != log_next_seq);
if (error)
- goto out;
+ return error;
error = syslog_print(buf, len);
break;
/* Read/clear last kernel messages */
@@ -1465,16 +1460,12 @@ int do_syslog(int type, char __user *buf, int len, int source)
/* FALL THRU */
/* Read last kernel messages */
case SYSLOG_ACTION_READ_ALL:
- error = -EINVAL;
if (!buf || len < 0)
- goto out;
- error = 0;
+ return -EINVAL;
if (!len)
- goto out;
- if (!access_ok(VERIFY_WRITE, buf, len)) {
- error = -EFAULT;
- goto out;
- }
+ return 0;
+ if (!access_ok(VERIFY_WRITE, buf, len))
+ return -EFAULT;
error = syslog_print_all(buf, len, clear);
break;
/* Clear ring buffer */
@@ -1496,15 +1487,13 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Set level of messages printed to console */
case SYSLOG_ACTION_CONSOLE_LEVEL:
- error = -EINVAL;
if (len < 1 || len > 8)
- goto out;
+ return -EINVAL;
if (len < minimum_console_loglevel)
len = minimum_console_loglevel;
console_loglevel = len;
/* Implicitly re-enable logging to console */
saved_console_loglevel = LOGLEVEL_DEFAULT;
- error = 0;
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
@@ -1526,7 +1515,6 @@ int do_syslog(int type, char __user *buf, int len, int source)
u64 seq = syslog_seq;
u32 idx = syslog_idx;
- error = 0;
while (seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
@@ -1546,7 +1534,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
error = -EINVAL;
break;
}
-out:
+
return error;
}
@@ -1698,10 +1686,10 @@ asmlinkage int vprintk_emit(int facility, int level,
{
static char textbuf[LOG_LINE_MAX];
char *text = textbuf;
- size_t text_len = 0;
+ size_t text_len;
enum log_flags lflags = 0;
unsigned long flags;
- int printed_len = 0;
+ int printed_len;
bool in_sched = false;
if (level == LOGLEVEL_SCHED) {
@@ -1754,7 +1742,7 @@ asmlinkage int vprintk_emit(int facility, int level,
if (dict)
lflags |= LOG_PREFIX|LOG_NEWLINE;
- printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len);
+ printed_len = log_output(facility, level, lflags, dict, dictlen, text, text_len);
logbuf_unlock_irqrestore(flags);
@@ -2650,9 +2638,8 @@ void __init console_init(void)
* makes it difficult to diagnose problems that occur during this time.
*
* To mitigate this problem somewhat, only unregister consoles whose memory
- * intersects with the init section. Note that code exists elsewhere to get
- * rid of the boot console as soon as the proper console shows up, so there
- * won't be side-effects from postponing the removal.
+ * intersects with the init section. Note that all other boot consoles will
+ * get unregistred when the real preferred console is registered.
*/
static int __init printk_late_init(void)
{
@@ -2660,16 +2647,23 @@ static int __init printk_late_init(void)
int ret;
for_each_console(con) {
- if (!keep_bootcon && con->flags & CON_BOOT) {
+ if (!(con->flags & CON_BOOT))
+ continue;
+
+ /* Check addresses that might be used for enabled consoles. */
+ if (init_section_intersects(con, sizeof(*con)) ||
+ init_section_contains(con->write, 0) ||
+ init_section_contains(con->read, 0) ||
+ init_section_contains(con->device, 0) ||
+ init_section_contains(con->unblank, 0) ||
+ init_section_contains(con->data, 0)) {
/*
- * Make sure to unregister boot consoles whose data
- * resides in the init section before the init section
- * is discarded. Boot consoles whose data will stick
- * around will automatically be unregistered when the
- * proper console replaces them.
+ * Please, consider moving the reported consoles out
+ * of the init section.
*/
- if (init_section_intersects(con, sizeof(*con)))
- unregister_console(con);
+ pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
+ con->name, con->index);
+ unregister_console(con);
}
}
ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 60f356d91060..84b1367935e4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -728,8 +728,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
if (unlikely(in_compat_syscall())) {
compat_siginfo_t __user *uinfo = compat_ptr(data);
- if (copy_siginfo_to_user32(uinfo, &info) ||
- __put_user(info.si_code, &uinfo->si_code)) {
+ if (copy_siginfo_to_user32(uinfo, &info)) {
ret = -EFAULT;
break;
}
@@ -739,8 +738,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
{
siginfo_t __user *uinfo = (siginfo_t __user *) data;
- if (copy_siginfo_to_user(uinfo, &info) ||
- __put_user(info.si_code, &uinfo->si_code)) {
+ if (copy_siginfo_to_user(uinfo, &info)) {
ret = -EFAULT;
break;
}
diff --git a/kernel/range.c b/kernel/range.c
index 82cfc285b046..d84de6766472 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Range add and subtract
*/
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index be90c945063f..9210379c0353 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -69,8 +69,7 @@ config TREE_SRCU
This option selects the full-fledged version of SRCU.
config TASKS_RCU
- bool
- default n
+ def_bool PREEMPT
select SRCU
help
This option enables a task-based RCU implementation that uses
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 13c0fc852767..020e8b6a644b 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
# Any varying coverage in these files is non-deterministic
# and is generally not a function of system call inputs.
KCOV_INSTRUMENT := n
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 808b8c85f626..59c471de342a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -203,6 +203,21 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
extern int rcu_cpu_stall_suppress;
int rcu_jiffies_till_stall_check(void);
+#define rcu_ftrace_dump_stall_suppress() \
+do { \
+ if (!rcu_cpu_stall_suppress) \
+ rcu_cpu_stall_suppress = 3; \
+} while (0)
+
+#define rcu_ftrace_dump_stall_unsuppress() \
+do { \
+ if (rcu_cpu_stall_suppress == 3) \
+ rcu_cpu_stall_suppress = 0; \
+} while (0)
+
+#else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */
+#define rcu_ftrace_dump_stall_suppress()
+#define rcu_ftrace_dump_stall_unsuppress()
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
/*
@@ -220,8 +235,12 @@ do { \
static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
\
if (!atomic_read(&___rfd_beenhere) && \
- !atomic_xchg(&___rfd_beenhere, 1)) \
+ !atomic_xchg(&___rfd_beenhere, 1)) { \
+ tracing_off(); \
+ rcu_ftrace_dump_stall_suppress(); \
ftrace_dump(oops_dump_mode); \
+ rcu_ftrace_dump_stall_unsuppress(); \
+ } \
} while (0)
void rcu_early_boot_tests(void);
@@ -356,22 +375,10 @@ do { \
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
-static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */
-{
- return true;
-}
-static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
-{
- return false;
-}
-
-static inline void rcu_expedite_gp(void)
-{
-}
-
-static inline void rcu_unexpedite_gp(void)
-{
-}
+static inline bool rcu_gp_is_normal(void) { return true; }
+static inline bool rcu_gp_is_expedited(void) { return false; }
+static inline void rcu_expedite_gp(void) { }
+static inline void rcu_unexpedite_gp(void) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
@@ -419,12 +426,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
*gpnum = 0;
*completed = 0;
}
-static inline void rcutorture_record_test_transition(void)
-{
-}
-static inline void rcutorture_record_progress(unsigned long vernum)
-{
-}
+static inline void rcutorture_record_test_transition(void) { }
+static inline void rcutorture_record_progress(unsigned long vernum) { }
#ifdef CONFIG_RCU_TRACE
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,
@@ -460,92 +463,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
#endif
#ifdef CONFIG_TINY_RCU
-
-/*
- * Return the number of grace periods started.
- */
-static inline unsigned long rcu_batches_started(void)
-{
- return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods started.
- */
-static inline unsigned long rcu_batches_started_bh(void)
-{
- return 0;
-}
-
-/*
- * Return the number of sched grace periods started.
- */
-static inline unsigned long rcu_batches_started_sched(void)
-{
- return 0;
-}
-
-/*
- * Return the number of grace periods completed.
- */
-static inline unsigned long rcu_batches_completed(void)
-{
- return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_bh(void)
-{
- return 0;
-}
-
-/*
- * Return the number of sched grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_sched(void)
-{
- return 0;
-}
-
-/*
- * Return the number of expedited grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed(void)
-{
- return 0;
-}
-
-/*
- * Return the number of expedited sched grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed_sched(void)
-{
- return 0;
-}
-
-static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
-{
- return 0;
-}
-
-static inline void rcu_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_bh_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_sched_force_quiescent_state(void)
-{
-}
-
-static inline void show_rcu_gp_kthreads(void)
-{
-}
-
+static inline unsigned long rcu_batches_started(void) { return 0; }
+static inline unsigned long rcu_batches_started_bh(void) { return 0; }
+static inline unsigned long rcu_batches_started_sched(void) { return 0; }
+static inline unsigned long rcu_batches_completed(void) { return 0; }
+static inline unsigned long rcu_batches_completed_bh(void) { return 0; }
+static inline unsigned long rcu_batches_completed_sched(void) { return 0; }
+static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
+static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; }
+static inline unsigned long
+srcu_batches_completed(struct srcu_struct *sp) { return 0; }
+static inline void rcu_force_quiescent_state(void) { }
+static inline void rcu_bh_force_quiescent_state(void) { }
+static inline void rcu_sched_force_quiescent_state(void) { }
+static inline void show_rcu_gp_kthreads(void) { }
#else /* #ifdef CONFIG_TINY_RCU */
extern unsigned long rcutorture_testseq;
extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 2b62a38b080f..88cba7c2956c 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -23,6 +23,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/rcupdate.h>
#include "rcu_segcblist.h"
@@ -36,24 +37,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
}
/*
- * Debug function to actually count the number of callbacks.
- * If the number exceeds the limit specified, return -1.
- */
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
-{
- int cnt = 0;
- struct rcu_head **rhpp = &rclp->head;
-
- for (;;) {
- if (!*rhpp)
- return cnt;
- if (++cnt > lim)
- return -1;
- rhpp = &(*rhpp)->next;
- }
-}
-
-/*
* Dequeue the oldest rcu_head structure from the specified callback
* list. This function assumes that the callback is non-lazy, but
* the caller can later invoke rcu_cblist_dequeued_lazy() if it
@@ -103,17 +86,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
}
/*
- * Is the specified segment of the specified rcu_segcblist structure
- * empty of callbacks?
- */
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
-{
- if (seg == RCU_DONE_TAIL)
- return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
- return rsclp->tails[seg - 1] == rsclp->tails[seg];
-}
-
-/*
* Does the specified rcu_segcblist structure contain callbacks that
* are ready to be invoked?
*/
@@ -134,50 +106,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
}
/*
- * Dequeue and return the first ready-to-invoke callback. If there
- * are no ready-to-invoke callbacks, return NULL. Disables interrupts
- * to avoid interference. Does not protect from interference from other
- * CPUs or tasks.
- */
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
-{
- unsigned long flags;
- int i;
- struct rcu_head *rhp;
-
- local_irq_save(flags);
- if (!rcu_segcblist_ready_cbs(rsclp)) {
- local_irq_restore(flags);
- return NULL;
- }
- rhp = rsclp->head;
- BUG_ON(!rhp);
- rsclp->head = rhp->next;
- for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
- if (rsclp->tails[i] != &rhp->next)
- break;
- rsclp->tails[i] = &rsclp->head;
- }
- smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
- WRITE_ONCE(rsclp->len, rsclp->len - 1);
- local_irq_restore(flags);
- return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rsclp->len_lazy--;
- local_irq_restore(flags);
-}
-
-/*
* Return a pointer to the first callback in the specified rcu_segcblist
* structure. This is useful for diagnostics.
*/
@@ -203,17 +131,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
}
/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * have not yet been processed beyond having been posted, that is,
- * does it contain callbacks in its last segment?
- */
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
-{
- return rcu_segcblist_is_enabled(rsclp) &&
- !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
-}
-
-/*
* Enqueue the specified callback onto the specified rcu_segcblist
* structure, updating accounting as needed. Note that the ->len
* field may be accessed locklessly, hence the WRITE_ONCE().
@@ -503,3 +420,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
return true;
return false;
}
+
+/*
+ * Merge the source rcu_segcblist structure into the destination
+ * rcu_segcblist structure, then initialize the source. Any pending
+ * callbacks from the source get to start over. It is best to
+ * advance and accelerate both the destination and the source
+ * before merging.
+ */
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+ struct rcu_segcblist *src_rsclp)
+{
+ struct rcu_cblist donecbs;
+ struct rcu_cblist pendcbs;
+
+ rcu_cblist_init(&donecbs);
+ rcu_cblist_init(&pendcbs);
+ rcu_segcblist_extract_count(src_rsclp, &donecbs);
+ rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
+ rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
+ rcu_segcblist_insert_count(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
+ rcu_segcblist_init(src_rsclp);
+}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 6e36e36478cd..581c12b63544 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
rclp->len_lazy--;
}
-/*
- * Interim function to return rcu_cblist head pointer. Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
-{
- return rclp->head;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer. Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
-{
- WARN_ON_ONCE(!rclp->head);
- return rclp->tail;
-}
-
void rcu_cblist_init(struct rcu_cblist *rclp);
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);
struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
/*
@@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp);
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp, bool lazy);
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
@@ -162,3 +136,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
unsigned long seq);
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+ struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 3cc18110b612..1f87a02c3399 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = {
.name = "sched"
};
-#ifdef CONFIG_TASKS_RCU
-
/*
* Definitions for RCU-tasks perf testing.
*/
@@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = {
.name = "tasks"
};
-#define RCUPERF_TASKS_OPS &tasks_ops,
-
static bool __maybe_unused torturing_tasks(void)
{
return cur_ops == &tasks_ops;
}
-#else /* #ifdef CONFIG_TASKS_RCU */
-
-#define RCUPERF_TASKS_OPS
-
-static bool __maybe_unused torturing_tasks(void)
-{
- return false;
-}
-
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
-
/*
* If performance tests complete, wait for shutdown to commence.
*/
@@ -658,7 +643,7 @@ rcu_perf_init(void)
int firsterr = 0;
static struct rcu_perf_ops *perf_ops[] = {
&rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops,
- RCUPERF_TASKS_OPS
+ &tasks_ops,
};
if (!torture_init_begin(perf_type, verbose, &perf_runnable))
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b8f7f8ce8575..74f6b0146b98 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -51,6 +51,7 @@
#include <asm/byteorder.h>
#include <linux/torture.h>
#include <linux/vmalloc.h>
+#include <linux/sched/debug.h>
#include "rcu.h"
@@ -89,6 +90,7 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
torture_param(int, stall_cpu_holdoff, 10,
"Time to wait before starting stall (s).");
+torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stat_interval, 60,
"Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
@@ -199,7 +201,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
static u64 notrace rcu_trace_clock_local(void)
{
u64 ts = trace_clock_local();
- unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+
+ (void)do_div(ts, NSEC_PER_USEC);
return ts;
}
#else /* #ifdef CONFIG_RCU_TRACE */
@@ -496,7 +499,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
- .name = "rcu_busted"
+ .name = "busted"
};
/*
@@ -522,7 +525,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
delay = torture_random(rrsp) %
(nrealreaders * 2 * longdelay * uspertick);
- if (!delay)
+ if (!delay && in_task())
schedule_timeout_interruptible(longdelay);
else
rcu_read_delay(rrsp);
@@ -561,44 +564,7 @@ static void srcu_torture_barrier(void)
static void srcu_torture_stats(void)
{
- int __maybe_unused cpu;
- int idx;
-
-#ifdef CONFIG_TREE_SRCU
- idx = srcu_ctlp->srcu_idx & 0x1;
- pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
- torture_type, TORTURE_FLAG, idx);
- for_each_possible_cpu(cpu) {
- unsigned long l0, l1;
- unsigned long u0, u1;
- long c0, c1;
- struct srcu_data *counts;
-
- counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
- u0 = counts->srcu_unlock_count[!idx];
- u1 = counts->srcu_unlock_count[idx];
-
- /*
- * Make sure that a lock is always counted if the corresponding
- * unlock is counted.
- */
- smp_rmb();
-
- l0 = counts->srcu_lock_count[!idx];
- l1 = counts->srcu_lock_count[idx];
-
- c0 = l0 - u0;
- c1 = l1 - u1;
- pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
- }
- pr_cont("\n");
-#elif defined(CONFIG_TINY_SRCU)
- idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
- pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
- torture_type, TORTURE_FLAG, idx,
- READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
- READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
-#endif
+ srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG);
}
static void srcu_torture_synchronize_expedited(void)
@@ -620,6 +586,7 @@ static struct rcu_torture_ops srcu_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .irq_capable = 1,
.name = "srcu"
};
@@ -652,6 +619,7 @@ static struct rcu_torture_ops srcud_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .irq_capable = 1,
.name = "srcud"
};
@@ -696,8 +664,6 @@ static struct rcu_torture_ops sched_ops = {
.name = "sched"
};
-#ifdef CONFIG_TASKS_RCU
-
/*
* Definitions for RCU-tasks torture testing.
*/
@@ -735,24 +701,11 @@ static struct rcu_torture_ops tasks_ops = {
.name = "tasks"
};
-#define RCUTORTURE_TASKS_OPS &tasks_ops,
-
static bool __maybe_unused torturing_tasks(void)
{
return cur_ops == &tasks_ops;
}
-#else /* #ifdef CONFIG_TASKS_RCU */
-
-#define RCUTORTURE_TASKS_OPS
-
-static bool __maybe_unused torturing_tasks(void)
-{
- return false;
-}
-
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
-
/*
* RCU torture priority-boost testing. Runs one real-time thread per
* CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -1114,13 +1067,18 @@ rcu_torture_fakewriter(void *arg)
return 0;
}
+static void rcu_torture_timer_cb(struct rcu_head *rhp)
+{
+ kfree(rhp);
+}
+
/*
* RCU torture reader from timer handler. Dereferences rcu_torture_current,
* incrementing the corresponding element of the pipeline array. The
* counter in the element should never be greater than 1, otherwise, the
* RCU implementation is broken.
*/
-static void rcu_torture_timer(unsigned long unused)
+static void rcu_torture_timer(struct timer_list *unused)
{
int idx;
unsigned long started;
@@ -1176,6 +1134,14 @@ static void rcu_torture_timer(unsigned long unused)
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
cur_ops->readunlock(idx);
+
+ /* Test call_rcu() invocation from interrupt handler. */
+ if (cur_ops->call) {
+ struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT);
+
+ if (rhp)
+ cur_ops->call(rhp, rcu_torture_timer_cb);
+ }
}
/*
@@ -1199,7 +1165,7 @@ rcu_torture_reader(void *arg)
VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
set_user_nice(current, MAX_NICE);
if (irqreader && cur_ops->irq_capable)
- setup_timer_on_stack(&t, rcu_torture_timer, 0);
+ timer_setup_on_stack(&t, rcu_torture_timer, 0);
do {
if (irqreader && cur_ops->irq_capable) {
@@ -1275,6 +1241,7 @@ rcu_torture_stats_print(void)
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
static unsigned long rtcv_snap = ULONG_MAX;
+ static bool splatted;
struct task_struct *wtp;
for_each_possible_cpu(cpu) {
@@ -1354,11 +1321,16 @@ rcu_torture_stats_print(void)
srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
&flags, &gpnum, &completed);
wtp = READ_ONCE(writer_task);
- pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
+ pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n",
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
gpnum, completed, flags,
- wtp == NULL ? ~0UL : wtp->state);
+ wtp == NULL ? ~0UL : wtp->state,
+ wtp == NULL ? -1 : (int)task_cpu(wtp));
+ if (!splatted && wtp) {
+ sched_show_task(wtp);
+ splatted = true;
+ }
show_rcu_gp_kthreads();
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1392,7 +1364,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
"test_boost=%d/%d test_boost_interval=%d "
"test_boost_duration=%d shutdown_secs=%d "
- "stall_cpu=%d stall_cpu_holdoff=%d "
+ "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
@@ -1400,7 +1372,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
test_boost, cur_ops->can_boost,
test_boost_interval, test_boost_duration, shutdown_secs,
- stall_cpu, stall_cpu_holdoff,
+ stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
n_barrier_cbs,
onoff_interval, onoff_holdoff);
}
@@ -1465,12 +1437,19 @@ static int rcu_torture_stall(void *args)
if (!kthread_should_stop()) {
stop_at = get_seconds() + stall_cpu;
/* RCU CPU stall is expected behavior in following code. */
- pr_alert("rcu_torture_stall start.\n");
rcu_read_lock();
- preempt_disable();
+ if (stall_cpu_irqsoff)
+ local_irq_disable();
+ else
+ preempt_disable();
+ pr_alert("rcu_torture_stall start on CPU %d.\n",
+ smp_processor_id());
while (ULONG_CMP_LT(get_seconds(), stop_at))
continue; /* Induce RCU CPU stall warning. */
- preempt_enable();
+ if (stall_cpu_irqsoff)
+ local_irq_enable();
+ else
+ preempt_enable();
rcu_read_unlock();
pr_alert("rcu_torture_stall end.\n");
}
@@ -1749,7 +1728,7 @@ rcu_torture_init(void)
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] = {
&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
- &sched_ops, RCUTORTURE_TASKS_OPS
+ &sched_ops, &tasks_ops,
};
if (!torture_init_begin(torture_type, verbose, &torture_runnable))
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 1a1c1047d2ed..76ac5f50b2c7 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -33,6 +33,8 @@
#include "rcu_segcblist.h"
#include "rcu.h"
+int rcu_scheduler_active __read_mostly;
+
static int init_srcu_struct_fields(struct srcu_struct *sp)
{
sp->srcu_lock_nesting[0] = 0;
@@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp)
destroy_rcu_head_on_stack(&rs.head);
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/* Lockdep diagnostics. */
+void __init rcu_scheduler_starting(void)
+{
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d0ca524bf042..6d5880089ff6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444);
static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+static void process_srcu(struct work_struct *work);
/*
* Initialize SRCU combining tree. Note that statically allocated
@@ -853,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
/**
* call_srcu() - Queue a callback for invocation after an SRCU grace period
* @sp: srcu_struct in queue the callback
- * @head: structure to be used for queueing the SRCU callback.
+ * @rhp: structure to be used for queueing the SRCU callback.
* @func: function to be invoked after the SRCU grace period
*
* The callback function will be invoked some time after a full SRCU
@@ -896,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
__call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
wait_for_completion(&rcu.completion);
destroy_rcu_head_on_stack(&rcu.head);
+
+ /*
+ * Make sure that later code is ordered after the SRCU grace
+ * period. This pairs with the raw_spin_lock_irq_rcu_node()
+ * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
+ * because the current CPU might have been totally uninvolved with
+ * (and thus unordered against) that grace period.
+ */
+ smp_mb();
}
/**
@@ -1194,7 +1204,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
/*
* This is the work-queue function that handles SRCU grace periods.
*/
-void process_srcu(struct work_struct *work)
+static void process_srcu(struct work_struct *work)
{
struct srcu_struct *sp;
@@ -1203,7 +1213,6 @@ void process_srcu(struct work_struct *work)
srcu_advance_state(sp);
srcu_reschedule(sp, srcu_get_delay(sp));
}
-EXPORT_SYMBOL_GPL(process_srcu);
void srcutorture_get_gp_data(enum rcutorture_type test_type,
struct srcu_struct *sp, int *flags,
@@ -1217,6 +1226,43 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf)
+{
+ int cpu;
+ int idx;
+ unsigned long s0 = 0, s1 = 0;
+
+ idx = sp->srcu_idx & 0x1;
+ pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx);
+ for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
+ long c0, c1;
+ struct srcu_data *counts;
+
+ counts = per_cpu_ptr(sp->sda, cpu);
+ u0 = counts->srcu_unlock_count[!idx];
+ u1 = counts->srcu_unlock_count[idx];
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = counts->srcu_lock_count[!idx];
+ l1 = counts->srcu_lock_count[idx];
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
+ pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
+ s0 += c0;
+ s1 += c1;
+ }
+ pr_cont(" T(%ld,%ld)\n", s0, s1);
+}
+EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
+
static int __init srcu_bootup_announce(void)
{
pr_info("Hierarchical SRCU implementation.\n");
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 50d1861f7759..3f943efcf61c 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
}
/**
+ * rcu_sync_enter_start - Force readers onto slow path for multiple updates
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
* Must be called after rcu_sync_init() and before first use.
*
* Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
@@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
/**
* rcu_sync_func() - Callback function managing reader access to fastpath
- * @rsp: Pointer to rcu_sync structure to use for synchronization
+ * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
*
* This function is passed to one of the call_rcu() functions by
* rcu_sync_exit(), so that it is invoked after a grace period following the
@@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp)
* rcu_sync_exit(). Otherwise, set all state back to idle so that readers
* can again use their fastpaths.
*/
-static void rcu_sync_func(struct rcu_head *rcu)
+static void rcu_sync_func(struct rcu_head *rhp)
{
- struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
+ struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
unsigned long flags;
BUG_ON(rsp->gp_state != GP_PASSED);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index f8488965250f..a64eee0db39e 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.curtail = &rcu_bh_ctrlblk.rcucblist,
};
-#include "tiny_plugin.h"
-
void rcu_barrier_bh(void)
{
wait_rcu_gp(call_rcu_bh);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
deleted file mode 100644
index f0a01b2a3062..000000000000
--- a/kernel/rcu/tiny_plugin.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
- * Internal non-public definitions that provide either classic
- * or preemptible semantics.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (c) 2010 Linaro
- *
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
-#include <linux/kernel_stat.h>
-
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-
-/*
- * During boot, we forgive RCU lockdep issues. After this function is
- * invoked, we start taking RCU lockdep issues seriously. Note that unlike
- * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
- * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
- * The reason for this is that Tiny RCU does not need kthreads, so does
- * not have to care about the fact that the scheduler is half-initialized
- * at a certain phase of the boot process. Unless SRCU is in the mix.
- */
-void __init rcu_scheduler_starting(void)
-{
- WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
- ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
-}
-
-#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 51d4c3acf32d..f9c0ca2ccf0c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -97,9 +97,6 @@ struct rcu_state sname##_state = { \
.gp_state = RCU_GP_IDLE, \
.gpnum = 0UL - 300UL, \
.completed = 0UL - 300UL, \
- .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
- .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
- .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
@@ -537,8 +534,8 @@ module_param(rcu_kick_kthreads, bool, 0644);
* How long the grace period must be before we start recruiting
* quiescent-state help from rcu_note_context_switch().
*/
-static ulong jiffies_till_sched_qs = HZ / 20;
-module_param(jiffies_till_sched_qs, ulong, 0644);
+static ulong jiffies_till_sched_qs = HZ / 10;
+module_param(jiffies_till_sched_qs, ulong, 0444);
static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
@@ -737,7 +734,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
int *fp = &rnp->need_future_gp[idx];
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
return READ_ONCE(*fp);
}
@@ -749,7 +746,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
static bool
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (rcu_gp_in_progress(rsp))
return false; /* No, a grace period is already in progress. */
if (rcu_future_needs_gp(rsp))
@@ -776,7 +773,7 @@ static void rcu_eqs_enter_common(bool user)
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
!user && !is_idle_task(current)) {
@@ -840,16 +837,15 @@ static void rcu_eqs_enter(bool user)
* We crowbar the ->dynticks_nesting field to zero to allow for
* the possibility of usermode upcalls having messed up our count
* of interrupt nesting level during the prior busy period.
+ *
+ * If you add or remove a call to rcu_idle_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_idle_enter(void)
{
- unsigned long flags;
-
- local_irq_save(flags);
+ lockdep_assert_irqs_disabled();
rcu_eqs_enter(false);
- local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
#ifdef CONFIG_NO_HZ_FULL
/**
@@ -859,10 +855,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
* is permitted between this call and rcu_user_exit(). This way the
* CPU doesn't need to maintain the tick for RCU maintenance purposes
* when the CPU runs in userspace.
+ *
+ * If you add or remove a call to rcu_user_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_user_enter(void)
{
- rcu_eqs_enter(1);
+ lockdep_assert_irqs_disabled();
+ rcu_eqs_enter(true);
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -881,13 +881,21 @@ void rcu_user_enter(void)
* Use things like work queues to work around this limitation.
*
* You have been warned.
+ *
+ * If you add or remove a call to rcu_irq_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_exit(void)
{
struct rcu_dynticks *rdtp;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* Page faults can happen in NMI handlers, so check... */
+ if (rdtp->dynticks_nmi_nesting)
+ return;
+
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
rdtp->dynticks_nesting < 1);
if (rdtp->dynticks_nesting <= 1) {
@@ -900,6 +908,9 @@ void rcu_irq_exit(void)
/*
* Wrapper for rcu_irq_exit() where interrupts are enabled.
+ *
+ * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_exit_irqson(void)
{
@@ -948,15 +959,17 @@ static void rcu_eqs_exit(bool user)
struct rcu_dynticks *rdtp;
long long oldval;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
if (oldval & DYNTICK_TASK_NEST_MASK) {
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
} else {
+ __this_cpu_inc(disable_rcu_irq_enter);
rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_eqs_exit_common(oldval, user);
+ __this_cpu_dec(disable_rcu_irq_enter);
}
}
@@ -970,6 +983,9 @@ static void rcu_eqs_exit(bool user)
* allow for the possibility of usermode upcalls messing up our count
* of interrupt nesting level during the busy period that is just
* now starting.
+ *
+ * If you add or remove a call to rcu_idle_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_idle_exit(void)
{
@@ -979,7 +995,6 @@ void rcu_idle_exit(void)
rcu_eqs_exit(false);
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
#ifdef CONFIG_NO_HZ_FULL
/**
@@ -987,6 +1002,9 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
*
* Exit RCU idle mode while entering the kernel because it can
* run a RCU read side critical section anytime.
+ *
+ * If you add or remove a call to rcu_user_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_user_exit(void)
{
@@ -1012,14 +1030,22 @@ void rcu_user_exit(void)
* Use things like work queues to work around this limitation.
*
* You have been warned.
+ *
+ * If you add or remove a call to rcu_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_enter(void)
{
struct rcu_dynticks *rdtp;
long long oldval;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* Page faults can happen in NMI handlers, so check... */
+ if (rdtp->dynticks_nmi_nesting)
+ return;
+
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -1032,6 +1058,9 @@ void rcu_irq_enter(void)
/*
* Wrapper for rcu_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_enter_irqson(void)
{
@@ -1050,6 +1079,9 @@ void rcu_irq_enter_irqson(void)
* that the CPU is active. This implementation permits nested NMIs, as
* long as the nesting level does not overflow an int. (You will probably
* run out of stack space first.)
+ *
+ * If you add or remove a call to rcu_nmi_enter(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_nmi_enter(void)
{
@@ -1082,6 +1114,9 @@ void rcu_nmi_enter(void)
* RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
* to let the RCU grace-period handling know that the CPU is back to
* being RCU-idle.
+ *
+ * If you add or remove a call to rcu_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_nmi_exit(void)
{
@@ -1202,6 +1237,22 @@ static int rcu_is_cpu_rrupt_from_idle(void)
}
/*
+ * We are reporting a quiescent state on behalf of some other CPU, so
+ * it is our responsibility to check for and handle potential overflow
+ * of the rcu_node ->gpnum counter with respect to the rcu_data counters.
+ * After all, the CPU might be in deep idle state, and thus executing no
+ * code whatsoever.
+ */
+static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
+{
+ lockdep_assert_held(&rnp->lock);
+ if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum))
+ WRITE_ONCE(rdp->gpwrap, true);
+ if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum))
+ rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4;
+}
+
+/*
* Snapshot the specified CPU's dynticks counter so that we can later
* credit them with an implicit quiescent state. Return 1 if this CPU
* is in dynticks idle mode, which is an extended quiescent state.
@@ -1211,15 +1262,34 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
- if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
- rdp->mynode->gpnum))
- WRITE_ONCE(rdp->gpwrap, true);
+ rcu_gpnum_ovf(rdp->mynode, rdp);
return 1;
}
return 0;
}
/*
+ * Handler for the irq_work request posted when a grace period has
+ * gone on for too long, but not yet long enough for an RCU CPU
+ * stall warning. Set state appropriately, but just complain if
+ * there is unexpected state on entry.
+ */
+static void rcu_iw_handler(struct irq_work *iwp)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ rdp = container_of(iwp, struct rcu_data, rcu_iw);
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp);
+ if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
+ rdp->rcu_iw_gpnum = rnp->gpnum;
+ rdp->rcu_iw_pending = false;
+ }
+ raw_spin_unlock_rcu_node(rnp);
+}
+
+/*
* Return true if the specified CPU has passed through a quiescent
* state by virtue of being in or having passed through an dynticks
* idle state since the last call to dyntick_save_progress_counter()
@@ -1230,8 +1300,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
unsigned long jtsq;
bool *rnhqp;
bool *ruqp;
- unsigned long rjtsc;
- struct rcu_node *rnp;
+ struct rcu_node *rnp = rdp->mynode;
/*
* If the CPU passed through or entered a dynticks idle phase with
@@ -1244,34 +1313,25 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
rdp->dynticks_fqs++;
+ rcu_gpnum_ovf(rnp, rdp);
return 1;
}
- /* Compute and saturate jiffies_till_sched_qs. */
- jtsq = jiffies_till_sched_qs;
- rjtsc = rcu_jiffies_till_stall_check();
- if (jtsq > rjtsc / 2) {
- WRITE_ONCE(jiffies_till_sched_qs, rjtsc);
- jtsq = rjtsc / 2;
- } else if (jtsq < 1) {
- WRITE_ONCE(jiffies_till_sched_qs, 1);
- jtsq = 1;
- }
-
/*
* Has this CPU encountered a cond_resched_rcu_qs() since the
* beginning of the grace period? For this to be the case,
* the CPU has to have noticed the current grace period. This
* might not be the case for nohz_full CPUs looping in the kernel.
*/
- rnp = rdp->mynode;
+ jtsq = jiffies_till_sched_qs;
ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) &&
READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
+ rcu_gpnum_ovf(rnp, rdp);
return 1;
- } else {
+ } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) {
/* Load rcu_qs_ctr before store to rcu_urgent_qs. */
smp_store_release(ruqp, true);
}
@@ -1280,6 +1340,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
rdp->offline_fqs++;
+ rcu_gpnum_ovf(rnp, rdp);
return 1;
}
@@ -1299,10 +1360,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* updates are only once every few jiffies, the probability of
* lossage (and thus of slight grace-period extension) is
* quite low.
- *
- * Note that if the jiffies_till_sched_qs boot/sysfs parameter
- * is set too high, we override with half of the RCU CPU stall
- * warning delay.
*/
rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
if (!READ_ONCE(*rnhqp) &&
@@ -1311,15 +1368,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
WRITE_ONCE(*rnhqp, true);
/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
smp_store_release(ruqp, true);
- rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+ rdp->rsp->jiffies_resched += jtsq; /* Re-enable beating. */
}
/*
- * If more than halfway to RCU CPU stall-warning time, do
- * a resched_cpu() to try to loosen things up a bit.
+ * If more than halfway to RCU CPU stall-warning time, do a
+ * resched_cpu() to try to loosen things up a bit. Also check to
+ * see if the CPU is getting hammered with interrupts, but only
+ * once per grace period, just to keep the IPIs down to a dull roar.
*/
- if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2)
+ if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) {
resched_cpu(rdp->cpu);
+ if (IS_ENABLED(CONFIG_IRQ_WORK) &&
+ !rdp->rcu_iw_pending && rdp->rcu_iw_gpnum != rnp->gpnum &&
+ (rnp->ffmask & rdp->grpmask)) {
+ init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
+ rdp->rcu_iw_pending = true;
+ rdp->rcu_iw_gpnum = rnp->gpnum;
+ irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
+ }
+ }
return 0;
}
@@ -1358,12 +1426,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
j = jiffies;
gpa = READ_ONCE(rsp->gp_activity);
if (j - gpa > 2 * HZ) {
- pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
rsp->name, j - gpa,
rsp->gpnum, rsp->completed,
rsp->gp_flags,
gp_state_getname(rsp->gp_state), rsp->gp_state,
- rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
+ rsp->gp_kthread ? rsp->gp_kthread->state : ~0,
+ rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1);
if (rsp->gp_kthread) {
sched_show_task(rsp->gp_kthread);
wake_up_process(rsp->gp_kthread);
@@ -1507,6 +1576,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
{
int cpu;
unsigned long flags;
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
struct rcu_node *rnp = rcu_get_root(rsp);
long totqlen = 0;
@@ -1522,7 +1592,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
*/
pr_err("INFO: %s self-detected stall on CPU", rsp->name);
print_cpu_stall_info_begin();
+ raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
print_cpu_stall_info(rsp, smp_processor_id());
+ raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
@@ -1916,6 +1988,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->core_needs_qs = need_gp;
zero_cpu_stall_ticks(rdp);
WRITE_ONCE(rdp->gpwrap, false);
+ rcu_gpnum_ovf(rnp, rdp);
}
return ret;
}
@@ -2067,8 +2140,8 @@ static bool rcu_gp_init(struct rcu_state *rsp)
}
/*
- * Helper function for wait_event_interruptible_timeout() wakeup
- * at force-quiescent-state time.
+ * Helper function for swait_event_idle() wakeup at force-quiescent-state
+ * time.
*/
static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
{
@@ -2206,9 +2279,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
- swait_event_interruptible(rsp->gp_wq,
- READ_ONCE(rsp->gp_flags) &
- RCU_GP_FLAG_INIT);
+ swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
+ RCU_GP_FLAG_INIT);
rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
@@ -2239,7 +2311,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
- ret = swait_event_interruptible_timeout(rsp->gp_wq,
+ ret = swait_event_idle_timeout(rsp->gp_wq,
rcu_gp_fqs_check_wake(rsp, &gf), j);
rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
@@ -2409,6 +2481,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
return;
}
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 &&
+ rcu_preempt_blocked_readers_cgp(rnp));
rnp->qsmask &= ~mask;
trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
mask, rnp->qsmask, rnp->level,
@@ -2563,85 +2637,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
- * Send the specified CPU's RCU callbacks to the orphanage. The
- * specified CPU must be offline, and the caller must hold the
- * ->orphan_lock.
- */
-static void
-rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
- struct rcu_node *rnp, struct rcu_data *rdp)
-{
- lockdep_assert_held(&rsp->orphan_lock);
-
- /* No-CBs CPUs do not have orphanable callbacks. */
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
- return;
-
- /*
- * Orphan the callbacks. First adjust the counts. This is safe
- * because _rcu_barrier() excludes CPU-hotplug operations, so it
- * cannot be running now. Thus no memory barrier is required.
- */
- rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
- rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
-
- /*
- * Next, move those callbacks still needing a grace period to
- * the orphanage, where some other CPU will pick them up.
- * Some of the callbacks might have gone partway through a grace
- * period, but that is too bad. They get to start over because we
- * cannot assume that grace periods are synchronized across CPUs.
- */
- rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
-
- /*
- * Then move the ready-to-invoke callbacks to the orphanage,
- * where some other CPU will pick them up. These will not be
- * required to pass though another grace period: They are done.
- */
- rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
-
- /* Finally, disallow further callbacks on this CPU. */
- rcu_segcblist_disable(&rdp->cblist);
-}
-
-/*
- * Adopt the RCU callbacks from the specified rcu_state structure's
- * orphanage. The caller must hold the ->orphan_lock.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
-{
- struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
-
- lockdep_assert_held(&rsp->orphan_lock);
-
- /* No-CBs CPUs are handled specially. */
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
- rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
- return;
-
- /* Do the accounting first. */
- rdp->n_cbs_adopted += rsp->orphan_done.len;
- if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
- rcu_idle_count_callbacks_posted();
- rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
-
- /*
- * We do not need a memory barrier here because the only way we
- * can get here if there is an rcu_barrier() in flight is if
- * we are the task doing the rcu_barrier().
- */
-
- /* First adopt the ready-to-invoke callbacks, then the done ones. */
- rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
- WARN_ON_ONCE(rsp->orphan_done.head);
- rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
- WARN_ON_ONCE(rsp->orphan_pend.head);
- WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
- !rcu_segcblist_n_cbs(&rdp->cblist));
-}
-
-/*
* Trace the fact that this CPU is going offline.
*/
static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
@@ -2704,14 +2699,12 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
/*
* The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup,
- * including orphaning the outgoing CPU's RCU callbacks, and also
- * adopting them. There can only be one CPU hotplug operation at a time,
- * so no other CPU can be attempting to update rcu_cpu_kthread_task.
+ * this fact from process context. Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
*/
static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
{
- unsigned long flags;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
@@ -2720,18 +2713,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
-
- /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
- raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
- rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
- rcu_adopt_orphan_cbs(rsp, flags);
- raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-
- WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
- !rcu_segcblist_empty(&rdp->cblist),
- "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
- cpu, rcu_segcblist_n_cbs(&rdp->cblist),
- rcu_segcblist_first_cb(&rdp->cblist));
}
/*
@@ -3183,9 +3164,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
* read-side critical sections have completed. call_rcu_sched() assumes
* that the read-side critical sections end on enabling of preemption
* or on voluntary preemption.
- * RCU read-side critical sections are delimited by :
- * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
- * - anything that disables preemption.
+ * RCU read-side critical sections are delimited by:
+ *
+ * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
+ * - anything that disables preemption.
*
* These may be nested.
*
@@ -3210,11 +3192,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
* handler. This means that read-side critical sections in process
* context must not be interrupted by softirqs. This interface is to be
* used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by :
- * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context.
- * OR
- * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
- * These may be nested.
+ * RCU read-side critical sections are delimited by:
+ *
+ * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR
+ * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
+ *
+ * These may be nested.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -3569,10 +3552,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
struct rcu_state *rsp = rdp->rsp;
if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
- _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("LastCB"), -1,
+ rsp->barrier_sequence);
complete(&rsp->barrier_completion);
} else {
- _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence);
}
}
@@ -3584,14 +3568,15 @@ static void rcu_barrier_func(void *type)
struct rcu_state *rsp = type;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
- _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
atomic_inc(&rsp->barrier_cpu_count);
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
- _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1,
+ rsp->barrier_sequence);
}
}
@@ -3605,14 +3590,15 @@ static void _rcu_barrier(struct rcu_state *rsp)
struct rcu_data *rdp;
unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Begin", -1, s);
+ _rcu_barrier_trace(rsp, TPS("Begin"), -1, s);
/* Take mutex to serialize concurrent rcu_barrier() requests. */
mutex_lock(&rsp->barrier_mutex);
/* Did someone else do our work for us? */
if (rcu_seq_done(&rsp->barrier_sequence, s)) {
- _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1,
+ rsp->barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rsp->barrier_mutex);
return;
@@ -3620,7 +3606,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
/* Mark the start of the barrier operation. */
rcu_seq_start(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence);
/*
* Initialize the count to one rather than to zero in order to
@@ -3643,10 +3629,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
rdp = per_cpu_ptr(rsp->rda, cpu);
if (rcu_is_nocb_cpu(cpu)) {
if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
- _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
+ _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu,
rsp->barrier_sequence);
} else {
- _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu,
rsp->barrier_sequence);
smp_mb__before_atomic();
atomic_inc(&rsp->barrier_cpu_count);
@@ -3654,11 +3640,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
rcu_barrier_callback, rsp, cpu, 0);
}
} else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
- _rcu_barrier_trace(rsp, "OnlineQ", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu,
rsp->barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
} else {
- _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu,
rsp->barrier_sequence);
}
}
@@ -3675,7 +3661,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
wait_for_completion(&rsp->barrier_completion);
/* Mark the end of the barrier operation. */
- _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence);
rcu_seq_end(&rsp->barrier_sequence);
/* Other rcu_barrier() invocations can now safely proceed. */
@@ -3777,14 +3763,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
*/
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- if (!rdp->beenonline)
- WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
rdp->beenonline = true; /* We have now been online. */
rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
rdp->completed = rnp->completed;
rdp->cpu_no_qs.b.norm = true;
rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu);
rdp->core_needs_qs = false;
+ rdp->rcu_iw_pending = false;
+ rdp->rcu_iw_gpnum = rnp->gpnum - 1;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -3822,10 +3808,24 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
*/
int rcutree_online_cpu(unsigned int cpu)
{
- sync_sched_exp_online_cleanup(cpu);
- rcutree_affinity_setting(cpu, -1);
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->ffmask |= rdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
if (IS_ENABLED(CONFIG_TREE_SRCU))
srcu_online_cpu(cpu);
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return 0; /* Too early in boot for scheduler work. */
+ sync_sched_exp_online_cleanup(cpu);
+ rcutree_affinity_setting(cpu, -1);
return 0;
}
@@ -3835,6 +3835,19 @@ int rcutree_online_cpu(unsigned int cpu)
*/
int rcutree_offline_cpu(unsigned int cpu)
{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->ffmask &= ~rdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+
rcutree_affinity_setting(cpu, cpu);
if (IS_ENABLED(CONFIG_TREE_SRCU))
srcu_offline_cpu(cpu);
@@ -3882,6 +3895,8 @@ void rcu_cpu_starting(unsigned int cpu)
{
unsigned long flags;
unsigned long mask;
+ int nbits;
+ unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_state *rsp;
@@ -3892,9 +3907,15 @@ void rcu_cpu_starting(unsigned int cpu)
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->qsmaskinitnext |= mask;
+ oldmask = rnp->expmaskinitnext;
rnp->expmaskinitnext |= mask;
+ oldmask ^= rnp->expmaskinitnext;
+ nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
+ /* Allow lockless access for expedited grace periods. */
+ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+ smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -3937,6 +3958,50 @@ void rcu_report_dead(unsigned int cpu)
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_idle_cpu(cpu, rsp);
}
+
+/* Migrate the dead CPU's callbacks to the current CPU. */
+static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ struct rcu_data *my_rdp;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+
+ if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
+ return; /* No callbacks to migrate. */
+
+ local_irq_save(flags);
+ my_rdp = this_cpu_ptr(rsp->rda);
+ if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
+ local_irq_restore(flags);
+ return;
+ }
+ raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+ rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */
+ rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */
+ rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
+ !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+ WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
+ !rcu_segcblist_empty(&rdp->cblist),
+ "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
+ cpu, rcu_segcblist_n_cbs(&rdp->cblist),
+ rcu_segcblist_first_cb(&rdp->cblist));
+}
+
+/*
+ * The outgoing CPU has just passed through the dying-idle state,
+ * and we are being invoked from the CPU that was IPIed to continue the
+ * offline operation. We need to migrate the outgoing CPU's callbacks.
+ */
+void rcutree_migrate_callbacks(int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rcu_migrate_callbacks(cpu, rsp);
+}
#endif
/*
@@ -4134,7 +4199,7 @@ static void __init rcu_init_geometry(void)
if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
nr_cpu_ids == NR_CPUS)
return;
- pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
+ pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
rcu_fanout_leaf, nr_cpu_ids);
/*
@@ -4231,8 +4296,7 @@ void __init rcu_init(void)
for_each_online_cpu(cpu) {
rcutree_prepare_cpu(cpu);
rcu_cpu_starting(cpu);
- if (IS_ENABLED(CONFIG_TREE_SRCU))
- srcu_online_cpu(cpu);
+ rcutree_online_cpu(cpu);
}
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9af0f31d6847..46a5d1991450 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -103,6 +103,7 @@ struct rcu_node {
/* Online CPUs for next expedited GP. */
/* Any CPU that has ever been online will */
/* have its bit set. */
+ unsigned long ffmask; /* Fully functional CPUs. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */
@@ -219,8 +220,6 @@ struct rcu_data {
/* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
- unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
- unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -268,7 +267,9 @@ struct rcu_data {
struct rcu_head **nocb_follower_tail;
struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
+ raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+ struct timer_list nocb_timer; /* Enforce finite deferral. */
/* The following fields are used by the leader, hence own cacheline. */
struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
@@ -285,6 +286,10 @@ struct rcu_data {
/* 8) RCU CPU stall data. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
+ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
+ struct irq_work rcu_iw; /* Check for non-irq activity. */
+ bool rcu_iw_pending; /* Is ->rcu_iw pending? */
+ unsigned long rcu_iw_gpnum; /* ->gpnum associated with ->rcu_iw. */
int cpu;
struct rcu_state *rsp;
@@ -350,15 +355,6 @@ struct rcu_state {
/* End of fields guarded by root rcu_node's lock. */
- raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
- /* Protect following fields. */
- struct rcu_cblist orphan_pend; /* Orphaned callbacks that */
- /* need a grace period. */
- struct rcu_cblist orphan_done; /* Orphaned callbacks that */
- /* are ready to invoke. */
- /* (Contains counts.) */
- /* End of fields guarded by orphan_lock. */
-
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
@@ -495,7 +491,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags);
-static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index dd21ca47e4b4..46d61b597731 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
unsigned long flags;
unsigned long mask;
unsigned long oldmask;
- int ncpus = READ_ONCE(rsp->ncpus);
+ int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */
struct rcu_node *rnp;
struct rcu_node *rnp_up;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 908b309d60d7..db85ca3975f1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -29,6 +29,7 @@
#include <linux/oom.h>
#include <linux/sched/debug.h>
#include <linux/smpboot.h>
+#include <linux/sched/isolation.h>
#include <uapi/linux/sched/types.h>
#include "../time/tick-internal.h"
@@ -54,6 +55,7 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
* This probably needs to be excluded from -rt builds.
*/
#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
+#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1)
#endif /* #else #ifdef CONFIG_RCU_BOOST */
@@ -89,7 +91,7 @@ static void __init rcu_bootup_announce_oddness(void)
if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
- pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
+ pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids);
#ifdef CONFIG_RCU_BOOST
pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY);
#endif
@@ -180,6 +182,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
struct task_struct *t = current;
lockdep_assert_held(&rnp->lock);
+ WARN_ON_ONCE(rdp->mynode != rnp);
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
/*
* Decide where to queue the newly blocked task. In theory,
@@ -261,6 +265,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
rnp->gp_tasks = &t->rcu_node_entry;
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
rnp->exp_tasks = &t->rcu_node_entry;
+ WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
+ !(rnp->qsmask & rdp->grpmask));
+ WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
+ !(rnp->expmask & rdp->grpmask));
raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
/*
@@ -319,7 +327,7 @@ static void rcu_preempt_note_context_switch(bool preempt)
struct rcu_data *rdp;
struct rcu_node *rnp;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
+ lockdep_assert_irqs_disabled();
WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
if (t->rcu_read_lock_nesting > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
@@ -482,6 +490,7 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp = t->rcu_blocked_node;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
WARN_ON_ONCE(rnp != t->rcu_blocked_node);
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -495,10 +504,10 @@ void rcu_read_unlock_special(struct task_struct *t)
if (&t->rcu_node_entry == rnp->exp_tasks)
rnp->exp_tasks = np;
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp->boost_tasks = np;
/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp->boost_tasks = np;
}
/*
@@ -523,7 +532,7 @@ void rcu_read_unlock_special(struct task_struct *t)
/* Unboost if we were boosted. */
if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
- rt_mutex_unlock(&rnp->boost_mtx);
+ rt_mutex_futex_unlock(&rnp->boost_mtx);
/*
* If this was the last task on the expedited lists,
@@ -636,10 +645,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
+ struct task_struct *t;
+
RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
- if (rcu_preempt_has_tasks(rnp))
+ if (rcu_preempt_has_tasks(rnp)) {
rnp->gp_tasks = rnp->blkd_tasks.next;
+ t = container_of(rnp->gp_tasks, struct task_struct,
+ rcu_node_entry);
+ trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
+ rnp->gpnum, t->pid);
+ }
WARN_ON_ONCE(rnp->qsmask);
}
@@ -897,8 +913,6 @@ void exit_rcu(void)
#ifdef CONFIG_RCU_BOOST
-#include "../locking/rtmutex_common.h"
-
static void rcu_wake_cond(struct task_struct *t, int status)
{
/*
@@ -1407,7 +1421,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
unsigned long dj;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
@@ -1456,7 +1470,7 @@ static void rcu_prepare_for_idle(void)
struct rcu_state *rsp;
int tne;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (rcu_is_nocb_cpu(smp_processor_id()))
return;
@@ -1493,7 +1507,7 @@ static void rcu_prepare_for_idle(void)
rdtp->last_accelerate = jiffies;
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
- if (rcu_segcblist_pend_cbs(&rdp->cblist))
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist))
continue;
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
@@ -1511,7 +1525,7 @@ static void rcu_prepare_for_idle(void)
*/
static void rcu_cleanup_after_idle(void)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
@@ -1657,6 +1671,7 @@ static void print_cpu_stall_info_begin(void)
*/
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
{
+ unsigned long delta;
char fast_no_hz[72];
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_dynticks *rdtp = rdp->dynticks;
@@ -1671,11 +1686,15 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
ticks_value = rsp->gpnum - rdp->gpnum;
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
+ delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum;
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
"N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+ !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
+ rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
+ "!."[!delta],
ticks_value, ticks_title,
rcu_dynticks_snap(rdtp) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
@@ -1788,23 +1807,62 @@ bool rcu_is_nocb_cpu(int cpu)
}
/*
- * Kick the leader kthread for this NOCB group.
+ * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock
+ * and this function releases it.
*/
-static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
{
struct rcu_data *rdp_leader = rdp->nocb_leader;
- if (!READ_ONCE(rdp_leader->nocb_kthread))
+ lockdep_assert_held(&rdp->nocb_lock);
+ if (!READ_ONCE(rdp_leader->nocb_kthread)) {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
return;
- if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+ }
+ if (rdp_leader->nocb_leader_sleep || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
+ del_timer(&rdp->nocb_timer);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
swake_up(&rdp_leader->nocb_wq);
+ } else {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
}
}
/*
+ * Kick the leader kthread for this NOCB group, but caller has not
+ * acquired locks.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ __wake_nocb_leader(rdp, force, flags);
+}
+
+/*
+ * Arrange to wake the leader kthread for this NOCB group at some
+ * future time when it is safe to do so.
+ */
+static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
+ mod_timer(&rdp->nocb_timer, jiffies + 1);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+}
+
+/*
* Does the specified CPU need an RCU callback for the specified flavor
* of rcu_barrier()?
*/
@@ -1891,11 +1949,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE);
- /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeEmptyIsDeferred"));
+ wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
}
rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
@@ -1905,11 +1960,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE);
- /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeOvfIsDeferred"));
+ wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeOvfIsDeferred"));
}
rdp->qlen_last_fqs_check = LONG_MAX / 2;
} else {
@@ -1961,30 +2013,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
* Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
* not a no-CBs CPU.
*/
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags)
{
- long ql = rsp->orphan_done.len;
- long qll = rsp->orphan_done.len_lazy;
-
- /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
+ lockdep_assert_irqs_disabled();
if (!rcu_is_nocb_cpu(smp_processor_id()))
- return false;
-
- /* First, enqueue the donelist, if any. This preserves CB ordering. */
- if (rsp->orphan_done.head) {
- __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
- rcu_cblist_tail(&rsp->orphan_done),
- ql, qll, flags);
- }
- if (rsp->orphan_pend.head) {
- __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
- rcu_cblist_tail(&rsp->orphan_pend),
- ql, qll, flags);
- }
- rcu_cblist_init(&rsp->orphan_done);
- rcu_cblist_init(&rsp->orphan_pend);
+ return false; /* Not NOCBs CPU, caller must migrate CBs. */
+ __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
+ rcu_segcblist_tail(&rdp->cblist),
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags);
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_disable(&rdp->cblist);
return true;
}
@@ -2031,6 +2072,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
static void nocb_leader_wait(struct rcu_data *my_rdp)
{
bool firsttime = true;
+ unsigned long flags;
bool gotcbs;
struct rcu_data *rdp;
struct rcu_head **tail;
@@ -2039,13 +2081,17 @@ wait_again:
/* Wait for callbacks to appear. */
if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
swait_event_interruptible(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep));
- /* Memory barrier handled by smp_mb() calls below and repoll. */
+ raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
+ my_rdp->nocb_leader_sleep = true;
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
} else if (firsttime) {
firsttime = false; /* Don't drown trace log with "Poll"! */
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll"));
}
/*
@@ -2054,7 +2100,7 @@ wait_again:
* nocb_gp_head, where they await a grace period.
*/
gotcbs = false;
- smp_mb(); /* wakeup before ->nocb_head reads. */
+ smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
if (!rdp->nocb_gp_head)
@@ -2066,56 +2112,41 @@ wait_again:
gotcbs = true;
}
- /*
- * If there were no callbacks, sleep a bit, rescan after a
- * memory barrier, and go retry.
- */
+ /* No callbacks? Sleep a bit if polling, and go retry. */
if (unlikely(!gotcbs)) {
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
- "WokeEmpty");
WARN_ON(signal_pending(current));
- schedule_timeout_interruptible(1);
-
- /* Rescan in case we were a victim of memory ordering. */
- my_rdp->nocb_leader_sleep = true;
- smp_mb(); /* Ensure _sleep true before scan. */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
- if (READ_ONCE(rdp->nocb_head)) {
- /* Found CB, so short-circuit next wait. */
- my_rdp->nocb_leader_sleep = false;
- break;
- }
+ if (rcu_nocb_poll) {
+ schedule_timeout_interruptible(1);
+ } else {
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
+ TPS("WokeEmpty"));
+ }
goto wait_again;
}
/* Wait for one grace period. */
rcu_nocb_wait_gp(my_rdp);
- /*
- * We left ->nocb_leader_sleep unset to reduce cache thrashing.
- * We set it now, but recheck for new callbacks while
- * traversing our follower list.
- */
- my_rdp->nocb_leader_sleep = true;
- smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
-
/* Each pass through the following loop wakes a follower, if needed. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- if (READ_ONCE(rdp->nocb_head))
+ if (!rcu_nocb_poll &&
+ READ_ONCE(rdp->nocb_head) &&
+ READ_ONCE(my_rdp->nocb_leader_sleep)) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
+ }
if (!rdp->nocb_gp_head)
continue; /* No CBs, so no need to wake follower. */
/* Append callbacks to follower's "done" list. */
- tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ tail = rdp->nocb_follower_tail;
+ rdp->nocb_follower_tail = rdp->nocb_gp_tail;
*tail = rdp->nocb_gp_head;
- smp_mb__after_atomic(); /* Store *tail before wakeup. */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
- /*
- * List was empty, wake up the follower.
- * Memory barriers supplied by atomic_long_add().
- */
+ /* List was empty, so wake up the follower. */
swake_up(&rdp->nocb_wq);
}
}
@@ -2131,28 +2162,16 @@ wait_again:
*/
static void nocb_follower_wait(struct rcu_data *rdp)
{
- bool firsttime = true;
-
for (;;) {
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- "FollowerSleep");
- swait_event_interruptible(rdp->nocb_wq,
- READ_ONCE(rdp->nocb_follower_head));
- } else if (firsttime) {
- /* Don't drown trace log with "Poll"! */
- firsttime = false;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
- }
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
+ swait_event_interruptible(rdp->nocb_wq,
+ READ_ONCE(rdp->nocb_follower_head));
if (smp_load_acquire(&rdp->nocb_follower_head)) {
/* ^^^ Ensure CB invocation follows _head test. */
return;
}
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- "WokeEmpty");
WARN_ON(signal_pending(current));
- schedule_timeout_interruptible(1);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty"));
}
}
@@ -2165,6 +2184,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
static int rcu_nocb_kthread(void *arg)
{
int c, cl;
+ unsigned long flags;
struct rcu_head *list;
struct rcu_head *next;
struct rcu_head **tail;
@@ -2179,11 +2199,14 @@ static int rcu_nocb_kthread(void *arg)
nocb_follower_wait(rdp);
/* Pull the ready-to-invoke callbacks onto local list. */
- list = READ_ONCE(rdp->nocb_follower_head);
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ list = rdp->nocb_follower_head;
+ rdp->nocb_follower_head = NULL;
+ tail = rdp->nocb_follower_tail;
+ rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
BUG_ON(!list);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
- WRITE_ONCE(rdp->nocb_follower_head, NULL);
- tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty"));
/* Each pass through the following loop invokes a callback. */
trace_rcu_batch_start(rdp->rsp->name,
@@ -2226,18 +2249,41 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
}
/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
{
+ unsigned long flags;
int ndw;
- if (!rcu_nocb_need_deferred_wakeup(rdp))
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ if (!rcu_nocb_need_deferred_wakeup(rdp)) {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
return;
+ }
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE);
+ __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
}
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
+{
+ struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
+
+ do_nocb_deferred_wakeup_common(rdp);
+}
+
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check. Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ if (rcu_nocb_need_deferred_wakeup(rdp))
+ do_nocb_deferred_wakeup_common(rdp);
+}
+
void __init rcu_init_nohz(void)
{
int cpu;
@@ -2287,6 +2333,8 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
rdp->nocb_tail = &rdp->nocb_head;
init_swait_queue_head(&rdp->nocb_wq);
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+ raw_spin_lock_init(&rdp->nocb_lock);
+ timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
}
/*
@@ -2459,7 +2507,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
return false;
}
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags)
{
@@ -2541,7 +2589,7 @@ static void rcu_bind_gp_kthread(void)
if (!tick_nohz_full_enabled())
return;
- housekeeping_affine(current);
+ housekeeping_affine(current, HK_FLAG_RCU);
}
/* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 00e77c470017..fbd56d6e575b 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -51,6 +51,7 @@
#include <linux/kthread.h>
#include <linux/tick.h>
#include <linux/rcupdate_wait.h>
+#include <linux/sched/isolation.h>
#define CREATE_TRACE_POINTS
@@ -494,6 +495,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
#endif
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_suppress, int, 0644);
@@ -568,14 +570,13 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
/* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_SRCU(tasks_rcu_exit_srcu);
+DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
-static void rcu_spawn_tasks_kthread(void);
static struct task_struct *rcu_tasks_kthread_ptr;
/**
@@ -600,7 +601,6 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
{
unsigned long flags;
bool needwake;
- bool havetask = READ_ONCE(rcu_tasks_kthread_ptr);
rhp->next = NULL;
rhp->func = func;
@@ -610,11 +610,8 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
rcu_tasks_cbs_tail = &rhp->next;
raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
/* We can't create the thread unless interrupts are enabled. */
- if ((needwake && havetask) ||
- (!havetask && !irqs_disabled_flags(flags))) {
- rcu_spawn_tasks_kthread();
+ if (needwake && READ_ONCE(rcu_tasks_kthread_ptr))
wake_up(&rcu_tasks_cbs_wq);
- }
}
EXPORT_SYMBOL_GPL(call_rcu_tasks);
@@ -718,7 +715,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
LIST_HEAD(rcu_tasks_holdouts);
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
- housekeeping_affine(current);
+ housekeeping_affine(current, HK_FLAG_RCU);
/*
* Each pass through the following loop makes one check for
@@ -853,26 +850,33 @@ static int __noreturn rcu_tasks_kthread(void *arg)
}
}
-/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */
-static void rcu_spawn_tasks_kthread(void)
+/* Spawn rcu_tasks_kthread() at core_initcall() time. */
+static int __init rcu_spawn_tasks_kthread(void)
{
- static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
struct task_struct *t;
- if (READ_ONCE(rcu_tasks_kthread_ptr)) {
- smp_mb(); /* Ensure caller sees full kthread. */
- return;
- }
- mutex_lock(&rcu_tasks_kthread_mutex);
- if (rcu_tasks_kthread_ptr) {
- mutex_unlock(&rcu_tasks_kthread_mutex);
- return;
- }
t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
BUG_ON(IS_ERR(t));
smp_mb(); /* Ensure others see full kthread. */
WRITE_ONCE(rcu_tasks_kthread_ptr, t);
- mutex_unlock(&rcu_tasks_kthread_mutex);
+ return 0;
+}
+core_initcall(rcu_spawn_tasks_kthread);
+
+/* Do the srcu_read_lock() for the above synchronize_srcu(). */
+void exit_tasks_rcu_start(void)
+{
+ preempt_disable();
+ current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+ preempt_enable();
+}
+
+/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
+void exit_tasks_rcu_finish(void)
+{
+ preempt_disable();
+ __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx);
+ preempt_enable();
}
#endif /* #ifdef CONFIG_TASKS_RCU */
diff --git a/kernel/resource.c b/kernel/resource.c
index 9b5f04404152..54ba6de3757c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -397,9 +397,32 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
res->start = p->start;
if (res->end > p->end)
res->end = p->end;
+ res->flags = p->flags;
+ res->desc = p->desc;
return 0;
}
+static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
+ bool first_level_children_only,
+ void *arg,
+ int (*func)(struct resource *, void *))
+{
+ u64 orig_end = res->end;
+ int ret = -1;
+
+ while ((res->start < res->end) &&
+ !find_next_iomem_res(res, desc, first_level_children_only)) {
+ ret = (*func)(res, arg);
+ if (ret)
+ break;
+
+ res->start = res->end + 1;
+ res->end = orig_end;
+ }
+
+ return ret;
+}
+
/*
* Walks through iomem resources and calls func() with matching resource
* ranges. This walks through whole tree and not just first level children.
@@ -415,29 +438,15 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
* <linux/ioport.h> and set it in 'desc' of a target resource entry.
*/
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
- u64 end, void *arg, int (*func)(u64, u64, void *))
+ u64 end, void *arg, int (*func)(struct resource *, void *))
{
struct resource res;
- u64 orig_end;
- int ret = -1;
res.start = start;
res.end = end;
res.flags = flags;
- orig_end = res.end;
-
- while ((res.start < res.end) &&
- (!find_next_iomem_res(&res, desc, false))) {
-
- ret = (*func)(res.start, res.end, arg);
- if (ret)
- break;
-
- res.start = res.end + 1;
- res.end = orig_end;
- }
- return ret;
+ return __walk_iomem_res_desc(&res, desc, false, arg, func);
}
/*
@@ -448,25 +457,33 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
* ranges.
*/
int walk_system_ram_res(u64 start, u64 end, void *arg,
- int (*func)(u64, u64, void *))
+ int (*func)(struct resource *, void *))
{
struct resource res;
- u64 orig_end;
- int ret = -1;
res.start = start;
res.end = end;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- orig_end = res.end;
- while ((res.start < res.end) &&
- (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) {
- ret = (*func)(res.start, res.end, arg);
- if (ret)
- break;
- res.start = res.end + 1;
- res.end = orig_end;
- }
- return ret;
+
+ return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
+ arg, func);
+}
+
+/*
+ * This function calls the @func callback against all memory ranges, which
+ * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ */
+int walk_mem_res(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ struct resource res;
+
+ res.start = start;
+ res.end = end;
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
+ return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
+ arg, func);
}
#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
@@ -508,6 +525,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
{
return 1;
}
+
/*
* This generic page_is_ram() returns true if specified address is
* registered as System RAM in iomem_resource list.
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 53f0164ed362..e2f9d4feff40 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
endif
@@ -25,3 +26,5 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
+obj-$(CONFIG_CPU_ISOLATION) += isolation.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index da39489d2d80..a43df5193538 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "sched.h"
#include <linux/proc_fs.h>
@@ -71,7 +72,6 @@ static inline struct autogroup *autogroup_create(void)
goto out_fail;
tg = sched_create_group(&root_task_group);
-
if (IS_ERR(tg))
goto out_free;
@@ -101,7 +101,7 @@ out_free:
out_fail:
if (printk_ratelimit()) {
printk(KERN_WARNING "autogroup_create: %s failure.\n",
- ag ? "sched_create_group()" : "kmalloc()");
+ ag ? "sched_create_group()" : "kzalloc()");
}
return autogroup_kref_get(&autogroup_default);
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index ce40c810cd5c..27cd22b89824 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifdef CONFIG_SCHED_AUTOGROUP
#include <linux/kref.h>
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index ca0f8fc945c6..e086babe6c61 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -388,7 +388,7 @@ void sched_clock_tick(void)
if (unlikely(!sched_clock_running))
return;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
scd = this_scd();
__scd_stamp(scd);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 13fc5ae9bf2f..2ddaec40956f 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Generic wait-for-completion handler;
*
@@ -32,6 +33,12 @@ void complete(struct completion *x)
unsigned long flags;
spin_lock_irqsave(&x->wait.lock, flags);
+
+ /*
+ * Perform commit of crossrelease here.
+ */
+ complete_release_commit(x);
+
if (x->done != UINT_MAX)
x->done++;
__wake_up_locked(&x->wait, TASK_NORMAL, 1);
@@ -47,6 +54,13 @@ EXPORT_SYMBOL(complete);
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
+ *
+ * Since complete_all() sets the completion of @x permanently to done
+ * to allow multiple waiters to finish, a call to reinit_completion()
+ * must be used on @x if @x is to be used again. The code must make
+ * sure that all waiters have woken and finished before reinitializing
+ * @x. Also note that the function completion_done() can not be used
+ * to know if there are still waiters after complete_all() has been called.
*/
void complete_all(struct completion *x)
{
@@ -92,9 +106,14 @@ __wait_for_common(struct completion *x,
{
might_sleep();
+ complete_acquire(x);
+
spin_lock_irq(&x->wait.lock);
timeout = do_wait_for_common(x, action, timeout, state);
spin_unlock_irq(&x->wait.lock);
+
+ complete_release(x);
+
return timeout;
}
@@ -297,9 +316,12 @@ EXPORT_SYMBOL(try_wait_for_completion);
* Return: 0 if there are waiters (wait_for_completion() in progress)
* 1 if there are no waiters.
*
+ * Note, this will always return true if complete_all() was called on @X.
*/
bool completion_done(struct completion *x)
{
+ unsigned long flags;
+
if (!READ_ONCE(x->done))
return false;
@@ -307,14 +329,9 @@ bool completion_done(struct completion *x)
* If ->done, we need to wait for complete() to release ->wait.lock
* otherwise we can end up freeing the completion before complete()
* is done referencing it.
- *
- * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
- * the loads of ->done and ->wait.lock such that we cannot observe
- * the lock before complete() acquires it while observing the ->done
- * after it's acquired the lock.
*/
- smp_rmb();
- spin_unlock_wait(&x->wait.lock);
+ spin_lock_irqsave(&x->wait.lock, flags);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0869b20fba81..5b82a0073532 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/sched/isolation.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -42,18 +43,21 @@
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
/*
* Debugging: various feature bits
+ *
+ * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
+ * sysctl_sched_features, defined in sched.h, to allow constants propagation
+ * at compile time and compiler optimization based on features default.
*/
-
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
-
const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
-
#undef SCHED_FEAT
+#endif
/*
* Number of tasks to iterate in a single balance run.
@@ -83,9 +87,6 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
-/* CPUs with isolated domains */
-cpumask_var_t cpu_isolated_map;
-
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -505,8 +506,7 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- if (!raw_spin_trylock_irqsave(&rq->lock, flags))
- return;
+ raw_spin_lock_irqsave(&rq->lock, flags);
resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -526,7 +526,7 @@ int get_nohz_timer_target(void)
int i, cpu = smp_processor_id();
struct sched_domain *sd;
- if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
return cpu;
rcu_read_lock();
@@ -535,15 +535,15 @@ int get_nohz_timer_target(void)
if (cpu == i)
continue;
- if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
cpu = i;
goto unlock;
}
}
}
- if (!is_housekeeping_cpu(cpu))
- cpu = housekeeping_any_cpu();
+ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
+ cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
unlock:
rcu_read_unlock();
return cpu;
@@ -733,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;
@@ -747,8 +747,16 @@ static void set_load_weight(struct task_struct *p)
return;
}
- load->weight = scale_load(sched_prio_to_weight[prio]);
- load->inv_weight = sched_prio_to_wmult[prio];
+ /*
+ * SCHED_OTHER tasks have to update their load when changing their
+ * weight
+ */
+ if (update_load && p->sched_class == &fair_sched_class) {
+ reweight_task(p, prio);
+ } else {
+ load->weight = scale_load(sched_prio_to_weight[prio]);
+ load->inv_weight = sched_prio_to_wmult[prio];
+ }
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -951,8 +959,13 @@ struct migration_arg {
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int dest_cpu)
{
- if (unlikely(!cpu_active(dest_cpu)))
- return rq;
+ if (p->flags & PF_KTHREAD) {
+ if (unlikely(!cpu_online(dest_cpu)))
+ return rq;
+ } else {
+ if (unlikely(!cpu_active(dest_cpu)))
+ return rq;
+ }
/* Affinity changed (again). */
if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
@@ -1168,6 +1181,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
lockdep_is_held(&task_rq(p)->lock)));
#endif
+ /*
+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
+ */
+ WARN_ON_ONCE(!cpu_online(new_cpu));
#endif
trace_sched_migrate_task(p, new_cpu);
@@ -1967,8 +1984,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* reordered with p->state check below. This pairs with mb() in
* set_current_state() the waiting thread does.
*/
- smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ smp_mb__after_spinlock();
if (!(p->state & state))
goto out;
@@ -2349,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->static_prio = NICE_TO_PRIO(0);
p->prio = p->normal_prio = __normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, false);
/*
* We don't need the reset flag anymore after the fork. It has
@@ -2635,6 +2652,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
prev_state = prev->state;
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
+ /*
+ * The membarrier system call requires a full memory barrier
+ * after storing to rq->curr, before going back to user-space.
+ *
+ * TODO: This smp_mb__after_unlock_lock can go away if PPC end
+ * up adding a full barrier to switch_mm(), or we should figure
+ * out if a smp_mb__after_unlock_lock is really the proper API
+ * to use.
+ */
+ smp_mb__after_unlock_lock();
finish_lock_switch(rq, prev);
finish_arch_post_lock_switch();
@@ -3281,8 +3308,8 @@ static void __sched notrace __schedule(bool preempt)
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
- smp_mb__before_spinlock();
rq_lock(rq, &rf);
+ smp_mb__after_spinlock();
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
@@ -3324,6 +3351,21 @@ static void __sched notrace __schedule(bool preempt)
if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
+ /*
+ * The membarrier system call requires each architecture
+ * to have a full memory barrier after updating
+ * rq->curr, before returning to user-space. For TSO
+ * (e.g. x86), the architecture must provide its own
+ * barrier in switch_mm(). For weakly ordered machines
+ * for which spin_unlock() acts as a full memory
+ * barrier, finish_lock_switch() in common code takes
+ * care of this barrier. For weakly ordered machines for
+ * which spin_unlock() acts as a RELEASE barrier (only
+ * arm64 and PowerPC), arm64 has a full barrier in
+ * switch_to(), and PowerPC has
+ * smp_mb__after_unlock_lock() before
+ * finish_lock_switch().
+ */
++*switch_count;
trace_sched_switch(preempt, prev, next);
@@ -3352,8 +3394,8 @@ void __noreturn do_task_dead(void)
* To avoid it, we have to wait for releasing tsk->pi_lock which
* is held by try_to_wake_up()
*/
- smp_mb();
- raw_spin_unlock_wait(&current->pi_lock);
+ raw_spin_lock_irq(&current->pi_lock);
+ raw_spin_unlock_irq(&current->pi_lock);
/* Causes final put_task_struct in finish_task_switch(): */
__set_current_state(TASK_DEAD);
@@ -3771,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice)
put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p);
+ set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
delta = p->prio - old_prio;
@@ -3928,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p,
*/
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, true);
}
/* Actually do priority change: must hold pi & rq lock. */
@@ -4808,6 +4850,7 @@ int __sched _cond_resched(void)
preempt_schedule_common();
return 1;
}
+ rcu_all_qs();
return 0;
}
EXPORT_SYMBOL(_cond_resched);
@@ -5103,24 +5146,17 @@ out_unlock:
return retval;
}
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
int ppid;
- unsigned long state = p->state;
-
- /* Make sure the string lines up properly with the number of task states: */
- BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
if (!try_get_task_stack(p))
return;
- if (state)
- state = __ffs(state) + 1;
- printk(KERN_INFO "%-15.15s %c", p->comm,
- state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
- if (state == TASK_RUNNING)
+
+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+
+ if (p->state == TASK_RUNNING)
printk(KERN_CONT " running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
@@ -5138,6 +5174,29 @@ void sched_show_task(struct task_struct *p)
show_stack(p, NULL);
put_task_stack(p);
}
+EXPORT_SYMBOL_GPL(sched_show_task);
+
+static inline bool
+state_filter_match(unsigned long state_filter, struct task_struct *p)
+{
+ /* no filter, everything matches */
+ if (!state_filter)
+ return true;
+
+ /* filter, but doesn't match */
+ if (!(p->state & state_filter))
+ return false;
+
+ /*
+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
+ * TASK_KILLABLE).
+ */
+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+ return false;
+
+ return true;
+}
+
void show_state_filter(unsigned long state_filter)
{
@@ -5161,7 +5220,7 @@ void show_state_filter(unsigned long state_filter)
*/
touch_nmi_watchdog();
touch_all_softlockup_watchdogs();
- if (!state_filter || (p->state & state_filter))
+ if (state_filter_match(state_filter, p))
sched_show_task(p);
}
@@ -5177,11 +5236,6 @@ void show_state_filter(unsigned long state_filter)
debug_show_all_locks();
}
-void init_idle_bootup_task(struct task_struct *idle)
-{
- idle->sched_class = &idle_sched_class;
-}
-
/**
* init_idle - set up an idle thread for a given CPU
* @idle: task in question
@@ -5438,7 +5492,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
*/
next = pick_next_task(rq, &fake_task, rf);
BUG_ON(!next);
- next->sched_class->put_prev_task(rq, next);
+ put_prev_task(rq, next);
/*
* Rules for changing task_struct::cpus_allowed are holding
@@ -5538,16 +5592,15 @@ static void cpuset_cpu_active(void)
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- num_cpus_frozen--;
- if (likely(num_cpus_frozen)) {
- partition_sched_domains(1, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL);
+ if (--num_cpus_frozen)
return;
- }
/*
* This is the last CPU online operation. So fall through and
* restore the original sched domains by considering the
* cpuset configurations.
*/
+ cpuset_force_rebuild();
}
cpuset_update_active_cpus();
}
@@ -5683,10 +5736,6 @@ static inline void sched_init_smt(void) { }
void __init sched_init_smp(void)
{
- cpumask_var_t non_isolated_cpus;
-
- alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-
sched_init_numa();
/*
@@ -5696,16 +5745,12 @@ void __init sched_init_smp(void)
*/
mutex_lock(&sched_domains_mutex);
sched_init_domains(cpu_active_mask);
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
- if (cpumask_empty(non_isolated_cpus))
- cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
BUG();
sched_init_granularity();
- free_cpumask_var(non_isolated_cpus);
init_sched_rt_class();
init_sched_dl_class();
@@ -5890,7 +5935,7 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
}
- set_load_weight(&init_task);
+ set_load_weight(&init_task, false);
/*
* The boot idle thread does lazy MMU switching as well:
@@ -5909,9 +5954,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
- /* May be allocated at isolcpus cmdline parse time */
- if (cpu_isolated_map == NULL)
- zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
set_cpu_rq_start_time(smp_processor_id());
#endif
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index f95ab29a45d0..44ab32a4fab6 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
#include <linux/slab.h>
#include <linux/percpu.h>
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
index ba72807c73d4..a8358a57a316 100644
--- a/kernel/sched/cpuacct.h
+++ b/kernel/sched/cpuacct.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifdef CONFIG_CGROUP_CPUACCT
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index fba235c7d026..8d9562d890d3 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp)
* @p: the task
* @later_mask: a mask to fill in with the selected CPUs (or NULL)
*
- * Returns: int - best CPU (heap maximum if suitable)
+ * Returns: int - CPUs were found
*/
int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask)
{
- int best_cpu = -1;
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
- best_cpu = cpumask_any(later_mask);
- goto out;
- } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
- dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
- best_cpu = cpudl_maximum(cp);
- if (later_mask)
- cpumask_set_cpu(best_cpu, later_mask);
- }
+ return 1;
+ } else {
+ int best_cpu = cpudl_maximum(cp);
+ WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
-out:
- WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
+ if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+ dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+ if (later_mask)
+ cpumask_set_cpu(best_cpu, later_mask);
- return best_cpu;
+ return 1;
+ }
+ }
+ return 0;
}
/*
@@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp)
{
int i;
- memset(cp, 0, sizeof(*cp));
raw_spin_lock_init(&cp->lock);
cp->size = 0;
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index f7da8c55bba0..b010d26e108e 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUDL_H
#define _LINUX_CPUDL_H
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 29a397067ffa..2f52ec0f1539 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -52,9 +52,11 @@ struct sugov_policy {
struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
+ unsigned int cpu;
- unsigned long iowait_boost;
- unsigned long iowait_boost_max;
+ bool iowait_boost_pending;
+ unsigned int iowait_boost;
+ unsigned int iowait_boost_max;
u64 last_update;
/* The fields below are only needed when sharing a policy. */
@@ -76,6 +78,26 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
{
s64 delta_ns;
+ /*
+ * Since cpufreq_update_util() is called with rq->lock held for
+ * the @target_cpu, our per-cpu data is fully serialized.
+ *
+ * However, drivers cannot in general deal with cross-cpu
+ * requests, so while get_next_freq() will work, our
+ * sugov_update_commit() call may not for the fast switching platforms.
+ *
+ * Hence stop here for remote requests if they aren't supported
+ * by the hardware, as calculating the frequency is pointless if
+ * we cannot in fact act on it.
+ *
+ * For the slow switching platforms, the kthread is always scheduled on
+ * the right set of CPUs and any CPU can find the next frequency and
+ * schedule the kthread.
+ */
+ if (sg_policy->policy->fast_switch_enabled &&
+ !cpufreq_can_do_remote_dvfs(sg_policy->policy))
+ return false;
+
if (sg_policy->work_in_progress)
return false;
@@ -106,7 +128,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
if (policy->fast_switch_enabled) {
next_freq = cpufreq_driver_fast_switch(policy, next_freq);
- if (next_freq == CPUFREQ_ENTRY_INVALID)
+ if (!next_freq)
return;
policy->cur = next_freq;
@@ -154,12 +176,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
-static void sugov_get_util(unsigned long *util, unsigned long *max)
+static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu)
{
- struct rq *rq = this_rq();
+ struct rq *rq = cpu_rq(cpu);
unsigned long cfs_max;
- cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+ cfs_max = arch_scale_cpu_capacity(NULL, cpu);
*util = min(rq->cfs.avg.util_avg, cfs_max);
*max = cfs_max;
@@ -169,30 +191,54 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
unsigned int flags)
{
if (flags & SCHED_CPUFREQ_IOWAIT) {
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ if (sg_cpu->iowait_boost_pending)
+ return;
+
+ sg_cpu->iowait_boost_pending = true;
+
+ if (sg_cpu->iowait_boost) {
+ sg_cpu->iowait_boost <<= 1;
+ if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ } else {
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+ }
} else if (sg_cpu->iowait_boost) {
s64 delta_ns = time - sg_cpu->last_update;
/* Clear iowait_boost if the CPU apprears to have been idle. */
- if (delta_ns > TICK_NSEC)
+ if (delta_ns > TICK_NSEC) {
sg_cpu->iowait_boost = 0;
+ sg_cpu->iowait_boost_pending = false;
+ }
}
}
static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
unsigned long *max)
{
- unsigned long boost_util = sg_cpu->iowait_boost;
- unsigned long boost_max = sg_cpu->iowait_boost_max;
+ unsigned int boost_util, boost_max;
- if (!boost_util)
+ if (!sg_cpu->iowait_boost)
return;
+ if (sg_cpu->iowait_boost_pending) {
+ sg_cpu->iowait_boost_pending = false;
+ } else {
+ sg_cpu->iowait_boost >>= 1;
+ if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+ sg_cpu->iowait_boost = 0;
+ return;
+ }
+ }
+
+ boost_util = sg_cpu->iowait_boost;
+ boost_max = sg_cpu->iowait_boost_max;
+
if (*util * boost_max < *max * boost_util) {
*util = boost_util;
*max = boost_max;
}
- sg_cpu->iowait_boost >>= 1;
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -229,15 +275,19 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
if (flags & SCHED_CPUFREQ_RT_DL) {
next_f = policy->cpuinfo.max_freq;
} else {
- sugov_get_util(&util, &max);
+ sugov_get_util(&util, &max, sg_cpu->cpu);
sugov_iowait_boost(sg_cpu, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
*/
- if (busy && next_f < sg_policy->next_freq)
+ if (busy && next_f < sg_policy->next_freq) {
next_f = sg_policy->next_freq;
+
+ /* Reset cached freq as next_freq has changed */
+ sg_policy->cached_raw_freq = 0;
+ }
}
sugov_update_commit(sg_policy, time, next_f);
}
@@ -264,6 +314,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
+ j_sg_cpu->iowait_boost_pending = false;
continue;
}
if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
@@ -290,7 +341,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
unsigned long util, max;
unsigned int next_f;
- sugov_get_util(&util, &max);
+ sugov_get_util(&util, &max, sg_cpu->cpu);
raw_spin_lock(&sg_policy->update_lock);
@@ -445,7 +496,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
}
sg_policy->thread = thread;
- kthread_bind_mask(thread, policy->related_cpus);
+
+ /* Kthread is bound to all CPUs by default */
+ if (!policy->dvfs_possible_from_any_cpu)
+ kthread_bind_mask(thread, policy->related_cpus);
+
init_irq_work(&sg_policy->irq_work, sugov_irq_work);
mutex_init(&sg_policy->work_lock);
@@ -528,16 +583,7 @@ static int sugov_init(struct cpufreq_policy *policy)
goto stop_kthread;
}
- if (policy->transition_delay_us) {
- tunables->rate_limit_us = policy->transition_delay_us;
- } else {
- unsigned int lat;
-
- tunables->rate_limit_us = LATENCY_MULTIPLIER;
- lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
- if (lat)
- tunables->rate_limit_us *= lat;
- }
+ tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
@@ -607,6 +653,7 @@ static int sugov_start(struct cpufreq_policy *policy)
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
memset(sg_cpu, 0, sizeof(*sg_cpu));
+ sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
sg_cpu->flags = SCHED_CPUFREQ_RT;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
@@ -655,6 +702,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
static struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
+ .dynamic_switching = true,
.init = sugov_init,
.exit = sugov_exit,
.start = sugov_start,
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..2511aba36b89 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp)
{
int i;
- memset(cp, 0, sizeof(*cp));
-
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[i];
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 63cbb9ca0496..bab050019071 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUPRI_H
#define _LINUX_CPUPRI_H
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 14d2dbf97c53..9be8b68a66da 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -259,8 +259,7 @@ static inline u64 account_other_time(u64 max)
{
u64 accounted;
- /* Shall be converted to a lockdep-enabled lightweight check */
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
accounted = steal_account_process_time(max);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 755bd3f1a1a9..f349f7e98dec 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Deadline Scheduling Class (SCHED_DEADLINE)
*
@@ -242,7 +243,7 @@ static void task_non_contending(struct task_struct *p)
if (p->state == TASK_DEAD)
sub_rq_bw(p->dl.dl_bw, &rq->dl);
raw_spin_lock(&dl_b->lock);
- __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_clear_params(p);
raw_spin_unlock(&dl_b->lock);
}
@@ -296,7 +297,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
{
struct sched_dl_entity *dl_se = &p->dl;
- return dl_rq->rb_leftmost == &dl_se->rb_node;
+ return dl_rq->root.rb_leftmost == &dl_se->rb_node;
}
void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
@@ -320,7 +321,7 @@ void init_dl_bw(struct dl_bw *dl_b)
void init_dl_rq(struct dl_rq *dl_rq)
{
- dl_rq->rb_root = RB_ROOT;
+ dl_rq->root = RB_ROOT_CACHED;
#ifdef CONFIG_SMP
/* zero means no -deadline tasks */
@@ -328,7 +329,7 @@ void init_dl_rq(struct dl_rq *dl_rq)
dl_rq->dl_nr_migratory = 0;
dl_rq->overloaded = 0;
- dl_rq->pushable_dl_tasks_root = RB_ROOT;
+ dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
#else
init_dl_bw(&dl_rq->dl_bw);
#endif
@@ -410,10 +411,10 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
struct dl_rq *dl_rq = &rq->dl;
- struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
+ struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node;
struct rb_node *parent = NULL;
struct task_struct *entry;
- int leftmost = 1;
+ bool leftmost = true;
BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
@@ -425,17 +426,16 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
link = &parent->rb_left;
else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- if (leftmost) {
- dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+ if (leftmost)
dl_rq->earliest_dl.next = p->dl.deadline;
- }
rb_link_node(&p->pushable_dl_tasks, parent, link);
- rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+ rb_insert_color_cached(&p->pushable_dl_tasks,
+ &dl_rq->pushable_dl_tasks_root, leftmost);
}
static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
@@ -445,24 +445,23 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
return;
- if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
+ if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) {
struct rb_node *next_node;
next_node = rb_next(&p->pushable_dl_tasks);
- dl_rq->pushable_dl_tasks_leftmost = next_node;
if (next_node) {
dl_rq->earliest_dl.next = rb_entry(next_node,
struct task_struct, pushable_dl_tasks)->dl.deadline;
}
}
- rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+ rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
}
static inline int has_pushable_dl_tasks(struct rq *rq)
{
- return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
+ return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
}
static int push_dl_task(struct rq *rq);
@@ -1136,7 +1135,7 @@ static void update_curr_dl(struct rq *rq)
}
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_DL);
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1211,7 +1210,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
}
raw_spin_lock(&dl_b->lock);
- __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&dl_b->lock);
__dl_clear_params(p);
@@ -1266,7 +1265,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
dl_rq->earliest_dl.next = 0;
cpudl_clear(&rq->rd->cpudl, rq->cpu);
} else {
- struct rb_node *leftmost = dl_rq->rb_leftmost;
+ struct rb_node *leftmost = dl_rq->root.rb_leftmost;
struct sched_dl_entity *entry;
entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
@@ -1313,7 +1312,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rb_node **link = &dl_rq->rb_root.rb_node;
+ struct rb_node **link = &dl_rq->root.rb_root.rb_node;
struct rb_node *parent = NULL;
struct sched_dl_entity *entry;
int leftmost = 1;
@@ -1331,11 +1330,8 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
}
}
- if (leftmost)
- dl_rq->rb_leftmost = &dl_se->rb_node;
-
rb_link_node(&dl_se->rb_node, parent, link);
- rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
+ rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost);
inc_dl_tasks(dl_se, dl_rq);
}
@@ -1347,14 +1343,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
if (RB_EMPTY_NODE(&dl_se->rb_node))
return;
- if (dl_rq->rb_leftmost == &dl_se->rb_node) {
- struct rb_node *next_node;
-
- next_node = rb_next(&dl_se->rb_node);
- dl_rq->rb_leftmost = next_node;
- }
-
- rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
+ rb_erase_cached(&dl_se->rb_node, &dl_rq->root);
RB_CLEAR_NODE(&dl_se->rb_node);
dec_dl_tasks(dl_se, dl_rq);
@@ -1376,6 +1365,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
update_dl_entity(dl_se, pi_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se);
+ } else if ((flags & ENQUEUE_RESTORE) &&
+ dl_time_before(dl_se->deadline,
+ rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+ setup_new_dl_entity(dl_se);
}
__enqueue_dl_entity(dl_se);
@@ -1594,7 +1587,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+ !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
return;
/*
@@ -1602,7 +1595,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* see if it is pushed or pulled somewhere else.
*/
if (p->nr_cpus_allowed != 1 &&
- cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
+ cpudl_find(&rq->rd->cpudl, p, NULL))
return;
resched_curr(rq);
@@ -1647,7 +1640,7 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
struct dl_rq *dl_rq)
{
- struct rb_node *left = dl_rq->rb_leftmost;
+ struct rb_node *left = rb_first_cached(&dl_rq->root);
if (!left)
return NULL;
@@ -1655,7 +1648,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-struct task_struct *
+static struct task_struct *
pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct sched_dl_entity *dl_se;
@@ -1771,7 +1764,7 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
*/
static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
{
- struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
+ struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost;
struct task_struct *p = NULL;
if (!has_pushable_dl_tasks(rq))
@@ -1798,7 +1791,7 @@ static int find_later_rq(struct task_struct *task)
struct sched_domain *sd;
struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
int this_cpu = smp_processor_id();
- int best_cpu, cpu = task_cpu(task);
+ int cpu = task_cpu(task);
/* Make sure the mask is initialized first */
if (unlikely(!later_mask))
@@ -1811,17 +1804,14 @@ static int find_later_rq(struct task_struct *task)
* We have to consider system topology and task affinity
* first, then we can look for a suitable cpu.
*/
- best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
- task, later_mask);
- if (best_cpu == -1)
+ if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
return -1;
/*
- * If we are here, some target has been found,
- * the most suitable of which is cached in best_cpu.
- * This is, among the runqueues where the current tasks
- * have later deadlines than the task's one, the rq
- * with the latest possible one.
+ * If we are here, some targets have been found, including
+ * the most suitable which is, among the runqueues where the
+ * current tasks have later deadlines than the task's one, the
+ * rq with the latest possible one.
*
* Now we check how well this matches with task's
* affinity and system topology.
@@ -1841,6 +1831,7 @@ static int find_later_rq(struct task_struct *task)
rcu_read_lock();
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
/*
* If possible, preempting this_cpu is
@@ -1852,12 +1843,15 @@ static int find_later_rq(struct task_struct *task)
return this_cpu;
}
+ best_cpu = cpumask_first_and(later_mask,
+ sched_domain_span(sd));
/*
- * Last chance: if best_cpu is valid and is
- * in the mask, that becomes our choice.
+ * Last chance: if a cpu being in both later_mask
+ * and current sd span is valid, that becomes our
+ * choice. Of course, the latest possible cpu is
+ * already under consideration through later_mask.
*/
- if (best_cpu < nr_cpu_ids &&
- cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+ if (best_cpu < nr_cpu_ids) {
rcu_read_unlock();
return best_cpu;
}
@@ -1944,7 +1938,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
if (!has_pushable_dl_tasks(rq))
return NULL;
- p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
+ p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost,
struct task_struct, pushable_dl_tasks);
BUG_ON(rq->cpu != task_cpu(p));
@@ -2177,7 +2171,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* until we complete the update.
*/
raw_spin_lock(&src_dl_b->lock);
- __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&src_dl_b->lock);
}
@@ -2266,13 +2260,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
return;
}
- /*
- * If p is boosted we already updated its params in
- * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
- * p's deadline being now already after rq_clock(rq).
- */
- if (dl_time_before(p->dl.deadline, rq_clock(rq)))
- setup_new_dl_entity(&p->dl);
if (rq->curr != p) {
#ifdef CONFIG_SMP
@@ -2462,7 +2449,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
if (dl_policy(policy) && !task_has_dl_policy(p) &&
!__dl_overflow(dl_b, cpus, 0, new_bw)) {
if (hrtimer_active(&p->dl.inactive_timer))
- __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ __dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
@@ -2474,7 +2461,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
* But this would require to set the task's "inactive
* timer" when the task is not inactive.
*/
- __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ __dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
dl_change_utilization(p, new_bw);
err = 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fa66de52bd6..1ca0130ed4f9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -181,11 +181,16 @@ static const struct file_operations sched_feat_fops = {
.release = single_release,
};
+__read_mostly bool sched_debug_enabled;
+
static __init int sched_init_debug(void)
{
debugfs_create_file("sched_features", 0644, NULL, NULL,
&sched_feat_fops);
+ debugfs_create_bool("sched_debug", 0644, NULL,
+ &sched_debug_enabled);
+
return 0;
}
late_initcall(sched_init_debug);
@@ -327,38 +332,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
return table;
}
+static cpumask_var_t sd_sysctl_cpus;
static struct ctl_table_header *sd_sysctl_header;
+
void register_sched_domain_sysctl(void)
{
- int i, cpu_num = num_possible_cpus();
- struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ static struct ctl_table *cpu_entries;
+ static struct ctl_table **cpu_idx;
char buf[32];
+ int i;
- WARN_ON(sd_ctl_dir[0].child);
- sd_ctl_dir[0].child = entry;
+ if (!cpu_entries) {
+ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
+ if (!cpu_entries)
+ return;
- if (entry == NULL)
- return;
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = cpu_entries;
+ }
- for_each_possible_cpu(i) {
- snprintf(buf, 32, "cpu%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_cpu_table(i);
- entry++;
+ if (!cpu_idx) {
+ struct ctl_table *e = cpu_entries;
+
+ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
+ if (!cpu_idx)
+ return;
+
+ /* deal with sparse possible map */
+ for_each_possible_cpu(i) {
+ cpu_idx[i] = e;
+ e++;
+ }
+ }
+
+ if (!cpumask_available(sd_sysctl_cpus)) {
+ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
+ return;
+
+ /* init to possible to not have holes in @cpu_entries */
+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
+ }
+
+ for_each_cpu(i, sd_sysctl_cpus) {
+ struct ctl_table *e = cpu_idx[i];
+
+ if (e->child)
+ sd_free_ctl_entry(&e->child);
+
+ if (!e->procname) {
+ snprintf(buf, 32, "cpu%d", i);
+ e->procname = kstrdup(buf, GFP_KERNEL);
+ }
+ e->mode = 0555;
+ e->child = sd_alloc_ctl_cpu_table(i);
+
+ __cpumask_clear_cpu(i, sd_sysctl_cpus);
}
WARN_ON(sd_sysctl_header);
sd_sysctl_header = register_sysctl_table(sd_ctl_root);
}
+void dirty_sched_domain_sysctl(int cpu)
+{
+ if (cpumask_available(sd_sysctl_cpus))
+ __cpumask_set_cpu(cpu, sd_sysctl_cpus);
+}
+
/* may be called multiple times per register */
void unregister_sched_domain_sysctl(void)
{
unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
- if (sd_ctl_dir[0].child)
- sd_free_ctl_entry(&sd_ctl_dir[0].child);
}
#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_SMP */
@@ -396,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P_SCHEDSTAT(se->statistics.wait_count);
}
P(se->load.weight);
+ P(se->runnable_weight);
#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
+ P(se->avg.runnable_load_avg);
#endif
#undef PN_SCHEDSTAT
@@ -425,9 +472,9 @@ static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
if (rq->curr == p)
- SEQ_printf(m, "R");
+ SEQ_printf(m, ">R");
else
- SEQ_printf(m, " ");
+ SEQ_printf(m, " %c", task_state_to_char(p));
SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
@@ -456,9 +503,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m,
"\nrunnable tasks:\n"
- " task PID tree-key switches prio"
+ " S task PID tree-key switches prio"
" wait-time sum-exec sum-sleep\n"
- "------------------------------------------------------"
+ "-------------------------------------------------------"
"----------------------------------------------------\n");
rcu_read_lock();
@@ -488,7 +535,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_lock_irqsave(&rq->lock, flags);
- if (cfs_rq->rb_leftmost)
+ if (rb_first_cached(&cfs_rq->tasks_timeline))
MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
@@ -513,16 +560,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
+ SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
- cfs_rq->runnable_load_avg);
+ cfs_rq->avg.runnable_load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
- atomic_long_read(&cfs_rq->removed_load_avg));
- SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
- atomic_long_read(&cfs_rq->removed_util_avg));
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
+ cfs_rq->removed.load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
+ cfs_rq->removed.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
+ cfs_rq->removed.runnable_sum);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
@@ -872,11 +922,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
#endif
}
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ struct seq_file *m)
{
unsigned long nr_switches;
- SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
get_nr_threads(p));
SEQ_printf(m,
"---------------------------------------------------------"
@@ -958,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
"nr_involuntary_switches", (long long)p->nivcsw);
P(se.load.weight);
+ P(se.runnable_weight);
#ifdef CONFIG_SMP
P(se.avg.load_sum);
+ P(se.avg.runnable_load_sum);
P(se.avg.util_sum);
P(se.avg.load_avg);
+ P(se.avg.runnable_load_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c95880e216f6..0989676c50e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
*
@@ -32,6 +33,7 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/sched/isolation.h>
#include <trace/events/sched.h>
@@ -513,6 +515,7 @@ static inline int entity_before(struct sched_entity *a,
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
+ struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
u64 vruntime = cfs_rq->min_vruntime;
@@ -523,10 +526,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}
- if (cfs_rq->rb_leftmost) {
- struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
- struct sched_entity,
- run_node);
+ if (leftmost) { /* non-empty tree */
+ struct sched_entity *se;
+ se = rb_entry(leftmost, struct sched_entity, run_node);
if (!curr)
vruntime = se->vruntime;
@@ -547,10 +549,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+ struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
- int leftmost = 1;
+ bool leftmost = true;
/*
* Find the right place in the rbtree:
@@ -566,36 +568,23 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- /*
- * Maintain a cache of leftmost tree entries (it is frequently
- * used):
- */
- if (leftmost)
- cfs_rq->rb_leftmost = &se->run_node;
-
rb_link_node(&se->run_node, parent, link);
- rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_insert_color_cached(&se->run_node,
+ &cfs_rq->tasks_timeline, leftmost);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->rb_leftmost == &se->run_node) {
- struct rb_node *next_node;
-
- next_node = rb_next(&se->run_node);
- cfs_rq->rb_leftmost = next_node;
- }
-
- rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *left = cfs_rq->rb_leftmost;
+ struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
if (!left)
return NULL;
@@ -616,7 +605,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
+ struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
if (!last)
return NULL;
@@ -729,13 +718,8 @@ void init_entity_runnable_average(struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
- sa->last_update_time = 0;
- /*
- * sched_avg's period_contrib should be strictly less then 1024, so
- * we give it 1023 to make sure it is almost a period (1024us), and
- * will definitely be update (after enqueue).
- */
- sa->period_contrib = 1023;
+ memset(sa, 0, sizeof(*sa));
+
/*
* Tasks are intialized with full load to be seen as heavy tasks until
* they get a chance to stabilize to their real load level.
@@ -743,13 +727,10 @@ void init_entity_runnable_average(struct sched_entity *se)
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
- sa->load_avg = scale_load_down(se->load.weight);
- sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- /*
- * At this point, util_avg won't be used in select_task_rq_fair anyway
- */
- sa->util_avg = 0;
- sa->util_sum = 0;
+ sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
+
+ se->runnable_weight = se->load.weight;
+
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
@@ -797,7 +778,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
} else {
sa->util_avg = cap;
}
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
}
if (entity_is_task(se)) {
@@ -806,7 +786,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
/*
* For !fair tasks do:
*
- update_cfs_rq_load_avg(now, cfs_rq, false);
+ update_cfs_rq_load_avg(now, cfs_rq);
attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
@@ -1071,6 +1051,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+struct numa_group {
+ atomic_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+ int active_nodes;
+
+ struct rcu_head rcu;
+ unsigned long total_faults;
+ unsigned long max_faults_cpu;
+ /*
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long *faults_cpu;
+ unsigned long faults[0];
+};
+
+static inline unsigned long group_faults_priv(struct numa_group *ng);
+static inline unsigned long group_faults_shared(struct numa_group *ng);
+
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
@@ -1107,13 +1110,47 @@ static unsigned int task_scan_min(struct task_struct *p)
return max_t(unsigned int, floor, scan);
}
+static unsigned int task_scan_start(struct task_struct *p)
+{
+ unsigned long smin = task_scan_min(p);
+ unsigned long period = smin;
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+ }
+
+ return max(smin, period);
+}
+
static unsigned int task_scan_max(struct task_struct *p)
{
- unsigned int smin = task_scan_min(p);
- unsigned int smax;
+ unsigned long smin = task_scan_min(p);
+ unsigned long smax;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+ unsigned long period = smax;
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+
+ smax = max(smax, period);
+ }
+
return max(smin, smax);
}
@@ -1129,26 +1166,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
-struct numa_group {
- atomic_t refcount;
-
- spinlock_t lock; /* nr_tasks, tasks */
- int nr_tasks;
- pid_t gid;
- int active_nodes;
-
- struct rcu_head rcu;
- unsigned long total_faults;
- unsigned long max_faults_cpu;
- /*
- * Faults_cpu is used to decide whether memory should move
- * towards the CPU. As a consequence, these stats are weighted
- * more by CPU use than by memory faults.
- */
- unsigned long *faults_cpu;
- unsigned long faults[0];
-};
-
/* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2
@@ -1198,6 +1215,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
+static inline unsigned long group_faults_priv(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+
+ return faults;
+}
+
+static inline unsigned long group_faults_shared(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
+ }
+
+ return faults;
+}
+
/*
* A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations
@@ -1378,7 +1419,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long weighted_cpuload(const int cpu);
+static unsigned long weighted_cpuload(struct rq *rq);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long capacity_of(int cpu);
@@ -1409,7 +1450,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
struct rq *rq = cpu_rq(cpu);
ns->nr_running += rq->nr_running;
- ns->load += weighted_cpuload(cpu);
+ ns->load += weighted_cpuload(rq);
ns->compute_capacity += capacity_of(cpu);
cpus++;
@@ -1808,7 +1849,7 @@ static int task_numa_migrate(struct task_struct *p)
* Reset the scan period if the task is being rescheduled on an
* alternative node to recheck if the tasks is now properly placed.
*/
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
@@ -1892,7 +1933,7 @@ static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private)
{
unsigned int period_slot;
- int ratio;
+ int lr_ratio, ps_ratio;
int diff;
unsigned long remote = p->numa_faults_locality[0];
@@ -1922,25 +1963,36 @@ static void update_task_scan_period(struct task_struct *p,
* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
*/
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
- ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
- if (ratio >= NUMA_PERIOD_THRESHOLD) {
- int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+ if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are local. There is no need to
+ * do fast NUMA scanning, since memory is already local.
+ */
+ int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are shared with other tasks.
+ * There is no point in continuing fast NUMA scanning,
+ * since other tasks may just move the memory elsewhere.
+ */
+ int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else {
- diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
-
/*
- * Scale scan rate increases based on sharing. There is an
- * inverse relationship between the degree of sharing and
- * the adjustment made to the scanning period. Broadly
- * speaking the intent is that there is little point
- * scanning faster if shared accesses dominate as it may
- * simply bounce migrations uselessly
+ * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+ * yet they are not on the local NUMA node. Speed up
+ * NUMA scanning to get the memory moved over.
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
- diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ int ratio = max(lr_ratio, ps_ratio);
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
}
p->numa_scan_period = clamp(p->numa_scan_period + diff,
@@ -1966,7 +2018,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
} else {
- delta = p->se.avg.load_sum / p->se.load.weight;
+ delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;
}
@@ -2448,7 +2500,7 @@ void task_numa_work(struct callback_head *work)
if (p->numa_scan_period == 0) {
p->numa_scan_period_max = task_scan_max(p);
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
@@ -2576,7 +2628,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
if (now > curr->node_stamp + period) {
if (!curr->node_stamp)
- curr->numa_scan_period = task_scan_min(curr);
+ curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
@@ -2586,59 +2638,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- */
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- struct numa_stats prev_load, this_load;
- s64 this_eff_load, prev_eff_load;
-
- update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
- update_numa_stats(&this_load, cpu_to_node(this_cpu));
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync) {
- unsigned long current_load = task_h_load(current);
-
- if (this_load.load > current_load)
- this_load.load -= current_load;
- else
- this_load.load = 0;
- }
-
- /*
- * In low-load situations, where this_cpu's node is idle due to the
- * sync cause above having dropped this_load.load to 0, move the task.
- * Moving to an idle socket will not create a bad imbalance.
- *
- * Otherwise check if the nodes are near enough in load to allow this
- * task to be woken on this_cpu's node.
- */
- if (this_load.load > 0) {
- unsigned long task_load = task_h_load(p);
-
- this_eff_load = 100;
- this_eff_load *= prev_load.compute_capacity;
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= this_load.compute_capacity;
-
- this_eff_load *= this_load.load + task_load;
- prev_eff_load *= prev_load.load - task_load;
-
- return this_eff_load <= prev_eff_load;
- }
-
- return true;
-}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2652,14 +2651,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
-#ifdef CONFIG_SMP
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- return true;
-}
-#endif /* !SMP */
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -2694,18 +2685,226 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->nr_running--;
}
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(*ptr) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ res = var - val; \
+ if (res > var) \
+ res = 0; \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+#ifdef CONFIG_SMP
+/*
+ * XXX we want to get rid of these helpers and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+ return scale_load_down(se->load.weight);
+}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ return scale_load_down(se->runnable_weight);
+}
+
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->runnable_weight += se->runnable_weight;
+
+ cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
+ cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
+}
+
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->runnable_weight -= se->runnable_weight;
+
+ sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
+ sub_positive(&cfs_rq->avg.runnable_load_sum,
+ se_runnable(se) * se->avg.runnable_load_sum);
+}
+
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->avg.load_avg += se->avg.load_avg;
+ cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+}
+
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+ sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+}
+#else
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+#endif
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight, unsigned long runnable)
+{
+ if (se->on_rq) {
+ /* commit outstanding execution time */
+ if (cfs_rq->curr == se)
+ update_curr(cfs_rq);
+ account_entity_dequeue(cfs_rq, se);
+ dequeue_runnable_load_avg(cfs_rq, se);
+ }
+ dequeue_load_avg(cfs_rq, se);
+
+ se->runnable_weight = runnable;
+ update_load_set(&se->load, weight);
+
+#ifdef CONFIG_SMP
+ do {
+ u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+
+ se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
+ se->avg.runnable_load_avg =
+ div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
+ } while (0);
+#endif
+
+ enqueue_load_avg(cfs_rq, se);
+ if (se->on_rq) {
+ account_entity_enqueue(cfs_rq, se);
+ enqueue_runnable_load_avg(cfs_rq, se);
+ }
+}
+
+void reweight_task(struct task_struct *p, int prio)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct load_weight *load = &se->load;
+ unsigned long weight = scale_load(sched_prio_to_weight[prio]);
+
+ reweight_entity(cfs_rq, se, weight, weight);
+ load->inv_weight = sched_prio_to_wmult[prio];
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+/*
+ * All this does is approximate the hierarchical proportion which includes that
+ * global sum we all love to hate.
+ *
+ * That is, the weight of a group entity, is the proportional share of the
+ * group weight based on the group runqueue weights. That is:
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- (1)
+ * \Sum grq->load.weight
+ *
+ * Now, because computing that sum is prohibitively expensive to compute (been
+ * there, done that) we approximate it with this average stuff. The average
+ * moves slower and therefore the approximation is cheaper and more stable.
+ *
+ * So instead of the above, we substitute:
+ *
+ * grq->load.weight -> grq->avg.load_avg (2)
+ *
+ * which yields the following:
+ *
+ * tg->weight * grq->avg.load_avg
+ * ge->load.weight = ------------------------------ (3)
+ * tg->load_avg
+ *
+ * Where: tg->load_avg ~= \Sum grq->avg.load_avg
+ *
+ * That is shares_avg, and it is right (given the approximation (2)).
+ *
+ * The problem with it is that because the average is slow -- it was designed
+ * to be exactly that of course -- this leads to transients in boundary
+ * conditions. In specific, the case where the group was idle and we start the
+ * one task. It takes time for our CPU's grq->avg.load_avg to build up,
+ * yielding bad latency etc..
+ *
+ * Now, in that special case (1) reduces to:
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- = tg->weight (4)
+ * grp->load.weight
+ *
+ * That is, the sum collapses because all other CPUs are idle; the UP scenario.
+ *
+ * So what we do is modify our approximation (3) to approach (4) in the (near)
+ * UP case, like:
+ *
+ * ge->load.weight =
+ *
+ * tg->weight * grq->load.weight
+ * --------------------------------------------------- (5)
+ * tg->load_avg - grq->avg.load_avg + grq->load.weight
+ *
+ * But because grq->load.weight can drop to 0, resulting in a divide by zero,
+ * we need to use grq->avg.load_avg as its lower bound, which then gives:
+ *
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- (6)
+ * tg_load_avg'
+ *
+ * Where:
+ *
+ * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
+ * max(grq->load.weight, grq->avg.load_avg)
+ *
+ * And that is shares_weight and is icky. In the (near) UP case it approaches
+ * (4) while in the normal case it approaches (3). It consistently
+ * overestimates the ge->load.weight and therefore:
+ *
+ * \Sum ge->load.weight >= tg->weight
+ *
+ * hence icky!
+ */
+static long calc_group_shares(struct cfs_rq *cfs_rq)
{
- long tg_weight, load, shares;
+ long tg_weight, tg_shares, load, shares;
+ struct task_group *tg = cfs_rq->tg;
- /*
- * This really should be: cfs_rq->avg.load_avg, but instead we use
- * cfs_rq->load.weight, which is its upper bound. This helps ramp up
- * the shares for small weight interactive tasks.
- */
- load = scale_load_down(cfs_rq->load.weight);
+ tg_shares = READ_ONCE(tg->shares);
+
+ load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
tg_weight = atomic_long_read(&tg->load_avg);
@@ -2713,7 +2912,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
tg_weight -= cfs_rq->tg_load_avg_contrib;
tg_weight += load;
- shares = (tg->shares * load);
+ shares = (tg_shares * load);
if (tg_weight)
shares /= tg_weight;
@@ -2729,67 +2928,115 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
* case no task is runnable on a CPU MIN_SHARES=2 should be returned
* instead of 0.
*/
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- if (shares > tg->shares)
- shares = tg->shares;
-
- return shares;
-}
-# else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- return tg->shares;
+ return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
-# endif /* CONFIG_SMP */
-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
- unsigned long weight)
+/*
+ * This calculates the effective runnable weight for a group entity based on
+ * the group entity weight calculated above.
+ *
+ * Because of the above approximation (2), our group entity weight is
+ * an load_avg based ratio (3). This means that it includes blocked load and
+ * does not represent the runnable weight.
+ *
+ * Approximate the group entity's runnable weight per ratio from the group
+ * runqueue:
+ *
+ * grq->avg.runnable_load_avg
+ * ge->runnable_weight = ge->load.weight * -------------------------- (7)
+ * grq->avg.load_avg
+ *
+ * However, analogous to above, since the avg numbers are slow, this leads to
+ * transients in the from-idle case. Instead we use:
+ *
+ * ge->runnable_weight = ge->load.weight *
+ *
+ * max(grq->avg.runnable_load_avg, grq->runnable_weight)
+ * ----------------------------------------------------- (8)
+ * max(grq->avg.load_avg, grq->load.weight)
+ *
+ * Where these max() serve both to use the 'instant' values to fix the slow
+ * from-idle and avoid the /0 on to-idle, similar to (6).
+ */
+static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
{
- if (se->on_rq) {
- /* commit outstanding execution time */
- if (cfs_rq->curr == se)
- update_curr(cfs_rq);
- account_entity_dequeue(cfs_rq, se);
- }
+ long runnable, load_avg;
- update_load_set(&se->load, weight);
+ load_avg = max(cfs_rq->avg.load_avg,
+ scale_load_down(cfs_rq->load.weight));
- if (se->on_rq)
- account_entity_enqueue(cfs_rq, se);
+ runnable = max(cfs_rq->avg.runnable_load_avg,
+ scale_load_down(cfs_rq->runnable_weight));
+
+ runnable *= shares;
+ if (load_avg)
+ runnable /= load_avg;
+
+ return clamp_t(long, runnable, MIN_SHARES, shares);
}
+# endif /* CONFIG_SMP */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct sched_entity *se)
+/*
+ * Recomputes the group entity based on the current state of its group
+ * runqueue.
+ */
+static void update_cfs_group(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
- struct task_group *tg;
- long shares;
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long shares, runnable;
- if (!cfs_rq)
+ if (!gcfs_rq)
return;
- if (throttled_hierarchy(cfs_rq))
+ if (throttled_hierarchy(gcfs_rq))
return;
- tg = cfs_rq->tg;
-
#ifndef CONFIG_SMP
- if (likely(se->load.weight == tg->shares))
+ runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
+
+ if (likely(se->load.weight == shares))
return;
+#else
+ shares = calc_group_shares(gcfs_rq);
+ runnable = calc_group_runnable(gcfs_rq, shares);
#endif
- shares = calc_cfs_shares(cfs_rq, tg);
- reweight_entity(cfs_rq_of(se), se, shares);
+ reweight_entity(cfs_rq_of(se), se, shares, runnable);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct sched_entity *se)
+static inline void update_cfs_group(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (&rq->cfs == cfs_rq) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq, 0);
+ }
+}
+
#ifdef CONFIG_SMP
/*
* Approximate:
@@ -2869,7 +3116,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
*/
static __always_inline u32
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+ unsigned long load, unsigned long runnable, int running)
{
unsigned long scale_freq, scale_cpu;
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
@@ -2886,10 +3133,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
*/
if (periods) {
sa->load_sum = decay_load(sa->load_sum, periods);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- decay_load(cfs_rq->runnable_load_sum, periods);
- }
+ sa->runnable_load_sum =
+ decay_load(sa->runnable_load_sum, periods);
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
/*
@@ -2902,11 +3147,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
sa->period_contrib = delta;
contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
- }
+ if (load)
+ sa->load_sum += load * contrib;
+ if (runnable)
+ sa->runnable_load_sum += runnable * contrib;
if (running)
sa->util_sum += contrib * scale_cpu;
@@ -2942,8 +3186,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
-___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long load, unsigned long runnable, int running)
{
u64 delta;
@@ -2968,69 +3212,114 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
sa->last_update_time += delta << 10;
/*
+ * running is a subset of runnable (weight) so running can't be set if
+ * runnable is clear. But there are some corner cases where the current
+ * se has been already dequeued but cfs_rq->curr still points to it.
+ * This means that weight will be 0 but not running for a sched_entity
+ * but also for a cfs_rq if the latter becomes idle. As an example,
+ * this happens during idle_balance() which calls
+ * update_blocked_averages()
+ */
+ if (!load)
+ runnable = running = 0;
+
+ /*
* Now we know we crossed measurement unit boundaries. The *_avg
* accrues by two steps:
*
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+ if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
return 0;
+ return 1;
+}
+
+static __always_inline void
+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
+{
+ u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
/*
* Step 2: update *_avg.
*/
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
- if (cfs_rq) {
- cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
- }
- sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
-
- return 1;
+ sa->load_avg = div_u64(load * sa->load_sum, divider);
+ sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
+ sa->util_avg = sa->util_sum / divider;
}
+/*
+ * sched_entity:
+ *
+ * task:
+ * se_runnable() == se_weight()
+ *
+ * group: [ see update_cfs_group() ]
+ * se_weight() = tg->weight * grq->load_avg / tg->load_avg
+ * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ *
+ * load_sum := runnable_sum
+ * load_avg = se_weight(se) * runnable_avg
+ *
+ * runnable_load_sum := runnable_sum
+ * runnable_load_avg = se_runnable(se) * runnable_avg
+ *
+ * XXX collapse load_sum and runnable_load_sum
+ *
+ * cfq_rs:
+ *
+ * load_sum = \Sum se_weight(se) * se->avg.load_sum
+ * load_avg = \Sum se->avg.load_avg
+ *
+ * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
+ * runnable_load_avg = \Sum se->avg.runable_load_avg
+ */
+
static int
__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
{
- return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+ if (entity_is_task(se))
+ se->runnable_weight = se->load.weight;
+
+ if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ return 1;
+ }
+
+ return 0;
}
static int
__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return ___update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
+ if (entity_is_task(se))
+ se->runnable_weight = se->load.weight;
+
+ if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+ cfs_rq->curr == se)) {
+
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ return 1;
+ }
+
+ return 0;
}
static int
__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
{
- return ___update_load_avg(now, cpu, &cfs_rq->avg,
- scale_load_down(cfs_rq->load.weight),
- cfs_rq->curr != NULL, cfs_rq);
-}
+ if (___update_load_sum(now, cpu, &cfs_rq->avg,
+ scale_load_down(cfs_rq->load.weight),
+ scale_load_down(cfs_rq->runnable_weight),
+ cfs_rq->curr != NULL)) {
-/*
- * Signed add and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define add_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(_val) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- \
- res = var + val; \
- \
- if (val < 0 && res > var) \
- res = 0; \
- \
- WRITE_ONCE(*ptr, res); \
-} while (0)
+ ___update_load_avg(&cfs_rq->avg, 1, 1);
+ return 1;
+ }
+
+ return 0;
+}
#ifdef CONFIG_FAIR_GROUP_SCHED
/**
@@ -3113,11 +3402,77 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
-/* Take into account change of utilization of a child task group */
+
+/*
+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
+ * propagate its contribution. The key to this propagation is the invariant
+ * that for each group:
+ *
+ * ge->avg == grq->avg (1)
+ *
+ * _IFF_ we look at the pure running and runnable sums. Because they
+ * represent the very same entity, just at different points in the hierarchy.
+ *
+ *
+ * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
+ * simply copies the running sum over.
+ *
+ * However, update_tg_cfs_runnable() is more complex. So we have:
+ *
+ * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
+ *
+ * And since, like util, the runnable part should be directly transferable,
+ * the following would _appear_ to be the straight forward approach:
+ *
+ * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3)
+ *
+ * And per (1) we have:
+ *
+ * ge->avg.running_avg == grq->avg.running_avg
+ *
+ * Which gives:
+ *
+ * ge->load.weight * grq->avg.load_avg
+ * ge->avg.load_avg = ----------------------------------- (4)
+ * grq->load.weight
+ *
+ * Except that is wrong!
+ *
+ * Because while for entities historical weight is not important and we
+ * really only care about our future and therefore can consider a pure
+ * runnable sum, runqueues can NOT do this.
+ *
+ * We specifically want runqueues to have a load_avg that includes
+ * historical weights. Those represent the blocked load, the load we expect
+ * to (shortly) return to us. This only works by keeping the weights as
+ * integral part of the sum. We therefore cannot decompose as per (3).
+ *
+ * OK, so what then?
+ *
+ *
+ * Another way to look at things is:
+ *
+ * grq->avg.load_avg = \Sum se->avg.load_avg
+ *
+ * Therefore, per (2):
+ *
+ * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ *
+ * And the very thing we're propagating is a change in that sum (someone
+ * joined/left). So we can easily know the runnable change, which would be, per
+ * (2) the already tracked se->load_avg divided by the corresponding
+ * se->weight.
+ *
+ * Basically (4) but in differential form:
+ *
+ * d(runnable_avg) += se->avg.load_avg / se->load.weight
+ * (5)
+ * ge->avg.load_avg += ge->load.weight * d(runnable_avg)
+ */
+
static inline void
-update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
/* Nothing to update */
@@ -3133,102 +3488,65 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
}
-/* Take into account change of load of a child task group */
static inline void
-update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- long delta, load = gcfs_rq->avg.load_avg;
+ long runnable_sum = gcfs_rq->prop_runnable_sum;
+ long runnable_load_avg, load_avg;
+ s64 runnable_load_sum, load_sum;
- /*
- * If the load of group cfs_rq is null, the load of the
- * sched_entity will also be null so we can skip the formula
- */
- if (load) {
- long tg_load;
+ if (!runnable_sum)
+ return;
- /* Get tg's load and ensure tg_load > 0 */
- tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+ gcfs_rq->prop_runnable_sum = 0;
- /* Ensure tg_load >= load and updated with current load*/
- tg_load -= gcfs_rq->tg_load_avg_contrib;
- tg_load += load;
+ load_sum = (s64)se_weight(se) * runnable_sum;
+ load_avg = div_s64(load_sum, LOAD_AVG_MAX);
- /*
- * We need to compute a correction term in the case that the
- * task group is consuming more CPU than a task of equal
- * weight. A task with a weight equals to tg->shares will have
- * a load less or equal to scale_load_down(tg->shares).
- * Similarly, the sched_entities that represent the task group
- * at parent level, can't have a load higher than
- * scale_load_down(tg->shares). And the Sum of sched_entities'
- * load must be <= scale_load_down(tg->shares).
- */
- if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
- /* scale gcfs_rq's load into tg's shares*/
- load *= scale_load_down(gcfs_rq->tg->shares);
- load /= tg_load;
- }
- }
-
- delta = load - se->avg.load_avg;
+ add_positive(&se->avg.load_sum, runnable_sum);
+ add_positive(&se->avg.load_avg, load_avg);
- /* Nothing to update */
- if (!delta)
- return;
+ add_positive(&cfs_rq->avg.load_avg, load_avg);
+ add_positive(&cfs_rq->avg.load_sum, load_sum);
- /* Set new sched_entity's load */
- se->avg.load_avg = load;
- se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+ runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
+ runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
- /* Update parent cfs_rq load */
- add_positive(&cfs_rq->avg.load_avg, delta);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+ add_positive(&se->avg.runnable_load_sum, runnable_sum);
+ add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
- /*
- * If the sched_entity is already enqueued, we also have to update the
- * runnable load avg.
- */
if (se->on_rq) {
- /* Update parent cfs_rq runnable_load_avg */
- add_positive(&cfs_rq->runnable_load_avg, delta);
- cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
+ add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
}
}
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
{
- cfs_rq->propagate_avg = 1;
-}
-
-static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
-
- if (!cfs_rq->propagate_avg)
- return 0;
-
- cfs_rq->propagate_avg = 0;
- return 1;
+ cfs_rq->propagate = 1;
+ cfs_rq->prop_runnable_sum += runnable_sum;
}
/* Update task and its cfs_rq load average */
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *gcfs_rq;
if (entity_is_task(se))
return 0;
- if (!test_and_clear_tg_cfs_propagate(se))
+ gcfs_rq = group_cfs_rq(se);
+ if (!gcfs_rq->propagate)
return 0;
+ gcfs_rq->propagate = 0;
+
cfs_rq = cfs_rq_of(se);
- set_tg_cfs_propagate(cfs_rq);
+ add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
- update_tg_cfs_util(cfs_rq, se);
- update_tg_cfs_load(cfs_rq, se);
+ update_tg_cfs_util(cfs_rq, se, gcfs_rq);
+ update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
return 1;
}
@@ -3252,7 +3570,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
* If there is a pending propagation, we have to update the load and
* the utilization of the sched_entity:
*/
- if (gcfs_rq->propagate_avg)
+ if (gcfs_rq->propagate)
return false;
/*
@@ -3272,55 +3590,14 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
return 0;
}
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
-{
- if (&this_rq()->cfs == cfs_rq) {
- /*
- * There are a few boundary cases this might miss but it should
- * get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
- *
- * It will not get called when we go idle, because the idle
- * thread is a different class (!fair), nor will the utilization
- * number include things like RT tasks.
- *
- * As is, the util number is not freq-invariant (we'd have to
- * implement arch_scale_freq_capacity() for that).
- *
- * See cpu_util().
- */
- cpufreq_update_util(rq_of(cfs_rq), 0);
- }
-}
-
-/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(*ptr) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- res = var - val; \
- if (res > var) \
- res = 0; \
- WRITE_ONCE(*ptr, res); \
-} while (0)
-
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_task()
* @cfs_rq: cfs_rq to update
- * @update_freq: should we call cfs_rq_util_change() or will the call do so
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
* avg. The immediate corollary is that all (fair) tasks must be attached, see
@@ -3334,67 +3611,47 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* call update_tg_load_avg() when this function returns true.
*/
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
struct sched_avg *sa = &cfs_rq->avg;
- int decayed, removed_load = 0, removed_util = 0;
+ int decayed = 0;
- if (atomic_long_read(&cfs_rq->removed_load_avg)) {
- s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+ if (cfs_rq->removed.nr) {
+ unsigned long r;
+ u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
+ raw_spin_lock(&cfs_rq->removed.lock);
+ swap(cfs_rq->removed.util_avg, removed_util);
+ swap(cfs_rq->removed.load_avg, removed_load);
+ swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
+ cfs_rq->removed.nr = 0;
+ raw_spin_unlock(&cfs_rq->removed.lock);
+
+ r = removed_load;
sub_positive(&sa->load_avg, r);
- sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
- removed_load = 1;
- set_tg_cfs_propagate(cfs_rq);
- }
+ sub_positive(&sa->load_sum, r * divider);
- if (atomic_long_read(&cfs_rq->removed_util_avg)) {
- long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+ r = removed_util;
sub_positive(&sa->util_avg, r);
- sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
- removed_util = 1;
- set_tg_cfs_propagate(cfs_rq);
+ sub_positive(&sa->util_sum, r * divider);
+
+ add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
+
+ decayed = 1;
}
- decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+ decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (update_freq && (decayed || removed_util))
+ if (decayed)
cfs_rq_util_change(cfs_rq);
- return decayed || removed_load;
-}
-
-/*
- * Optional action to be done while updating the load average
- */
-#define UPDATE_TG 0x1
-#define SKIP_AGE_LOAD 0x2
-
-/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int flags)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
- int decayed;
-
- /*
- * Track task load average for carrying it to new CPU after migrated, and
- * track group sched_entity load average for task_h_load calc in migration
- */
- if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
- __update_load_avg_se(now, cpu, cfs_rq, se);
-
- decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
- decayed |= propagate_entity_load_avg(se);
-
- if (decayed && (flags & UPDATE_TG))
- update_tg_load_avg(cfs_rq, 0);
+ return decayed;
}
/**
@@ -3407,12 +3664,39 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+
+ /*
+ * When we attach the @se to the @cfs_rq, we must align the decay
+ * window because without that, really weird and wonderful things can
+ * happen.
+ *
+ * XXX illustrate
+ */
se->avg.last_update_time = cfs_rq->avg.last_update_time;
- cfs_rq->avg.load_avg += se->avg.load_avg;
- cfs_rq->avg.load_sum += se->avg.load_sum;
+ se->avg.period_contrib = cfs_rq->avg.period_contrib;
+
+ /*
+ * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
+ * period_contrib. This isn't strictly correct, but since we're
+ * entirely outside of the PELT hierarchy, nobody cares if we truncate
+ * _sum a little.
+ */
+ se->avg.util_sum = se->avg.util_avg * divider;
+
+ se->avg.load_sum = divider;
+ if (se_weight(se)) {
+ se->avg.load_sum =
+ div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
+ }
+
+ se->avg.runnable_load_sum = se->avg.load_sum;
+
+ enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
- set_tg_cfs_propagate(cfs_rq);
+
+ add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
cfs_rq_util_change(cfs_rq);
}
@@ -3427,39 +3711,47 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
-
- sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+ dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
- set_tg_cfs_propagate(cfs_rq);
+
+ add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
cfs_rq_util_change(cfs_rq);
}
-/* Add the load generated by se into cfs_rq's load average */
-static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+#define DO_ATTACH 0x4
+
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- struct sched_avg *sa = &se->avg;
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+ int decayed;
- cfs_rq->runnable_load_avg += sa->load_avg;
- cfs_rq->runnable_load_sum += sa->load_sum;
+ /*
+ * Track task load average for carrying it to new CPU after migrated, and
+ * track group sched_entity load average for task_h_load calc in migration
+ */
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+ __update_load_avg_se(now, cpu, cfs_rq, se);
+
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
+ decayed |= propagate_entity_load_avg(se);
+
+ if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
- if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, 0);
- }
-}
-/* Remove the runnable load generated by se from cfs_rq's runnable load average */
-static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- cfs_rq->runnable_load_avg =
- max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
- cfs_rq->runnable_load_sum =
- max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
+ } else if (decayed && (flags & UPDATE_TG))
+ update_tg_load_avg(cfs_rq, 0);
}
#ifndef CONFIG_64BIT
@@ -3503,6 +3795,7 @@ void sync_entity_load_avg(struct sched_entity *se)
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ unsigned long flags;
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3515,13 +3808,18 @@ void remove_entity_load_avg(struct sched_entity *se)
*/
sync_entity_load_avg(se);
- atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
- atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+
+ raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
+ ++cfs_rq->removed.nr;
+ cfs_rq->removed.util_avg += se->avg.util_avg;
+ cfs_rq->removed.load_avg += se->avg.load_avg;
+ cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
+ raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
{
- return cfs_rq->runnable_load_avg;
+ return cfs_rq->avg.runnable_load_avg;
}
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
@@ -3534,23 +3832,20 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
#else /* CONFIG_SMP */
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
return 0;
}
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
+#define DO_ATTACH 0x0
-static inline void update_load_avg(struct sched_entity *se, int not_used1)
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
- cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+ cfs_rq_util_change(cfs_rq);
}
-static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
@@ -3695,9 +3990,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
*/
- update_load_avg(se, UPDATE_TG);
- enqueue_entity_load_avg(cfs_rq, se);
- update_cfs_shares(se);
+ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+ update_cfs_group(se);
+ enqueue_runnable_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
@@ -3779,8 +4074,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
- update_load_avg(se, UPDATE_TG);
- dequeue_entity_load_avg(cfs_rq, se);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ dequeue_runnable_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se, flags);
@@ -3803,7 +4098,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_cfs_shares(se);
+ update_cfs_group(se);
/*
* Now advance min_vruntime if @se was the entity holding it back,
@@ -3867,7 +4162,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(se, UPDATE_TG);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
@@ -3969,7 +4264,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
- update_load_avg(prev, 0);
+ update_load_avg(cfs_rq, prev, 0);
}
cfs_rq->curr = NULL;
}
@@ -3985,8 +4280,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(curr, UPDATE_TG);
- update_cfs_shares(curr);
+ update_load_avg(cfs_rq, curr, UPDATE_TG);
+ update_cfs_group(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -4875,7 +5170,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* passed.
*/
if (p->in_iowait)
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
if (se->on_rq)
@@ -4903,8 +5198,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, UPDATE_TG);
- update_cfs_shares(se);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_cfs_group(se);
}
if (!se)
@@ -4962,8 +5257,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, UPDATE_TG);
- update_cfs_shares(se);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_cfs_group(se);
}
if (!se)
@@ -5125,9 +5420,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
}
/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(struct rq *rq)
{
- return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+ return cfs_rq_runnable_load_avg(&rq->cfs);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -5172,7 +5467,7 @@ static void cpu_load_update_idle(struct rq *this_rq)
/*
* bail if there's load or we're actually up-to-date.
*/
- if (weighted_cpuload(cpu_of(this_rq)))
+ if (weighted_cpuload(this_rq))
return;
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -5193,7 +5488,7 @@ void cpu_load_update_nohz_start(void)
* concurrently we'll exit nohz. And cpu_load write can race with
* cpu_load_update_idle() but both updater would be writing the same.
*/
- this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+ this_rq->cpu_load[0] = weighted_cpuload(this_rq);
}
/*
@@ -5209,7 +5504,7 @@ void cpu_load_update_nohz_stop(void)
if (curr_jiffies == this_rq->last_load_update_tick)
return;
- load = weighted_cpuload(cpu_of(this_rq));
+ load = weighted_cpuload(this_rq);
rq_lock(this_rq, &rf);
update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -5235,7 +5530,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
*/
void cpu_load_update_active(struct rq *this_rq)
{
- unsigned long load = weighted_cpuload(cpu_of(this_rq));
+ unsigned long load = weighted_cpuload(this_rq);
if (tick_nohz_tick_stopped())
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -5253,7 +5548,7 @@ void cpu_load_update_active(struct rq *this_rq)
static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5268,7 +5563,7 @@ static unsigned long source_load(int cpu, int type)
static unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5290,7 +5585,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = weighted_cpuload(cpu);
+ unsigned long load_avg = weighted_cpuload(rq);
if (nr_running)
return load_avg / nr_running;
@@ -5345,20 +5640,77 @@ static int wake_wide(struct task_struct *p)
return 1;
}
+/*
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
+ *
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ * will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ * scheduling latency of the CPUs. This seems to work
+ * for the overloaded case.
+ */
+
+static bool
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ if (idle_cpu(this_cpu))
+ return true;
+
+ if (sync && cpu_rq(this_cpu)->nr_running == 1)
+ return true;
+
+ return false;
+}
+
+static bool
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ s64 this_eff_load, prev_eff_load;
+ unsigned long task_load;
+
+ this_eff_load = target_load(this_cpu, sd->wake_idx);
+ prev_eff_load = source_load(prev_cpu, sd->wake_idx);
+
+ if (sync) {
+ unsigned long current_load = task_h_load(current);
+
+ if (current_load > this_eff_load)
+ return true;
+
+ this_eff_load -= current_load;
+ }
+
+ task_load = task_h_load(p);
+
+ this_eff_load += task_load;
+ if (sched_feat(WA_BIAS))
+ this_eff_load *= 100;
+ this_eff_load *= capacity_of(prev_cpu);
+
+ prev_eff_load -= task_load;
+ if (sched_feat(WA_BIAS))
+ prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= capacity_of(this_cpu);
+
+ return this_eff_load <= prev_eff_load;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
bool affine = false;
- /*
- * Common case: CPUs are in the same socket, and select_idle_sibling()
- * will do its thing regardless of what we return:
- */
- if (cpus_share_cache(prev_cpu, this_cpu))
- affine = true;
- else
- affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_IDLE) && !affine)
+ affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
+
+ if (sched_feat(WA_WEIGHT) && !affine)
+ affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) {
@@ -5380,6 +5732,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
@@ -5387,8 +5741,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
{
struct sched_group *idlest = NULL, *group = sd->groups;
struct sched_group *most_spare_sg = NULL;
- unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
- unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
+ unsigned long min_runnable_load = ULONG_MAX;
+ unsigned long this_runnable_load = ULONG_MAX;
+ unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
@@ -5509,10 +5864,10 @@ skip_spare:
}
/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
*/
static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -5550,7 +5905,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
- load = weighted_cpuload(i);
+ load = weighted_cpuload(cpu_rq(i));
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
least_loaded_cpu = i;
@@ -5561,6 +5916,53 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+ int cpu, int prev_cpu, int sd_flag)
+{
+ int new_cpu = cpu;
+
+ if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ return prev_cpu;
+
+ while (sd) {
+ struct sched_group *group;
+ struct sched_domain *tmp;
+ int weight;
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_group_cpu(group, p, cpu);
+ if (new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ return new_cpu;
+}
+
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
@@ -5913,50 +6315,30 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
new_cpu = cpu;
}
+ if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+ /*
+ * We're going to need the task's util for capacity_spare_wake
+ * in find_idlest_group. Sync it up to prev_cpu's
+ * last_update_time.
+ */
+ sync_entity_load_avg(&p->se);
+ }
+
if (!sd) {
- pick_cpu:
+pick_cpu:
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- } else while (sd) {
- struct sched_group *group;
- int weight;
-
- if (!(sd->flags & sd_flag)) {
- sd = sd->child;
- continue;
- }
-
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
- continue;
- }
-
- new_cpu = find_idlest_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
- sd = sd->child;
- continue;
- }
-
- /* Now try balancing at a lower domain level of new_cpu */
- cpu = new_cpu;
- weight = sd->span_weight;
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (weight <= tmp->span_weight)
- break;
- if (tmp->flags & sd_flag)
- sd = tmp;
- }
- /* while loop will break here if sd == NULL */
+ } else {
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
rcu_read_unlock();
return new_cpu;
}
+static void detach_entity_cfs_rq(struct sched_entity *se);
+
/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
@@ -5990,14 +6372,25 @@ static void migrate_task_rq_fair(struct task_struct *p)
se->vruntime -= min_vruntime;
}
- /*
- * We are supposed to update the task to "current" time, then its up to date
- * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
- * what current time is, so simply throw away the out-of-date time. This
- * will result in the wakee task is less decayed, but giving the wakee more
- * load sounds not bad.
- */
- remove_entity_load_avg(&p->se);
+ if (p->on_rq == TASK_ON_RQ_MIGRATING) {
+ /*
+ * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
+ * rq->lock and can modify state directly.
+ */
+ lockdep_assert_held(&task_rq(p)->lock);
+ detach_entity_cfs_rq(&p->se);
+
+ } else {
+ /*
+ * We are supposed to update the task to "current" time, then
+ * its up to date and ready to go to new CPU/cfs_rq. But we
+ * have difficulty in getting what current time is, so simply
+ * throw away the out-of-date time. This will result in the
+ * wakee task is less decayed, but giving the wakee more load
+ * sounds not bad.
+ */
+ remove_entity_load_avg(&p->se);
+ }
/* Tell new CPU we are migrated */
p->se.avg.last_update_time = 0;
@@ -6187,10 +6580,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
int new_tasks;
again:
-#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
goto idle;
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (prev->sched_class != &fair_sched_class)
goto simple;
@@ -6220,11 +6613,17 @@ again:
/*
* This call to check_cfs_rq_runtime() will do the
* throttle and dequeue its entity in the parent(s).
- * Therefore the 'simple' nr_running test will indeed
+ * Therefore the nr_running test will indeed
* be correct.
*/
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
+ cfs_rq = &rq->cfs;
+
+ if (!cfs_rq->nr_running)
+ goto idle;
+
goto simple;
+ }
}
se = pick_next_entity(cfs_rq, curr);
@@ -6259,17 +6658,10 @@ again:
set_next_entity(cfs_rq, se);
}
- if (hrtick_enabled(rq))
- hrtick_start_fair(rq, p);
-
- return p;
+ goto done;
simple:
- cfs_rq = &rq->cfs;
#endif
- if (!cfs_rq->nr_running)
- goto idle;
-
put_prev_task(rq, prev);
do {
@@ -6280,6 +6672,16 @@ simple:
p = task_of(se);
+done: __maybe_unused
+#ifdef CONFIG_SMP
+ /*
+ * Move the next running task to the front of
+ * the list, so our cfs_tasks list becomes MRU
+ * one.
+ */
+ list_move(&p->se.group_node, &rq->cfs_tasks);
+#endif
+
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
@@ -6715,11 +7117,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
*/
static struct task_struct *detach_one_task(struct lb_env *env)
{
- struct task_struct *p, *n;
+ struct task_struct *p;
lockdep_assert_held(&env->src_rq->lock);
- list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ list_for_each_entry_reverse(p,
+ &env->src_rq->cfs_tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;
@@ -6765,7 +7168,7 @@ static int detach_tasks(struct lb_env *env)
if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
break;
- p = list_first_entry(tasks, struct task_struct, se.group_node);
+ p = list_last_entry(tasks, struct task_struct, se.group_node);
env->loop++;
/* We've more or less seen every task there is, call it quits */
@@ -6815,7 +7218,7 @@ static int detach_tasks(struct lb_env *env)
continue;
next:
- list_move_tail(&p->se.group_node, tasks);
+ list_move(&p->se.group_node, tasks);
}
/*
@@ -6891,7 +7294,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (cfs_rq->avg.util_sum)
return false;
- if (cfs_rq->runnable_load_sum)
+ if (cfs_rq->avg.runnable_load_sum)
return false;
return true;
@@ -6917,13 +7320,13 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
- update_load_avg(se, 0);
+ update_load_avg(cfs_rq_of(se), se, 0);
/*
* There can be a lot of idle CPU cgroups. Don't let fully
@@ -6990,7 +7393,7 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
rq_unlock_irqrestore(rq, &rf);
}
@@ -7036,6 +7439,7 @@ struct sg_lb_stats {
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
+ unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
@@ -7055,6 +7459,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
+ .total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
@@ -7363,7 +7768,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
- sgs->sum_weighted_load += weighted_cpuload(i);
+ sgs->sum_weighted_load += weighted_cpuload(rq);
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -7546,6 +7951,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
+ sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
@@ -7560,7 +7966,6 @@ next_group:
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
}
-
}
/**
@@ -7581,7 +7986,7 @@ next_group:
* number.
*
* Return: 1 when packing is required and a task should be moved to
- * this CPU. The amount of the imbalance is returned in *imbalance.
+ * this CPU. The amount of the imbalance is returned in env->imbalance.
*
* @env: The load balancing environment.
* @sds: Statistics of the sched_domain which is to be packed
@@ -7790,6 +8195,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
+ /* XXX broken for overlapping NUMA groups */
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
@@ -7801,8 +8207,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_imbalanced)
goto force_balance;
- /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ /*
+ * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+ * capacities from resulting in underutilization due to avg_load.
+ */
+ if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
busiest->group_no_capacity)
goto force_balance;
@@ -7892,7 +8301,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
capacity = capacity_of(i);
- wl = weighted_cpuload(i);
+ wl = weighted_cpuload(rq);
/*
* When comparing with imbalance, use weighted_cpuload()
@@ -7970,6 +8379,13 @@ static int should_we_balance(struct lb_env *env)
int cpu, balance_cpu = -1;
/*
+ * Ensure the balancing environment is consistent; can happen
+ * when the softirq triggers 'during' hotplug.
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+ return 0;
+
+ /*
* In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
@@ -8309,6 +8725,12 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
this_rq->idle_stamp = rq_clock(this_rq);
/*
+ * Do not pull tasks towards !active CPUs...
+ */
+ if (!cpu_active(this_cpu))
+ return 0;
+
+ /*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
* further scheduler activity on it and we're being very careful to
@@ -8415,6 +8837,13 @@ static int active_load_balance_cpu_stop(void *data)
struct rq_flags rf;
rq_lock_irq(busiest_rq, &rf);
+ /*
+ * Between queueing the stop-work and running it is a hole in which
+ * CPUs can become inactive. We should not move tasks from or to
+ * inactive CPUs.
+ */
+ if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
+ goto out_unlock;
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -8599,7 +9028,7 @@ void nohz_balance_enter_idle(int cpu)
return;
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!is_housekeeping_cpu(cpu))
+ if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
return;
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
@@ -9064,7 +9493,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, UPDATE_TG);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
}
}
#else
@@ -9076,7 +9505,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/* Catch up with the cfs_rq and remove our load when we leave */
- update_load_avg(se, 0);
+ update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
@@ -9095,7 +9524,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
#endif
/* Synchronize entity with its cfs_rq */
- update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
@@ -9171,17 +9600,13 @@ static void set_curr_task_fair(struct rq *rq)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
- cfs_rq->tasks_timeline = RB_ROOT;
+ cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
-#ifdef CONFIG_FAIR_GROUP_SCHED
- cfs_rq->propagate_avg = 0;
-#endif
- atomic_long_set(&cfs_rq->removed_load_avg, 0);
- atomic_long_set(&cfs_rq->removed_util_avg, 0);
+ raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
}
@@ -9379,8 +9804,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
for_each_sched_entity(se) {
- update_load_avg(se, UPDATE_TG);
- update_cfs_shares(se);
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
+ update_cfs_group(se);
}
rq_unlock_irqrestore(rq, &rf);
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..9552fd5854bf 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Only give sleepers 50% of their service deficit. This allows
* them to run sooner, but does not allow tons of sleepers to
@@ -81,3 +82,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
+SCHED_FEAT(WA_IDLE, true)
+SCHED_FEAT(WA_WEIGHT, true)
+SCHED_FEAT(WA_BIAS, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6c23e30c0e5c..7dae9eb8c042 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -158,7 +158,7 @@ static void cpuidle_idle_call(void)
}
/*
- * Suspend-to-idle ("freeze") is a system state in which all user space
+ * Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
* activity happens here and in iterrupts (if any). In that case bypass
* the cpuidle governor and go stratight for the deepest idle state
@@ -167,9 +167,9 @@ static void cpuidle_idle_call(void)
* until a proper wakeup interrupt happens.
*/
- if (idle_should_freeze() || dev->use_deepest_state) {
- if (idle_should_freeze()) {
- entered_state = cpuidle_enter_freeze(drv, dev);
+ if (idle_should_enter_s2idle() || dev->use_deepest_state) {
+ if (idle_should_enter_s2idle()) {
+ entered_state = cpuidle_enter_s2idle(drv, dev);
if (entered_state > 0) {
local_irq_enable();
goto exit_idle;
@@ -209,6 +209,7 @@ exit_idle:
*/
static void do_idle(void)
{
+ int cpu = smp_processor_id();
/*
* If the arch has a polling bit, we maintain an invariant:
*
@@ -219,14 +220,13 @@ static void do_idle(void)
*/
__current_set_polling();
- quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
check_pgt_cache();
rmb();
- if (cpu_is_offline(smp_processor_id())) {
+ if (cpu_is_offline(cpu)) {
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 0c00172db63e..d518664cce4f 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "sched.h"
/*
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
new file mode 100644
index 000000000000..b71b436f59f2
--- /dev/null
+++ b/kernel/sched/isolation.c
@@ -0,0 +1,155 @@
+/*
+ * Housekeeping management. Manage the targets for routine code that can run on
+ * any CPU: unbound workqueues, timers, kthreads and any offloadable work.
+ *
+ * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
+ *
+ */
+
+#include <linux/sched/isolation.h>
+#include <linux/tick.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/static_key.h>
+#include <linux/ctype.h>
+
+DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
+EXPORT_SYMBOL_GPL(housekeeping_overriden);
+static cpumask_var_t housekeeping_mask;
+static unsigned int housekeeping_flags;
+
+int housekeeping_any_cpu(enum hk_flags flags)
+{
+ if (static_branch_unlikely(&housekeeping_overriden))
+ if (housekeeping_flags & flags)
+ return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+ return smp_processor_id();
+}
+EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
+
+const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
+{
+ if (static_branch_unlikely(&housekeeping_overriden))
+ if (housekeeping_flags & flags)
+ return housekeeping_mask;
+ return cpu_possible_mask;
+}
+EXPORT_SYMBOL_GPL(housekeeping_cpumask);
+
+void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
+{
+ if (static_branch_unlikely(&housekeeping_overriden))
+ if (housekeeping_flags & flags)
+ set_cpus_allowed_ptr(t, housekeeping_mask);
+}
+EXPORT_SYMBOL_GPL(housekeeping_affine);
+
+bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
+{
+ if (static_branch_unlikely(&housekeeping_overriden))
+ if (housekeeping_flags & flags)
+ return cpumask_test_cpu(cpu, housekeeping_mask);
+ return true;
+}
+EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
+
+void __init housekeeping_init(void)
+{
+ if (!housekeeping_flags)
+ return;
+
+ static_branch_enable(&housekeeping_overriden);
+
+ /* We need at least one CPU to handle housekeeping work */
+ WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+}
+
+static int __init housekeeping_setup(char *str, enum hk_flags flags)
+{
+ cpumask_var_t non_housekeeping_mask;
+ int err;
+
+ alloc_bootmem_cpumask_var(&non_housekeeping_mask);
+ err = cpulist_parse(str, non_housekeeping_mask);
+ if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
+ pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+ return 0;
+ }
+
+ if (!housekeeping_flags) {
+ alloc_bootmem_cpumask_var(&housekeeping_mask);
+ cpumask_andnot(housekeeping_mask,
+ cpu_possible_mask, non_housekeeping_mask);
+ if (cpumask_empty(housekeeping_mask))
+ cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+ } else {
+ cpumask_var_t tmp;
+
+ alloc_bootmem_cpumask_var(&tmp);
+ cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
+ if (!cpumask_equal(tmp, housekeeping_mask)) {
+ pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+ free_bootmem_cpumask_var(tmp);
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+ return 0;
+ }
+ free_bootmem_cpumask_var(tmp);
+ }
+
+ if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
+ if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ tick_nohz_full_setup(non_housekeeping_mask);
+ } else {
+ pr_warn("Housekeeping: nohz unsupported."
+ " Build with CONFIG_NO_HZ_FULL\n");
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+ return 0;
+ }
+ }
+
+ housekeeping_flags |= flags;
+
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+
+ return 1;
+}
+
+static int __init housekeeping_nohz_full_setup(char *str)
+{
+ unsigned int flags;
+
+ flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+
+ return housekeeping_setup(str, flags);
+}
+__setup("nohz_full=", housekeeping_nohz_full_setup);
+
+static int __init housekeeping_isolcpus_setup(char *str)
+{
+ unsigned int flags = 0;
+
+ while (isalpha(*str)) {
+ if (!strncmp(str, "nohz,", 5)) {
+ str += 5;
+ flags |= HK_FLAG_TICK;
+ continue;
+ }
+
+ if (!strncmp(str, "domain,", 7)) {
+ str += 7;
+ flags |= HK_FLAG_DOMAIN;
+ continue;
+ }
+
+ pr_warn("isolcpus: Error, unknown flag\n");
+ return 0;
+ }
+
+ /* Default behaviour for isolcpus without flags */
+ if (!flags)
+ flags |= HK_FLAG_DOMAIN;
+
+ return housekeeping_setup(str, flags);
+}
+__setup("isolcpus=", housekeeping_isolcpus_setup);
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index f14716a3522f..89a989e4d758 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* kernel/sched/loadavg.c
*
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
new file mode 100644
index 000000000000..dd7908743dab
--- /dev/null
+++ b/kernel/sched/membarrier.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/cpumask.h>
+#include <linux/atomic.h>
+
+#include "sched.h" /* for cpu_rq(). */
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK \
+ (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
+
+static void ipi_mb(void *info)
+{
+ smp_mb(); /* IPIs should be serializing but paranoid. */
+}
+
+static int membarrier_private_expedited(void)
+{
+ int cpu;
+ bool fallback = false;
+ cpumask_var_t tmpmask;
+
+ if (!(atomic_read(&current->mm->membarrier_state)
+ & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+ return -EPERM;
+
+ if (num_online_cpus() == 1)
+ return 0;
+
+ /*
+ * Matches memory barriers around rq->curr modification in
+ * scheduler.
+ */
+ smp_mb(); /* system call entry is not a mb. */
+
+ /*
+ * Expedited membarrier commands guarantee that they won't
+ * block, hence the GFP_NOWAIT allocation flag and fallback
+ * implementation.
+ */
+ if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+ /* Fallback for OOM. */
+ fallback = true;
+ }
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /*
+ * Skipping the current CPU is OK even through we can be
+ * migrated at any point. The current CPU, at the point
+ * where we read raw_smp_processor_id(), is ensured to
+ * be in program order with respect to the caller
+ * thread. Therefore, we can skip this CPU from the
+ * iteration.
+ */
+ if (cpu == raw_smp_processor_id())
+ continue;
+ rcu_read_lock();
+ p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+ if (p && p->mm == current->mm) {
+ if (!fallback)
+ __cpumask_set_cpu(cpu, tmpmask);
+ else
+ smp_call_function_single(cpu, ipi_mb, NULL, 1);
+ }
+ rcu_read_unlock();
+ }
+ if (!fallback) {
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ free_cpumask_var(tmpmask);
+ }
+ cpus_read_unlock();
+
+ /*
+ * Memory barrier on the caller thread _after_ we finished
+ * waiting for the last IPI. Matches memory barriers around
+ * rq->curr modification in scheduler.
+ */
+ smp_mb(); /* exit from system call is not a mb */
+ return 0;
+}
+
+static void membarrier_register_private_expedited(void)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+
+ /*
+ * We need to consider threads belonging to different thread
+ * groups, which use the same mm. (CLONE_VM but not
+ * CLONE_THREAD).
+ */
+ if (atomic_read(&mm->membarrier_state)
+ & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
+ return;
+ atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+ &mm->membarrier_state);
+}
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd: Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, not available on the running
+ * kernel, or if the command argument is invalid, this system call
+ * returns -EINVAL. For a given command, with flags argument set to 0,
+ * this system call is guaranteed to always return the same value until
+ * reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ * barrier() smp_mb() sys_membarrier()
+ * barrier() X X O
+ * smp_mb() X O O
+ * sys_membarrier() O O O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+ if (unlikely(flags))
+ return -EINVAL;
+ switch (cmd) {
+ case MEMBARRIER_CMD_QUERY:
+ {
+ int cmd_mask = MEMBARRIER_CMD_BITMASK;
+
+ if (tick_nohz_full_enabled())
+ cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+ return cmd_mask;
+ }
+ case MEMBARRIER_CMD_SHARED:
+ /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+ if (tick_nohz_full_enabled())
+ return -EINVAL;
+ if (num_online_cpus() > 1)
+ synchronize_sched();
+ return 0;
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ return membarrier_private_expedited();
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+ membarrier_register_private_expedited();
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 45caf937ef90..d8c43d73e078 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
* policies)
@@ -73,10 +74,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
-#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
-static void push_irq_work_func(struct irq_work *work);
-#endif
-
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
@@ -96,13 +93,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
-
-#ifdef HAVE_RT_PUSH_IPI
- rt_rq->push_flags = 0;
- rt_rq->push_cpu = nr_cpu_ids;
- raw_spin_lock_init(&rt_rq->push_lock);
- init_irq_work(&rt_rq->push_work, push_irq_work_func);
-#endif
#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -970,7 +960,7 @@ static void update_curr_rt(struct rq *rq)
return;
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1875,241 +1865,166 @@ static void push_rt_tasks(struct rq *rq)
}
#ifdef HAVE_RT_PUSH_IPI
+
/*
- * The search for the next cpu always starts at rq->cpu and ends
- * when we reach rq->cpu again. It will never return rq->cpu.
- * This returns the next cpu to check, or nr_cpu_ids if the loop
- * is complete.
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * All CPUs with overloaded RT tasks need to be notified as there is currently
+ * no way to know which of these CPUs have the highest priority task waiting
+ * to run. Instead of trying to take a spinlock on each of these CPUs,
+ * which has shown to cause large latency when done on machines with many
+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+ * RT tasks waiting to run.
+ *
+ * Just sending an IPI to each of the CPUs is also an issue, as on large
+ * count CPU machines, this can cause an IPI storm on a CPU, especially
+ * if its the only CPU with multiple RT tasks queued, and a large number
+ * of CPUs scheduling a lower priority task at the same time.
+ *
+ * Each root domain has its own irq work function that can iterate over
+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+ * tassk must be checked if there's one or many CPUs that are lowering
+ * their priority, there's a single irq work iterator that will try to
+ * push off RT tasks that are waiting to run.
+ *
+ * When a CPU schedules a lower priority task, it will kick off the
+ * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * As it only takes the first CPU that schedules a lower priority task
+ * to start the process, the rto_start variable is incremented and if
+ * the atomic result is one, then that CPU will try to take the rto_lock.
+ * This prevents high contention on the lock as the process handles all
+ * CPUs scheduling lower priority tasks.
+ *
+ * All CPUs that are scheduling a lower priority task will increment the
+ * rt_loop_next variable. This will make sure that the irq work iterator
+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+ * priority task, even if the iterator is in the middle of a scan. Incrementing
+ * the rt_loop_next will cause the iterator to perform another scan.
*
- * rq->rt.push_cpu holds the last cpu returned by this function,
- * or if this is the first instance, it must hold rq->cpu.
*/
static int rto_next_cpu(struct rq *rq)
{
- int prev_cpu = rq->rt.push_cpu;
+ struct root_domain *rd = rq->rd;
+ int next;
int cpu;
- cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
-
/*
- * If the previous cpu is less than the rq's CPU, then it already
- * passed the end of the mask, and has started from the beginning.
- * We end if the next CPU is greater or equal to rq's CPU.
+ * When starting the IPI RT pushing, the rto_cpu is set to -1,
+ * rt_next_cpu() will simply return the first CPU found in
+ * the rto_mask.
+ *
+ * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+ * will return the next CPU found in the rto_mask.
+ *
+ * If there are no more CPUs left in the rto_mask, then a check is made
+ * against rto_loop and rto_loop_next. rto_loop is only updated with
+ * the rto_lock held, but any CPU may increment the rto_loop_next
+ * without any locking.
*/
- if (prev_cpu < rq->cpu) {
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
+ for (;;) {
- } else if (cpu >= nr_cpu_ids) {
- /*
- * We passed the end of the mask, start at the beginning.
- * If the result is greater or equal to the rq's CPU, then
- * the loop is finished.
- */
- cpu = cpumask_first(rq->rd->rto_mask);
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
- }
- rq->rt.push_cpu = cpu;
+ /* When rto_cpu is -1 this acts like cpumask_first() */
+ cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
- /* Return cpu to let the caller know if the loop is finished or not */
- return cpu;
-}
+ rd->rto_cpu = cpu;
-static int find_next_push_cpu(struct rq *rq)
-{
- struct rq *next_rq;
- int cpu;
+ if (cpu < nr_cpu_ids)
+ return cpu;
- while (1) {
- cpu = rto_next_cpu(rq);
- if (cpu >= nr_cpu_ids)
- break;
- next_rq = cpu_rq(cpu);
+ rd->rto_cpu = -1;
+
+ /*
+ * ACQUIRE ensures we see the @rto_mask changes
+ * made prior to the @next value observed.
+ *
+ * Matches WMB in rt_set_overload().
+ */
+ next = atomic_read_acquire(&rd->rto_loop_next);
- /* Make sure the next rq can push to this rq */
- if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ if (rd->rto_loop == next)
break;
+
+ rd->rto_loop = next;
}
- return cpu;
+ return -1;
}
-#define RT_PUSH_IPI_EXECUTING 1
-#define RT_PUSH_IPI_RESTART 2
+static inline bool rto_start_trylock(atomic_t *v)
+{
+ return !atomic_cmpxchg_acquire(v, 0, 1);
+}
-/*
- * When a high priority task schedules out from a CPU and a lower priority
- * task is scheduled in, a check is made to see if there's any RT tasks
- * on other CPUs that are waiting to run because a higher priority RT task
- * is currently running on its CPU. In this case, the CPU with multiple RT
- * tasks queued on it (overloaded) needs to be notified that a CPU has opened
- * up that may be able to run one of its non-running queued RT tasks.
- *
- * On large CPU boxes, there's the case that several CPUs could schedule
- * a lower priority task at the same time, in which case it will look for
- * any overloaded CPUs that it could pull a task from. To do this, the runqueue
- * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
- * for a single overloaded CPU's runqueue lock can produce a large latency.
- * (This has actually been observed on large boxes running cyclictest).
- * Instead of taking the runqueue lock of the overloaded CPU, each of the
- * CPUs that scheduled a lower priority task simply sends an IPI to the
- * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
- * lots of contention. The overloaded CPU will look to push its non-running
- * RT task off, and if it does, it can then ignore the other IPIs coming
- * in, and just pass those IPIs off to any other overloaded CPU.
- *
- * When a CPU schedules a lower priority task, it only sends an IPI to
- * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
- * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
- * RT overloaded tasks, would cause 100 IPIs to go out at once.
- *
- * The overloaded RT CPU, when receiving an IPI, will try to push off its
- * overloaded RT tasks and then send an IPI to the next CPU that has
- * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
- * have completed. Just because a CPU may have pushed off its own overloaded
- * RT task does not mean it should stop sending the IPI around to other
- * overloaded CPUs. There may be another RT task waiting to run on one of
- * those CPUs that are of higher priority than the one that was just
- * pushed.
- *
- * An optimization that could possibly be made is to make a CPU array similar
- * to the cpupri array mask of all running RT tasks, but for the overloaded
- * case, then the IPI could be sent to only the CPU with the highest priority
- * RT task waiting, and that CPU could send off further IPIs to the CPU with
- * the next highest waiting task. Since the overloaded case is much less likely
- * to happen, the complexity of this implementation may not be worth it.
- * Instead, just send an IPI around to all overloaded CPUs.
- *
- * The rq->rt.push_flags holds the status of the IPI that is going around.
- * A run queue can only send out a single IPI at a time. The possible flags
- * for rq->rt.push_flags are:
- *
- * (None or zero): No IPI is going around for the current rq
- * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
- * RT_PUSH_IPI_RESTART: The priority of the running task for the rq
- * has changed, and the IPI should restart
- * circulating the overloaded CPUs again.
- *
- * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
- * before sending to the next CPU.
- *
- * Instead of having all CPUs that schedule a lower priority task send
- * an IPI to the same "first" CPU in the RT overload mask, they send it
- * to the next overloaded CPU after their own CPU. This helps distribute
- * the work when there's more than one overloaded CPU and multiple CPUs
- * scheduling in lower priority tasks.
- *
- * When a rq schedules a lower priority task than what was currently
- * running, the next CPU with overloaded RT tasks is examined first.
- * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
- * priority task, it will send an IPI first to CPU 5, then CPU 5 will
- * send to CPU 1 if it is still overloaded. CPU 1 will clear the
- * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
- *
- * The first CPU to notice IPI_RESTART is set, will clear that flag and then
- * send an IPI to the next overloaded CPU after the rq->cpu and not the next
- * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
- * schedules a lower priority task, and the IPI_RESTART gets set while the
- * handling is being done on CPU 5, it will clear the flag and send it back to
- * CPU 4 instead of CPU 1.
- *
- * Note, the above logic can be disabled by turning off the sched_feature
- * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
- * taken by the CPU requesting a pull and the waiting RT task will be pulled
- * by that CPU. This may be fine for machines with few CPUs.
- */
-static void tell_cpu_to_push(struct rq *rq)
+static inline void rto_start_unlock(atomic_t *v)
{
- int cpu;
+ atomic_set_release(v, 0);
+}
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- raw_spin_lock(&rq->rt.push_lock);
- /* Make sure it's still executing */
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- /*
- * Tell the IPI to restart the loop as things have
- * changed since it started.
- */
- rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
- raw_spin_unlock(&rq->rt.push_lock);
- return;
- }
- raw_spin_unlock(&rq->rt.push_lock);
- }
+static void tell_cpu_to_push(struct rq *rq)
+{
+ int cpu = -1;
- /* When here, there's no IPI going around */
+ /* Keep the loop going if the IPI is currently active */
+ atomic_inc(&rq->rd->rto_loop_next);
- rq->rt.push_cpu = rq->cpu;
- cpu = find_next_push_cpu(rq);
- if (cpu >= nr_cpu_ids)
+ /* Only one CPU can initiate a loop at a time */
+ if (!rto_start_trylock(&rq->rd->rto_loop_start))
return;
- rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+ raw_spin_lock(&rq->rd->rto_lock);
+
+ /*
+ * The rto_cpu is updated under the lock, if it has a valid cpu
+ * then the IPI is still running and will continue due to the
+ * update to loop_next, and nothing needs to be done here.
+ * Otherwise it is finishing up and an ipi needs to be sent.
+ */
+ if (rq->rd->rto_cpu < 0)
+ cpu = rto_next_cpu(rq);
- irq_work_queue_on(&rq->rt.push_work, cpu);
+ raw_spin_unlock(&rq->rd->rto_lock);
+
+ rto_start_unlock(&rq->rd->rto_loop_start);
+
+ if (cpu >= 0)
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
}
/* Called from hardirq context */
-static void try_to_push_tasks(void *arg)
+void rto_push_irq_work_func(struct irq_work *work)
{
- struct rt_rq *rt_rq = arg;
- struct rq *rq, *src_rq;
- int this_cpu;
+ struct rq *rq;
int cpu;
- this_cpu = rt_rq->push_cpu;
+ rq = this_rq();
- /* Paranoid check */
- BUG_ON(this_cpu != smp_processor_id());
-
- rq = cpu_rq(this_cpu);
- src_rq = rq_of_rt_rq(rt_rq);
-
-again:
+ /*
+ * We do not need to grab the lock to check for has_pushable_tasks.
+ * When it gets updated, a check is made if a push is possible.
+ */
if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock);
- push_rt_task(rq);
+ push_rt_tasks(rq);
raw_spin_unlock(&rq->lock);
}
- /* Pass the IPI to the next rt overloaded queue */
- raw_spin_lock(&rt_rq->push_lock);
- /*
- * If the source queue changed since the IPI went out,
- * we need to restart the search from that CPU again.
- */
- if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
- rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
- rt_rq->push_cpu = src_rq->cpu;
- }
+ raw_spin_lock(&rq->rd->rto_lock);
- cpu = find_next_push_cpu(src_rq);
+ /* Pass the IPI to the next rt overloaded queue */
+ cpu = rto_next_cpu(rq);
- if (cpu >= nr_cpu_ids)
- rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
- raw_spin_unlock(&rt_rq->push_lock);
+ raw_spin_unlock(&rq->rd->rto_lock);
- if (cpu >= nr_cpu_ids)
+ if (cpu < 0)
return;
- /*
- * It is possible that a restart caused this CPU to be
- * chosen again. Don't bother with an IPI, just see if we
- * have more to push.
- */
- if (unlikely(cpu == rq->cpu))
- goto again;
-
/* Try the next RT overloaded CPU */
- irq_work_queue_on(&rt_rq->push_work, cpu);
-}
-
-static void push_irq_work_func(struct irq_work *work)
-{
- struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
-
- try_to_push_tasks(rt_rq);
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
}
#endif /* HAVE_RT_PUSH_IPI */
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
index cd200d16529e..a26473674fb7 100644
--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
static const u32 runnable_avg_yN_inv[] = {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eeef1a3086d1..45ab0bf564e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/sched.h>
#include <linux/sched/autogroup.h>
@@ -226,7 +227,7 @@ struct dl_bw {
static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
{
dl_b->total_bw -= tsk_bw;
__dl_update(dl_b, (s32)tsk_bw / cpus);
@@ -255,7 +256,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy,
extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
-extern void __dl_clear_params(struct task_struct *p);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_task_can_attach(struct task_struct *p,
const struct cpumask *cs_cpus_allowed);
@@ -418,6 +418,7 @@ struct cfs_bandwidth { };
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
+ unsigned long runnable_weight;
unsigned int nr_running, h_nr_running;
u64 exec_clock;
@@ -426,8 +427,7 @@ struct cfs_rq {
u64 min_vruntime_copy;
#endif
- struct rb_root tasks_timeline;
- struct rb_node *rb_leftmost;
+ struct rb_root_cached tasks_timeline;
/*
* 'curr' points to currently running entity on this cfs_rq.
@@ -444,18 +444,22 @@ struct cfs_rq {
* CFS load tracking
*/
struct sched_avg avg;
- u64 runnable_load_sum;
- unsigned long runnable_load_avg;
-#ifdef CONFIG_FAIR_GROUP_SCHED
- unsigned long tg_load_avg_contrib;
- unsigned long propagate_avg;
-#endif
- atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
#endif
+ struct {
+ raw_spinlock_t lock ____cacheline_aligned;
+ int nr;
+ unsigned long load_avg;
+ unsigned long util_avg;
+ unsigned long runnable_sum;
+ } removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ unsigned long tg_load_avg_contrib;
+ long propagate;
+ long prop_runnable_sum;
+
/*
* h_load = weight * f(tg)
*
@@ -502,7 +506,7 @@ static inline int rt_bandwidth_enabled(void)
}
/* RT IPI pull logic requires IRQ_WORK */
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
# define HAVE_RT_PUSH_IPI
#endif
@@ -524,12 +528,6 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
-#ifdef HAVE_RT_PUSH_IPI
- int push_flags;
- int push_cpu;
- struct irq_work push_work;
- raw_spinlock_t push_lock;
-#endif
#endif /* CONFIG_SMP */
int rt_queued;
@@ -550,8 +548,7 @@ struct rt_rq {
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
- struct rb_root rb_root;
- struct rb_node *rb_leftmost;
+ struct rb_root_cached root;
unsigned long dl_nr_running;
@@ -575,8 +572,7 @@ struct dl_rq {
* an rb-tree, ordered by tasks' deadlines, with caching
* of the leftmost (earliest deadline) element.
*/
- struct rb_root pushable_dl_tasks_root;
- struct rb_node *pushable_dl_tasks_leftmost;
+ struct rb_root_cached pushable_dl_tasks_root;
#else
struct dl_bw dl_bw;
#endif
@@ -640,6 +636,19 @@ struct root_domain {
struct dl_bw dl_bw;
struct cpudl cpudl;
+#ifdef HAVE_RT_PUSH_IPI
+ /*
+ * For IPI pull requests, loop across the rto_mask.
+ */
+ struct irq_work rto_push_work;
+ raw_spinlock_t rto_lock;
+ /* These are only updated and read within rto_lock */
+ int rto_loop;
+ int rto_cpu;
+ /* These atomics are updated outside of a lock */
+ atomic_t rto_loop_next;
+ atomic_t rto_loop_start;
+#endif
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
@@ -657,6 +666,9 @@ extern void init_defrootdomain(void);
extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
+#ifdef HAVE_RT_PUSH_IPI
+extern void rto_push_irq_work_func(struct irq_work *work);
+#endif
#endif /* CONFIG_SMP */
/*
@@ -769,7 +781,7 @@ struct rq {
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
- struct call_single_data hrtick_csd;
+ call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
#endif
@@ -1120,11 +1132,15 @@ extern int group_balance_cpu(struct sched_group *sg);
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
void register_sched_domain_sysctl(void);
+void dirty_sched_domain_sysctl(int cpu);
void unregister_sched_domain_sysctl(void);
#else
static inline void register_sched_domain_sysctl(void)
{
}
+static inline void dirty_sched_domain_sysctl(int cpu)
+{
+}
static inline void unregister_sched_domain_sysctl(void)
{
}
@@ -1217,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
# define const_debug const
#endif
-extern const_debug unsigned int sysctl_sched_features;
-
#define SCHED_FEAT(name, enabled) \
__SCHED_FEAT_##name ,
@@ -1230,6 +1244,13 @@ enum {
#undef SCHED_FEAT
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+
+/*
+ * To support run-time toggling of sched features, all the translation units
+ * (but core.c) reference the sysctl_sched_features defined in core.c.
+ */
+extern const_debug unsigned int sysctl_sched_features;
+
#define SCHED_FEAT(name, enabled) \
static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
@@ -1237,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
}
#include "features.h"
-
#undef SCHED_FEAT
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
+
#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+
+/*
+ * Each translation unit has its own copy of sysctl_sched_features to allow
+ * constants propagation at compile time and compiler optimization based on
+ * features default.
+ */
+#define SCHED_FEAT(name, enabled) \
+ (1UL << __SCHED_FEAT_##name) * enabled |
+static const_debug __maybe_unused unsigned int sysctl_sched_features =
+#include "features.h"
+ 0;
+#undef SCHED_FEAT
+
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing;
@@ -1528,6 +1563,8 @@ extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
+extern void reweight_task(struct task_struct *p, int prio);
+
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
@@ -1950,6 +1987,8 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
#ifdef CONFIG_SCHED_DEBUG
+extern bool sched_debug_enabled;
+
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
@@ -2070,19 +2109,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
struct update_util_data *data;
- data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
+ cpu_of(rq)));
if (data)
data->func(data, rq_clock(rq), flags);
}
-
-static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
-{
- if (cpu_of(rq) == smp_processor_id())
- cpufreq_update_util(rq, flags);
-}
#else
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
-static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef arch_scale_freq_capacity
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 87e2c9f0c33e..940b1fa1d2ce 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
#include <linux/fs.h>
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index d5710651043b..baf500d12b7c 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 9f69fb630853..45caf90b24cd 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "sched.h"
/*
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 3d5610dcce11..9ff1555341ed 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/sched/signal.h>
#include <linux/swait.h>
@@ -33,9 +34,6 @@ void swake_up(struct swait_queue_head *q)
{
unsigned long flags;
- if (!swait_active(q))
- return;
-
raw_spin_lock_irqsave(&q->lock, flags);
swake_up_locked(q);
raw_spin_unlock_irqrestore(&q->lock, flags);
@@ -51,9 +49,6 @@ void swake_up_all(struct swait_queue_head *q)
struct swait_queue *curr;
LIST_HEAD(tmp);
- if (!swait_active(q))
- return;
-
raw_spin_lock_irq(&q->lock);
list_splice_init(&q->task_list, &tmp);
while (!list_empty(&tmp)) {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 79895aec281e..034cbed7f88b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1,8 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Scheduler topology setup/handling methods
*/
#include <linux/sched.h>
#include <linux/mutex.h>
+#include <linux/sched/isolation.h>
#include "sched.h"
@@ -14,11 +16,9 @@ cpumask_var_t sched_domains_tmpmask2;
#ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_debug_enabled;
-
static int __init sched_debug_setup(char *str)
{
- sched_debug_enabled = 1;
+ sched_debug_enabled = true;
return 0;
}
@@ -261,8 +261,6 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
static int init_rootdomain(struct root_domain *rd)
{
- memset(rd, 0, sizeof(*rd));
-
if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
goto out;
if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
@@ -272,6 +270,12 @@ static int init_rootdomain(struct root_domain *rd)
if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
goto free_dlo_mask;
+#ifdef HAVE_RT_PUSH_IPI
+ rd->rto_cpu = -1;
+ raw_spin_lock_init(&rd->rto_lock);
+ init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+#endif
+
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_rto_mask;
@@ -311,7 +315,7 @@ static struct root_domain *alloc_rootdomain(void)
{
struct root_domain *rd;
- rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+ rd = kzalloc(sizeof(*rd), GFP_KERNEL);
if (!rd)
return NULL;
@@ -337,7 +341,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
kfree(sg->sgc);
- kfree(sg);
+ if (atomic_dec_and_test(&sg->ref))
+ kfree(sg);
sg = tmp;
} while (sg != first);
}
@@ -345,15 +350,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
static void destroy_sched_domain(struct sched_domain *sd)
{
/*
- * If its an overlapping domain it has private groups, iterate and
- * nuke them all.
+ * A normal sched domain may have multiple group references, an
+ * overlapping domain, having private groups, only one. Iterate,
+ * dropping group/capacity references, freeing where none remain.
*/
- if (sd->flags & SD_OVERLAP) {
- free_sched_groups(sd->groups, 1);
- } else if (atomic_dec_and_test(&sd->groups->ref)) {
- kfree(sd->groups->sgc);
- kfree(sd->groups);
- }
+ free_sched_groups(sd->groups, 1);
+
if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
kfree(sd->shared);
kfree(sd);
@@ -463,26 +465,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
rq_attach_root(rq, rd);
tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd);
+ dirty_sched_domain_sysctl(cpu);
destroy_sched_domains(tmp);
update_top_cache_domain(cpu);
}
-/* Setup the mask of CPUs configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
- int ret;
-
- alloc_bootmem_cpumask_var(&cpu_isolated_map);
- ret = cpulist_parse(str, cpu_isolated_map);
- if (ret) {
- pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
- return 0;
- }
- return 1;
-}
-__setup("isolcpus=", isolated_cpu_setup);
-
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
@@ -670,6 +658,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
else
cpumask_copy(sg_span, sched_domain_span(sd));
+ atomic_inc(&sg->ref);
return sg;
}
@@ -1161,6 +1150,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->smt_gain = 1178; /* ~15% */
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
@@ -1335,6 +1325,10 @@ void sched_init_numa(void)
if (!sched_domains_numa_distance)
return;
+ /* Includes NUMA identity node at level 0. */
+ sched_domains_numa_distance[level++] = curr_distance;
+ sched_domains_numa_levels = level;
+
/*
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
* unique distances in the node_distance() table.
@@ -1382,8 +1376,7 @@ void sched_init_numa(void)
return;
/*
- * 'level' contains the number of unique distances, excluding the
- * identity distance node_distance(i,i).
+ * 'level' contains the number of unique distances
*
* The sched_domains_numa_distance[] array includes the actual distance
* numbers.
@@ -1445,9 +1438,18 @@ void sched_init_numa(void)
tl[i] = sched_domain_topology[i];
/*
+ * Add the NUMA identity distance, aka single NODE.
+ */
+ tl[i++] = (struct sched_domain_topology_level){
+ .mask = sd_numa_mask,
+ .numa_level = 0,
+ SD_INIT_NAME(NODE)
+ };
+
+ /*
* .. and append 'j' levels of NUMA goodness.
*/
- for (j = 0; j < level; i++, j++) {
+ for (j = 1; j < level; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
.mask = sd_numa_mask,
.sd_flags = cpu_numa_flags,
@@ -1595,7 +1597,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
}
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
@@ -1777,7 +1779,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
doms_cur = alloc_sched_domains(ndoms_cur);
if (!doms_cur)
doms_cur = &fallback_doms;
- cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+ cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
@@ -1854,7 +1856,18 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
/* Let the architecture update CPU core mappings: */
new_topology = arch_update_cpu_topology();
- n = doms_new ? ndoms_new : 0;
+ if (!doms_new) {
+ WARN_ON_ONCE(dattr_new);
+ n = 0;
+ doms_new = alloc_sched_domains(1);
+ if (doms_new) {
+ n = 1;
+ cpumask_and(doms_new[0], cpu_active_mask,
+ housekeeping_cpumask(HK_FLAG_DOMAIN));
+ }
+ } else {
+ n = ndoms_new;
+ }
/* Destroy deleted domains: */
for (i = 0; i < ndoms_cur; i++) {
@@ -1870,11 +1883,11 @@ match1:
}
n = ndoms_cur;
- if (doms_new == NULL) {
+ if (!doms_new) {
n = 0;
doms_new = &fallback_doms;
- cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
- WARN_ON_ONCE(dattr_new);
+ cpumask_and(doms_new[0], cpu_active_mask,
+ housekeeping_cpumask(HK_FLAG_DOMAIN));
}
/* Build new domains: */
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index d6afed6d0752..98feab7933c7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
}
EXPORT_SYMBOL(remove_wait_queue);
+/*
+ * Scan threshold to break wait queue walk.
+ * This allows a waker to take a break from holding the
+ * wait queue lock during the wait queue walk.
+ */
+#define WAITQUEUE_WALK_BREAK_CNT 64
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
@@ -63,18 +69,67 @@ EXPORT_SYMBOL(remove_wait_queue);
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
-static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key)
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key,
+ wait_queue_entry_t *bookmark)
{
wait_queue_entry_t *curr, *next;
+ int cnt = 0;
+
+ if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+ curr = list_next_entry(bookmark, entry);
+
+ list_del(&bookmark->entry);
+ bookmark->flags = 0;
+ } else
+ curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
- list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
+ if (&curr->entry == &wq_head->head)
+ return nr_exclusive;
+
+ list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
- int ret = curr->func(curr, mode, wake_flags, key);
+ int ret;
+
+ if (flags & WQ_FLAG_BOOKMARK)
+ continue;
+
+ ret = curr->func(curr, mode, wake_flags, key);
if (ret < 0)
break;
if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
+
+ if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+ (&next->entry != &wq_head->head)) {
+ bookmark->flags = WQ_FLAG_BOOKMARK;
+ list_add_tail(&bookmark->entry, &next->entry);
+ break;
+ }
+ }
+ return nr_exclusive;
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key)
+{
+ unsigned long flags;
+ wait_queue_entry_t bookmark;
+
+ bookmark.flags = 0;
+ bookmark.private = NULL;
+ bookmark.func = NULL;
+ INIT_LIST_HEAD(&bookmark.entry);
+
+ spin_lock_irqsave(&wq_head->lock, flags);
+ nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+
+ while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+ spin_lock_irqsave(&wq_head->lock, flags);
+ nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+ wake_flags, key, &bookmark);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
}
@@ -91,11 +146,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, void *key)
{
- unsigned long flags;
-
- spin_lock_irqsave(&wq_head->lock, flags);
- __wake_up_common(wq_head, mode, nr_exclusive, 0, key);
- spin_unlock_irqrestore(&wq_head->lock, flags);
+ __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
}
EXPORT_SYMBOL(__wake_up);
@@ -104,16 +155,23 @@ EXPORT_SYMBOL(__wake_up);
*/
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
{
- __wake_up_common(wq_head, mode, nr, 0, NULL);
+ __wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, 0, key);
+ __wake_up_common(wq_head, mode, 1, 0, key, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+ unsigned int mode, void *key, wait_queue_entry_t *bookmark)
+{
+ __wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
+
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
@@ -134,7 +192,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, void *key)
{
- unsigned long flags;
int wake_flags = 1; /* XXX WF_SYNC */
if (unlikely(!wq_head))
@@ -143,9 +200,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
if (unlikely(nr_exclusive != 1))
wake_flags = 0;
- spin_lock_irqsave(&wq_head->lock, flags);
- __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key);
- spin_unlock_irqrestore(&wq_head->lock, flags);
+ __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 98b59b5db90b..5f0dfb2abb8d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/seccomp.c
*
@@ -17,11 +18,13 @@
#include <linux/audit.h>
#include <linux/compat.h>
#include <linux/coredump.h>
+#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/seccomp.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
+#include <linux/sysctl.h>
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
#include <asm/syscall.h>
@@ -42,6 +45,7 @@
* get/put helpers should be used when accessing an instance
* outside of a lifetime-guarded section. In general, this
* is only needed for handling filters shared across tasks.
+ * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
*
@@ -57,6 +61,7 @@
*/
struct seccomp_filter {
refcount_t usage;
+ bool log;
struct seccomp_filter *prev;
struct bpf_prog *prog;
};
@@ -171,20 +176,25 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
+ * @match: stores struct seccomp_filter that resulted in the return value,
+ * unless filter returned SECCOMP_RET_ALLOW, in which case it will
+ * be unchanged.
*
* Returns valid seccomp BPF response codes.
*/
-static u32 seccomp_run_filters(const struct seccomp_data *sd)
+#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
+static u32 seccomp_run_filters(const struct seccomp_data *sd,
+ struct seccomp_filter **match)
{
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
/* Make sure cross-thread synced filter points somewhere sane. */
struct seccomp_filter *f =
- lockless_dereference(current->seccomp.filter);
+ READ_ONCE(current->seccomp.filter);
/* Ensure unexpected behavior doesn't result in failing open. */
if (unlikely(WARN_ON(f == NULL)))
- return SECCOMP_RET_KILL;
+ return SECCOMP_RET_KILL_PROCESS;
if (!sd) {
populate_seccomp_data(&sd_local);
@@ -198,8 +208,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
for (; f; f = f->prev) {
u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
- if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
+ if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
ret = cur_ret;
+ *match = f;
+ }
}
return ret;
}
@@ -444,6 +456,10 @@ static long seccomp_attach_filter(unsigned int flags,
return ret;
}
+ /* Set log flag, if present. */
+ if (flags & SECCOMP_FILTER_FLAG_LOG)
+ filter->log = true;
+
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
@@ -458,14 +474,19 @@ static long seccomp_attach_filter(unsigned int flags,
return 0;
}
+static void __get_seccomp_filter(struct seccomp_filter *filter)
+{
+ /* Reference count is bounded by the number of total processes. */
+ refcount_inc(&filter->usage);
+}
+
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
void get_seccomp_filter(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
if (!orig)
return;
- /* Reference count is bounded by the number of total processes. */
- refcount_inc(&orig->usage);
+ __get_seccomp_filter(orig);
}
static inline void seccomp_filter_free(struct seccomp_filter *filter)
@@ -476,10 +497,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter)
}
}
-/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
-void put_seccomp_filter(struct task_struct *tsk)
+static void __put_seccomp_filter(struct seccomp_filter *orig)
{
- struct seccomp_filter *orig = tsk->seccomp.filter;
/* Clean up single-reference branches iteratively. */
while (orig && refcount_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
@@ -488,6 +507,12 @@ void put_seccomp_filter(struct task_struct *tsk)
}
}
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+ __put_seccomp_filter(tsk->seccomp.filter);
+}
+
static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
{
memset(info, 0, sizeof(*info));
@@ -514,6 +539,65 @@ static void seccomp_send_sigsys(int syscall, int reason)
}
#endif /* CONFIG_SECCOMP_FILTER */
+/* For use with seccomp_actions_logged */
+#define SECCOMP_LOG_KILL_PROCESS (1 << 0)
+#define SECCOMP_LOG_KILL_THREAD (1 << 1)
+#define SECCOMP_LOG_TRAP (1 << 2)
+#define SECCOMP_LOG_ERRNO (1 << 3)
+#define SECCOMP_LOG_TRACE (1 << 4)
+#define SECCOMP_LOG_LOG (1 << 5)
+#define SECCOMP_LOG_ALLOW (1 << 6)
+
+static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
+ SECCOMP_LOG_KILL_THREAD |
+ SECCOMP_LOG_TRAP |
+ SECCOMP_LOG_ERRNO |
+ SECCOMP_LOG_TRACE |
+ SECCOMP_LOG_LOG;
+
+static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
+ bool requested)
+{
+ bool log = false;
+
+ switch (action) {
+ case SECCOMP_RET_ALLOW:
+ break;
+ case SECCOMP_RET_TRAP:
+ log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
+ break;
+ case SECCOMP_RET_ERRNO:
+ log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
+ break;
+ case SECCOMP_RET_TRACE:
+ log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
+ break;
+ case SECCOMP_RET_LOG:
+ log = seccomp_actions_logged & SECCOMP_LOG_LOG;
+ break;
+ case SECCOMP_RET_KILL_THREAD:
+ log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
+ break;
+ case SECCOMP_RET_KILL_PROCESS:
+ default:
+ log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
+ }
+
+ /*
+ * Force an audit message to be emitted when the action is RET_KILL_*,
+ * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
+ * allowed to be logged by the admin.
+ */
+ if (log)
+ return __audit_seccomp(syscall, signr, action);
+
+ /*
+ * Let the audit subsystem decide if the action should be audited based
+ * on whether the current task itself is being audited.
+ */
+ return audit_seccomp(syscall, signr, action);
+}
+
/*
* Secure computing mode 1 allows only read/write/exit/sigreturn.
* To be fully secure this must be combined with rlimit
@@ -539,7 +623,7 @@ static void __secure_computing_strict(int this_syscall)
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
- audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
+ seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
do_exit(SIGKILL);
}
@@ -566,6 +650,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
const bool recheck_after_trace)
{
u32 filter_ret, action;
+ struct seccomp_filter *match = NULL;
int data;
/*
@@ -574,9 +659,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
*/
rmb();
- filter_ret = seccomp_run_filters(sd);
+ filter_ret = seccomp_run_filters(sd, &match);
data = filter_ret & SECCOMP_RET_DATA;
- action = filter_ret & SECCOMP_RET_ACTION;
+ action = filter_ret & SECCOMP_RET_ACTION_FULL;
switch (action) {
case SECCOMP_RET_ERRNO:
@@ -637,14 +722,25 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
return 0;
+ case SECCOMP_RET_LOG:
+ seccomp_log(this_syscall, 0, action, true);
+ return 0;
+
case SECCOMP_RET_ALLOW:
+ /*
+ * Note that the "match" filter will always be NULL for
+ * this action since SECCOMP_RET_ALLOW is the starting
+ * state in seccomp_run_filters().
+ */
return 0;
- case SECCOMP_RET_KILL:
+ case SECCOMP_RET_KILL_THREAD:
+ case SECCOMP_RET_KILL_PROCESS:
default:
- audit_seccomp(this_syscall, SIGSYS, action);
+ seccomp_log(this_syscall, SIGSYS, action, true);
/* Dump core only if this is the last remaining thread. */
- if (get_nr_threads(current) == 1) {
+ if (action == SECCOMP_RET_KILL_PROCESS ||
+ get_nr_threads(current) == 1) {
siginfo_t info;
/* Show the original registers in the dump. */
@@ -653,13 +749,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
seccomp_init_siginfo(&info, this_syscall, data);
do_coredump(&info);
}
- do_exit(SIGSYS);
+ if (action == SECCOMP_RET_KILL_PROCESS)
+ do_group_exit(SIGSYS);
+ else
+ do_exit(SIGSYS);
}
unreachable();
skip:
- audit_seccomp(this_syscall, 0, action);
+ seccomp_log(this_syscall, 0, action, match ? match->log : false);
return -1;
}
#else
@@ -794,6 +893,29 @@ static inline long seccomp_set_mode_filter(unsigned int flags,
}
#endif
+static long seccomp_get_action_avail(const char __user *uaction)
+{
+ u32 action;
+
+ if (copy_from_user(&action, uaction, sizeof(action)))
+ return -EFAULT;
+
+ switch (action) {
+ case SECCOMP_RET_KILL_PROCESS:
+ case SECCOMP_RET_KILL_THREAD:
+ case SECCOMP_RET_TRAP:
+ case SECCOMP_RET_ERRNO:
+ case SECCOMP_RET_TRACE:
+ case SECCOMP_RET_LOG:
+ case SECCOMP_RET_ALLOW:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
/* Common entry point for both prctl and syscall. */
static long do_seccomp(unsigned int op, unsigned int flags,
const char __user *uargs)
@@ -805,6 +927,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
return seccomp_set_mode_strict();
case SECCOMP_SET_MODE_FILTER:
return seccomp_set_mode_filter(flags, uargs);
+ case SECCOMP_GET_ACTION_AVAIL:
+ if (flags != 0)
+ return -EINVAL;
+
+ return seccomp_get_action_avail(uargs);
default:
return -EINVAL;
}
@@ -908,13 +1035,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
if (!data)
goto out;
- get_seccomp_filter(task);
+ __get_seccomp_filter(filter);
spin_unlock_irq(&task->sighand->siglock);
if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
ret = -EFAULT;
- put_seccomp_filter(task);
+ __put_seccomp_filter(filter);
return ret;
out:
@@ -922,3 +1049,185 @@ out:
return ret;
}
#endif
+
+#ifdef CONFIG_SYSCTL
+
+/* Human readable action names for friendly sysctl interaction */
+#define SECCOMP_RET_KILL_PROCESS_NAME "kill_process"
+#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread"
+#define SECCOMP_RET_TRAP_NAME "trap"
+#define SECCOMP_RET_ERRNO_NAME "errno"
+#define SECCOMP_RET_TRACE_NAME "trace"
+#define SECCOMP_RET_LOG_NAME "log"
+#define SECCOMP_RET_ALLOW_NAME "allow"
+
+static const char seccomp_actions_avail[] =
+ SECCOMP_RET_KILL_PROCESS_NAME " "
+ SECCOMP_RET_KILL_THREAD_NAME " "
+ SECCOMP_RET_TRAP_NAME " "
+ SECCOMP_RET_ERRNO_NAME " "
+ SECCOMP_RET_TRACE_NAME " "
+ SECCOMP_RET_LOG_NAME " "
+ SECCOMP_RET_ALLOW_NAME;
+
+struct seccomp_log_name {
+ u32 log;
+ const char *name;
+};
+
+static const struct seccomp_log_name seccomp_log_names[] = {
+ { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
+ { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
+ { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
+ { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
+ { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
+ { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
+ { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
+ { }
+};
+
+static bool seccomp_names_from_actions_logged(char *names, size_t size,
+ u32 actions_logged)
+{
+ const struct seccomp_log_name *cur;
+ bool append_space = false;
+
+ for (cur = seccomp_log_names; cur->name && size; cur++) {
+ ssize_t ret;
+
+ if (!(actions_logged & cur->log))
+ continue;
+
+ if (append_space) {
+ ret = strscpy(names, " ", size);
+ if (ret < 0)
+ return false;
+
+ names += ret;
+ size -= ret;
+ } else
+ append_space = true;
+
+ ret = strscpy(names, cur->name, size);
+ if (ret < 0)
+ return false;
+
+ names += ret;
+ size -= ret;
+ }
+
+ return true;
+}
+
+static bool seccomp_action_logged_from_name(u32 *action_logged,
+ const char *name)
+{
+ const struct seccomp_log_name *cur;
+
+ for (cur = seccomp_log_names; cur->name; cur++) {
+ if (!strcmp(cur->name, name)) {
+ *action_logged = cur->log;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
+{
+ char *name;
+
+ *actions_logged = 0;
+ while ((name = strsep(&names, " ")) && *name) {
+ u32 action_logged = 0;
+
+ if (!seccomp_action_logged_from_name(&action_logged, name))
+ return false;
+
+ *actions_logged |= action_logged;
+ }
+
+ return true;
+}
+
+static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ char names[sizeof(seccomp_actions_avail)];
+ struct ctl_table table;
+ int ret;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ memset(names, 0, sizeof(names));
+
+ if (!write) {
+ if (!seccomp_names_from_actions_logged(names, sizeof(names),
+ seccomp_actions_logged))
+ return -EINVAL;
+ }
+
+ table = *ro_table;
+ table.data = names;
+ table.maxlen = sizeof(names);
+ ret = proc_dostring(&table, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ if (write) {
+ u32 actions_logged;
+
+ if (!seccomp_actions_logged_from_names(&actions_logged,
+ table.data))
+ return -EINVAL;
+
+ if (actions_logged & SECCOMP_LOG_ALLOW)
+ return -EINVAL;
+
+ seccomp_actions_logged = actions_logged;
+ }
+
+ return 0;
+}
+
+static struct ctl_path seccomp_sysctl_path[] = {
+ { .procname = "kernel", },
+ { .procname = "seccomp", },
+ { }
+};
+
+static struct ctl_table seccomp_sysctl_table[] = {
+ {
+ .procname = "actions_avail",
+ .data = (void *) &seccomp_actions_avail,
+ .maxlen = sizeof(seccomp_actions_avail),
+ .mode = 0444,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "actions_logged",
+ .mode = 0644,
+ .proc_handler = seccomp_actions_logged_handler,
+ },
+ { }
+};
+
+static int __init seccomp_sysctl_init(void)
+{
+ struct ctl_table_header *hdr;
+
+ hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
+ if (!hdr)
+ pr_warn("seccomp: sysctl registration failed\n");
+ else
+ kmemleak_not_leak(hdr);
+
+ return 0;
+}
+
+device_initcall(seccomp_sysctl_init)
+
+#endif /* CONFIG_SYSCTL */
diff --git a/kernel/signal.c b/kernel/signal.c
index ed804a470dcd..8dcd8825b2de 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2686,6 +2686,51 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
}
#endif
+enum siginfo_layout siginfo_layout(int sig, int si_code)
+{
+ enum siginfo_layout layout = SIL_KILL;
+ if ((si_code > SI_USER) && (si_code < SI_KERNEL)) {
+ static const struct {
+ unsigned char limit, layout;
+ } filter[] = {
+ [SIGILL] = { NSIGILL, SIL_FAULT },
+ [SIGFPE] = { NSIGFPE, SIL_FAULT },
+ [SIGSEGV] = { NSIGSEGV, SIL_FAULT },
+ [SIGBUS] = { NSIGBUS, SIL_FAULT },
+ [SIGTRAP] = { NSIGTRAP, SIL_FAULT },
+#if defined(SIGEMT) && defined(NSIGEMT)
+ [SIGEMT] = { NSIGEMT, SIL_FAULT },
+#endif
+ [SIGCHLD] = { NSIGCHLD, SIL_CHLD },
+ [SIGPOLL] = { NSIGPOLL, SIL_POLL },
+#ifdef __ARCH_SIGSYS
+ [SIGSYS] = { NSIGSYS, SIL_SYS },
+#endif
+ };
+ if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
+ layout = filter[sig].layout;
+ else if (si_code <= NSIGPOLL)
+ layout = SIL_POLL;
+ } else {
+ if (si_code == SI_TIMER)
+ layout = SIL_TIMER;
+ else if (si_code == SI_SIGIO)
+ layout = SIL_POLL;
+ else if (si_code < 0)
+ layout = SIL_RT;
+ /* Tests to support buggy kernel ABIs */
+#ifdef TRAP_FIXME
+ if ((sig == SIGTRAP) && (si_code == TRAP_FIXME))
+ layout = SIL_FAULT;
+#endif
+#ifdef FPE_FIXME
+ if ((sig == SIGFPE) && (si_code == FPE_FIXME))
+ layout = SIL_FAULT;
+#endif
+ }
+ return layout;
+}
+
#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
@@ -2708,22 +2753,20 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
*/
err = __put_user(from->si_signo, &to->si_signo);
err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user((short)from->si_code, &to->si_code);
- switch (from->si_code & __SI_MASK) {
- case __SI_KILL:
+ err |= __put_user(from->si_code, &to->si_code);
+ switch (siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_KILL:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
break;
- case __SI_TIMER:
- err |= __put_user(from->si_tid, &to->si_tid);
- err |= __put_user(from->si_overrun, &to->si_overrun);
- err |= __put_user(from->si_ptr, &to->si_ptr);
+ case SIL_TIMER:
+ /* Unreached SI_TIMER is negative */
break;
- case __SI_POLL:
+ case SIL_POLL:
err |= __put_user(from->si_band, &to->si_band);
err |= __put_user(from->si_fd, &to->si_fd);
break;
- case __SI_FAULT:
+ case SIL_FAULT:
err |= __put_user(from->si_addr, &to->si_addr);
#ifdef __ARCH_SI_TRAPNO
err |= __put_user(from->si_trapno, &to->si_trapno);
@@ -2748,30 +2791,25 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
err |= __put_user(from->si_pkey, &to->si_pkey);
#endif
break;
- case __SI_CHLD:
+ case SIL_CHLD:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_status, &to->si_status);
err |= __put_user(from->si_utime, &to->si_utime);
err |= __put_user(from->si_stime, &to->si_stime);
break;
- case __SI_RT: /* This is not generated by the kernel as of now. */
- case __SI_MESGQ: /* But this is */
+ case SIL_RT:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
#ifdef __ARCH_SIGSYS
- case __SI_SYS:
+ case SIL_SYS:
err |= __put_user(from->si_call_addr, &to->si_call_addr);
err |= __put_user(from->si_syscall, &to->si_syscall);
err |= __put_user(from->si_arch, &to->si_arch);
break;
#endif
- default: /* this is just in case for now ... */
- err |= __put_user(from->si_pid, &to->si_pid);
- err |= __put_user(from->si_uid, &to->si_uid);
- break;
}
return err;
}
diff --git a/kernel/smp.c b/kernel/smp.c
index 3061483cb3ad..084c8b3a2681 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -28,7 +28,7 @@ enum {
};
struct call_function_data {
- struct call_single_data __percpu *csd;
+ call_single_data_t __percpu *csd;
cpumask_var_t cpumask;
cpumask_var_t cpumask_ipi;
};
@@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu)
free_cpumask_var(cfd->cpumask);
return -ENOMEM;
}
- cfd->csd = alloc_percpu(struct call_single_data);
+ cfd->csd = alloc_percpu(call_single_data_t);
if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
free_cpumask_var(cfd->cpumask_ipi);
@@ -103,12 +103,12 @@ void __init call_function_init(void)
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
-static __always_inline void csd_lock_wait(struct call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
}
-static __always_inline void csd_lock(struct call_single_data *csd)
+static __always_inline void csd_lock(call_single_data_t *csd)
{
csd_lock_wait(csd);
csd->flags |= CSD_FLAG_LOCK;
@@ -116,12 +116,12 @@ static __always_inline void csd_lock(struct call_single_data *csd)
/*
* prevent CPU from reordering the above assignment
* to ->flags with any subsequent assignments to other
- * fields of the specified call_single_data structure:
+ * fields of the specified call_single_data_t structure:
*/
smp_wmb();
}
-static __always_inline void csd_unlock(struct call_single_data *csd)
+static __always_inline void csd_unlock(call_single_data_t *csd)
{
WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
@@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd)
smp_store_release(&csd->flags, 0);
}
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
/*
- * Insert a previously allocated call_single_data element
+ * Insert a previously allocated call_single_data_t element
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
-static int generic_exec_single(int cpu, struct call_single_data *csd,
+static int generic_exec_single(int cpu, call_single_data_t *csd,
smp_call_func_t func, void *info)
{
if (cpu == smp_processor_id()) {
@@ -210,10 +210,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
struct llist_head *head;
struct llist_node *entry;
- struct call_single_data *csd, *csd_next;
+ call_single_data_t *csd, *csd_next;
static bool warned;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
head = this_cpu_ptr(&call_single_queue);
entry = llist_del_all(head);
@@ -268,8 +268,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int wait)
{
- struct call_single_data *csd;
- struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
+ call_single_data_t *csd;
+ call_single_data_t csd_stack = {
+ .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+ };
int this_cpu;
int err;
@@ -321,7 +323,7 @@ EXPORT_SYMBOL(smp_call_function_single);
* NOTE: Be careful, there is unfortunately no current debugging facility to
* validate the correctness of this serialization.
*/
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
int err = 0;
@@ -444,7 +446,7 @@ void smp_call_function_many(const struct cpumask *mask,
cpumask_clear(cfd->cpumask_ipi);
for_each_cpu(cpu, cfd->cpumask) {
- struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
+ call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock(csd);
if (wait)
@@ -460,7 +462,7 @@ void smp_call_function_many(const struct cpumask *mask,
if (wait) {
for_each_cpu(cpu, cfd->cpumask) {
- struct call_single_data *csd;
+ call_single_data_t *csd;
csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock_wait(csd);
@@ -548,7 +550,7 @@ static int __init maxcpus(char *str)
early_param("maxcpus", maxcpus);
/* Setup number of possible processor ids */
-int nr_cpu_ids __read_mostly = NR_CPUS;
+unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 1d71c051a951..5043e7433f4b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -344,39 +344,30 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
* by the client, but only by calling this function.
* This function can only be called on a registered smp_hotplug_thread.
*/
-int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
- const struct cpumask *new)
+void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *new)
{
struct cpumask *old = plug_thread->cpumask;
- cpumask_var_t tmp;
+ static struct cpumask tmp;
unsigned int cpu;
- if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
- return -ENOMEM;
-
- get_online_cpus();
+ lockdep_assert_cpus_held();
mutex_lock(&smpboot_threads_lock);
/* Park threads that were exclusively enabled on the old mask. */
- cpumask_andnot(tmp, old, new);
- for_each_cpu_and(cpu, tmp, cpu_online_mask)
+ cpumask_andnot(&tmp, old, new);
+ for_each_cpu_and(cpu, &tmp, cpu_online_mask)
smpboot_park_thread(plug_thread, cpu);
/* Unpark threads that are exclusively enabled on the new mask. */
- cpumask_andnot(tmp, new, old);
- for_each_cpu_and(cpu, tmp, cpu_online_mask)
+ cpumask_andnot(&tmp, new, old);
+ for_each_cpu_and(cpu, &tmp, cpu_online_mask)
smpboot_unpark_thread(plug_thread, cpu);
cpumask_copy(old, new);
mutex_unlock(&smpboot_threads_lock);
- put_online_cpus();
-
- free_cpumask_var(tmp);
-
- return 0;
}
-EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 485b81cfab34..34dd3d7ba40b 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef SMPBOOT_H
#define SMPBOOT_H
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4e09821f9d9e..662f7b1b7a78 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -137,7 +137,7 @@ EXPORT_SYMBOL(__local_bh_disable_ip);
static void __local_bh_enable(unsigned int cnt)
{
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
if (softirq_count() == (cnt & SOFTIRQ_MASK))
trace_softirqs_on(_RET_IP_);
@@ -158,7 +158,8 @@ EXPORT_SYMBOL(_local_bh_enable);
void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
{
- WARN_ON_ONCE(in_irq() || irqs_disabled());
+ WARN_ON_ONCE(in_irq());
+ lockdep_assert_irqs_enabled();
#ifdef CONFIG_TRACE_IRQFLAGS
local_irq_disable();
#endif
@@ -396,9 +397,8 @@ void irq_exit(void)
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
local_irq_disable();
#else
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
#endif
-
account_irq_exit_time(current);
preempt_count_sub(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
@@ -488,7 +488,7 @@ EXPORT_SYMBOL(__tasklet_hi_schedule);
void __tasklet_hi_schedule_first(struct tasklet_struct *t)
{
- BUG_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
t->next = __this_cpu_read(tasklet_hi_vec.head);
__this_cpu_write(tasklet_hi_vec.head, t);
diff --git a/kernel/sys.c b/kernel/sys.c
index 2855ee73acd0..83ffd7dccf23 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/sys.c
*
@@ -110,6 +111,12 @@
#ifndef SET_FP_MODE
# define SET_FP_MODE(a,b) (-EINVAL)
#endif
+#ifndef SVE_SET_VL
+# define SVE_SET_VL(a) (-EINVAL)
+#endif
+#ifndef SVE_GET_VL
+# define SVE_GET_VL() (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -1896,15 +1903,11 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
/*
* Finally, make sure the caller has the rights to
- * change /proc/pid/exe link: only local root should
+ * change /proc/pid/exe link: only local sys admin should
* be allowed to.
*/
if (prctl_map->exe_fd != (u32)-1) {
- struct user_namespace *ns = current_user_ns();
- const struct cred *cred = current_cred();
-
- if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
- !gid_eq(cred->gid, make_kgid(ns, 0)))
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
goto out;
}
@@ -2389,6 +2392,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_GET_FP_MODE:
error = GET_FP_MODE(me);
break;
+ case PR_SVE_SET_VL:
+ error = SVE_SET_VL(arg2);
+ break;
+ case PR_SVE_GET_VL:
+ error = SVE_GET_VL();
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 8acef8576ce9..b5189762d275 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/linkage.h>
#include <linux/errno.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbbb8157..9576bd582d4a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_time_avg,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
#ifdef CONFIG_SCHEDSTATS
{
@@ -871,9 +872,9 @@ static struct ctl_table kern_table[] = {
#if defined(CONFIG_LOCKUP_DETECTOR)
{
.procname = "watchdog",
- .data = &watchdog_user_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
+ .data = &watchdog_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_watchdog,
.extra1 = &zero,
.extra2 = &one,
@@ -889,16 +890,12 @@ static struct ctl_table kern_table[] = {
},
{
.procname = "nmi_watchdog",
- .data = &nmi_watchdog_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
+ .data = &nmi_watchdog_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = NMI_WATCHDOG_SYSCTL_PERM,
.proc_handler = proc_nmi_watchdog,
.extra1 = &zero,
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
.extra2 = &one,
-#else
- .extra2 = &zero,
-#endif
},
{
.procname = "watchdog_cpumask",
@@ -910,9 +907,9 @@ static struct ctl_table kern_table[] = {
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
{
.procname = "soft_watchdog",
- .data = &soft_watchdog_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
+ .data = &soft_watchdog_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_soft_watchdog,
.extra1 = &zero,
.extra2 = &one,
@@ -1345,11 +1342,6 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
- .procname = "nr_pdflush_threads",
- .mode = 0444 /* read-only */,
- .proc_handler = pdflush_proc_obsolete,
- },
- {
.procname = "swappiness",
.data = &vm_swappiness,
.maxlen = sizeof(vm_swappiness),
@@ -2187,8 +2179,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp,
if (write) {
if (*lvalp > UINT_MAX)
return -EINVAL;
- if (*lvalp > UINT_MAX)
- return -EINVAL;
*valp = *lvalp;
} else {
unsigned int val = *valp;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 02e1859f2ca8..e8c0dab4fd65 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/stat.h>
#include <linux/sysctl.h>
#include "../fs/xfs/xfs_sysctl.h"
@@ -986,8 +987,9 @@ static ssize_t bin_intvec(struct file *file,
size_t length = oldlen / sizeof(*vec);
char *str, *end;
int i;
+ loff_t pos = 0;
- result = kernel_read(file, 0, buffer, BUFSZ - 1);
+ result = kernel_read(file, buffer, BUFSZ - 1, &pos);
if (result < 0)
goto out_kfree;
@@ -1016,6 +1018,7 @@ static ssize_t bin_intvec(struct file *file,
size_t length = newlen / sizeof(*vec);
char *str, *end;
int i;
+ loff_t pos = 0;
str = buffer;
end = str + BUFSZ;
@@ -1029,7 +1032,7 @@ static ssize_t bin_intvec(struct file *file,
str += scnprintf(str, end - str, "%lu\t", value);
}
- result = kernel_write(file, buffer, str - buffer, 0);
+ result = kernel_write(file, buffer, str - buffer, &pos);
if (result < 0)
goto out_kfree;
}
@@ -1057,8 +1060,9 @@ static ssize_t bin_ulongvec(struct file *file,
size_t length = oldlen / sizeof(*vec);
char *str, *end;
int i;
+ loff_t pos = 0;
- result = kernel_read(file, 0, buffer, BUFSZ - 1);
+ result = kernel_read(file, buffer, BUFSZ - 1, &pos);
if (result < 0)
goto out_kfree;
@@ -1087,6 +1091,7 @@ static ssize_t bin_ulongvec(struct file *file,
size_t length = newlen / sizeof(*vec);
char *str, *end;
int i;
+ loff_t pos = 0;
str = buffer;
end = str + BUFSZ;
@@ -1100,7 +1105,7 @@ static ssize_t bin_ulongvec(struct file *file,
str += scnprintf(str, end - str, "%lu\t", value);
}
- result = kernel_write(file, buffer, str - buffer, 0);
+ result = kernel_write(file, buffer, str - buffer, &pos);
if (result < 0)
goto out_kfree;
}
@@ -1120,8 +1125,9 @@ static ssize_t bin_uuid(struct file *file,
if (oldval && oldlen) {
char buf[UUID_STRING_LEN + 1];
uuid_t uuid;
+ loff_t pos = 0;
- result = kernel_read(file, 0, buf, sizeof(buf) - 1);
+ result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
if (result < 0)
goto out;
@@ -1154,8 +1160,9 @@ static ssize_t bin_dn_node_address(struct file *file,
char buf[15], *nodep;
unsigned long area, node;
__le16 dnaddr;
+ loff_t pos = 0;
- result = kernel_read(file, 0, buf, sizeof(buf) - 1);
+ result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
if (result < 0)
goto out;
@@ -1188,6 +1195,7 @@ static ssize_t bin_dn_node_address(struct file *file,
__le16 dnaddr;
char buf[15];
int len;
+ loff_t pos = 0;
result = -EINVAL;
if (newlen != sizeof(dnaddr))
@@ -1201,7 +1209,7 @@ static ssize_t bin_dn_node_address(struct file *file,
le16_to_cpu(dnaddr) >> 10,
le16_to_cpu(dnaddr) & 0x3ff);
- result = kernel_write(file, buf, len, 0);
+ result = kernel_write(file, buf, len, &pos);
if (result < 0)
goto out;
}
diff --git a/kernel/task_work.c b/kernel/task_work.c
index d513051fcca2..0fef395662a6 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/spinlock.h>
#include <linux/task_work.h>
#include <linux/tracehook.h>
@@ -67,7 +68,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
* we raced with task_work_run(), *pprev == NULL/exited.
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
- while ((work = lockless_dereference(*pprev))) {
+ while ((work = READ_ONCE(*pprev))) {
if (work->func != func)
pprev = &work->next;
else if (cmpxchg(pprev, work, work->next) == work)
@@ -96,20 +97,16 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set
* work_exited unless the list is empty.
*/
+ raw_spin_lock_irq(&task->pi_lock);
do {
work = READ_ONCE(task->task_works);
head = !work && (task->flags & PF_EXITING) ?
&work_exited : NULL;
} while (cmpxchg(&task->task_works, work, head) != work);
+ raw_spin_unlock_irq(&task->pi_lock);
if (!work)
break;
- /*
- * Synchronize with task_work_cancel(). It can't remove
- * the first entry == work, cmpxchg(task_works) should
- * fail, but it can play with *work and other entries.
- */
- raw_spin_unlock_wait(&task->pi_lock);
do {
next = work->next;
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 0dbab6d1acb4..dd53e354f630 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -22,7 +22,7 @@
#define div_factor 3
-static u32 rand1, preh_val, posth_val, jph_val;
+static u32 rand1, preh_val, posth_val;
static int errors, handler_errors, num_tests;
static u32 (*target)(u32 value);
static u32 (*target2)(u32 value);
@@ -34,6 +34,10 @@ static noinline u32 kprobe_target(u32 value)
static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("pre-handler is preemptible\n");
+ }
preh_val = (rand1 / div_factor);
return 0;
}
@@ -41,6 +45,10 @@ static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("post-handler is preemptible\n");
+ }
if (preh_val != (rand1 / div_factor)) {
handler_errors++;
pr_err("incorrect value in post_handler\n");
@@ -154,8 +162,15 @@ static int test_kprobes(void)
}
+#if 0
+static u32 jph_val;
+
static u32 j_kprobe_target(u32 value)
{
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("jprobe-handler is preemptible\n");
+ }
if (value != rand1) {
handler_errors++;
pr_err("incorrect value in jprobe handler\n");
@@ -227,11 +242,19 @@ static int test_jprobes(void)
return 0;
}
+#else
+#define test_jprobe() (0)
+#define test_jprobes() (0)
+#endif
#ifdef CONFIG_KRETPROBES
static u32 krph_val;
static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("kretprobe entry handler is preemptible\n");
+ }
krph_val = (rand1 / div_factor);
return 0;
}
@@ -240,6 +263,10 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long ret = regs_return_value(regs);
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("kretprobe return handler is preemptible\n");
+ }
if (ret != (rand1 / div_factor)) {
handler_errors++;
pr_err("incorrect value in kretprobe handler\n");
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index ac09bc29eb08..d689a9557e17 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -56,7 +56,7 @@ menu "Timers subsystem"
# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
# only related to the tick functionality. Oneshot clockevent devices
-# are supported independ of this.
+# are supported independent of this.
config TICK_ONESHOT
bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 938dbf33ef49..f1e46f338a9c 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
obj-y += time.o timer.o hrtimer.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
obj-y += timeconv.o timecounter.o alarmtimer.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 0b8ff7d257ea..ec09ce9a6012 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -28,6 +28,7 @@
#include <linux/workqueue.h>
#include <linux/freezer.h>
#include <linux/compat.h>
+#include <linux/module.h>
#include "posix-timers.h"
@@ -56,9 +57,9 @@ static ktime_t freezer_delta;
static DEFINE_SPINLOCK(freezer_delta_lock);
#endif
+#ifdef CONFIG_RTC_CLASS
static struct wakeup_source *ws;
-#ifdef CONFIG_RTC_CLASS
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
static struct rtc_device *rtcdev;
@@ -89,6 +90,7 @@ static int alarmtimer_rtc_add_device(struct device *dev,
{
unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
+ struct wakeup_source *__ws;
if (rtcdev)
return -EBUSY;
@@ -98,13 +100,25 @@ static int alarmtimer_rtc_add_device(struct device *dev,
if (!device_may_wakeup(rtc->dev.parent))
return -1;
+ __ws = wakeup_source_register("alarmtimer");
+
spin_lock_irqsave(&rtcdev_lock, flags);
if (!rtcdev) {
+ if (!try_module_get(rtc->owner)) {
+ spin_unlock_irqrestore(&rtcdev_lock, flags);
+ return -1;
+ }
+
rtcdev = rtc;
/* hold a reference so it doesn't go away */
get_device(dev);
+ ws = __ws;
+ __ws = NULL;
}
spin_unlock_irqrestore(&rtcdev_lock, flags);
+
+ wakeup_source_unregister(__ws);
+
return 0;
}
@@ -860,7 +874,6 @@ static int __init alarmtimer_init(void)
error = PTR_ERR(pdev);
goto out_drv;
}
- ws = wakeup_source_register("alarmtimer");
return 0;
out_drv:
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 4237e0744e26..16c027e9cc73 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -280,17 +280,22 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
static int clockevents_program_min_delta(struct clock_event_device *dev)
{
unsigned long long clc;
- int64_t delta;
+ int64_t delta = 0;
+ int i;
- delta = dev->min_delta_ns;
- dev->next_event = ktime_add_ns(ktime_get(), delta);
+ for (i = 0; i < 10; i++) {
+ delta += dev->min_delta_ns;
+ dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (clockevent_state_shutdown(dev))
- return 0;
+ if (clockevent_state_shutdown(dev))
+ return 0;
- dev->retries++;
- clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
- return dev->set_next_event((unsigned long) clc, dev);
+ dev->retries++;
+ clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+ if (dev->set_next_event((unsigned long) clc, dev) == 0)
+ return 0;
+ }
+ return -ETIME;
}
#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 88f75f92ef36..d32520840fde 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -758,9 +758,7 @@ void clock_was_set(void)
*/
void hrtimers_resume(void)
{
- WARN_ONCE(!irqs_disabled(),
- KERN_INFO "hrtimers_resume() called with IRQs enabled!");
-
+ lockdep_assert_irqs_disabled();
/* Retrigger on the local CPU */
retrigger_next_event(NULL);
/* And schedule a retrigger for all others */
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 2ef98a02376a..f26acef5d7b4 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/itimer.c
*
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index edf19cc53140..8d70da1b9a0d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* NTP state machine interfaces and logic.
*
@@ -492,6 +493,67 @@ out:
return leap;
}
+static void sync_hw_clock(struct work_struct *work);
+static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock);
+
+static void sched_sync_hw_clock(struct timespec64 now,
+ unsigned long target_nsec, bool fail)
+
+{
+ struct timespec64 next;
+
+ getnstimeofday64(&next);
+ if (!fail)
+ next.tv_sec = 659;
+ else {
+ /*
+ * Try again as soon as possible. Delaying long periods
+ * decreases the accuracy of the work queue timer. Due to this
+ * the algorithm is very likely to require a short-sleep retry
+ * after the above long sleep to synchronize ts_nsec.
+ */
+ next.tv_sec = 0;
+ }
+
+ /* Compute the needed delay that will get to tv_nsec == target_nsec */
+ next.tv_nsec = target_nsec - next.tv_nsec;
+ if (next.tv_nsec <= 0)
+ next.tv_nsec += NSEC_PER_SEC;
+ if (next.tv_nsec >= NSEC_PER_SEC) {
+ next.tv_sec++;
+ next.tv_nsec -= NSEC_PER_SEC;
+ }
+
+ queue_delayed_work(system_power_efficient_wq, &sync_work,
+ timespec64_to_jiffies(&next));
+}
+
+static void sync_rtc_clock(void)
+{
+ unsigned long target_nsec;
+ struct timespec64 adjust, now;
+ int rc;
+
+ if (!IS_ENABLED(CONFIG_RTC_SYSTOHC))
+ return;
+
+ getnstimeofday64(&now);
+
+ adjust = now;
+ if (persistent_clock_is_local)
+ adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
+
+ /*
+ * The current RTC in use will provide the target_nsec it wants to be
+ * called at, and does rtc_tv_nsec_ok internally.
+ */
+ rc = rtc_set_ntp_time(adjust, &target_nsec);
+ if (rc == -ENODEV)
+ return;
+
+ sched_sync_hw_clock(now, target_nsec, rc);
+}
+
#ifdef CONFIG_GENERIC_CMOS_UPDATE
int __weak update_persistent_clock(struct timespec now)
{
@@ -507,76 +569,75 @@ int __weak update_persistent_clock64(struct timespec64 now64)
}
#endif
-#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
-static void sync_cmos_clock(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
-
-static void sync_cmos_clock(struct work_struct *work)
+static bool sync_cmos_clock(void)
{
+ static bool no_cmos;
struct timespec64 now;
- struct timespec64 next;
- int fail = 1;
+ struct timespec64 adjust;
+ int rc = -EPROTO;
+ long target_nsec = NSEC_PER_SEC / 2;
+
+ if (!IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE))
+ return false;
+
+ if (no_cmos)
+ return false;
/*
- * If we have an externally synchronized Linux clock, then update
- * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
- * called as close as possible to 500 ms before the new second starts.
- * This code is run on a timer. If the clock is set, that timer
- * may not expire at the correct time. Thus, we adjust...
- * We want the clock to be within a couple of ticks from the target.
+ * Historically update_persistent_clock64() has followed x86
+ * semantics, which match the MC146818A/etc RTC. This RTC will store
+ * 'adjust' and then in .5s it will advance once second.
+ *
+ * Architectures are strongly encouraged to use rtclib and not
+ * implement this legacy API.
*/
- if (!ntp_synced()) {
- /*
- * Not synced, exit, do not restart a timer (if one is
- * running, let it run out).
- */
- return;
- }
-
getnstimeofday64(&now);
- if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
- struct timespec64 adjust = now;
-
- fail = -ENODEV;
+ if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) {
if (persistent_clock_is_local)
adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
-#ifdef CONFIG_GENERIC_CMOS_UPDATE
- fail = update_persistent_clock64(adjust);
-#endif
-
-#ifdef CONFIG_RTC_SYSTOHC
- if (fail == -ENODEV)
- fail = rtc_set_ntp_time(adjust);
-#endif
+ rc = update_persistent_clock64(adjust);
+ /*
+ * The machine does not support update_persistent_clock64 even
+ * though it defines CONFIG_GENERIC_CMOS_UPDATE.
+ */
+ if (rc == -ENODEV) {
+ no_cmos = true;
+ return false;
+ }
}
- next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
- if (next.tv_nsec <= 0)
- next.tv_nsec += NSEC_PER_SEC;
+ sched_sync_hw_clock(now, target_nsec, rc);
+ return true;
+}
- if (!fail || fail == -ENODEV)
- next.tv_sec = 659;
- else
- next.tv_sec = 0;
+/*
+ * If we have an externally synchronized Linux clock, then update RTC clock
+ * accordingly every ~11 minutes. Generally RTCs can only store second
+ * precision, but many RTCs will adjust the phase of their second tick to
+ * match the moment of update. This infrastructure arranges to call to the RTC
+ * set at the correct moment to phase synchronize the RTC second tick over
+ * with the kernel clock.
+ */
+static void sync_hw_clock(struct work_struct *work)
+{
+ if (!ntp_synced())
+ return;
- if (next.tv_nsec >= NSEC_PER_SEC) {
- next.tv_sec++;
- next.tv_nsec -= NSEC_PER_SEC;
- }
- queue_delayed_work(system_power_efficient_wq,
- &sync_cmos_work, timespec64_to_jiffies(&next));
+ if (sync_cmos_clock())
+ return;
+
+ sync_rtc_clock();
}
void ntp_notify_cmos_timer(void)
{
- queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
-}
-
-#else
-void ntp_notify_cmos_timer(void) { }
-#endif
+ if (!ntp_synced())
+ return;
+ if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) ||
+ IS_ENABLED(CONFIG_RTC_SYSTOHC))
+ queue_delayed_work(system_power_efficient_wq, &sync_work, 0);
+}
/*
* Propagate a new txc->status value into the NTP state:
@@ -653,67 +714,6 @@ static inline void process_adjtimex_modes(struct timex *txc,
}
-
-/**
- * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
- */
-int ntp_validate_timex(struct timex *txc)
-{
- if (txc->modes & ADJ_ADJTIME) {
- /* singleshot must not be used with any other mode bits */
- if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
- return -EINVAL;
- if (!(txc->modes & ADJ_OFFSET_READONLY) &&
- !capable(CAP_SYS_TIME))
- return -EPERM;
- } else {
- /* In order to modify anything, you gotta be super-user! */
- if (txc->modes && !capable(CAP_SYS_TIME))
- return -EPERM;
- /*
- * if the quartz is off by more than 10% then
- * something is VERY wrong!
- */
- if (txc->modes & ADJ_TICK &&
- (txc->tick < 900000/USER_HZ ||
- txc->tick > 1100000/USER_HZ))
- return -EINVAL;
- }
-
- if (txc->modes & ADJ_SETOFFSET) {
- /* In order to inject time, you gotta be super-user! */
- if (!capable(CAP_SYS_TIME))
- return -EPERM;
-
- if (txc->modes & ADJ_NANO) {
- struct timespec ts;
-
- ts.tv_sec = txc->time.tv_sec;
- ts.tv_nsec = txc->time.tv_usec;
- if (!timespec_inject_offset_valid(&ts))
- return -EINVAL;
-
- } else {
- if (!timeval_inject_offset_valid(&txc->time))
- return -EINVAL;
- }
- }
-
- /*
- * Check for potential multiplication overflows that can
- * only happen on 64-bit systems:
- */
- if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
- if (LLONG_MIN / PPM_SCALE > txc->freq)
- return -EINVAL;
- if (LLONG_MAX / PPM_SCALE < txc->freq)
- return -EINVAL;
- }
-
- return 0;
-}
-
-
/*
* adjtimex mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index d8a7c11fa71a..909bd1f1bfb1 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NTP_INTERNAL_H
#define _LINUX_NTP_INTERNAL_H
@@ -7,7 +8,6 @@ extern void ntp_clear(void);
extern u64 ntp_tick_length(void);
extern ktime_t ntp_get_next_leap(void);
extern int second_overflow(time64_t secs);
-extern int ntp_validate_timex(struct timex *);
extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
extern void __hardpps(const struct timespec64 *, const struct timespec64 *);
#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a3bd5dbe0dc4..1f27887aa194 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Implement CPU time clocks for the POSIX clock interface.
*/
@@ -602,7 +603,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
/*
* Disarm any old timer after extracting its expiry time.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
ret = 0;
old_incr = timer->it.cpu.incr;
@@ -799,7 +800,6 @@ static void check_thread_timers(struct task_struct *tsk,
struct list_head *firing)
{
struct list_head *timers = tsk->cpu_timers;
- struct signal_struct *const sig = tsk->signal;
struct task_cputime *tsk_expires = &tsk->cputime_expires;
u64 expires;
unsigned long soft;
@@ -823,10 +823,9 @@ static void check_thread_timers(struct task_struct *tsk,
/*
* Check for the special case thread timers.
*/
- soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+ soft = task_rlimit(tsk, RLIMIT_RTTIME);
if (soft != RLIM_INFINITY) {
- unsigned long hard =
- READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+ unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);
if (hard != RLIM_INFINITY &&
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -847,7 +846,8 @@ static void check_thread_timers(struct task_struct *tsk,
*/
if (soft < hard) {
soft += USEC_PER_SEC;
- sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
+ tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur =
+ soft;
}
if (print_fatal_signals) {
pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
@@ -938,11 +938,10 @@ static void check_process_timers(struct task_struct *tsk,
SIGPROF);
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
SIGVTALRM);
- soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ soft = task_rlimit(tsk, RLIMIT_CPU);
if (soft != RLIM_INFINITY) {
unsigned long psecs = div_u64(ptime, NSEC_PER_SEC);
- unsigned long hard =
- READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+ unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU);
u64 x;
if (psecs >= hard) {
/*
@@ -1035,7 +1034,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
/*
* Now re-arm for the new expiry time.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
arm_timer(timer);
unlock:
unlock_task_sighand(p, &flags);
@@ -1126,7 +1125,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
struct k_itimer *timer, *next;
unsigned long flags;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
/*
* The fast path checks that there are no expired thread or thread
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 06f34feb635e..b258bee13b02 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -117,8 +117,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
const struct timespec __user *, rqtp,
struct timespec __user *, rmtp)
{
- struct timespec64 t64;
- struct timespec t;
+ struct timespec64 t;
switch (which_clock) {
case CLOCK_REALTIME:
@@ -129,16 +128,15 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
}
- if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+ if (get_timespec64(&t, rqtp))
return -EFAULT;
- t64 = timespec_to_timespec64(t);
- if (!timespec64_valid(&t64))
+ if (!timespec64_valid(&t))
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ?
+ return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
@@ -203,8 +201,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
{
- struct timespec64 t64;
- struct timespec t;
+ struct timespec64 t;
switch (which_clock) {
case CLOCK_REALTIME:
@@ -215,16 +212,15 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
return -EINVAL;
}
- if (compat_get_timespec(&t, rqtp))
+ if (compat_get_timespec64(&t, rqtp))
return -EFAULT;
- t64 = timespec_to_timespec64(t);
- if (!timespec64_valid(&t64))
+ if (!timespec64_valid(&t))
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ?
+ return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index fb303c3be4d3..151e28f5bf30 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#define TIMER_RETRY 1
struct k_clock {
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index a7bb8f33ae07..58045eb976c3 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/time/tick-broadcast-hrtimer.c
* This file emulates a local clock event device
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index be0ac01f2e12..f8e1845aa464 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* tick internal variable and functions used by low/high res code
*/
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 6b009c207671..c1f518e7aa80 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -33,6 +33,7 @@ int tick_program_event(ktime_t expires, int force)
* We don't need the clock event device any more, stop it.
*/
clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
+ dev->next_event = KTIME_MAX;
return 0;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c7a899c5ce64..99578f06c8d4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -27,6 +27,7 @@
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/context_tracking.h>
+#include <linux/mm.h>
#include <asm/irq_regs.h>
@@ -165,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
-cpumask_var_t housekeeping_mask;
bool tick_nohz_full_running;
static atomic_t tick_dep_mask;
@@ -198,7 +198,7 @@ static bool check_tick_dependency(atomic_t *dep)
static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
{
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
if (unlikely(!cpu_online(cpu)))
return false;
@@ -385,20 +385,13 @@ out:
local_irq_restore(flags);
}
-/* Parse the boot-time nohz CPU list from the kernel parameters. */
-static int __init tick_nohz_full_setup(char *str)
+/* Get the boot-time nohz CPU list from the kernel parameters. */
+void __init tick_nohz_full_setup(cpumask_var_t cpumask)
{
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
- if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
- pr_warn("NO_HZ: Incorrect nohz_full cpumask\n");
- free_bootmem_cpumask_var(tick_nohz_full_mask);
- return 1;
- }
+ cpumask_copy(tick_nohz_full_mask, cpumask);
tick_nohz_full_running = true;
-
- return 1;
}
-__setup("nohz_full=", tick_nohz_full_setup);
static int tick_nohz_cpu_down(unsigned int cpu)
{
@@ -437,13 +430,6 @@ void __init tick_nohz_init(void)
return;
}
- if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
- WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
- cpumask_clear(tick_nohz_full_mask);
- tick_nohz_full_running = false;
- return;
- }
-
/*
* Full dynticks uses irq work to drive the tick rescheduling on safe
* locking contexts. But then we need irq work to raise its own
@@ -452,7 +438,6 @@ void __init tick_nohz_init(void)
if (!arch_irq_work_has_interrupt()) {
pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
cpumask_clear(tick_nohz_full_mask);
- cpumask_copy(housekeeping_mask, cpu_possible_mask);
tick_nohz_full_running = false;
return;
}
@@ -465,9 +450,6 @@ void __init tick_nohz_init(void)
cpumask_clear_cpu(cpu, tick_nohz_full_mask);
}
- cpumask_andnot(housekeeping_mask,
- cpu_possible_mask, tick_nohz_full_mask);
-
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
@@ -477,12 +459,6 @@ void __init tick_nohz_init(void)
WARN_ON(ret < 0);
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
cpumask_pr_args(tick_nohz_full_mask));
-
- /*
- * We need at least one CPU to handle housekeeping work such
- * as timekeeping, unbound timers, workqueues, ...
- */
- WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
}
#endif
@@ -787,6 +763,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
if (!ts->tick_stopped) {
calc_load_nohz_start();
cpu_load_update_nohz_start();
+ quiet_vmstat();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
@@ -960,8 +937,7 @@ void tick_nohz_idle_enter(void)
{
struct tick_sched *ts;
- WARN_ON_ONCE(irqs_disabled());
-
+ lockdep_assert_irqs_enabled();
/*
* Update the idle state in the scheduler domain hierarchy
* when tick_nohz_stop_sched_tick() is called from the idle loop.
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 075444e3d48e..954b43dbf21c 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _TICK_SCHED_H
#define _TICK_SCHED_H
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 44a8c1402133..bd4e6c7dd689 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -82,7 +82,7 @@ SYSCALL_DEFINE1(time, time_t __user *, tloc)
SYSCALL_DEFINE1(stime, time_t __user *, tptr)
{
- struct timespec tv;
+ struct timespec64 tv;
int err;
if (get_user(tv.tv_sec, tptr))
@@ -90,11 +90,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr)
tv.tv_nsec = 0;
- err = security_settime(&tv, NULL);
+ err = security_settime64(&tv, NULL);
if (err)
return err;
- do_settimeofday(&tv);
+ do_settimeofday64(&tv);
return 0;
}
@@ -122,7 +122,7 @@ COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
{
- struct timespec tv;
+ struct timespec64 tv;
int err;
if (get_user(tv.tv_sec, tptr))
@@ -130,11 +130,11 @@ COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
tv.tv_nsec = 0;
- err = security_settime(&tv, NULL);
+ err = security_settime64(&tv, NULL);
if (err)
return err;
- do_settimeofday(&tv);
+ do_settimeofday64(&tv);
return 0;
}
@@ -158,40 +158,6 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
}
/*
- * Indicates if there is an offset between the system clock and the hardware
- * clock/persistent clock/rtc.
- */
-int persistent_clock_is_local;
-
-/*
- * Adjust the time obtained from the CMOS to be UTC time instead of
- * local time.
- *
- * This is ugly, but preferable to the alternatives. Otherwise we
- * would either need to write a program to do it in /etc/rc (and risk
- * confusion if the program gets run more than once; it would also be
- * hard to make the program warp the clock precisely n hours) or
- * compile in the timezone information into the kernel. Bad, bad....
- *
- * - TYT, 1992-01-01
- *
- * The best thing to do is to keep the CMOS clock in universal time (UTC)
- * as real UNIX machines always do it. This avoids all headaches about
- * daylight saving times and warping kernel clocks.
- */
-static inline void warp_clock(void)
-{
- if (sys_tz.tz_minuteswest != 0) {
- struct timespec adjust;
-
- persistent_clock_is_local = 1;
- adjust.tv_sec = sys_tz.tz_minuteswest * 60;
- adjust.tv_nsec = 0;
- timekeeping_inject_offset(&adjust);
- }
-}
-
-/*
* In case for some reason the CMOS clock has not already been running
* in UTC, but in some local time: The first time we set the timezone,
* we will warp the clock so that it is ticking UTC time instead of
@@ -224,7 +190,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
if (firsttime) {
firsttime = 0;
if (!tv)
- warp_clock();
+ timekeeping_warp_clock();
}
}
if (tv)
@@ -441,6 +407,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
}
EXPORT_SYMBOL(mktime64);
+#if __BITS_PER_LONG == 32
/**
* set_normalized_timespec - set timespec sec and nsec parts and normalize
*
@@ -501,6 +468,7 @@ struct timespec ns_to_timespec(const s64 nsec)
return ts;
}
EXPORT_SYMBOL(ns_to_timespec);
+#endif
/**
* ns_to_timeval - Convert nanoseconds to timeval
@@ -520,7 +488,6 @@ struct timeval ns_to_timeval(const s64 nsec)
}
EXPORT_SYMBOL(ns_to_timeval);
-#if BITS_PER_LONG == 32
/**
* set_normalized_timespec - set timespec sec and nsec parts and normalize
*
@@ -581,7 +548,7 @@ struct timespec64 ns_to_timespec64(const s64 nsec)
return ts;
}
EXPORT_SYMBOL(ns_to_timespec64);
-#endif
+
/**
* msecs_to_jiffies: - convert milliseconds to jiffies
* @m: time in milliseconds
@@ -853,24 +820,6 @@ unsigned long nsecs_to_jiffies(u64 n)
EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
/*
- * Add two timespec values and do a safety check for overflow.
- * It's assumed that both values are valid (>= 0)
- */
-struct timespec timespec_add_safe(const struct timespec lhs,
- const struct timespec rhs)
-{
- struct timespec res;
-
- set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
- lhs.tv_nsec + rhs.tv_nsec);
-
- if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
- res.tv_sec = TIME_T_MAX;
-
- return res;
-}
-
-/*
* Add two timespec64 values and do a safety check for overflow.
* It's assumed that both values are valid (>= 0).
* And, each timespec64 is in normalized form.
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7e7e61c00d61..198afa78bf69 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -60,8 +60,27 @@ struct tk_fast {
struct tk_read_base base[2];
};
-static struct tk_fast tk_fast_mono ____cacheline_aligned;
-static struct tk_fast tk_fast_raw ____cacheline_aligned;
+/* Suspend-time cycles value for halted fast timekeeper. */
+static u64 cycles_at_suspend;
+
+static u64 dummy_clock_read(struct clocksource *cs)
+{
+ return cycles_at_suspend;
+}
+
+static struct clocksource dummy_clock = {
+ .read = dummy_clock_read,
+};
+
+static struct tk_fast tk_fast_mono ____cacheline_aligned = {
+ .base[0] = { .clock = &dummy_clock, },
+ .base[1] = { .clock = &dummy_clock, },
+};
+
+static struct tk_fast tk_fast_raw ____cacheline_aligned = {
+ .base[0] = { .clock = &dummy_clock, },
+ .base[1] = { .clock = &dummy_clock, },
+};
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -477,17 +496,39 @@ u64 notrace ktime_get_boot_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
-/* Suspend-time cycles value for halted fast timekeeper. */
-static u64 cycles_at_suspend;
-static u64 dummy_clock_read(struct clocksource *cs)
+/*
+ * See comment for __ktime_get_fast_ns() vs. timestamp ordering
+ */
+static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf)
{
- return cycles_at_suspend;
+ struct tk_read_base *tkr;
+ unsigned int seq;
+ u64 now;
+
+ do {
+ seq = raw_read_seqcount_latch(&tkf->seq);
+ tkr = tkf->base + (seq & 0x01);
+ now = ktime_to_ns(tkr->base_real);
+
+ now += timekeeping_delta_to_ns(tkr,
+ clocksource_delta(
+ tk_clock_read(tkr),
+ tkr->cycle_last,
+ tkr->mask));
+ } while (read_seqcount_retry(&tkf->seq, seq));
+
+ return now;
}
-static struct clocksource dummy_clock = {
- .read = dummy_clock_read,
-};
+/**
+ * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
+ */
+u64 ktime_get_real_fast_ns(void)
+{
+ return __ktime_get_real_fast_ns(&tk_fast_mono);
+}
+EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
@@ -507,6 +548,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
cycles_at_suspend = tk_clock_read(tkr);
tkr_dummy.clock = &dummy_clock;
+ tkr_dummy.base_real = tkr->base + tk->offs_real;
update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
tkr = &tk->tkr_raw;
@@ -654,6 +696,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+ tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
@@ -1264,33 +1307,31 @@ EXPORT_SYMBOL(do_settimeofday64);
*
* Adds or subtracts an offset value from the current time.
*/
-int timekeeping_inject_offset(struct timespec *ts)
+static int timekeeping_inject_offset(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
- struct timespec64 ts64, tmp;
+ struct timespec64 tmp;
int ret = 0;
- if (!timespec_inject_offset_valid(ts))
+ if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
- ts64 = timespec_to_timespec64(*ts);
-
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
/* Make sure the proposed value is valid */
- tmp = timespec64_add(tk_xtime(tk), ts64);
- if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 ||
+ tmp = timespec64_add(tk_xtime(tk), *ts);
+ if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
!timespec64_valid_strict(&tmp)) {
ret = -EINVAL;
goto error;
}
- tk_xtime_add(tk, &ts64);
- tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64));
+ tk_xtime_add(tk, ts);
+ tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts));
error: /* even if we error out, we forwarded the time, so call update */
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1303,7 +1344,40 @@ error: /* even if we error out, we forwarded the time, so call update */
return ret;
}
-EXPORT_SYMBOL(timekeeping_inject_offset);
+
+/*
+ * Indicates if there is an offset between the system clock and the hardware
+ * clock/persistent clock/rtc.
+ */
+int persistent_clock_is_local;
+
+/*
+ * Adjust the time obtained from the CMOS to be UTC time instead of
+ * local time.
+ *
+ * This is ugly, but preferable to the alternatives. Otherwise we
+ * would either need to write a program to do it in /etc/rc (and risk
+ * confusion if the program gets run more than once; it would also be
+ * hard to make the program warp the clock precisely n hours) or
+ * compile in the timezone information into the kernel. Bad, bad....
+ *
+ * - TYT, 1992-01-01
+ *
+ * The best thing to do is to keep the CMOS clock in universal time (UTC)
+ * as real UNIX machines always do it. This avoids all headaches about
+ * daylight saving times and warping kernel clocks.
+ */
+void timekeeping_warp_clock(void)
+{
+ if (sys_tz.tz_minuteswest != 0) {
+ struct timespec64 adjust;
+
+ persistent_clock_is_local = 1;
+ adjust.tv_sec = sys_tz.tz_minuteswest * 60;
+ adjust.tv_nsec = 0;
+ timekeeping_inject_offset(&adjust);
+ }
+}
/**
* __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
@@ -2064,7 +2138,7 @@ void update_wall_time(void)
goto out;
/* Do some additional sanity checking */
- timekeeping_check_update(real_tk, offset);
+ timekeeping_check_update(tk, offset);
/*
* With NO_HZ we may have to accumulate many cycle_intervals
@@ -2248,6 +2322,72 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
}
/**
+ * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
+ */
+static int timekeeping_validate_timex(struct timex *txc)
+{
+ if (txc->modes & ADJ_ADJTIME) {
+ /* singleshot must not be used with any other mode bits */
+ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
+ return -EINVAL;
+ if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+ !capable(CAP_SYS_TIME))
+ return -EPERM;
+ } else {
+ /* In order to modify anything, you gotta be super-user! */
+ if (txc->modes && !capable(CAP_SYS_TIME))
+ return -EPERM;
+ /*
+ * if the quartz is off by more than 10% then
+ * something is VERY wrong!
+ */
+ if (txc->modes & ADJ_TICK &&
+ (txc->tick < 900000/USER_HZ ||
+ txc->tick > 1100000/USER_HZ))
+ return -EINVAL;
+ }
+
+ if (txc->modes & ADJ_SETOFFSET) {
+ /* In order to inject time, you gotta be super-user! */
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ /*
+ * Validate if a timespec/timeval used to inject a time
+ * offset is valid. Offsets can be postive or negative, so
+ * we don't check tv_sec. The value of the timeval/timespec
+ * is the sum of its fields,but *NOTE*:
+ * The field tv_usec/tv_nsec must always be non-negative and
+ * we can't have more nanoseconds/microseconds than a second.
+ */
+ if (txc->time.tv_usec < 0)
+ return -EINVAL;
+
+ if (txc->modes & ADJ_NANO) {
+ if (txc->time.tv_usec >= NSEC_PER_SEC)
+ return -EINVAL;
+ } else {
+ if (txc->time.tv_usec >= USEC_PER_SEC)
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Check for potential multiplication overflows that can
+ * only happen on 64-bit systems:
+ */
+ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
+ if (LLONG_MIN / PPM_SCALE > txc->freq)
+ return -EINVAL;
+ if (LLONG_MAX / PPM_SCALE < txc->freq)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
*/
int do_adjtimex(struct timex *txc)
@@ -2259,12 +2399,12 @@ int do_adjtimex(struct timex *txc)
int ret;
/* Validate the data before disabling interrupts */
- ret = ntp_validate_timex(txc);
+ ret = timekeeping_validate_timex(txc);
if (ret)
return ret;
if (txc->modes & ADJ_SETOFFSET) {
- struct timespec delta;
+ struct timespec64 delta;
delta.tv_sec = txc->time.tv_sec;
delta.tv_nsec = txc->time.tv_usec;
if (!(txc->modes & ADJ_NANO))
@@ -2316,7 +2456,7 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
EXPORT_SYMBOL(hardpps);
-#endif
+#endif /* CONFIG_NTP_PPS */
/**
* xtime_update() - advances the timekeeping infrastructure
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index d0914676d4c5..7a9b4eb7a1d5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _KERNEL_TIME_TIMEKEEPING_H
#define _KERNEL_TIME_TIMEKEEPING_H
/*
@@ -10,7 +11,7 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
extern int timekeeping_valid_for_hres(void);
extern u64 timekeeping_max_deferment(void);
-extern int timekeeping_inject_offset(struct timespec *ts);
+extern void timekeeping_warp_clock(void);
extern int timekeeping_suspend(void);
extern void timekeeping_resume(void);
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 38bc4d2208e8..0754cadfa9e6 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -19,6 +19,7 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/seq_file.h>
+#include <linux/suspend.h>
#include <linux/time.h>
#include "timekeeping_internal.h"
@@ -75,7 +76,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t)
int bin = min(fls(t->tv_sec), NUM_BINS-1);
sleep_time_bin[bin]++;
- printk_deferred(KERN_INFO "Suspended for %lld.%03lu seconds\n",
- (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
+ pm_deferred_pr_dbg("Timekeeping suspended for %lld.%03lu seconds\n",
+ (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 9a18f121f399..fdbeeb02dde9 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _TIMEKEEPING_INTERNAL_H
#define _TIMEKEEPING_INTERNAL_H
/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f2674a056c26..af0b8bae4502 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -610,7 +610,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state)
}
/* Stub timer callback for improperly used timers. */
-static void stub_timer(unsigned long data)
+static void stub_timer(struct timer_list *unused)
{
WARN_ON(1);
}
@@ -626,7 +626,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
- setup_timer(timer, stub_timer, 0);
+ timer_setup(timer, stub_timer, 0);
return true;
case ODEBUG_STATE_ACTIVE:
@@ -665,7 +665,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
- setup_timer(timer, stub_timer, 0);
+ timer_setup(timer, stub_timer, 0);
return true;
default:
return false;
@@ -929,8 +929,11 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
}
}
+#define MOD_TIMER_PENDING_ONLY 0x01
+#define MOD_TIMER_REDUCE 0x02
+
static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
+__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
struct timer_base *base, *new_base;
unsigned int idx = UINT_MAX;
@@ -950,7 +953,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* larger granularity than you would get from adding a new
* timer with this expiry.
*/
- if (timer->expires == expires)
+ long diff = timer->expires - expires;
+
+ if (!diff)
+ return 1;
+ if (options & MOD_TIMER_REDUCE && diff <= 0)
return 1;
/*
@@ -962,6 +969,12 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
base = lock_timer_base(timer, &flags);
forward_timer_base(base);
+ if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
+ time_before_eq(timer->expires, expires)) {
+ ret = 1;
+ goto out_unlock;
+ }
+
clk = base->clk;
idx = calc_wheel_index(expires, clk);
@@ -971,7 +984,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* subsequent call will exit in the expires check above.
*/
if (idx == timer_get_idx(timer)) {
- timer->expires = expires;
+ if (!(options & MOD_TIMER_REDUCE))
+ timer->expires = expires;
+ else if (time_after(timer->expires, expires))
+ timer->expires = expires;
ret = 1;
goto out_unlock;
}
@@ -981,7 +997,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
}
ret = detach_if_pending(timer, base, false);
- if (!ret && pending_only)
+ if (!ret && (options & MOD_TIMER_PENDING_ONLY))
goto out_unlock;
debug_activate(timer, expires);
@@ -1042,7 +1058,7 @@ out_unlock:
*/
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
- return __mod_timer(timer, expires, true);
+ return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
}
EXPORT_SYMBOL(mod_timer_pending);
@@ -1068,11 +1084,26 @@ EXPORT_SYMBOL(mod_timer_pending);
*/
int mod_timer(struct timer_list *timer, unsigned long expires)
{
- return __mod_timer(timer, expires, false);
+ return __mod_timer(timer, expires, 0);
}
EXPORT_SYMBOL(mod_timer);
/**
+ * timer_reduce - Modify a timer's timeout if it would reduce the timeout
+ * @timer: The timer to be modified
+ * @expires: New timeout in jiffies
+ *
+ * timer_reduce() is very similar to mod_timer(), except that it will only
+ * modify a running timer if that would reduce the expiration time (it will
+ * start a timer that isn't running).
+ */
+int timer_reduce(struct timer_list *timer, unsigned long expires)
+{
+ return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
+}
+EXPORT_SYMBOL(timer_reduce);
+
+/**
* add_timer - start a timer
* @timer: the timer to be added
*
@@ -1560,8 +1591,11 @@ static int collect_expired_timers(struct timer_base *base,
* jiffies, otherwise forward to the next expiry time:
*/
if (time_after(next, jiffies)) {
- /* The call site will increment clock! */
- base->clk = jiffies - 1;
+ /*
+ * The call site will increment base->clk and then
+ * terminate the expiry loop immediately.
+ */
+ base->clk = jiffies;
return 0;
}
base->clk = next;
@@ -1668,9 +1702,20 @@ void run_local_timers(void)
raise_softirq(TIMER_SOFTIRQ);
}
-static void process_timeout(unsigned long __data)
+/*
+ * Since schedule_timeout()'s timer is defined on the stack, it must store
+ * the target task on the stack as well.
+ */
+struct process_timer {
+ struct timer_list timer;
+ struct task_struct *task;
+};
+
+static void process_timeout(struct timer_list *t)
{
- wake_up_process((struct task_struct *)__data);
+ struct process_timer *timeout = from_timer(timeout, t, timer);
+
+ wake_up_process(timeout->task);
}
/**
@@ -1704,7 +1749,7 @@ static void process_timeout(unsigned long __data)
*/
signed long __sched schedule_timeout(signed long timeout)
{
- struct timer_list timer;
+ struct process_timer timer;
unsigned long expire;
switch (timeout)
@@ -1738,13 +1783,14 @@ signed long __sched schedule_timeout(signed long timeout)
expire = timeout + jiffies;
- setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
- __mod_timer(&timer, expire, false);
+ timer.task = current;
+ timer_setup_on_stack(&timer.timer, process_timeout, 0);
+ __mod_timer(&timer.timer, expire, 0);
schedule();
- del_singleshot_timer_sync(&timer);
+ del_singleshot_timer_sync(&timer.timer);
/* Remove the timer from the object tracker */
- destroy_timer_on_stack(&timer);
+ destroy_timer_on_stack(&timer.timer);
timeout = expire - jiffies;
diff --git a/kernel/torture.c b/kernel/torture.c
index 55de96529287..637e172835d8 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
torture_type, cpu);
(*n_offl_successes)++;
delta = jiffies - starttime;
- sum_offl += delta;
+ *sum_offl += delta;
if (*min_offl < 0) {
*min_offl = delta;
*max_offl = delta;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..f54b7b6b4a4b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -224,7 +224,7 @@ config HWLAT_TRACER
select GENERIC_TRACER
help
This tracer, when enabled will create one or more kernel threads,
- depening on what the cpumask file is set to, which each thread
+ depending on what the cpumask file is set to, which each thread
spinning in a loop looking for interruptions caused by
something other than the kernel. For example, if a
System Management Interrupt (SMI) takes a noticeable amount of
@@ -239,7 +239,7 @@ config HWLAT_TRACER
iteration
A kernel thread is created that will spin with interrupts disabled
- for "width" microseconds in every "widow" cycle. It will not spin
+ for "width" microseconds in every "window" cycle. It will not spin
for "window - width" microseconds, where the system can
continue to operate.
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 90f2701d92a7..19a15b2f1190 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
# Do not instrument the tracer itself:
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc364f86100a..206e0e2ace53 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -27,6 +27,7 @@
#include <linux/time.h>
#include <linux/uaccess.h>
#include <linux/list.h>
+#include <linux/blk-cgroup.h>
#include "../../block/blk.h"
@@ -46,10 +47,16 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
/* Select an alternative, minimalistic output than the original one */
#define TRACE_BLK_OPT_CLASSIC 0x1
+#define TRACE_BLK_OPT_CGROUP 0x2
+#define TRACE_BLK_OPT_CGNAME 0x4
static struct tracer_opt blk_tracer_opts[] = {
/* Default disable the minimalistic output */
{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+#ifdef CONFIG_BLK_CGROUP
+ { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
+ { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) },
+#endif
{ }
};
@@ -59,7 +66,8 @@ static struct tracer_flags blk_tracer_flags = {
};
/* Global reference count of probes */
-static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
static void blk_register_tracepoints(void);
static void blk_unregister_tracepoints(void);
@@ -68,7 +76,8 @@ static void blk_unregister_tracepoints(void);
* Send out a notify message.
*/
static void trace_note(struct blk_trace *bt, pid_t pid, int action,
- const void *data, size_t len)
+ const void *data, size_t len,
+ union kernfs_node_id *cgid)
{
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
@@ -76,12 +85,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
int pc = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
+ ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
if (blk_tracer) {
buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + len,
+ sizeof(*t) + len + cgid_len,
0, pc);
if (!event)
return;
@@ -92,17 +102,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
if (!bt->rchan)
return;
- t = relay_reserve(bt->rchan, sizeof(*t) + len);
+ t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
if (t) {
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->time = ktime_to_ns(ktime_get());
record_it:
t->device = bt->dev;
- t->action = action;
+ t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
t->pid = pid;
t->cpu = cpu;
- t->pdu_len = len;
- memcpy((void *) t + sizeof(*t), data, len);
+ t->pdu_len = len + cgid_len;
+ if (cgid)
+ memcpy((void *)t + sizeof(*t), cgid, cgid_len);
+ memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer)
trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -122,7 +134,7 @@ static void trace_note_tsk(struct task_struct *tsk)
spin_lock_irqsave(&running_trace_lock, flags);
list_for_each_entry(bt, &running_trace_list, running_list) {
trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
- sizeof(tsk->comm));
+ sizeof(tsk->comm), NULL);
}
spin_unlock_irqrestore(&running_trace_lock, flags);
}
@@ -139,11 +151,12 @@ static void trace_note_time(struct blk_trace *bt)
words[1] = now.tv_nsec;
local_irq_save(flags);
- trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
+ trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL);
local_irq_restore(flags);
}
-void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
+void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
+ const char *fmt, ...)
{
int n;
va_list args;
@@ -167,7 +180,14 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
va_end(args);
- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+ if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+ blkcg = NULL;
+#ifdef CONFIG_BLK_CGROUP
+ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
+ blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL);
+#else
+ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL);
+#endif
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(__trace_note_message);
@@ -204,7 +224,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
int op, int op_flags, u32 what, int error, int pdu_len,
- void *pdu_data)
+ void *pdu_data, union kernfs_node_id *cgid)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
@@ -215,6 +235,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
pid_t pid;
int cpu, pc = 0;
bool blk_tracer = blk_tracer_enabled;
+ ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
@@ -229,6 +250,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
what |= BLK_TC_ACT(BLK_TC_DISCARD);
if (op == REQ_OP_FLUSH)
what |= BLK_TC_ACT(BLK_TC_FLUSH);
+ if (cgid)
+ what |= __BLK_TA_CGROUP;
pid = tsk->pid;
if (act_log_check(bt, what, sector, pid))
@@ -241,7 +264,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + pdu_len,
+ sizeof(*t) + pdu_len + cgid_len,
0, pc);
if (!event)
return;
@@ -258,7 +281,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
* from coming in and stepping on our toes.
*/
local_irq_save(flags);
- t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+ t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
if (t) {
sequence = per_cpu_ptr(bt->sequence, cpu);
@@ -280,10 +303,12 @@ record_it:
t->action = what;
t->device = bt->dev;
t->error = error;
- t->pdu_len = pdu_len;
+ t->pdu_len = pdu_len + cgid_len;
+ if (cgid_len)
+ memcpy((void *)t + sizeof(*t), cgid, cgid_len);
if (pdu_len)
- memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+ memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
if (blk_tracer) {
trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -305,14 +330,29 @@ static void blk_trace_free(struct blk_trace *bt)
kfree(bt);
}
+static void get_probe_ref(void)
+{
+ mutex_lock(&blk_probe_mutex);
+ if (++blk_probes_ref == 1)
+ blk_register_tracepoints();
+ mutex_unlock(&blk_probe_mutex);
+}
+
+static void put_probe_ref(void)
+{
+ mutex_lock(&blk_probe_mutex);
+ if (!--blk_probes_ref)
+ blk_unregister_tracepoints();
+ mutex_unlock(&blk_probe_mutex);
+}
+
static void blk_trace_cleanup(struct blk_trace *bt)
{
blk_trace_free(bt);
- if (atomic_dec_and_test(&blk_probes_ref))
- blk_unregister_tracepoints();
+ put_probe_ref();
}
-int blk_trace_remove(struct request_queue *q)
+static int __blk_trace_remove(struct request_queue *q)
{
struct blk_trace *bt;
@@ -325,6 +365,17 @@ int blk_trace_remove(struct request_queue *q)
return 0;
}
+
+int blk_trace_remove(struct request_queue *q)
+{
+ int ret;
+
+ mutex_lock(&q->blk_trace_mutex);
+ ret = __blk_trace_remove(q);
+ mutex_unlock(&q->blk_trace_mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(blk_trace_remove);
static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
@@ -359,7 +410,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
return PTR_ERR(msg);
bt = filp->private_data;
- __trace_note_message(bt, "%s", msg);
+ __trace_note_message(bt, NULL, "%s", msg);
kfree(msg);
return count;
@@ -514,8 +565,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (cmpxchg(&q->blk_trace, NULL, bt))
goto err;
- if (atomic_inc_return(&blk_probes_ref) == 1)
- blk_register_tracepoints();
+ get_probe_ref();
ret = 0;
err:
@@ -526,9 +576,8 @@ err:
return ret;
}
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- char __user *arg)
+static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev, char __user *arg)
{
struct blk_user_trace_setup buts;
int ret;
@@ -547,6 +596,19 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
}
return 0;
}
+
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ char __user *arg)
+{
+ int ret;
+
+ mutex_lock(&q->blk_trace_mutex);
+ ret = __blk_trace_setup(q, name, dev, bdev, arg);
+ mutex_unlock(&q->blk_trace_mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(blk_trace_setup);
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
@@ -583,7 +645,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
}
#endif
-int blk_trace_startstop(struct request_queue *q, int start)
+static int __blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
struct blk_trace *bt = q->blk_trace;
@@ -622,8 +684,25 @@ int blk_trace_startstop(struct request_queue *q, int start)
return ret;
}
+
+int blk_trace_startstop(struct request_queue *q, int start)
+{
+ int ret;
+
+ mutex_lock(&q->blk_trace_mutex);
+ ret = __blk_trace_startstop(q, start);
+ mutex_unlock(&q->blk_trace_mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(blk_trace_startstop);
+/*
+ * When reading or writing the blktrace sysfs files, the references to the
+ * opened sysfs or device files should prevent the underlying block device
+ * from being removed. So no further delete protection is really needed.
+ */
+
/**
* blk_trace_ioctl: - handle the ioctls associated with tracing
* @bdev: the block device
@@ -641,12 +720,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
if (!q)
return -ENXIO;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
switch (cmd) {
case BLKTRACESETUP:
bdevname(bdev, b);
- ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+ ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
case BLKTRACESETUP32:
@@ -657,17 +736,17 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
case BLKTRACESTART:
start = 1;
case BLKTRACESTOP:
- ret = blk_trace_startstop(q, start);
+ ret = __blk_trace_startstop(q, start);
break;
case BLKTRACETEARDOWN:
- ret = blk_trace_remove(q);
+ ret = __blk_trace_remove(q);
break;
default:
ret = -ENOTTY;
break;
}
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
return ret;
}
@@ -678,10 +757,44 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
**/
void blk_trace_shutdown(struct request_queue *q)
{
+ mutex_lock(&q->blk_trace_mutex);
+
if (q->blk_trace) {
- blk_trace_startstop(q, 0);
- blk_trace_remove(q);
+ __blk_trace_startstop(q, 0);
+ __blk_trace_remove(q);
}
+
+ mutex_unlock(&q->blk_trace_mutex);
+}
+
+#ifdef CONFIG_BLK_CGROUP
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+ return NULL;
+
+ if (!bio->bi_css)
+ return NULL;
+ return cgroup_get_kernfs_id(bio->bi_css->cgroup);
+}
+#else
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+ return NULL;
+}
+#endif
+
+static union kernfs_node_id *
+blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+{
+ if (!rq->bio)
+ return NULL;
+ /* Use the first bio */
+ return blk_trace_bio_get_cgid(q, rq->bio);
}
/*
@@ -694,13 +807,15 @@ void blk_trace_shutdown(struct request_queue *q)
* @error: return status to log
* @nr_bytes: number of completed bytes
* @what: the action
+ * @cgid: the cgroup info
*
* Description:
* Records an action against a request. Will log the bio offset + size.
*
**/
static void blk_add_trace_rq(struct request *rq, int error,
- unsigned int nr_bytes, u32 what)
+ unsigned int nr_bytes, u32 what,
+ union kernfs_node_id *cgid)
{
struct blk_trace *bt = rq->q->blk_trace;
@@ -713,32 +828,36 @@ static void blk_add_trace_rq(struct request *rq, int error,
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, error, 0, NULL);
+ rq->cmd_flags, what, error, 0, NULL, cgid);
}
static void blk_add_trace_rq_insert(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_issue(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_requeue(void *ignore,
struct request_queue *q,
struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
int error, unsigned int nr_bytes)
{
- blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
+ blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
+ blk_trace_request_get_cgid(rq->q, rq));
}
/**
@@ -753,7 +872,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what, int error)
+ u32 what, int error, union kernfs_node_id *cgid)
{
struct blk_trace *bt = q->blk_trace;
@@ -761,20 +880,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
return;
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, what, error, 0, NULL);
+ bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid);
}
static void blk_add_trace_bio_bounce(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio,
int error)
{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_backmerge(void *ignore,
@@ -782,7 +903,8 @@ static void blk_add_trace_bio_backmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_frontmerge(void *ignore,
@@ -790,13 +912,15 @@ static void blk_add_trace_bio_frontmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_queue(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_getrq(void *ignore,
@@ -804,13 +928,14 @@ static void blk_add_trace_getrq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0,
+ blk_trace_bio_get_cgid(q, bio));
else {
struct blk_trace *bt = q->blk_trace;
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
- NULL);
+ NULL, NULL);
}
}
@@ -820,13 +945,14 @@ static void blk_add_trace_sleeprq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0,
+ blk_trace_bio_get_cgid(q, bio));
else {
struct blk_trace *bt = q->blk_trace;
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
- 0, 0, NULL);
+ 0, 0, NULL, NULL);
}
}
@@ -835,7 +961,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL);
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -852,7 +978,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+ __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL);
}
}
@@ -868,7 +994,7 @@ static void blk_add_trace_split(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
- &rpdu);
+ &rpdu, blk_trace_bio_get_cgid(q, bio));
}
}
@@ -896,12 +1022,12 @@ static void blk_add_trace_bio_remap(void *ignore,
return;
r.device_from = cpu_to_be32(dev);
- r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
+ r.device_to = cpu_to_be32(bio_dev(bio));
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
- sizeof(r), &r);
+ sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
}
/**
@@ -934,7 +1060,7 @@ static void blk_add_trace_rq_remap(void *ignore,
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
- sizeof(r), &r);
+ sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
}
/**
@@ -958,7 +1084,8 @@ void blk_add_driver_data(struct request_queue *q,
return;
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
- BLK_TA_DRV_DATA, 0, len, data);
+ BLK_TA_DRV_DATA, 0, len, data,
+ blk_trace_request_get_cgid(q, rq));
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1031,7 +1158,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
int i = 0;
int tc = t->action >> BLK_TC_SHIFT;
- if (t->action == BLK_TN_MESSAGE) {
+ if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
rwbs[i++] = 'N';
goto out;
}
@@ -1066,9 +1193,21 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
return (const struct blk_io_trace *)ent;
}
-static inline const void *pdu_start(const struct trace_entry *ent)
+static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
+{
+ return (void *)(te_blk_io_trace(ent) + 1) +
+ (has_cg ? sizeof(union kernfs_node_id) : 0);
+}
+
+static inline const void *cgid_start(const struct trace_entry *ent)
{
- return te_blk_io_trace(ent) + 1;
+ return (void *)(te_blk_io_trace(ent) + 1);
+}
+
+static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
+{
+ return te_blk_io_trace(ent)->pdu_len -
+ (has_cg ? sizeof(union kernfs_node_id) : 0);
}
static inline u32 t_action(const struct trace_entry *ent)
@@ -1096,16 +1235,16 @@ static inline __u16 t_error(const struct trace_entry *ent)
return te_blk_io_trace(ent)->error;
}
-static __u64 get_pdu_int(const struct trace_entry *ent)
+static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
{
- const __u64 *val = pdu_start(ent);
+ const __u64 *val = pdu_start(ent, has_cg);
return be64_to_cpu(*val);
}
static void get_pdu_remap(const struct trace_entry *ent,
- struct blk_io_trace_remap *r)
+ struct blk_io_trace_remap *r, bool has_cg)
{
- const struct blk_io_trace_remap *__r = pdu_start(ent);
+ const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
__u64 sector_from = __r->sector_from;
r->device_from = be32_to_cpu(__r->device_from);
@@ -1113,9 +1252,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
r->sector_from = be64_to_cpu(sector_from);
}
-typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
+ bool has_cg);
-static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
+ bool has_cg)
{
char rwbs[RWBS_LEN];
unsigned long long ts = iter->ts;
@@ -1131,24 +1272,43 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
secs, nsec_rem, iter->ent->pid, act, rwbs);
}
-static void blk_log_action(struct trace_iterator *iter, const char *act)
+static void blk_log_action(struct trace_iterator *iter, const char *act,
+ bool has_cg)
{
char rwbs[RWBS_LEN];
const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
- trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
- MAJOR(t->device), MINOR(t->device), act, rwbs);
-}
-
-static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+ if (has_cg) {
+ const union kernfs_node_id *id = cgid_start(iter->ent);
+
+ if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) {
+ char blkcg_name_buf[NAME_MAX + 1] = "<...>";
+
+ cgroup_path_from_kernfs_id(id, blkcg_name_buf,
+ sizeof(blkcg_name_buf));
+ trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ",
+ MAJOR(t->device), MINOR(t->device),
+ blkcg_name_buf, act, rwbs);
+ } else
+ trace_seq_printf(&iter->seq,
+ "%3d,%-3d %x,%-x %2s %3s ",
+ MAJOR(t->device), MINOR(t->device),
+ id->ino, id->generation, act, rwbs);
+ } else
+ trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), act, rwbs);
+}
+
+static void blk_log_dump_pdu(struct trace_seq *s,
+ const struct trace_entry *ent, bool has_cg)
{
const unsigned char *pdu_buf;
int pdu_len;
int i, end;
- pdu_buf = pdu_start(ent);
- pdu_len = te_blk_io_trace(ent)->pdu_len;
+ pdu_buf = pdu_start(ent, has_cg);
+ pdu_len = pdu_real_len(ent, has_cg);
if (!pdu_len)
return;
@@ -1179,7 +1339,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
trace_seq_puts(s, ") ");
}
-static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
@@ -1187,7 +1347,7 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
trace_seq_printf(s, "%u ", t_bytes(ent));
- blk_log_dump_pdu(s, ent);
+ blk_log_dump_pdu(s, ent, has_cg);
trace_seq_printf(s, "[%s]\n", cmd);
} else {
if (t_sec(ent))
@@ -1199,10 +1359,10 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
}
static void blk_log_with_error(struct trace_seq *s,
- const struct trace_entry *ent)
+ const struct trace_entry *ent, bool has_cg)
{
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
- blk_log_dump_pdu(s, ent);
+ blk_log_dump_pdu(s, ent, has_cg);
trace_seq_printf(s, "[%d]\n", t_error(ent));
} else {
if (t_sec(ent))
@@ -1215,18 +1375,18 @@ static void blk_log_with_error(struct trace_seq *s,
}
}
-static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
struct blk_io_trace_remap r = { .device_from = 0, };
- get_pdu_remap(ent, &r);
+ get_pdu_remap(ent, &r, has_cg);
trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
t_sector(ent), t_sec(ent),
MAJOR(r.device_from), MINOR(r.device_from),
(unsigned long long)r.sector_from);
}
-static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
@@ -1235,30 +1395,31 @@ static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
trace_seq_printf(s, "[%s]\n", cmd);
}
-static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+ trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg));
}
-static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
- get_pdu_int(ent), cmd);
+ get_pdu_int(ent, has_cg), cmd);
}
-static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent,
+ bool has_cg)
{
- const struct blk_io_trace *t = te_blk_io_trace(ent);
- trace_seq_putmem(s, t + 1, t->pdu_len);
+ trace_seq_putmem(s, pdu_start(ent, has_cg),
+ pdu_real_len(ent, has_cg));
trace_seq_putc(s, '\n');
}
@@ -1298,7 +1459,8 @@ static void blk_tracer_reset(struct trace_array *tr)
static const struct {
const char *act[2];
- void (*print)(struct trace_seq *s, const struct trace_entry *ent);
+ void (*print)(struct trace_seq *s, const struct trace_entry *ent,
+ bool has_cg);
} what2act[] = {
[__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
[__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
@@ -1326,23 +1488,25 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
u16 what;
bool long_act;
blk_log_action_t *log_action;
+ bool has_cg;
t = te_blk_io_trace(iter->ent);
- what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+ what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
log_action = classic ? &blk_log_action_classic : &blk_log_action;
+ has_cg = t->action & __BLK_TA_CGROUP;
- if (t->action == BLK_TN_MESSAGE) {
- log_action(iter, long_act ? "message" : "m");
- blk_log_msg(s, iter->ent);
+ if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
+ log_action(iter, long_act ? "message" : "m", has_cg);
+ blk_log_msg(s, iter->ent, has_cg);
return trace_handle_return(s);
}
if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
trace_seq_printf(s, "Unknown action %x\n", what);
else {
- log_action(iter, what2act[what].act[long_act]);
- what2act[what].print(s, iter->ent);
+ log_action(iter, what2act[what].act[long_act], has_cg);
+ what2act[what].print(s, iter->ent, has_cg);
}
return trace_handle_return(s);
@@ -1447,9 +1611,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (bt == NULL)
return -EINVAL;
- if (atomic_dec_and_test(&blk_probes_ref))
- blk_unregister_tracepoints();
-
+ put_probe_ref();
blk_trace_free(bt);
return 0;
}
@@ -1480,8 +1642,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
if (cmpxchg(&q->blk_trace, NULL, bt))
goto free_bt;
- if (atomic_inc_return(&blk_probes_ref) == 1)
- blk_register_tracepoints();
+ get_probe_ref();
return 0;
free_bt:
@@ -1622,7 +1783,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
if (q == NULL)
goto out_bdput;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
if (attr == &dev_attr_enable) {
ret = sprintf(buf, "%u\n", !!q->blk_trace);
@@ -1641,7 +1802,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
out_unlock_bdev:
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
out_bdput:
bdput(bdev);
out:
@@ -1683,7 +1844,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
if (q == NULL)
goto out_bdput;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
if (attr == &dev_attr_enable) {
if (value)
@@ -1709,7 +1870,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
}
out_unlock_bdev:
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
out_bdput:
bdput(bdev);
out:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index dc498b605d5d..a5580c670866 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -15,9 +15,11 @@
#include <linux/ctype.h>
#include "trace.h"
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+
/**
* trace_call_bpf - invoke BPF program
- * @prog: BPF program
+ * @call: tracepoint event
* @ctx: opaque context pointer
*
* kprobe handlers execute BPF programs via this helper.
@@ -29,7 +31,7 @@
* 1 - store kprobe event into ring buffer
* Other values are reserved and currently alias to 1
*/
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
unsigned int ret;
@@ -49,9 +51,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
goto out;
}
- rcu_read_lock();
- ret = BPF_PROG_RUN(prog, ctx);
- rcu_read_unlock();
+ /*
+ * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
+ * to all call sites, we did a bpf_prog_array_valid() there to check
+ * whether call->prog_array is empty or not, which is
+ * a heurisitc to speed up execution.
+ *
+ * If bpf_prog_array_valid() fetched prog_array was
+ * non-NULL, we go into trace_call_bpf() and do the actual
+ * proper rcu_dereference() under RCU lock.
+ * If it turns out that prog_array is NULL then, we bail out.
+ * For the opposite, if the bpf_prog_array_valid() fetched pointer
+ * was NULL, you'll skip the prog_array with the risk of missing
+ * out of events when it was updated in between this and the
+ * rcu_dereference() which is accepted risk.
+ */
+ ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
out:
__this_cpu_dec(bpf_prog_active);
@@ -63,12 +78,16 @@ EXPORT_SYMBOL_GPL(trace_call_bpf);
BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
{
- int ret;
+ int ret = 0;
+
+ if (unlikely(size == 0))
+ goto out;
ret = probe_kernel_read(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
+ out:
return ret;
}
@@ -77,7 +96,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
- .arg2_type = ARG_CONST_SIZE,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
@@ -255,14 +274,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
-BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+static __always_inline int
+get_map_perf_counter(struct bpf_map *map, u64 flags,
+ u64 *value, u64 *enabled, u64 *running)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
- u64 value = 0;
- int err;
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -275,7 +294,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
if (!ee)
return -ENOENT;
- err = perf_event_read_local(ee->event, &value);
+ return perf_event_read_local(ee->event, value, enabled, running);
+}
+
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+{
+ u64 value = 0;
+ int err;
+
+ err = get_map_perf_counter(map, flags, &value, NULL, NULL);
/*
* this api is ugly since we miss [-22..-2] range of valid
* counter values, but that's uapi
@@ -293,6 +320,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
+ struct bpf_perf_event_value *, buf, u32, size)
+{
+ int err = -EINVAL;
+
+ if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+ goto clear;
+ err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
+ &buf->running);
+ if (unlikely(err))
+ goto clear;
+ return 0;
+clear:
+ memset(buf, 0, size);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
+ .func = bpf_perf_event_read_value,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
static __always_inline u64
@@ -499,6 +553,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_perf_event_output_proto;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
+ case BPF_FUNC_perf_event_read_value:
+ return &bpf_perf_event_read_value_proto;
default:
return tracing_func_proto(func_id);
}
@@ -524,11 +580,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
return true;
}
-const struct bpf_verifier_ops kprobe_prog_ops = {
+const struct bpf_verifier_ops kprobe_verifier_ops = {
.get_func_proto = kprobe_prog_func_proto,
.is_valid_access = kprobe_prog_is_valid_access,
};
+const struct bpf_prog_ops kprobe_prog_ops = {
+};
+
BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
@@ -576,6 +635,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+ struct bpf_perf_event_value *, buf, u32, size)
+{
+ int err = -EINVAL;
+
+ if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+ goto clear;
+ err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
+ &buf->running);
+ if (unlikely(err))
+ goto clear;
+ return 0;
+clear:
+ memset(buf, 0, size);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
+ .func = bpf_perf_prog_read_value_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -583,6 +668,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
+ case BPF_FUNC_perf_prog_read_value:
+ return &bpf_perf_prog_read_value_proto_tp;
default:
return tracing_func_proto(func_id);
}
@@ -602,11 +689,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
return true;
}
-const struct bpf_verifier_ops tracepoint_prog_ops = {
+const struct bpf_verifier_ops tracepoint_verifier_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = tp_prog_is_valid_access,
};
+const struct bpf_prog_ops tracepoint_prog_ops = {
+};
+
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
@@ -662,8 +752,67 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
-const struct bpf_verifier_ops perf_event_prog_ops = {
+const struct bpf_verifier_ops perf_event_verifier_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = pe_prog_is_valid_access,
.convert_ctx_access = pe_prog_convert_ctx_access,
};
+
+const struct bpf_prog_ops perf_event_prog_ops = {
+};
+
+static DEFINE_MUTEX(bpf_event_mutex);
+
+int perf_event_attach_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog)
+{
+ struct bpf_prog_array __rcu *old_array;
+ struct bpf_prog_array *new_array;
+ int ret = -EEXIST;
+
+ mutex_lock(&bpf_event_mutex);
+
+ if (event->prog)
+ goto unlock;
+
+ old_array = event->tp_event->prog_array;
+ ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+ if (ret < 0)
+ goto unlock;
+
+ /* set the new array to event->tp_event and set event->prog */
+ event->prog = prog;
+ rcu_assign_pointer(event->tp_event->prog_array, new_array);
+ bpf_prog_array_free(old_array);
+
+unlock:
+ mutex_unlock(&bpf_event_mutex);
+ return ret;
+}
+
+void perf_event_detach_bpf_prog(struct perf_event *event)
+{
+ struct bpf_prog_array __rcu *old_array;
+ struct bpf_prog_array *new_array;
+ int ret;
+
+ mutex_lock(&bpf_event_mutex);
+
+ if (!event->prog)
+ goto unlock;
+
+ old_array = event->tp_event->prog_array;
+ ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+ if (ret < 0) {
+ bpf_prog_array_delete_safe(old_array, event->prog);
+ } else {
+ rcu_assign_pointer(event->tp_event->prog_array, new_array);
+ bpf_prog_array_free(old_array);
+ }
+
+ bpf_prog_put(event->prog);
+ event->prog = NULL;
+
+unlock:
+ mutex_unlock(&bpf_event_mutex);
+}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 96cea88fa00f..8319e09e15b9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2828,13 +2828,14 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (!command || !ftrace_enabled) {
/*
- * If these are per_cpu ops, they still need their
- * per_cpu field freed. Since, function tracing is
+ * If these are dynamic or per_cpu ops, they still
+ * need their data freed. Since, function tracing is
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & FTRACE_OPS_FL_PER_CPU)
- per_cpu_ops_free(ops);
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+ goto free_ops;
+
return 0;
}
@@ -2900,6 +2901,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (IS_ENABLED(CONFIG_PREEMPT))
synchronize_rcu_tasks();
+ free_ops:
arch_ftrace_trampoline_free(ops);
if (ops->flags & FTRACE_OPS_FL_PER_CPU)
@@ -4952,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
-static unsigned long save_global_trampoline;
-static unsigned long save_global_flags;
-
static int __init set_graph_function(char *str)
{
strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -5690,10 +5689,51 @@ static int referenced_filters(struct dyn_ftrace *rec)
return cnt;
}
+static void
+clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash)
+{
+ struct ftrace_func_entry *entry;
+ struct dyn_ftrace *rec;
+ int i;
+
+ if (ftrace_hash_empty(hash))
+ return;
+
+ for (i = 0; i < pg->index; i++) {
+ rec = &pg->records[i];
+ entry = __ftrace_lookup_ip(hash, rec->ip);
+ /*
+ * Do not allow this rec to match again.
+ * Yeah, it may waste some memory, but will be removed
+ * if/when the hash is modified again.
+ */
+ if (entry)
+ entry->ip = 0;
+ }
+}
+
+/* Clear any records from hashs */
+static void clear_mod_from_hashes(struct ftrace_page *pg)
+{
+ struct trace_array *tr;
+
+ mutex_lock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->ops || !tr->ops->func_hash)
+ continue;
+ mutex_lock(&tr->ops->func_hash->regex_lock);
+ clear_mod_from_hash(pg, tr->ops->func_hash->filter_hash);
+ clear_mod_from_hash(pg, tr->ops->func_hash->notrace_hash);
+ mutex_unlock(&tr->ops->func_hash->regex_lock);
+ }
+ mutex_unlock(&trace_types_lock);
+}
+
void ftrace_release_mod(struct module *mod)
{
struct dyn_ftrace *rec;
struct ftrace_page **last_pg;
+ struct ftrace_page *tmp_page = NULL;
struct ftrace_page *pg;
int order;
@@ -5723,14 +5763,25 @@ void ftrace_release_mod(struct module *mod)
ftrace_update_tot_cnt -= pg->index;
*last_pg = pg->next;
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
- kfree(pg);
+
+ pg->next = tmp_page;
+ tmp_page = pg;
} else
last_pg = &pg->next;
}
out_unlock:
mutex_unlock(&ftrace_lock);
+
+ for (pg = tmp_page; pg; pg = tmp_page) {
+
+ /* Needs to be called outside of ftrace_lock */
+ clear_mod_from_hashes(pg);
+
+ order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+ free_pages((unsigned long)pg->records, order);
+ tmp_page = pg->next;
+ kfree(pg);
+ }
}
void ftrace_module_enable(struct module *mod)
@@ -6754,17 +6805,6 @@ void unregister_ftrace_graph(void)
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
-#ifdef CONFIG_DYNAMIC_FTRACE
- /*
- * Function graph does not allocate the trampoline, but
- * other global_ops do. We need to reset the ALLOC_TRAMP flag
- * if one was used.
- */
- global_ops.trampoline = save_global_trampoline;
- if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
- global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
-#endif
-
out:
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 0c7dee221dca..21bb161c2316 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Power trace points
*
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 81279c6602ff..845f3805c73d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2724,7 +2724,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
* if it happened, we have to fail the write.
*/
barrier();
- if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
+ if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) {
local_dec(&cpu_buffer->committing);
local_dec(&cpu_buffer->commits);
return NULL;
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c
index 4b3b5eaf94d1..25dec0b00280 100644
--- a/kernel/trace/rpm-traces.c
+++ b/kernel/trace/rpm-traces.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Power trace points
*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 44004d8aa3b3..752e5daf0896 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1702,6 +1702,9 @@ void tracing_reset_all_online_cpus(void)
struct trace_array *tr;
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->clear_trace)
+ continue;
+ tr->clear_trace = false;
tracing_reset_online_cpus(&tr->trace_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
tracing_reset_online_cpus(&tr->max_buffer);
@@ -2799,11 +2802,17 @@ static char *get_trace_buf(void)
if (!buffer || buffer->nesting >= 4)
return NULL;
- return &buffer->buffer[buffer->nesting++][0];
+ buffer->nesting++;
+
+ /* Interrupts must see nesting incremented before we use the buffer */
+ barrier();
+ return &buffer->buffer[buffer->nesting][0];
}
static void put_trace_buf(void)
{
+ /* Don't let the decrement of nesting leak before this */
+ barrier();
this_cpu_dec(trace_percpu_buffer->nesting);
}
@@ -4011,11 +4020,17 @@ static int tracing_open(struct inode *inode, struct file *file)
/* If this file was open for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
int cpu = tracing_get_cpu(inode);
+ struct trace_buffer *trace_buf = &tr->trace_buffer;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (tr->current_trace->print_max)
+ trace_buf = &tr->max_buffer;
+#endif
if (cpu == RING_BUFFER_ALL_CPUS)
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(trace_buf);
else
- tracing_reset(&tr->trace_buffer, cpu);
+ tracing_reset(trace_buf, cpu);
}
if (file->f_mode & FMODE_READ) {
@@ -5349,6 +5364,13 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (t == tr->current_trace)
goto out;
+ /* Some tracers won't work on kernel command line */
+ if (system_state < SYSTEM_RUNNING && t->noboot) {
+ pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
+ t->name);
+ goto out;
+ }
+
/* Some tracers are only allowed for the top level buffer */
if (!trace_ok_for_array(t, tr)) {
ret = -EINVAL;
@@ -5658,7 +5680,7 @@ static int tracing_wait_pipe(struct file *filp)
*
* iter->pos will be 0 if we haven't read anything.
*/
- if (!tracing_is_on() && iter->pos)
+ if (!tracer_tracing_is_on(iter->tr) && iter->pos)
break;
mutex_unlock(&iter->mutex);
@@ -6220,7 +6242,7 @@ static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
tracing_reset_online_cpus(&tr->trace_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
+ if (tr->max_buffer.buffer)
ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
tracing_reset_online_cpus(&tr->max_buffer);
#endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 490ba229931d..6b0b343a36a2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KERNEL_TRACE_H
#define _LINUX_KERNEL_TRACE_H
@@ -245,6 +246,7 @@ struct trace_array {
int stop_count;
int clock_id;
int nr_topts;
+ bool clear_trace;
struct tracer *current_trace;
unsigned int trace_flags;
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
@@ -443,6 +445,8 @@ struct tracer {
#ifdef CONFIG_TRACER_MAX_TRACE
bool use_max_tr;
#endif
+ /* True if tracer cannot be enabled in kernel param */
+ bool noboot;
};
@@ -1456,7 +1460,7 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr,
static inline void *event_file_data(struct file *filp)
{
- return ACCESS_ONCE(file_inode(filp)->i_private);
+ return READ_ONCE(file_inode(filp)->i_private);
}
extern struct mutex event_mutex;
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 16a8cf02eee9..79f838a75077 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/kthread.h>
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
index ebdbfc2f2a64..be1d86ff753d 100644
--- a/kernel/trace/trace_benchmark.h
+++ b/kernel/trace/trace_benchmark.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM benchmark
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4d8fdf3184dc..4ad967453b6f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* unlikely profiler
*
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index adcdbbeae010..e954ae3d82c0 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file defines the trace event structures that go into the ring
* buffer directly. They are created via macros so that changes for them
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 36132f9280e6..87468398b9ed 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -406,7 +406,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (file->flags & EVENT_FILE_FL_RECORDED_TGID) {
tracing_stop_tgid_record();
- clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
}
call->class->reg(call, TRACE_REG_UNREGISTER, file);
@@ -466,7 +466,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
/* WAS_ENABLED gets set but never cleared. */
- call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
+ set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags);
}
break;
}
@@ -2058,6 +2058,10 @@ static void event_remove(struct trace_event_call *call)
do_for_each_event_file(tr, file) {
if (file->event_call != call)
continue;
+
+ if (file->flags & EVENT_FILE_FL_WAS_ENABLED)
+ tr->clear_trace = true;
+
ftrace_event_enable_disable(file, 0);
/*
* The do_for_each_event_file() is
@@ -2396,15 +2400,11 @@ static void trace_module_add_events(struct module *mod)
static void trace_module_remove_events(struct module *mod)
{
struct trace_event_call *call, *p;
- bool clear_trace = false;
down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
- if (call->mod == mod) {
- if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
- clear_trace = true;
+ if (call->mod == mod)
__trace_remove_event_call(call);
- }
}
up_write(&trace_event_sem);
@@ -2416,8 +2416,7 @@ static void trace_module_remove_events(struct module *mod)
* over from this module may be passed to the new module events and
* unexpected results may occur.
*/
- if (clear_trace)
- tracing_reset_all_online_cpus();
+ tracing_reset_all_online_cpus();
}
static int trace_module_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 181e139a8057..61e7f0678d33 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -702,7 +702,7 @@ static void append_filter_err(struct filter_parse_state *ps,
int pos = ps->lasterr_pos;
char *buf, *pbuf;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
+ buf = (char *)__get_free_page(GFP_KERNEL);
if (!buf)
return;
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
index bfd4dba0d603..39d7ef4f57cb 100644
--- a/kernel/trace/trace_events_filter_test.h
+++ b/kernel/trace/trace_events_filter_test.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM test
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 39aa7aa66468..548e62eb5c46 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* trace_export.c - export basic ftrace utilities to user space
*
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index a0910c0cdf2e..27f7ad12c4b1 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* ring buffer based function tracer
*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d56123cdcc89..23c0b0cb5fb9 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
*
* Function graph tracer.
@@ -1543,7 +1544,7 @@ fs_initcall(init_graph_tracefs);
static __init int init_graph_trace(void)
{
- max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
+ max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1);
if (!register_trace_event(&graph_trace_entry_event)) {
pr_warn("Warning: could not register graph trace events\n");
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 57149bce6aad..d953c163a079 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* kdb helper for dumping the ftrace buffer
*
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8a907e12b6b9..abf92e478cfb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1174,13 +1174,12 @@ static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct trace_event_call *call = &tk->tp.call;
- struct bpf_prog *prog = call->prog;
struct kprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
head = this_cpu_ptr(call->perf_events);
@@ -1210,13 +1209,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_event_call *call = &tk->tp.call;
- struct bpf_prog *prog = call->prog;
struct kretprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
head = this_cpu_ptr(call->perf_events);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index cd7480d0a201..b0388016b687 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Memory mapped I/O tracing
*
@@ -282,6 +283,7 @@ static struct tracer mmio_tracer __read_mostly =
.close = mmio_close,
.read = mmio_read,
.print_line = mmio_print_line,
+ .noboot = true,
};
__init static int init_mmio_trace(void)
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 49f61fe96a6b..50523f953a5d 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* nop tracer
*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bac629af2285..90db994ac900 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter)
return !trace_seq_has_overflowed(s);
}
-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-
-static int task_state_char(unsigned long state)
-{
- int bit = state ? __ffs(state) + 1 : 0;
-
- return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
-
/**
* ftrace_find_event - find a registered event
* @type: the type of event to look for
@@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- T = task_state_char(field->next_state);
- S = task_state_char(field->prev_state);
+ T = task_index_to_char(field->next_state);
+ S = task_index_to_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
trace_seq_printf(&iter->seq,
" %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
- S = task_state_char(field->prev_state);
- T = task_state_char(field->next_state);
+ S = task_index_to_char(field->prev_state);
+ T = task_index_to_char(field->next_state);
trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
field->prev_pid,
field->prev_prio,
@@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
- S = task_state_char(field->prev_state);
- T = task_state_char(field->next_state);
+ S = task_index_to_char(field->prev_state);
+ T = task_index_to_char(field->next_state);
SEQ_PUT_HEX_FIELD(s, field->prev_pid);
SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index fabc49bcd493..dbba03ed96de 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __TRACE_EVENTS_H
#define __TRACE_EVENTS_H
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index b341c02730be..e288168661e1 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* trace context switch
*
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ddec53b67646..a86b303e6c67 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* trace task wakeup timings
*
@@ -397,10 +398,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = prev->pid;
entry->prev_prio = prev->prio;
- entry->prev_state = prev->state;
+ entry->prev_state = task_state_index(prev);
entry->next_pid = next->pid;
entry->next_prio = next->prio;
- entry->next_state = next->state;
+ entry->next_state = task_state_index(next);
entry->next_cpu = task_cpu(next);
if (!call_filter_check_discard(call, entry, buffer, event))
@@ -425,10 +426,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = curr->pid;
entry->prev_prio = curr->prio;
- entry->prev_state = curr->state;
+ entry->prev_state = task_state_index(curr);
entry->next_pid = wakee->pid;
entry->next_prio = wakee->prio;
- entry->next_state = wakee->state;
+ entry->next_state = task_state_index(wakee);
entry->next_cpu = task_cpu(wakee);
if (!call_filter_check_discard(call, entry, buffer, event))
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index cb917cebae29..cd70eb5df38e 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* Include in trace.c */
#include <uapi/linux/sched/types.h>
@@ -273,7 +274,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt)
goto out_free;
if (cnt > 1) {
if (trace_selftest_test_global_cnt == 0)
- goto out;
+ goto out_free;
}
if (trace_selftest_test_dyn_cnt == 0)
goto out_free;
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
index b4c475a0a48b..8cda06a10d66 100644
--- a/kernel/trace/trace_selftest_dynamic.c
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "trace.h"
int DYN_FTRACE_TEST_NAME(void)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index a4df67cbc711..734accc02418 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*
@@ -77,7 +78,7 @@ check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags; unsigned long *p, *top, *start;
static int tracer_frame;
- int frame_size = ACCESS_ONCE(tracer_frame);
+ int frame_size = READ_ONCE(tracer_frame);
int i, x;
this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
@@ -96,23 +97,9 @@ check_stack(unsigned long ip, unsigned long *stack)
if (in_nmi())
return;
- /*
- * There's a slight chance that we are tracing inside the
- * RCU infrastructure, and rcu_irq_enter() will not work
- * as expected.
- */
- if (unlikely(rcu_irq_enter_disabled()))
- return;
-
local_irq_save(flags);
arch_spin_lock(&stack_trace_max_lock);
- /*
- * RCU may not be watching, make it see us.
- * The stack trace code uses rcu_sched.
- */
- rcu_irq_enter();
-
/* In case another CPU set the tracer_frame on us */
if (unlikely(!frame_size))
this_size -= tracer_frame;
@@ -205,7 +192,6 @@ check_stack(unsigned long ip, unsigned long *stack)
}
out:
- rcu_irq_exit();
arch_spin_unlock(&stack_trace_max_lock);
local_irq_restore(flags);
}
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 413ff108fbd0..75bf1bcb4a8a 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Infrastructure for statistic tracing (histogram output).
*
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index 8f03914b9a6a..76d30b4ebe83 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __TRACE_STAT_H
#define __TRACE_STAT_H
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 74d9a86eccc0..19bcaaac884b 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <trace/syscall.h>
#include <trace/events/syscalls.h>
#include <linux/syscalls.h>
@@ -559,11 +560,30 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
+static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+ struct syscall_metadata *sys_data,
+ struct syscall_trace_enter *rec)
+{
+ struct syscall_tp_t {
+ unsigned long long regs;
+ unsigned long syscall_nr;
+ unsigned long args[SYSCALL_DEFINE_MAXARGS];
+ } param;
+ int i;
+
+ *(struct pt_regs **)&param = regs;
+ param.syscall_nr = rec->nr;
+ for (i = 0; i < sys_data->nb_args; i++)
+ param.args[i] = rec->args[i];
+ return trace_call_bpf(call, &param);
+}
+
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct hlist_head *head;
+ bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
@@ -579,7 +599,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
return;
head = this_cpu_ptr(sys_data->enter_event->perf_events);
- if (hlist_empty(head))
+ valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+ if (!valid_prog_array && hlist_empty(head))
return;
/* get the size after alignment with the u32 buffer size field */
@@ -594,6 +615,14 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
+
+ if ((valid_prog_array &&
+ !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
+ hlist_empty(head)) {
+ perf_swevent_put_recursion_context(rctx);
+ return;
+ }
+
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
head, NULL, NULL);
@@ -633,11 +662,27 @@ static void perf_sysenter_disable(struct trace_event_call *call)
mutex_unlock(&syscall_trace_lock);
}
+static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
+ struct syscall_trace_exit *rec)
+{
+ struct syscall_tp_t {
+ unsigned long long regs;
+ unsigned long syscall_nr;
+ unsigned long ret;
+ } param;
+
+ *(struct pt_regs **)&param = regs;
+ param.syscall_nr = rec->nr;
+ param.ret = rec->ret;
+ return trace_call_bpf(call, &param);
+}
+
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
struct hlist_head *head;
+ bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
@@ -653,7 +698,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
return;
head = this_cpu_ptr(sys_data->exit_event->perf_events);
- if (hlist_empty(head))
+ valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
+ if (!valid_prog_array && hlist_empty(head))
return;
/* We can probably do that at build time */
@@ -666,6 +712,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
+
+ if ((valid_prog_array &&
+ !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
+ hlist_empty(head)) {
+ perf_swevent_put_recursion_context(rctx);
+ return;
+ }
+
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1, regs, head, NULL, NULL);
}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4525e0271a53..153c0e411461 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1113,13 +1113,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
{
struct trace_event_call *call = &tu->tp.call;
struct uprobe_trace_entry_head *entry;
- struct bpf_prog *prog = call->prog;
struct hlist_head *head;
void *data;
int size, esize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 618838f5f30a..ab0ca77331d0 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __TRACING_MAP_H
#define __TRACING_MAP_H
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 5c2dc5b2bf4f..ce74a4901d2b 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Wrapper functions for 16bit uid back compatibility. All nicely tied
* together in the faint hope we can take the out in five years time.
diff --git a/kernel/umh.c b/kernel/umh.c
new file mode 100644
index 000000000000..6ff9905250ff
--- /dev/null
+++ b/kernel/umh.c
@@ -0,0 +1,568 @@
+/*
+ * umh - the kernel usermode helper
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/resource.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <linux/rwsem.h>
+#include <linux/ptrace.h>
+#include <linux/async.h>
+#include <linux/uaccess.h>
+
+#include <trace/events/module.h>
+
+#define CAP_BSET (void *)1
+#define CAP_PI (void *)2
+
+static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
+static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
+static DEFINE_SPINLOCK(umh_sysctl_lock);
+static DECLARE_RWSEM(umhelper_sem);
+
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+ if (info->cleanup)
+ (*info->cleanup)(info);
+ kfree(info);
+}
+
+static void umh_complete(struct subprocess_info *sub_info)
+{
+ struct completion *comp = xchg(&sub_info->complete, NULL);
+ /*
+ * See call_usermodehelper_exec(). If xchg() returns NULL
+ * we own sub_info, the UMH_KILLABLE caller has gone away
+ * or the caller used UMH_NO_WAIT.
+ */
+ if (comp)
+ complete(comp);
+ else
+ call_usermodehelper_freeinfo(sub_info);
+}
+
+/*
+ * This is the task which runs the usermode application
+ */
+static int call_usermodehelper_exec_async(void *data)
+{
+ struct subprocess_info *sub_info = data;
+ struct cred *new;
+ int retval;
+
+ spin_lock_irq(&current->sighand->siglock);
+ flush_signal_handlers(current, 1);
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /*
+ * Our parent (unbound workqueue) runs with elevated scheduling
+ * priority. Avoid propagating that into the userspace child.
+ */
+ set_user_nice(current, 0);
+
+ retval = -ENOMEM;
+ new = prepare_kernel_cred(current);
+ if (!new)
+ goto out;
+
+ spin_lock(&umh_sysctl_lock);
+ new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
+ new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
+ new->cap_inheritable);
+ spin_unlock(&umh_sysctl_lock);
+
+ if (sub_info->init) {
+ retval = sub_info->init(sub_info, new);
+ if (retval) {
+ abort_creds(new);
+ goto out;
+ }
+ }
+
+ commit_creds(new);
+
+ retval = do_execve(getname_kernel(sub_info->path),
+ (const char __user *const __user *)sub_info->argv,
+ (const char __user *const __user *)sub_info->envp);
+out:
+ sub_info->retval = retval;
+ /*
+ * call_usermodehelper_exec_sync() will call umh_complete
+ * if UHM_WAIT_PROC.
+ */
+ if (!(sub_info->wait & UMH_WAIT_PROC))
+ umh_complete(sub_info);
+ if (!retval)
+ return 0;
+ do_exit(0);
+}
+
+/* Handles UMH_WAIT_PROC. */
+static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
+{
+ pid_t pid;
+
+ /* If SIGCLD is ignored sys_wait4 won't populate the status. */
+ kernel_sigaction(SIGCHLD, SIG_DFL);
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ } else {
+ int ret = -ECHILD;
+ /*
+ * Normally it is bogus to call wait4() from in-kernel because
+ * wait4() wants to write the exit code to a userspace address.
+ * But call_usermodehelper_exec_sync() always runs as kernel
+ * thread (workqueue) and put_user() to a kernel address works
+ * OK for kernel threads, due to their having an mm_segment_t
+ * which spans the entire address space.
+ *
+ * Thus the __user pointer cast is valid here.
+ */
+ sys_wait4(pid, (int __user *)&ret, 0, NULL);
+
+ /*
+ * If ret is 0, either call_usermodehelper_exec_async failed and
+ * the real error code is already in sub_info->retval or
+ * sub_info->retval is 0 anyway, so don't mess with it then.
+ */
+ if (ret)
+ sub_info->retval = ret;
+ }
+
+ /* Restore default kernel sig handler */
+ kernel_sigaction(SIGCHLD, SIG_IGN);
+
+ umh_complete(sub_info);
+}
+
+/*
+ * We need to create the usermodehelper kernel thread from a task that is affine
+ * to an optimized set of CPUs (or nohz housekeeping ones) such that they
+ * inherit a widest affinity irrespective of call_usermodehelper() callers with
+ * possibly reduced affinity (eg: per-cpu workqueues). We don't want
+ * usermodehelper targets to contend a busy CPU.
+ *
+ * Unbound workqueues provide such wide affinity and allow to block on
+ * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
+ *
+ * Besides, workqueues provide the privilege level that caller might not have
+ * to perform the usermodehelper request.
+ *
+ */
+static void call_usermodehelper_exec_work(struct work_struct *work)
+{
+ struct subprocess_info *sub_info =
+ container_of(work, struct subprocess_info, work);
+
+ if (sub_info->wait & UMH_WAIT_PROC) {
+ call_usermodehelper_exec_sync(sub_info);
+ } else {
+ pid_t pid;
+ /*
+ * Use CLONE_PARENT to reparent it to kthreadd; we do not
+ * want to pollute current->children, and we need a parent
+ * that always ignores SIGCHLD to ensure auto-reaping.
+ */
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
+ CLONE_PARENT | SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ umh_complete(sub_info);
+ }
+ }
+}
+
+/*
+ * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
+ * (used for preventing user land processes from being created after the user
+ * land has been frozen during a system-wide hibernation or suspend operation).
+ * Should always be manipulated under umhelper_sem acquired for write.
+ */
+static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
+
+/* Number of helpers running */
+static atomic_t running_helpers = ATOMIC_INIT(0);
+
+/*
+ * Wait queue head used by usermodehelper_disable() to wait for all running
+ * helpers to finish.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
+
+/*
+ * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
+ * to become 'false'.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
+
+/*
+ * Time to wait for running_helpers to become zero before the setting of
+ * usermodehelper_disabled in usermodehelper_disable() fails
+ */
+#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
+
+int usermodehelper_read_trylock(void)
+{
+ DEFINE_WAIT(wait);
+ int ret = 0;
+
+ down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_INTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ if (usermodehelper_disabled == UMH_DISABLED)
+ ret = -EAGAIN;
+
+ up_read(&umhelper_sem);
+
+ if (ret)
+ break;
+
+ schedule();
+ try_to_freeze();
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
+
+long usermodehelper_read_lock_wait(long timeout)
+{
+ DEFINE_WAIT(wait);
+
+ if (timeout < 0)
+ return -EINVAL;
+
+ down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ up_read(&umhelper_sem);
+
+ timeout = schedule_timeout(timeout);
+ if (!timeout)
+ break;
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return timeout;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
+
+void usermodehelper_read_unlock(void)
+{
+ up_read(&umhelper_sem);
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
+
+/**
+ * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Change the value of usermodehelper_disabled (under umhelper_sem locked for
+ * writing) and wakeup tasks waiting for it to change.
+ */
+void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
+{
+ down_write(&umhelper_sem);
+ usermodehelper_disabled = depth;
+ wake_up(&usermodehelper_disabled_waitq);
+ up_write(&umhelper_sem);
+}
+
+/**
+ * __usermodehelper_disable - Prevent new helpers from being started.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
+ */
+int __usermodehelper_disable(enum umh_disable_depth depth)
+{
+ long retval;
+
+ if (!depth)
+ return -EINVAL;
+
+ down_write(&umhelper_sem);
+ usermodehelper_disabled = depth;
+ up_write(&umhelper_sem);
+
+ /*
+ * From now on call_usermodehelper_exec() won't start any new
+ * helpers, so it is sufficient if running_helpers turns out to
+ * be zero at one point (it may be increased later, but that
+ * doesn't matter).
+ */
+ retval = wait_event_timeout(running_helpers_waitq,
+ atomic_read(&running_helpers) == 0,
+ RUNNING_HELPERS_TIMEOUT);
+ if (retval)
+ return 0;
+
+ __usermodehelper_set_disable_depth(UMH_ENABLED);
+ return -EAGAIN;
+}
+
+static void helper_lock(void)
+{
+ atomic_inc(&running_helpers);
+ smp_mb__after_atomic();
+}
+
+static void helper_unlock(void)
+{
+ if (atomic_dec_and_test(&running_helpers))
+ wake_up(&running_helpers_waitq);
+}
+
+/**
+ * call_usermodehelper_setup - prepare to call a usermode helper
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @gfp_mask: gfp mask for memory allocation
+ * @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
+ *
+ * Returns either %NULL on allocation failure, or a subprocess_info
+ * structure. This should be passed to call_usermodehelper_exec to
+ * exec the process and free the structure.
+ *
+ * The init function is used to customize the helper process prior to
+ * exec. A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
+ * be freed. This can be used for freeing the argv and envp. The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
+ */
+struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
+ char **envp, gfp_t gfp_mask,
+ int (*init)(struct subprocess_info *info, struct cred *new),
+ void (*cleanup)(struct subprocess_info *info),
+ void *data)
+{
+ struct subprocess_info *sub_info;
+ sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
+ if (!sub_info)
+ goto out;
+
+ INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
+
+#ifdef CONFIG_STATIC_USERMODEHELPER
+ sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
+#else
+ sub_info->path = path;
+#endif
+ sub_info->argv = argv;
+ sub_info->envp = envp;
+
+ sub_info->cleanup = cleanup;
+ sub_info->init = init;
+ sub_info->data = data;
+ out:
+ return sub_info;
+}
+EXPORT_SYMBOL(call_usermodehelper_setup);
+
+/**
+ * call_usermodehelper_exec - start a usermode application
+ * @sub_info: information about the subprocessa
+ * @wait: wait for the application to finish and return status.
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * Runs a user-space application. The application is started
+ * asynchronously if wait is not set, and runs as a child of system workqueues.
+ * (ie. it runs with full root capabilities and optimized affinity).
+ */
+int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+ int retval = 0;
+
+ if (!sub_info->path) {
+ call_usermodehelper_freeinfo(sub_info);
+ return -EINVAL;
+ }
+ helper_lock();
+ if (usermodehelper_disabled) {
+ retval = -EBUSY;
+ goto out;
+ }
+
+ /*
+ * If there is no binary for us to call, then just return and get out of
+ * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and
+ * disable all call_usermodehelper() calls.
+ */
+ if (strlen(sub_info->path) == 0)
+ goto out;
+
+ /*
+ * Set the completion pointer only if there is a waiter.
+ * This makes it possible to use umh_complete to free
+ * the data structure in case of UMH_NO_WAIT.
+ */
+ sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
+ sub_info->wait = wait;
+
+ queue_work(system_unbound_wq, &sub_info->work);
+ if (wait == UMH_NO_WAIT) /* task has freed sub_info */
+ goto unlock;
+
+ if (wait & UMH_KILLABLE) {
+ retval = wait_for_completion_killable(&done);
+ if (!retval)
+ goto wait_done;
+
+ /* umh_complete() will see NULL and free sub_info */
+ if (xchg(&sub_info->complete, NULL))
+ goto unlock;
+ /* fallthrough, umh_complete() was already called */
+ }
+
+ wait_for_completion(&done);
+wait_done:
+ retval = sub_info->retval;
+out:
+ call_usermodehelper_freeinfo(sub_info);
+unlock:
+ helper_unlock();
+ return retval;
+}
+EXPORT_SYMBOL(call_usermodehelper_exec);
+
+/**
+ * call_usermodehelper() - prepare and start a usermode application
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @wait: wait for the application to finish and return status.
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * This function is the equivalent to use call_usermodehelper_setup() and
+ * call_usermodehelper_exec().
+ */
+int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
+{
+ struct subprocess_info *info;
+ gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
+
+ info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
+ NULL, NULL, NULL);
+ if (info == NULL)
+ return -ENOMEM;
+
+ return call_usermodehelper_exec(info, wait);
+}
+EXPORT_SYMBOL(call_usermodehelper);
+
+static int proc_cap_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
+ kernel_cap_t new_cap;
+ int err, i;
+
+ if (write && (!capable(CAP_SETPCAP) ||
+ !capable(CAP_SYS_MODULE)))
+ return -EPERM;
+
+ /*
+ * convert from the global kernel_cap_t to the ulong array to print to
+ * userspace if this is a read.
+ */
+ spin_lock(&umh_sysctl_lock);
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
+ if (table->data == CAP_BSET)
+ cap_array[i] = usermodehelper_bset.cap[i];
+ else if (table->data == CAP_PI)
+ cap_array[i] = usermodehelper_inheritable.cap[i];
+ else
+ BUG();
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ t = *table;
+ t.data = &cap_array;
+
+ /*
+ * actually read or write and array of ulongs from userspace. Remember
+ * these are least significant 32 bits first
+ */
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ /*
+ * convert from the sysctl array of ulongs to the kernel_cap_t
+ * internal representation
+ */
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+ new_cap.cap[i] = cap_array[i];
+
+ /*
+ * Drop everything not in the new_cap (but don't add things)
+ */
+ spin_lock(&umh_sysctl_lock);
+ if (write) {
+ if (table->data == CAP_BSET)
+ usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
+ if (table->data == CAP_PI)
+ usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ return 0;
+}
+
+struct ctl_table usermodehelper_table[] = {
+ {
+ .procname = "bset",
+ .data = CAP_BSET,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ {
+ .procname = "inheritable",
+ .data = CAP_PI,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ { }
+};
diff --git a/kernel/up.c b/kernel/up.c
index ee81ac9af4ca..42c46bf3e0a5 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
}
EXPORT_SYMBOL(smp_call_function_single);
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
unsigned long flags;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2f735cbe05e8..d32b45662fb6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -894,7 +894,7 @@ static bool new_idmap_permitted(const struct file *file,
int proc_setgroups_show(struct seq_file *seq, void *v)
{
struct user_namespace *ns = seq->private;
- unsigned long userns_flags = ACCESS_ONCE(ns->flags);
+ unsigned long userns_flags = READ_ONCE(ns->flags);
seq_printf(seq, "%s\n",
(userns_flags & USERNS_SETGROUPS_ALLOWED) ?
@@ -986,17 +986,21 @@ bool userns_may_setgroups(const struct user_namespace *ns)
}
/*
- * Returns true if @ns is the same namespace as or a descendant of
- * @target_ns.
+ * Returns true if @child is the same namespace or a descendant of
+ * @ancestor.
*/
+bool in_userns(const struct user_namespace *ancestor,
+ const struct user_namespace *child)
+{
+ const struct user_namespace *ns;
+ for (ns = child; ns->level > ancestor->level; ns = ns->parent)
+ ;
+ return (ns == ancestor);
+}
+
bool current_in_userns(const struct user_namespace *target_ns)
{
- struct user_namespace *ns;
- for (ns = current_user_ns(); ns; ns = ns->parent) {
- if (ns == target_ns)
- return true;
- }
- return false;
+ return in_userns(target_ns, current_user_ns());
}
static inline struct user_namespace *to_user_ns(struct ns_common *ns)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f5d52024f6b7..576d18045811 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Detect hard and soft lockups on a system
*
@@ -24,25 +25,35 @@
#include <linux/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
#include <linux/kthread.h>
-/* Watchdog configuration */
-static DEFINE_MUTEX(watchdog_proc_mutex);
-
-int __read_mostly nmi_watchdog_enabled;
+static DEFINE_MUTEX(watchdog_mutex);
#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
- NMI_WATCHDOG_ENABLED;
+# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED)
+# define NMI_WATCHDOG_DEFAULT 1
#else
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED)
+# define NMI_WATCHDOG_DEFAULT 0
#endif
+unsigned long __read_mostly watchdog_enabled;
+int __read_mostly watchdog_user_enabled = 1;
+int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
+int __read_mostly soft_watchdog_user_enabled = 1;
+int __read_mostly watchdog_thresh = 10;
+int __read_mostly nmi_watchdog_available;
+
+struct cpumask watchdog_allowed_mask __read_mostly;
+
+struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+
#ifdef CONFIG_HARDLOCKUP_DETECTOR
-/* boot commands */
/*
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
@@ -56,9 +67,9 @@ unsigned int __read_mostly hardlockup_panic =
* kernel command line parameters are parsed, because otherwise it is not
* possible to override this in hardlockup_panic_setup().
*/
-void hardlockup_detector_disable(void)
+void __init hardlockup_detector_disable(void)
{
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ nmi_watchdog_user_enabled = 0;
}
static int __init hardlockup_panic_setup(char *str)
@@ -68,48 +79,24 @@ static int __init hardlockup_panic_setup(char *str)
else if (!strncmp(str, "nopanic", 7))
hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ nmi_watchdog_user_enabled = 0;
else if (!strncmp(str, "1", 1))
- watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+ nmi_watchdog_user_enabled = 1;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
-#endif
-
-#ifdef CONFIG_SOFTLOCKUP_DETECTOR
-int __read_mostly soft_watchdog_enabled;
-#endif
-
-int __read_mostly watchdog_user_enabled;
-int __read_mostly watchdog_thresh = 10;
-
-#ifdef CONFIG_SMP
-int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+# ifdef CONFIG_SMP
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#endif
-struct cpumask watchdog_cpumask __read_mostly;
-unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
-/*
- * The 'watchdog_running' variable is set to 1 when the watchdog threads
- * are registered/started and is set to 0 when the watchdog threads are
- * unregistered/stopped, so it is an indicator whether the threads exist.
- */
-static int __read_mostly watchdog_running;
-/*
- * If a subsystem has a need to deactivate the watchdog temporarily, it
- * can use the suspend/resume interface to achieve this. The content of
- * the 'watchdog_suspended' variable reflects this state. Existing threads
- * are parked/unparked by the lockup_detector_{suspend|resume} functions
- * (see comment blocks pertaining to those functions for further details).
- *
- * 'watchdog_suspended' also prevents threads from being registered/started
- * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
- * of 'watchdog_running' cannot change while the watchdog is deactivated
- * temporarily (see related code in 'proc' handlers).
- */
-int __read_mostly watchdog_suspended;
+static int __init hardlockup_all_cpu_backtrace_setup(char *str)
+{
+ sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
+ return 1;
+}
+__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
+# endif /* CONFIG_SMP */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
/*
* These functions can be overridden if an architecture implements its
@@ -121,36 +108,68 @@ int __read_mostly watchdog_suspended;
*/
int __weak watchdog_nmi_enable(unsigned int cpu)
{
+ hardlockup_detector_perf_enable();
return 0;
}
+
void __weak watchdog_nmi_disable(unsigned int cpu)
{
+ hardlockup_detector_perf_disable();
}
-/*
- * watchdog_nmi_reconfigure can be implemented to be notified after any
- * watchdog configuration change. The arch hardlockup watchdog should
- * respond to the following variables:
- * - nmi_watchdog_enabled
+/* Return 0, if a NMI watchdog is available. Error code otherwise */
+int __weak __init watchdog_nmi_probe(void)
+{
+ return hardlockup_detector_perf_init();
+}
+
+/**
+ * watchdog_nmi_stop - Stop the watchdog for reconfiguration
+ *
+ * The reconfiguration steps are:
+ * watchdog_nmi_stop();
+ * update_variables();
+ * watchdog_nmi_start();
+ */
+void __weak watchdog_nmi_stop(void) { }
+
+/**
+ * watchdog_nmi_start - Start the watchdog after reconfiguration
+ *
+ * Counterpart to watchdog_nmi_stop().
+ *
+ * The following variables have been updated in update_variables() and
+ * contain the currently valid configuration:
+ * - watchdog_enabled
* - watchdog_thresh
* - watchdog_cpumask
- * - sysctl_hardlockup_all_cpu_backtrace
- * - hardlockup_panic
- * - watchdog_suspended
*/
-void __weak watchdog_nmi_reconfigure(void)
+void __weak watchdog_nmi_start(void) { }
+
+/**
+ * lockup_detector_update_enable - Update the sysctl enable bit
+ *
+ * Caller needs to make sure that the NMI/perf watchdogs are off, so this
+ * can't race with watchdog_nmi_disable().
+ */
+static void lockup_detector_update_enable(void)
{
+ watchdog_enabled = 0;
+ if (!watchdog_user_enabled)
+ return;
+ if (nmi_watchdog_available && nmi_watchdog_user_enabled)
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+ if (soft_watchdog_user_enabled)
+ watchdog_enabled |= SOFT_WATCHDOG_ENABLED;
}
-
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
-/* Helper for online, unparked cpus. */
-#define for_each_watchdog_cpu(cpu) \
- for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
-
-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
+/* Global variables, exported for sysctl */
+unsigned int __read_mostly softlockup_panic =
+ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+static bool softlockup_threads_initialized __read_mostly;
static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -164,50 +183,40 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static unsigned long soft_lockup_nmi_warn;
-unsigned int __read_mostly softlockup_panic =
- CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-
static int __init softlockup_panic_setup(char *str)
{
softlockup_panic = simple_strtoul(str, NULL, 0);
-
return 1;
}
__setup("softlockup_panic=", softlockup_panic_setup);
static int __init nowatchdog_setup(char *str)
{
- watchdog_enabled = 0;
+ watchdog_user_enabled = 0;
return 1;
}
__setup("nowatchdog", nowatchdog_setup);
static int __init nosoftlockup_setup(char *str)
{
- watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
+ soft_watchdog_user_enabled = 0;
return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
#ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+
static int __init softlockup_all_cpu_backtrace_setup(char *str)
{
- sysctl_softlockup_all_cpu_backtrace =
- !!simple_strtol(str, NULL, 0);
+ sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
return 1;
}
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int __init hardlockup_all_cpu_backtrace_setup(char *str)
-{
- sysctl_hardlockup_all_cpu_backtrace =
- !!simple_strtol(str, NULL, 0);
- return 1;
-}
-__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
-#endif
#endif
+static void __lockup_detector_cleanup(void);
+
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
* lockups can have false positives under extreme conditions. So we generally
@@ -278,11 +287,15 @@ void touch_all_softlockup_watchdogs(void)
int cpu;
/*
- * this is done lockless
- * do we care if a 0 races with a timestamp?
- * all it means is the softlock check starts one cycle later
+ * watchdog_mutex cannpt be taken here, as this might be called
+ * from (soft)interrupt context, so the access to
+ * watchdog_allowed_cpumask might race with a concurrent update.
+ *
+ * The watchdog time stamp can race against a concurrent real
+ * update as well, the only side effect might be a cycle delay for
+ * the softlockup check.
*/
- for_each_watchdog_cpu(cpu)
+ for_each_cpu(cpu, &watchdog_allowed_mask)
per_cpu(watchdog_touch_ts, cpu) = 0;
wq_watchdog_touch(-1);
}
@@ -322,9 +335,6 @@ static void watchdog_interrupt_count(void)
__this_cpu_inc(hrtimer_interrupts);
}
-static int watchdog_enable_all_cpus(void);
-static void watchdog_disable_all_cpus(void);
-
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
@@ -333,7 +343,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
- if (atomic_read(&watchdog_park_in_progress) != 0)
+ if (!watchdog_enabled)
return HRTIMER_NORESTART;
/* kick the hardlockup detector */
@@ -447,32 +457,38 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
static void watchdog_enable(unsigned int cpu)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
- /* kick off the timer for the hardlockup detector */
+ /*
+ * Start the timer first to prevent the NMI watchdog triggering
+ * before the timer has a chance to fire.
+ */
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = watchdog_timer_fn;
-
- /* Enable the perf event */
- watchdog_nmi_enable(cpu);
-
- /* done here because hrtimer_start can only pin to smp_processor_id() */
hrtimer_start(hrtimer, ns_to_ktime(sample_period),
HRTIMER_MODE_REL_PINNED);
- /* initialize timestamp */
- watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
+ /* Initialize timestamp */
__touch_watchdog();
+ /* Enable the perf event */
+ if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
+ watchdog_nmi_enable(cpu);
+
+ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
}
static void watchdog_disable(unsigned int cpu)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
watchdog_set_prio(SCHED_NORMAL, 0);
- hrtimer_cancel(hrtimer);
- /* disable the perf event */
+ /*
+ * Disable the perf event first. That prevents that a large delay
+ * between disabling the timer and disabling the perf event causes
+ * the perf NMI to detect a false positive.
+ */
watchdog_nmi_disable(cpu);
+ hrtimer_cancel(hrtimer);
}
static void watchdog_cleanup(unsigned int cpu, bool online)
@@ -499,21 +515,6 @@ static void watchdog(unsigned int cpu)
__this_cpu_write(soft_lockup_hrtimer_cnt,
__this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
-
- /*
- * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
- * failure path. Check for failures that can occur asynchronously -
- * for example, when CPUs are on-lined - and shut down the hardware
- * perf event on each CPU accordingly.
- *
- * The only non-obvious place this bit can be cleared is through
- * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a
- * pr_info here would be too noisy as it would result in a message
- * every few seconds if the hardlockup was disabled but the softlockup
- * enabled.
- */
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- watchdog_nmi_disable(cpu);
}
static struct smp_hotplug_thread watchdog_threads = {
@@ -527,295 +528,174 @@ static struct smp_hotplug_thread watchdog_threads = {
.unpark = watchdog_enable,
};
-/*
- * park all watchdog threads that are specified in 'watchdog_cpumask'
- *
- * This function returns an error if kthread_park() of a watchdog thread
- * fails. In this situation, the watchdog threads of some CPUs can already
- * be parked and the watchdog threads of other CPUs can still be runnable.
- * Callers are expected to handle this special condition as appropriate in
- * their context.
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static int watchdog_park_threads(void)
+static void softlockup_update_smpboot_threads(void)
{
- int cpu, ret = 0;
-
- atomic_set(&watchdog_park_in_progress, 1);
+ lockdep_assert_held(&watchdog_mutex);
- for_each_watchdog_cpu(cpu) {
- ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
- if (ret)
- break;
- }
-
- atomic_set(&watchdog_park_in_progress, 0);
+ if (!softlockup_threads_initialized)
+ return;
- return ret;
+ smpboot_update_cpumask_percpu_thread(&watchdog_threads,
+ &watchdog_allowed_mask);
}
-/*
- * unpark all watchdog threads that are specified in 'watchdog_cpumask'
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static void watchdog_unpark_threads(void)
+/* Temporarily park all watchdog threads */
+static void softlockup_park_all_threads(void)
{
- int cpu;
-
- for_each_watchdog_cpu(cpu)
- kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ cpumask_clear(&watchdog_allowed_mask);
+ softlockup_update_smpboot_threads();
}
-static int update_watchdog_all_cpus(void)
+/* Unpark enabled threads */
+static void softlockup_unpark_threads(void)
{
- int ret;
-
- ret = watchdog_park_threads();
- if (ret)
- return ret;
-
- watchdog_unpark_threads();
-
- return 0;
+ cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
+ softlockup_update_smpboot_threads();
}
-static int watchdog_enable_all_cpus(void)
+static void lockup_detector_reconfigure(void)
{
- int err = 0;
-
- if (!watchdog_running) {
- err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
- &watchdog_cpumask);
- if (err)
- pr_err("Failed to create watchdog threads, disabled\n");
- else
- watchdog_running = 1;
- } else {
- /*
- * Enable/disable the lockup detectors or
- * change the sample period 'on the fly'.
- */
- err = update_watchdog_all_cpus();
-
- if (err) {
- watchdog_disable_all_cpus();
- pr_err("Failed to update lockup detectors, disabled\n");
- }
- }
-
- if (err)
- watchdog_enabled = 0;
-
- return err;
+ cpus_read_lock();
+ watchdog_nmi_stop();
+ softlockup_park_all_threads();
+ set_sample_period();
+ lockup_detector_update_enable();
+ if (watchdog_enabled && watchdog_thresh)
+ softlockup_unpark_threads();
+ watchdog_nmi_start();
+ cpus_read_unlock();
+ /*
+ * Must be called outside the cpus locked section to prevent
+ * recursive locking in the perf code.
+ */
+ __lockup_detector_cleanup();
}
-static void watchdog_disable_all_cpus(void)
+/*
+ * Create the watchdog thread infrastructure and configure the detector(s).
+ *
+ * The threads are not unparked as watchdog_allowed_mask is empty. When
+ * the threads are sucessfully initialized, take the proper locks and
+ * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
+ */
+static __init void lockup_detector_setup(void)
{
- if (watchdog_running) {
- watchdog_running = 0;
- smpboot_unregister_percpu_thread(&watchdog_threads);
- }
-}
+ int ret;
-#ifdef CONFIG_SYSCTL
-static int watchdog_update_cpus(void)
-{
- return smpboot_update_cpumask_percpu_thread(
- &watchdog_threads, &watchdog_cpumask);
-}
-#endif
+ /*
+ * If sysctl is off and watchdog got disabled on the command line,
+ * nothing to do here.
+ */
+ lockup_detector_update_enable();
-#else /* SOFTLOCKUP */
-static int watchdog_park_threads(void)
-{
- return 0;
-}
+ if (!IS_ENABLED(CONFIG_SYSCTL) &&
+ !(watchdog_enabled && watchdog_thresh))
+ return;
-static void watchdog_unpark_threads(void)
-{
-}
+ ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+ &watchdog_allowed_mask);
+ if (ret) {
+ pr_err("Failed to initialize soft lockup detector threads\n");
+ return;
+ }
-static int watchdog_enable_all_cpus(void)
-{
- return 0;
+ mutex_lock(&watchdog_mutex);
+ softlockup_threads_initialized = true;
+ lockup_detector_reconfigure();
+ mutex_unlock(&watchdog_mutex);
}
-static void watchdog_disable_all_cpus(void)
+#else /* CONFIG_SOFTLOCKUP_DETECTOR */
+static inline int watchdog_park_threads(void) { return 0; }
+static inline void watchdog_unpark_threads(void) { }
+static inline int watchdog_enable_all_cpus(void) { return 0; }
+static inline void watchdog_disable_all_cpus(void) { }
+static void lockup_detector_reconfigure(void)
{
+ cpus_read_lock();
+ watchdog_nmi_stop();
+ lockup_detector_update_enable();
+ watchdog_nmi_start();
+ cpus_read_unlock();
}
-
-#ifdef CONFIG_SYSCTL
-static int watchdog_update_cpus(void)
+static inline void lockup_detector_setup(void)
{
- return 0;
+ lockup_detector_reconfigure();
}
-#endif
+#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
-static void set_sample_period(void)
+static void __lockup_detector_cleanup(void)
{
+ lockdep_assert_held(&watchdog_mutex);
+ hardlockup_detector_perf_cleanup();
}
-#endif /* SOFTLOCKUP */
-/*
- * Suspend the hard and soft lockup detector by parking the watchdog threads.
+/**
+ * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
+ *
+ * Caller must not hold the cpu hotplug rwsem.
*/
-int lockup_detector_suspend(void)
+void lockup_detector_cleanup(void)
{
- int ret = 0;
-
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
- /*
- * Multiple suspend requests can be active in parallel (counted by
- * the 'watchdog_suspended' variable). If the watchdog threads are
- * running, the first caller takes care that they will be parked.
- * The state of 'watchdog_running' cannot change while a suspend
- * request is active (see related code in 'proc' handlers).
- */
- if (watchdog_running && !watchdog_suspended)
- ret = watchdog_park_threads();
-
- if (ret == 0)
- watchdog_suspended++;
- else {
- watchdog_disable_all_cpus();
- pr_err("Failed to suspend lockup detectors, disabled\n");
- watchdog_enabled = 0;
- }
-
- watchdog_nmi_reconfigure();
-
- mutex_unlock(&watchdog_proc_mutex);
-
- return ret;
+ mutex_lock(&watchdog_mutex);
+ __lockup_detector_cleanup();
+ mutex_unlock(&watchdog_mutex);
}
-/*
- * Resume the hard and soft lockup detector by unparking the watchdog threads.
+/**
+ * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
+ *
+ * Special interface for parisc. It prevents lockup detector warnings from
+ * the default pm_poweroff() function which busy loops forever.
*/
-void lockup_detector_resume(void)
+void lockup_detector_soft_poweroff(void)
{
- mutex_lock(&watchdog_proc_mutex);
-
- watchdog_suspended--;
- /*
- * The watchdog threads are unparked if they were previously running
- * and if there is no more active suspend request.
- */
- if (watchdog_running && !watchdog_suspended)
- watchdog_unpark_threads();
-
- watchdog_nmi_reconfigure();
-
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
+ watchdog_enabled = 0;
}
#ifdef CONFIG_SYSCTL
-/*
- * Update the run state of the lockup detectors.
- */
-static int proc_watchdog_update(void)
+/* Propagate any changes to the watchdog threads */
+static void proc_watchdog_update(void)
{
- int err = 0;
-
- /*
- * Watchdog threads won't be started if they are already active.
- * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
- * care of this. If those threads are already active, the sample
- * period will be updated and the lockup detectors will be enabled
- * or disabled 'on the fly'.
- */
- if (watchdog_enabled && watchdog_thresh)
- err = watchdog_enable_all_cpus();
- else
- watchdog_disable_all_cpus();
-
- watchdog_nmi_reconfigure();
-
- return err;
-
+ /* Remove impossible cpus to keep sysctl output clean. */
+ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
+ lockup_detector_reconfigure();
}
/*
* common function for watchdog, nmi_watchdog and soft_watchdog parameter
*
- * caller | table->data points to | 'which' contains the flag(s)
- * -------------------|-----------------------|-----------------------------
- * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
- * | | with SOFT_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ * caller | table->data points to | 'which'
+ * -------------------|----------------------------|--------------------------
+ * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED |
+ * | | SOFT_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
*/
static int proc_watchdog_common(int which, struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old, new;
- int *watchdog_param = (int *)table->data;
+ int err, old, *param = table->data;
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
+ mutex_lock(&watchdog_mutex);
- if (watchdog_suspended) {
- /* no parameter changes allowed while watchdog is suspended */
- err = -EAGAIN;
- goto out;
- }
-
- /*
- * If the parameter is being read return the state of the corresponding
- * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
- * run state of the lockup detectors.
- */
if (!write) {
- *watchdog_param = (watchdog_enabled & which) != 0;
+ /*
+ * On read synchronize the userspace interface. This is a
+ * racy snapshot.
+ */
+ *param = (watchdog_enabled & which) != 0;
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
} else {
+ old = READ_ONCE(*param);
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (err)
- goto out;
-
- /*
- * There is a race window between fetching the current value
- * from 'watchdog_enabled' and storing the new value. During
- * this race window, watchdog_nmi_enable() can sneak in and
- * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
- * The 'cmpxchg' detects this race and the loop retries.
- */
- do {
- old = watchdog_enabled;
- /*
- * If the parameter value is not zero set the
- * corresponding bit(s), else clear it(them).
- */
- if (*watchdog_param)
- new = old | which;
- else
- new = old & ~which;
- } while (cmpxchg(&watchdog_enabled, old, new) != old);
-
- /*
- * Update the run state of the lockup detectors. There is _no_
- * need to check the value returned by proc_watchdog_update()
- * and to restore the previous value of 'watchdog_enabled' as
- * both lockup detectors are disabled if proc_watchdog_update()
- * returns an error.
- */
- if (old == new)
- goto out;
-
- err = proc_watchdog_update();
+ if (!err && old != READ_ONCE(*param))
+ proc_watchdog_update();
}
-out:
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
+ mutex_unlock(&watchdog_mutex);
return err;
}
@@ -835,6 +715,8 @@ int proc_watchdog(struct ctl_table *table, int write,
int proc_nmi_watchdog(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
+ if (!nmi_watchdog_available && write)
+ return -ENOTSUPP;
return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
table, write, buffer, lenp, ppos);
}
@@ -855,39 +737,17 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
int proc_watchdog_thresh(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old, new;
-
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
+ int err, old;
- if (watchdog_suspended) {
- /* no parameter changes allowed while watchdog is suspended */
- err = -EAGAIN;
- goto out;
- }
+ mutex_lock(&watchdog_mutex);
- old = ACCESS_ONCE(watchdog_thresh);
+ old = READ_ONCE(watchdog_thresh);
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (err || !write)
- goto out;
+ if (!err && write && old != READ_ONCE(watchdog_thresh))
+ proc_watchdog_update();
- /*
- * Update the sample period. Restore on failure.
- */
- new = ACCESS_ONCE(watchdog_thresh);
- if (old == new)
- goto out;
-
- set_sample_period();
- err = proc_watchdog_update();
- if (err) {
- watchdog_thresh = old;
- set_sample_period();
- }
-out:
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
+ mutex_unlock(&watchdog_mutex);
return err;
}
@@ -902,55 +762,26 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
{
int err;
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
-
- if (watchdog_suspended) {
- /* no parameter changes allowed while watchdog is suspended */
- err = -EAGAIN;
- goto out;
- }
+ mutex_lock(&watchdog_mutex);
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
- if (!err && write) {
- /* Remove impossible cpus to keep sysctl output cleaner. */
- cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
- cpu_possible_mask);
+ if (!err && write)
+ proc_watchdog_update();
- if (watchdog_running) {
- /*
- * Failure would be due to being unable to allocate
- * a temporary cpumask, so we are likely not in a
- * position to do much else to make things better.
- */
- if (watchdog_update_cpus() != 0)
- pr_err("cpumask update failed\n");
- }
-
- watchdog_nmi_reconfigure();
- }
-out:
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
+ mutex_unlock(&watchdog_mutex);
return err;
}
-
#endif /* CONFIG_SYSCTL */
void __init lockup_detector_init(void)
{
- set_sample_period();
-
-#ifdef CONFIG_NO_HZ_FULL
- if (tick_nohz_full_enabled()) {
+ if (tick_nohz_full_enabled())
pr_info("Disabling watchdog on nohz_full cores by default\n");
- cpumask_copy(&watchdog_cpumask, housekeeping_mask);
- } else
- cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
-#else
- cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
-#endif
- if (watchdog_enabled)
- watchdog_enable_all_cpus();
+ cpumask_copy(&watchdog_cpumask,
+ housekeeping_cpumask(HK_FLAG_TIMER));
+
+ if (!watchdog_nmi_probe())
+ nmi_watchdog_available = true;
+ lockup_detector_setup();
}
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 3a09ea1b1d3d..e449a23e9d59 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Detect hard lockups on a system
*
@@ -12,6 +13,7 @@
#define pr_fmt(fmt) "NMI watchdog: " fmt
#include <linux/nmi.h>
+#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/sched/debug.h>
@@ -21,8 +23,11 @@
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static DEFINE_PER_CPU(struct perf_event *, dead_event);
+static struct cpumask dead_events_mask;
static unsigned long hardlockup_allcpu_dumped;
+static atomic_t watchdog_cpus = ATOMIC_INIT(0);
void arch_touch_nmi_watchdog(void)
{
@@ -103,15 +108,12 @@ static struct perf_event_attr wd_hw_attr = {
/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
- if (atomic_read(&watchdog_park_in_progress) != 0)
- return;
-
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;
@@ -160,104 +162,134 @@ static void watchdog_overflow_callback(struct perf_event *event,
return;
}
-/*
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long firstcpu_err;
-static atomic_t watchdog_cpus;
-
-int watchdog_nmi_enable(unsigned int cpu)
+static int hardlockup_detector_event_create(void)
{
+ unsigned int cpu = smp_processor_id();
struct perf_event_attr *wd_attr;
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
- int firstcpu = 0;
-
- /* nothing to do if the hard lockup detector is disabled */
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- goto out;
-
- /* is it already setup and enabled? */
- if (event && event->state > PERF_EVENT_STATE_OFF)
- goto out;
-
- /* it is setup but not enabled */
- if (event != NULL)
- goto out_enable;
-
- if (atomic_inc_return(&watchdog_cpus) == 1)
- firstcpu = 1;
+ struct perf_event *evt;
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
/* Try to register using hardware perf events */
- event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+ evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+ watchdog_overflow_callback, NULL);
+ if (IS_ERR(evt)) {
+ pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
+ PTR_ERR(evt));
+ return PTR_ERR(evt);
+ }
+ this_cpu_write(watchdog_ev, evt);
+ return 0;
+}
- /* save the first cpu's error for future comparision */
- if (firstcpu && IS_ERR(event))
- firstcpu_err = PTR_ERR(event);
+/**
+ * hardlockup_detector_perf_enable - Enable the local event
+ */
+void hardlockup_detector_perf_enable(void)
+{
+ if (hardlockup_detector_event_create())
+ return;
- if (!IS_ERR(event)) {
- /* only print for the first cpu initialized */
- if (firstcpu || firstcpu_err)
- pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
- goto out_save;
- }
+ /* use original value for check */
+ if (!atomic_fetch_inc(&watchdog_cpus))
+ pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
- /*
- * Disable the hard lockup detector if _any_ CPU fails to set up
- * set up the hardware perf event. The watchdog() function checks
- * the NMI_WATCHDOG_ENABLED bit periodically.
- *
- * The barriers are for syncing up watchdog_enabled across all the
- * cpus, as clear_bit() does not use barriers.
- */
- smp_mb__before_atomic();
- clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
- smp_mb__after_atomic();
-
- /* skip displaying the same error again */
- if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
- return PTR_ERR(event);
-
- /* vary the KERN level based on the returned errno */
- if (PTR_ERR(event) == -EOPNOTSUPP)
- pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
- else if (PTR_ERR(event) == -ENOENT)
- pr_warn("disabled (cpu%i): hardware events not enabled\n",
- cpu);
- else
- pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
- cpu, PTR_ERR(event));
-
- pr_info("Shutting down hard lockup detector on all cpus\n");
-
- return PTR_ERR(event);
-
- /* success path */
-out_save:
- per_cpu(watchdog_ev, cpu) = event;
-out_enable:
- perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
- return 0;
+ perf_event_enable(this_cpu_read(watchdog_ev));
}
-void watchdog_nmi_disable(unsigned int cpu)
+/**
+ * hardlockup_detector_perf_disable - Disable the local event
+ */
+void hardlockup_detector_perf_disable(void)
{
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
+ struct perf_event *event = this_cpu_read(watchdog_ev);
if (event) {
perf_event_disable(event);
- per_cpu(watchdog_ev, cpu) = NULL;
+ this_cpu_write(watchdog_ev, NULL);
+ this_cpu_write(dead_event, event);
+ cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
+ atomic_dec(&watchdog_cpus);
+ }
+}
+
+/**
+ * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
+ *
+ * Called from lockup_detector_cleanup(). Serialized by the caller.
+ */
+void hardlockup_detector_perf_cleanup(void)
+{
+ int cpu;
+
+ for_each_cpu(cpu, &dead_events_mask) {
+ struct perf_event *event = per_cpu(dead_event, cpu);
- /* should be in cleanup, but blocks oprofile */
- perf_event_release_kernel(event);
+ /*
+ * Required because for_each_cpu() reports unconditionally
+ * CPU0 as set on UP kernels. Sigh.
+ */
+ if (event)
+ perf_event_release_kernel(event);
+ per_cpu(dead_event, cpu) = NULL;
+ }
+ cpumask_clear(&dead_events_mask);
+}
+
+/**
+ * hardlockup_detector_perf_stop - Globally stop watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_stop(void)
+{
+ int cpu;
+
+ lockdep_assert_cpus_held();
+
+ for_each_online_cpu(cpu) {
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ if (event)
+ perf_event_disable(event);
+ }
+}
+
+/**
+ * hardlockup_detector_perf_restart - Globally restart watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_restart(void)
+{
+ int cpu;
+
+ lockdep_assert_cpus_held();
+
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ return;
+
+ for_each_online_cpu(cpu) {
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ if (event)
+ perf_event_enable(event);
+ }
+}
+
+/**
+ * hardlockup_detector_perf_init - Probe whether NMI event is available at all
+ */
+int __init hardlockup_detector_perf_init(void)
+{
+ int ret = hardlockup_detector_event_create();
- /* watchdog_nmi_enable() expects this to be zero initially. */
- if (atomic_dec_and_test(&watchdog_cpus))
- firstcpu_err = 0;
+ if (ret) {
+ pr_info("Perf NMI watchdog permanently disabled\n");
+ } else {
+ perf_event_release_kernel(this_cpu_read(watchdog_ev));
+ this_cpu_write(watchdog_ev, NULL);
}
+ return ret;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ca937b0c3a96..7368b57842ea 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -21,7 +21,7 @@
* pools for workqueues which are not bound to any specific CPU - the
* number of these backing pools is dynamic.
*
- * Please read Documentation/workqueue.txt for details.
+ * Please read Documentation/core-api/workqueue.rst for details.
*/
#include <linux/export.h>
@@ -68,6 +68,7 @@ enum {
* attach_mutex to avoid changing binding state while
* worker_attach_to_pool() is in progress.
*/
+ POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */
POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
/* worker flags */
@@ -165,7 +166,6 @@ struct worker_pool {
/* L: hash of busy workers */
/* see manage_workers() for details on the two manager mutexes */
- struct mutex manager_arb; /* manager arbitration */
struct worker *manager; /* L: purely informational */
struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
@@ -299,6 +299,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
+static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
@@ -801,7 +802,7 @@ static bool need_to_create_worker(struct worker_pool *pool)
/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
- bool managing = mutex_is_locked(&pool->manager_arb);
+ bool managing = pool->flags & POOL_MANAGER_ACTIVE;
int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
int nr_busy = pool->nr_workers - nr_idle;
@@ -1375,7 +1376,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
* queued or lose PENDING. Grabbing PENDING and queueing should
* happen with IRQ disabled.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
debug_work_activate(work);
@@ -1492,9 +1493,9 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
}
EXPORT_SYMBOL(queue_work_on);
-void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(struct timer_list *t)
{
- struct delayed_work *dwork = (struct delayed_work *)__data;
+ struct delayed_work *dwork = from_timer(dwork, t, timer);
/* should have been called from irqsafe timer with irq already off */
__queue_work(dwork->cpu, dwork->wq, &dwork->work);
@@ -1508,8 +1509,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work = &dwork->work;
WARN_ON_ONCE(!wq);
- WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
- timer->data != (unsigned long)dwork);
+ WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn);
WARN_ON_ONCE(timer_pending(timer));
WARN_ON_ONCE(!list_empty(&work->entry));
@@ -1832,9 +1832,9 @@ static void destroy_worker(struct worker *worker)
wake_up_process(worker->task);
}
-static void idle_worker_timeout(unsigned long __pool)
+static void idle_worker_timeout(struct timer_list *t)
{
- struct worker_pool *pool = (void *)__pool;
+ struct worker_pool *pool = from_timer(pool, t, idle_timer);
spin_lock_irq(&pool->lock);
@@ -1880,9 +1880,9 @@ static void send_mayday(struct work_struct *work)
}
}
-static void pool_mayday_timeout(unsigned long __pool)
+static void pool_mayday_timeout(struct timer_list *t)
{
- struct worker_pool *pool = (void *)__pool;
+ struct worker_pool *pool = from_timer(pool, t, mayday_timer);
struct work_struct *work;
spin_lock_irq(&pool->lock);
@@ -1980,24 +1980,17 @@ static bool manage_workers(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- /*
- * Anyone who successfully grabs manager_arb wins the arbitration
- * and becomes the manager. mutex_trylock() on pool->manager_arb
- * failure while holding pool->lock reliably indicates that someone
- * else is managing the pool and the worker which failed trylock
- * can proceed to executing work items. This means that anyone
- * grabbing manager_arb is responsible for actually performing
- * manager duties. If manager_arb is grabbed and released without
- * actual management, the pool may stall indefinitely.
- */
- if (!mutex_trylock(&pool->manager_arb))
+ if (pool->flags & POOL_MANAGER_ACTIVE)
return false;
+
+ pool->flags |= POOL_MANAGER_ACTIVE;
pool->manager = worker;
maybe_create_worker(pool);
pool->manager = NULL;
- mutex_unlock(&pool->manager_arb);
+ pool->flags &= ~POOL_MANAGER_ACTIVE;
+ wake_up(&wq_manager_wait);
return true;
}
@@ -2091,8 +2084,30 @@ __acquires(&pool->lock)
spin_unlock_irq(&pool->lock);
- lock_map_acquire_read(&pwq->wq->lockdep_map);
+ lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
+ /*
+ * Strictly speaking we should mark the invariant state without holding
+ * any locks, that is, before these two lock_map_acquire()'s.
+ *
+ * However, that would result in:
+ *
+ * A(W1)
+ * WFC(C)
+ * A(W1)
+ * C(C)
+ *
+ * Which would create W1->C->W1 dependencies, even though there is no
+ * actual deadlock possible. There are two solutions, using a
+ * read-recursive acquire on the work(queue) 'locks', but this will then
+ * hit the lockdep limitation on recursive locks, or simply discard
+ * these locks.
+ *
+ * AFAICT there is no possible deadlock scenario between the
+ * flush_work() and complete() primitives (except for single-threaded
+ * workqueues), so hiding them isn't a problem.
+ */
+ lockdep_invariant_state(true);
trace_workqueue_execute_start(work);
worker->current_func(work);
/*
@@ -2247,7 +2262,7 @@ sleep:
* event.
*/
worker_enter_idle(worker);
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_IDLE);
spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
@@ -2289,7 +2304,7 @@ static int rescuer_thread(void *__rescuer)
*/
rescuer->task->flags |= PF_WQ_WORKER;
repeat:
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_IDLE);
/*
* By the time the rescuer is requested to stop, the workqueue
@@ -2474,7 +2489,9 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
*/
INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
- init_completion(&barr->done);
+
+ init_completion_map(&barr->done, &target->lockdep_map);
+
barr->task = current;
/*
@@ -2580,16 +2597,13 @@ void flush_workqueue(struct workqueue_struct *wq)
struct wq_flusher this_flusher = {
.list = LIST_HEAD_INIT(this_flusher.list),
.flush_color = -1,
- .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+ .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
};
int next_color;
if (WARN_ON(!wq_online))
return;
- lock_map_acquire(&wq->lockdep_map);
- lock_map_release(&wq->lockdep_map);
-
mutex_lock(&wq->mutex);
/*
@@ -2815,16 +2829,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
spin_unlock_irq(&pool->lock);
/*
- * If @max_active is 1 or rescuer is in use, flushing another work
- * item on the same workqueue may lead to deadlock. Make sure the
- * flusher is not running on the same workqueue by verifying write
- * access.
+ * Force a lock recursion deadlock when using flush_work() inside a
+ * single-threaded or rescuer equipped workqueue.
+ *
+ * For single threaded workqueues the deadlock happens when the work
+ * is after the work issuing the flush_work(). For rescuer equipped
+ * workqueues the deadlock happens when the rescuer stalls, blocking
+ * forward progress.
*/
- if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
+ if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) {
lock_map_acquire(&pwq->wq->lockdep_map);
- else
- lock_map_acquire_read(&pwq->wq->lockdep_map);
- lock_map_release(&pwq->wq->lockdep_map);
+ lock_map_release(&pwq->wq->lockdep_map);
+ }
return true;
already_gone:
@@ -2850,9 +2866,6 @@ bool flush_work(struct work_struct *work)
if (WARN_ON(!wq_online))
return false;
- lock_map_acquire(&work->lockdep_map);
- lock_map_release(&work->lockdep_map);
-
if (start_flush_work(work, &barr)) {
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
@@ -3209,13 +3222,10 @@ static int init_worker_pool(struct worker_pool *pool)
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
- setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout,
- (unsigned long)pool);
+ timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
- setup_timer(&pool->mayday_timer, pool_mayday_timeout,
- (unsigned long)pool);
+ timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
- mutex_init(&pool->manager_arb);
mutex_init(&pool->attach_mutex);
INIT_LIST_HEAD(&pool->workers);
@@ -3285,13 +3295,15 @@ static void put_unbound_pool(struct worker_pool *pool)
hash_del(&pool->hash_node);
/*
- * Become the manager and destroy all workers. Grabbing
- * manager_arb prevents @pool's workers from blocking on
- * attach_mutex.
+ * Become the manager and destroy all workers. This prevents
+ * @pool's workers from blocking on attach_mutex. We're the last
+ * manager and @pool gets freed with the flag set.
*/
- mutex_lock(&pool->manager_arb);
-
spin_lock_irq(&pool->lock);
+ wait_event_lock_irq(wq_manager_wait,
+ !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock);
+ pool->flags |= POOL_MANAGER_ACTIVE;
+
while ((worker = first_idle_worker(pool)))
destroy_worker(worker);
WARN_ON(pool->nr_workers || pool->nr_idle);
@@ -3305,8 +3317,6 @@ static void put_unbound_pool(struct worker_pool *pool)
if (pool->detach_completion)
wait_for_completion(pool->detach_completion);
- mutex_unlock(&pool->manager_arb);
-
/* shut down the timers */
del_timer_sync(&pool->idle_timer);
del_timer_sync(&pool->mayday_timer);
@@ -4614,7 +4624,7 @@ static void rebind_workers(struct worker_pool *pool)
* concurrency management. Note that when or whether
* @worker clears REBOUND doesn't affect correctness.
*
- * ACCESS_ONCE() is necessary because @worker->flags may be
+ * WRITE_ONCE() is necessary because @worker->flags may be
* tested without holding any lock in
* wq_worker_waking_up(). Without it, NOT_RUNNING test may
* fail incorrectly leading to premature concurrency
@@ -4623,7 +4633,7 @@ static void rebind_workers(struct worker_pool *pool)
WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
worker_flags |= WORKER_REBOUND;
worker_flags &= ~WORKER_UNBOUND;
- ACCESS_ONCE(worker->flags) = worker_flags;
+ WRITE_ONCE(worker->flags, worker_flags);
}
spin_unlock_irq(&pool->lock);
@@ -5357,11 +5367,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
*/
#ifdef CONFIG_WQ_WATCHDOG
-static void wq_watchdog_timer_fn(unsigned long data);
-
static unsigned long wq_watchdog_thresh = 30;
-static struct timer_list wq_watchdog_timer =
- TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
+static struct timer_list wq_watchdog_timer;
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
@@ -5375,7 +5382,7 @@ static void wq_watchdog_reset_touched(void)
per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
}
-static void wq_watchdog_timer_fn(unsigned long data)
+static void wq_watchdog_timer_fn(struct timer_list *unused)
{
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
bool lockup_detected = false;
@@ -5477,6 +5484,7 @@ module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
static void wq_watchdog_init(void)
{
+ timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
wq_watchdog_set_thresh(wq_watchdog_thresh);
}
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 8635417c587b..d390d1be3748 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* kernel/workqueue_internal.h
*
@@ -9,6 +10,7 @@
#include <linux/workqueue.h>
#include <linux/kthread.h>
+#include <linux/preempt.h>
struct worker_pool;
@@ -59,7 +61,7 @@ struct worker {
*/
static inline struct worker *current_wq_worker(void)
{
- if (current->flags & PF_WQ_WORKER)
+ if (in_task() && (current->flags & PF_WQ_WORKER))
return kthread_data(current);
return NULL;
}