aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2021-12-08 23:43:50 -0800
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2021-12-08 23:43:50 -0800
commit5d8dfaa71d87f742c53309b95cb6a8b274119027 (patch)
tree83fa5199868fb98dbe7dcb0791bc462bac77265b /kernel
parentInput: ff-core - correct magnitude setting for rumble compatibility (diff)
parentLinux 5.15 (diff)
Merge tag 'v5.15' into next
Sync up with the mainline to get the latest APIs and DT bindings.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/Kconfig.preempt20
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/audit.h15
-rw-r--r--kernel/audit_tree.c14
-rw-r--r--kernel/auditsc.c65
-rw-r--r--kernel/bpf/Kconfig2
-rw-r--r--kernel/bpf/arraymap.c22
-rw-r--r--kernel/bpf/bpf_inode_storage.c2
-rw-r--r--kernel/bpf/bpf_iter.c37
-rw-r--r--kernel/bpf/bpf_lsm.c2
-rw-r--r--kernel/bpf/bpf_struct_ops.c29
-rw-r--r--kernel/bpf/bpf_task_storage.c6
-rw-r--r--kernel/bpf/btf.c160
-rw-r--r--kernel/bpf/cgroup.c198
-rw-r--r--kernel/bpf/core.c152
-rw-r--r--kernel/bpf/cpumap.c132
-rw-r--r--kernel/bpf/devmap.c418
-rw-r--r--kernel/bpf/disasm.c18
-rw-r--r--kernel/bpf/disasm.h2
-rw-r--r--kernel/bpf/hashtab.c228
-rw-r--r--kernel/bpf/helpers.c408
-rw-r--r--kernel/bpf/inode.c2
-rw-r--r--kernel/bpf/local_storage.c20
-rw-r--r--kernel/bpf/lpm_trie.c6
-rw-r--r--kernel/bpf/map_in_map.c8
-rw-r--r--kernel/bpf/preload/iterators/iterators.bpf.c1
-rw-r--r--kernel/bpf/reuseport_array.c2
-rw-r--r--kernel/bpf/ringbuf.c2
-rw-r--r--kernel/bpf/stackmap.c17
-rw-r--r--kernel/bpf/syscall.c470
-rw-r--r--kernel/bpf/task_iter.c11
-rw-r--r--kernel/bpf/tnum.c39
-rw-r--r--kernel/bpf/trampoline.c16
-rw-r--r--kernel/bpf/verifier.c955
-rw-r--r--kernel/cfi.c8
-rw-r--r--kernel/cgroup/cgroup-v1.c26
-rw-r--r--kernel/cgroup/cgroup.c300
-rw-r--r--kernel/cgroup/cpuset.c275
-rw-r--r--kernel/cgroup/namespace.c2
-rw-r--r--kernel/cgroup/rstat.c21
-rw-r--r--kernel/compat.c21
-rw-r--r--kernel/cpu.c137
-rw-r--r--kernel/cpu_pm.c50
-rw-r--r--kernel/crash_core.c54
-rw-r--r--kernel/cred.c54
-rw-r--r--kernel/debug/debug_core.c8
-rw-r--r--kernel/debug/gdbstub.c7
-rw-r--r--kernel/debug/kdb/kdb_bp.c72
-rw-r--r--kernel/debug/kdb/kdb_debugger.c1
-rw-r--r--kernel/debug/kdb/kdb_main.c643
-rw-r--r--kernel/debug/kdb/kdb_private.h20
-rw-r--r--kernel/debug/kdb/kdb_support.c348
-rw-r--r--kernel/delayacct.c71
-rw-r--r--kernel/dma/Kconfig17
-rw-r--r--kernel/dma/coherent.c161
-rw-r--r--kernel/dma/debug.c62
-rw-r--r--kernel/dma/debug.h24
-rw-r--r--kernel/dma/direct.c74
-rw-r--r--kernel/dma/direct.h8
-rw-r--r--kernel/dma/dummy.c2
-rw-r--r--kernel/dma/mapping.c91
-rw-r--r--kernel/dma/ops_helpers.c12
-rw-r--r--kernel/dma/swiotlb.c352
-rw-r--r--kernel/entry/common.c4
-rw-r--r--kernel/events/core.c180
-rw-r--r--kernel/events/hw_breakpoint.c6
-rw-r--r--kernel/events/uprobes.c5
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c132
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/futex.c725
-rw-r--r--kernel/gcov/Kconfig1
-rwxr-xr-xkernel/gen_kheaders.sh4
-rw-r--r--kernel/hung_task.c3
-rw-r--r--kernel/irq/Kconfig5
-rw-r--r--kernel/irq/affinity.c8
-rw-r--r--kernel/irq/chip.c7
-rw-r--r--kernel/irq/cpuhotplug.c2
-rw-r--r--kernel/irq/generic-chip.c17
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/ipi.c32
-rw-r--r--kernel/irq/irqdesc.c76
-rw-r--r--kernel/irq/irqdomain.c124
-rw-r--r--kernel/irq/manage.c57
-rw-r--r--kernel/irq/matrix.c3
-rw-r--r--kernel/irq/msi.c166
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/settings.h12
-rw-r--r--kernel/irq/timings.c7
-rw-r--r--kernel/jump_label.c25
-rw-r--r--kernel/kallsyms.c104
-rw-r--r--kernel/kcsan/atomic.h23
-rw-r--r--kernel/kcsan/core.c128
-rw-r--r--kernel/kcsan/debugfs.c2
-rw-r--r--kernel/kcsan/kcsan.h39
-rw-r--r--kernel/kcsan/kcsan_test.c32
-rw-r--r--kernel/kcsan/permissive.h94
-rw-r--r--kernel/kcsan/report.c171
-rw-r--r--kernel/kexec.c103
-rw-r--r--kernel/kexec_core.c2
-rw-r--r--kernel/kprobes.c37
-rw-r--r--kernel/kthread.c53
-rw-r--r--kernel/livepatch/transition.c4
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lockdep.c135
-rw-r--r--kernel/locking/lockdep_proc.c26
-rw-r--r--kernel/locking/locktorture.c25
-rw-r--r--kernel/locking/mutex-debug.c5
-rw-r--r--kernel/locking/mutex-debug.h29
-rw-r--r--kernel/locking/mutex.c547
-rw-r--r--kernel/locking/mutex.h48
-rw-r--r--kernel/locking/rtmutex.c1172
-rw-r--r--kernel/locking/rtmutex_api.c590
-rw-r--r--kernel/locking/rtmutex_common.h135
-rw-r--r--kernel/locking/rwbase_rt.c288
-rw-r--r--kernel/locking/rwsem.c119
-rw-r--r--kernel/locking/semaphore.c4
-rw-r--r--kernel/locking/spinlock.c7
-rw-r--r--kernel/locking/spinlock_debug.c5
-rw-r--r--kernel/locking/spinlock_rt.c263
-rw-r--r--kernel/locking/ww_mutex.h569
-rw-r--r--kernel/locking/ww_rt_mutex.c76
-rw-r--r--kernel/module.c55
-rw-r--r--kernel/notifier.c19
-rw-r--r--kernel/nsproxy.c2
-rw-r--r--kernel/padata.c35
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/params.c18
-rw-r--r--kernel/pid.c15
-rw-r--r--kernel/pid_namespace.c5
-rw-r--r--kernel/power/Kconfig12
-rw-r--r--kernel/power/energy_model.c4
-rw-r--r--kernel/power/hibernate.c5
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/snapshot.c10
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/suspend_test.c2
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk/Makefile1
-rw-r--r--kernel/printk/index.c195
-rw-r--r--kernel/printk/internal.h31
-rw-r--r--kernel/printk/printk.c481
-rw-r--r--kernel/printk/printk_safe.c364
-rw-r--r--kernel/profile.c21
-rw-r--r--kernel/ptrace.c12
-rw-r--r--kernel/rcu/Kconfig.debug2
-rw-r--r--kernel/rcu/rcu.h14
-rw-r--r--kernel/rcu/rcuscale.c4
-rw-r--r--kernel/rcu/rcutorture.c322
-rw-r--r--kernel/rcu/refscale.c149
-rw-r--r--kernel/rcu/srcutiny.c2
-rw-r--r--kernel/rcu/srcutree.c28
-rw-r--r--kernel/rcu/sync.c4
-rw-r--r--kernel/rcu/tasks.h100
-rw-r--r--kernel/rcu/tiny.c1
-rw-r--r--kernel/rcu/tree.c422
-rw-r--r--kernel/rcu/tree.h14
-rw-r--r--kernel/rcu/tree_nocb.h1496
-rw-r--r--kernel/rcu/tree_plugin.h1543
-rw-r--r--kernel/rcu/tree_stall.h171
-rw-r--r--kernel/rcu/update.c8
-rw-r--r--kernel/reboot.c79
-rw-r--r--kernel/rseq.c14
-rw-r--r--kernel/scftorture.c84
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c1876
-rw-r--r--kernel/sched/core_sched.c229
-rw-r--r--kernel/sched/cpuacct.c12
-rw-r--r--kernel/sched/cpufreq_schedutil.c17
-rw-r--r--kernel/sched/deadline.c58
-rw-r--r--kernel/sched/debug.c22
-rw-r--r--kernel/sched/fair.c660
-rw-r--r--kernel/sched/idle.c17
-rw-r--r--kernel/sched/isolation.c4
-rw-r--r--kernel/sched/loadavg.c2
-rw-r--r--kernel/sched/pelt.h2
-rw-r--r--kernel/sched/psi.c42
-rw-r--r--kernel/sched/rt.c48
-rw-r--r--kernel/sched/sched.h485
-rw-r--r--kernel/sched/stats.h68
-rw-r--r--kernel/sched/stop_task.c14
-rw-r--r--kernel/sched/topology.c278
-rw-r--r--kernel/sched/wait.c9
-rw-r--r--kernel/seccomp.c95
-rw-r--r--kernel/signal.c149
-rw-r--r--kernel/smp.c14
-rw-r--r--kernel/smpboot.c11
-rw-r--r--kernel/softirq.c7
-rw-r--r--kernel/static_call.c13
-rw-r--r--kernel/sys.c62
-rw-r--r--kernel/sys_ni.c11
-rw-r--r--kernel/sysctl-test.c24
-rw-r--r--kernel/sysctl.c76
-rw-r--r--kernel/time/Kconfig20
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c23
-rw-r--r--kernel/time/clocksource-wdtest.c201
-rw-r--r--kernel/time/clocksource.c227
-rw-r--r--kernel/time/hrtimer.c340
-rw-r--r--kernel/time/jiffies.c36
-rw-r--r--kernel/time/namespace.c4
-rw-r--r--kernel/time/posix-cpu-timers.c107
-rw-r--r--kernel/time/posix-timers.c6
-rw-r--r--kernel/time/tick-broadcast.c143
-rw-r--r--kernel/time/tick-common.c9
-rw-r--r--kernel/time/tick-internal.h37
-rw-r--r--kernel/time/tick-sched.c129
-rw-r--r--kernel/time/time_test.c99
-rw-r--r--kernel/time/timeconv.c116
-rw-r--r--kernel/time/timekeeping.c36
-rw-r--r--kernel/time/timer.c30
-rw-r--r--kernel/time/timer_list.c10
-rw-r--r--kernel/torture.c6
-rw-r--r--kernel/trace/Kconfig73
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c8
-rw-r--r--kernel/trace/bpf_trace.c130
-rw-r--r--kernel/trace/ftrace.c34
-rw-r--r--kernel/trace/ring_buffer.c38
-rw-r--r--kernel/trace/trace.c341
-rw-r--r--kernel/trace/trace.h98
-rw-r--r--kernel/trace/trace_boot.c334
-rw-r--r--kernel/trace/trace_dynevent.c38
-rw-r--r--kernel/trace/trace_dynevent.h4
-rw-r--r--kernel/trace/trace_entries.h41
-rw-r--r--kernel/trace/trace_eprobe.c959
-rw-r--r--kernel/trace/trace_event_perf.c6
-rw-r--r--kernel/trace/trace_events.c23
-rw-r--r--kernel/trace/trace_events_hist.c223
-rw-r--r--kernel/trace/trace_events_synth.c29
-rw-r--r--kernel/trace/trace_events_trigger.c23
-rw-r--r--kernel/trace/trace_hwlat.c542
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_kprobe.c49
-rw-r--r--kernel/trace/trace_osnoise.c2113
-rw-r--r--kernel/trace/trace_output.c130
-rw-r--r--kernel/trace/trace_probe.c109
-rw-r--r--kernel/trace/trace_probe.h16
-rw-r--r--kernel/trace/trace_probe_tmpl.h6
-rw-r--r--kernel/trace/trace_sched_wakeup.c24
-rw-r--r--kernel/trace/trace_synth.h4
-rw-r--r--kernel/trace/trace_uprobe.c40
-rw-r--r--kernel/tracepoint.c192
-rw-r--r--kernel/ucount.c188
-rw-r--r--kernel/user.c28
-rw-r--r--kernel/user_namespace.c11
-rw-r--r--kernel/usermode_driver.c2
-rw-r--r--kernel/watchdog.c12
-rw-r--r--kernel/workqueue.c222
-rw-r--r--kernel/workqueue_internal.h3
253 files changed, 21909 insertions, 8698 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 3de8fd11873b..4198f0273ecd 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -251,7 +251,7 @@ config ARCH_USE_QUEUED_RWLOCKS
config QUEUED_RWLOCKS
def_bool y if ARCH_USE_QUEUED_RWLOCKS
- depends on SMP
+ depends on SMP && !PREEMPT_RT
config ARCH_HAS_MMIOWB
bool
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 416017301660..5876e30c5740 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -99,3 +99,23 @@ config PREEMPT_DYNAMIC
Interesting if you want the same pre-built kernel should be used for
both Server and Desktop workloads.
+
+config SCHED_CORE
+ bool "Core Scheduling for SMT"
+ depends on SCHED_SMT
+ help
+ This option permits Core Scheduling, a means of coordinated task
+ selection across SMT siblings. When enabled -- see
+ prctl(PR_SCHED_CORE) -- task selection ensures that all SMT siblings
+ will execute a task from the same 'core group', forcing idle when no
+ matching task is found.
+
+ Use of this feature includes:
+ - mitigation of some (not all) SMT side channels;
+ - limiting SMT interference to improve determinism and/or performance.
+
+ SCHED_CORE is default disabled. When it is enabled and unused,
+ which is the likely usage by Linux distributions, there should
+ be no measurable impact on performance.
+
+
diff --git a/kernel/acct.c b/kernel/acct.c
index a64102be2bb0..23a7ab8e6cbc 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -478,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
/*
* Accounting records are not subject to resource limits.
*/
- flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ flim = rlimit(RLIMIT_FSIZE);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
/* Perform file operations on behalf of whoever enabled accounting */
orig_cred = override_creds(file->f_cred);
diff --git a/kernel/audit.h b/kernel/audit.h
index 1522e100fd17..d6a2c899a8db 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -1,11 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* audit -- definition of audit_context structure and supporting types
+/* audit -- definition of audit_context structure and supporting types
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
*/
+#ifndef _KERNEL_AUDIT_H_
+#define _KERNEL_AUDIT_H_
+
#include <linux/fs.h>
#include <linux/audit.h>
#include <linux/skbuff.h>
@@ -21,16 +24,16 @@
a per-task filter. At syscall entry, the audit_state is augmented by
the syscall filter. */
enum audit_state {
- AUDIT_DISABLED, /* Do not create per-task audit_context.
+ AUDIT_STATE_DISABLED, /* Do not create per-task audit_context.
* No syscall-specific audit records can
* be generated. */
- AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
+ AUDIT_STATE_BUILD, /* Create the per-task audit_context,
* and fill it in at syscall
* entry time. This makes a full
* syscall record available if some
* other part of the kernel decides it
* should be recorded. */
- AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
+ AUDIT_STATE_RECORD /* Create the per-task audit_context,
* always fill it in at syscall entry
* time, and always write out the audit
* record at syscall exit time. */
@@ -322,7 +325,7 @@ static inline int audit_signal_info_syscall(struct task_struct *t)
return 0;
}
-#define audit_filter_inodes(t, c) AUDIT_DISABLED
+#define audit_filter_inodes(t, c) AUDIT_STATE_DISABLED
#endif /* CONFIG_AUDITSYSCALL */
extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
@@ -331,3 +334,5 @@ extern int audit_filter(int msgtype, unsigned int listtype);
extern void audit_ctl_lock(void);
extern void audit_ctl_unlock(void);
+
+#endif
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 6c91902f4f45..2cd7b5694422 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -593,7 +593,6 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
spin_lock(&hash_lock);
}
spin_unlock(&hash_lock);
- put_tree(victim);
}
/*
@@ -602,6 +601,7 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
static void prune_one(struct audit_tree *victim)
{
prune_tree_chunks(victim, false);
+ put_tree(victim);
}
/* trim the uncommitted chunks from tree */
@@ -689,8 +689,7 @@ void audit_trim_trees(void)
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
- list_del(&cursor);
- list_add(&cursor, &tree->list);
+ list_move(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path);
@@ -899,8 +898,7 @@ int audit_tag_tree(char *old, char *new)
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
- list_del(&cursor);
- list_add(&cursor, &tree->list);
+ list_move(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path2);
@@ -925,8 +923,7 @@ int audit_tag_tree(char *old, char *new)
mutex_lock(&audit_filter_mutex);
spin_lock(&hash_lock);
if (!tree->goner) {
- list_del(&tree->list);
- list_add(&tree->list, &tree_list);
+ list_move(&tree->list, &tree_list);
}
spin_unlock(&hash_lock);
put_tree(tree);
@@ -937,8 +934,7 @@ int audit_tag_tree(char *old, char *new)
tree = container_of(barrier.prev, struct audit_tree, list);
get_tree(tree);
- list_del(&tree->list);
- list_add(&tree->list, &barrier);
+ list_move(&tree->list, &barrier);
mutex_unlock(&audit_filter_mutex);
if (!failed) {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 175ef6f3ea4e..b1cb1dbf7417 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -160,6 +160,7 @@ static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
static int audit_match_perm(struct audit_context *ctx, int mask)
{
unsigned n;
+
if (unlikely(!ctx))
return 0;
n = ctx->major;
@@ -231,7 +232,7 @@ static void audit_set_auditable(struct audit_context *ctx)
{
if (!ctx->prio) {
ctx->prio = 1;
- ctx->current_state = AUDIT_RECORD_CONTEXT;
+ ctx->current_state = AUDIT_STATE_RECORD;
}
}
@@ -239,6 +240,7 @@ static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
{
struct audit_tree_refs *p = ctx->trees;
int left = ctx->tree_count;
+
if (likely(left)) {
p->c[--left] = chunk;
ctx->tree_count = left;
@@ -259,6 +261,7 @@ static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
static int grow_tree_refs(struct audit_context *ctx)
{
struct audit_tree_refs *p = ctx->trees;
+
ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL);
if (!ctx->trees) {
ctx->trees = p;
@@ -277,6 +280,7 @@ static void unroll_tree_refs(struct audit_context *ctx,
{
struct audit_tree_refs *q;
int n;
+
if (!p) {
/* we started with empty chain */
p = ctx->first_trees;
@@ -303,6 +307,7 @@ static void unroll_tree_refs(struct audit_context *ctx,
static void free_tree_refs(struct audit_context *ctx)
{
struct audit_tree_refs *p, *q;
+
for (p = ctx->first_trees; p; p = q) {
q = p->next;
kfree(p);
@@ -313,6 +318,7 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
{
struct audit_tree_refs *p;
int n;
+
if (!tree)
return 0;
/* full ones */
@@ -337,13 +343,13 @@ static int audit_compare_uid(kuid_t uid,
{
struct audit_names *n;
int rc;
-
+
if (name) {
rc = audit_uid_comparator(uid, f->op, name->uid);
if (rc)
return rc;
}
-
+
if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
rc = audit_uid_comparator(uid, f->op, n->uid);
@@ -361,13 +367,13 @@ static int audit_compare_gid(kgid_t gid,
{
struct audit_names *n;
int rc;
-
+
if (name) {
rc = audit_gid_comparator(gid, f->op, name->gid);
if (rc)
return rc;
}
-
+
if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
rc = audit_gid_comparator(gid, f->op, n->gid);
@@ -651,7 +657,7 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
case AUDIT_SADDR_FAM:
- if (ctx->sockaddr)
+ if (ctx && ctx->sockaddr)
result = audit_comparator(ctx->sockaddr->ss_family,
f->op, f->val);
break;
@@ -751,10 +757,10 @@ static int audit_filter_rules(struct task_struct *tsk,
}
switch (rule->action) {
case AUDIT_NEVER:
- *state = AUDIT_DISABLED;
+ *state = AUDIT_STATE_DISABLED;
break;
case AUDIT_ALWAYS:
- *state = AUDIT_RECORD_CONTEXT;
+ *state = AUDIT_STATE_RECORD;
break;
}
return 1;
@@ -773,14 +779,14 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
&state, true)) {
- if (state == AUDIT_RECORD_CONTEXT)
+ if (state == AUDIT_STATE_RECORD)
*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
rcu_read_unlock();
return state;
}
}
rcu_read_unlock();
- return AUDIT_BUILD_CONTEXT;
+ return AUDIT_STATE_BUILD;
}
static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
@@ -802,7 +808,7 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
/* At syscall exit time, this filter is called if the audit_state is
* not low enough that auditing cannot take place, but is also not
* high enough that we already know we have to write an audit record
- * (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
+ * (i.e., the state is AUDIT_STATE_BUILD).
*/
static void audit_filter_syscall(struct task_struct *tsk,
struct audit_context *ctx)
@@ -923,7 +929,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
if (!context)
return NULL;
context->state = state;
- context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+ context->prio = state == AUDIT_STATE_RECORD ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
INIT_LIST_HEAD(&context->names_list);
context->fds[0] = -1;
@@ -950,7 +956,7 @@ int audit_alloc(struct task_struct *tsk)
return 0; /* Return if not auditing. */
state = audit_filter_task(tsk, &key);
- if (state == AUDIT_DISABLED) {
+ if (state == AUDIT_STATE_DISABLED) {
clear_task_syscall_work(tsk, SYSCALL_AUDIT);
return 0;
}
@@ -1225,6 +1231,7 @@ static void show_special(struct audit_context *context, int *call_panic)
switch (context->type) {
case AUDIT_SOCKETCALL: {
int nargs = context->socketcall.nargs;
+
audit_log_format(ab, "nargs=%d", nargs);
for (i = 0; i < nargs; i++)
audit_log_format(ab, " a%d=%lx", i,
@@ -1240,6 +1247,7 @@ static void show_special(struct audit_context *context, int *call_panic)
if (osid) {
char *ctx = NULL;
u32 len;
+
if (security_secid_to_secctx(osid, &ctx, &len)) {
audit_log_format(ab, " osid=%u", osid);
*call_panic = 1;
@@ -1289,6 +1297,7 @@ static void show_special(struct audit_context *context, int *call_panic)
break;
case AUDIT_MQ_GETSETATTR: {
struct mq_attr *attr = &context->mq_getsetattr.mqstat;
+
audit_log_format(ab,
"mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
"mq_curmsgs=%ld ",
@@ -1325,6 +1334,7 @@ static void show_special(struct audit_context *context, int *call_panic)
static inline int audit_proctitle_rtrim(char *proctitle, int len)
{
char *end = proctitle + len - 1;
+
while (end > proctitle && !isprint(*end))
end--;
@@ -1513,6 +1523,7 @@ static void audit_log_exit(void)
case AUDIT_BPRM_FCAPS: {
struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
+
audit_log_format(ab, "fver=%x", axs->fcap_ver);
audit_log_cap(ab, "fp", &axs->fcap.permitted);
audit_log_cap(ab, "fi", &axs->fcap.inheritable);
@@ -1628,7 +1639,7 @@ void __audit_free(struct task_struct *tsk)
audit_filter_syscall(tsk, context);
audit_filter_inodes(tsk, context);
- if (context->current_state == AUDIT_RECORD_CONTEXT)
+ if (context->current_state == AUDIT_STATE_RECORD)
audit_log_exit();
}
@@ -1647,7 +1658,7 @@ void __audit_free(struct task_struct *tsk)
* Fill in audit context at syscall entry. This only happens if the
* audit context was created when the task was created and the state or
* filters demand the audit context be built. If the state from the
- * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
+ * per-task filter or from the per-syscall filter is AUDIT_STATE_RECORD,
* then the record will be written at syscall exit time (otherwise, it
* will only be written if another part of the kernel requests that it
* be written).
@@ -1664,11 +1675,11 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
BUG_ON(context->in_syscall || context->name_count);
state = context->state;
- if (state == AUDIT_DISABLED)
+ if (state == AUDIT_STATE_DISABLED)
return;
context->dummy = !audit_n_rules;
- if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
+ if (!context->dummy && state == AUDIT_STATE_BUILD) {
context->prio = 0;
if (auditd_test_task(current))
return;
@@ -1693,7 +1704,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
* @return_code: return value of the syscall
*
* Tear down after system call. If the audit context has been marked as
- * auditable (either because of the AUDIT_RECORD_CONTEXT state from
+ * auditable (either because of the AUDIT_STATE_RECORD state from
* filtering, or because some other part of the kernel wrote an audit
* message), then write out the syscall information. In call cases,
* free the names stored from getname().
@@ -1735,12 +1746,12 @@ void __audit_syscall_exit(int success, long return_code)
audit_filter_syscall(current, context);
audit_filter_inodes(current, context);
- if (context->current_state == AUDIT_RECORD_CONTEXT)
+ if (context->current_state == AUDIT_STATE_RECORD)
audit_log_exit();
}
context->in_syscall = 0;
- context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+ context->prio = context->state == AUDIT_STATE_RECORD ? ~0ULL : 0;
audit_free_module(context);
audit_free_names(context);
@@ -1753,7 +1764,7 @@ void __audit_syscall_exit(int success, long return_code)
context->sockaddr_len = 0;
context->type = 0;
context->fds[0] = -1;
- if (context->state != AUDIT_RECORD_CONTEXT) {
+ if (context->state != AUDIT_STATE_RECORD) {
kfree(context->filterkey);
context->filterkey = NULL;
}
@@ -1765,6 +1776,7 @@ static inline void handle_one(const struct inode *inode)
struct audit_tree_refs *p;
struct audit_chunk *chunk;
int count;
+
if (likely(!inode->i_fsnotify_marks))
return;
context = audit_context();
@@ -1806,8 +1818,10 @@ retry:
seq = read_seqbegin(&rename_lock);
for(;;) {
struct inode *inode = d_backing_inode(d);
+
if (inode && unlikely(inode->i_fsnotify_marks)) {
struct audit_chunk *chunk;
+
chunk = audit_tree_lookup(inode);
if (chunk) {
if (unlikely(!put_tree_ref(context, chunk))) {
@@ -2203,7 +2217,7 @@ int auditsc_get_stamp(struct audit_context *ctx,
*serial = ctx->serial;
if (!ctx->prio) {
ctx->prio = 1;
- ctx->current_state = AUDIT_RECORD_CONTEXT;
+ ctx->current_state = AUDIT_STATE_RECORD;
}
return 1;
}
@@ -2285,6 +2299,7 @@ void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
struct audit_context *context = audit_context();
+
context->mq_getsetattr.mqdes = mqdes;
context->mq_getsetattr.mqstat = *mqstat;
context->type = AUDIT_MQ_GETSETATTR;
@@ -2298,6 +2313,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
struct audit_context *context = audit_context();
+
context->ipc.uid = ipcp->uid;
context->ipc.gid = ipcp->gid;
context->ipc.mode = ipcp->mode;
@@ -2362,6 +2378,7 @@ int __audit_socketcall(int nargs, unsigned long *args)
void __audit_fd_pair(int fd1, int fd2)
{
struct audit_context *context = audit_context();
+
context->fds[0] = fd1;
context->fds[1] = fd2;
}
@@ -2379,6 +2396,7 @@ int __audit_sockaddr(int len, void *a)
if (!context->sockaddr) {
void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
+
if (!p)
return -ENOMEM;
context->sockaddr = p;
@@ -2510,6 +2528,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
void __audit_log_capset(const struct cred *new, const struct cred *old)
{
struct audit_context *context = audit_context();
+
context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
@@ -2521,6 +2540,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
void __audit_mmap_fd(int fd, int flags)
{
struct audit_context *context = audit_context();
+
context->mmap.fd = fd;
context->mmap.flags = flags;
context->type = AUDIT_MMAP;
@@ -2686,6 +2706,7 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names,
struct list_head *audit_killed_trees(void)
{
struct audit_context *ctx = audit_context();
+
if (likely(!ctx || !ctx->in_syscall))
return NULL;
return &ctx->killed_trees;
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index bd04f4a44c01..a82d6de86522 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -29,7 +29,7 @@ config BPF_SYSCALL
select IRQ_WORK
select TASKS_TRACE_RCU
select BINARY_PRINTF
- select NET_SOCK_MSG if INET
+ select NET_SOCK_MSG if NET
default n
help
Enable the bpf() system call that allows to manipulate BPF programs
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3c4105603f9d..447def540544 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -287,6 +287,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
return 0;
}
+static void check_and_free_timer_in_array(struct bpf_array *arr, void *val)
+{
+ if (unlikely(map_value_has_timer(&arr->map)))
+ bpf_timer_cancel_and_free(val + arr->map.timer_off);
+}
+
/* Called from syscall or from eBPF program */
static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
@@ -321,6 +327,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
+ check_and_free_timer_in_array(array, val);
}
return 0;
}
@@ -374,6 +381,19 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
+static void array_map_free_timers(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ if (likely(!map_value_has_timer(map)))
+ return;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_timer_cancel_and_free(array->value + array->elem_size * i +
+ map->timer_off);
+}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
@@ -668,6 +688,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
+ .map_release_uref = array_map_free_timers,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
@@ -1051,6 +1072,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
INIT_WORK(&aux->work, prog_array_map_clear_deferred);
INIT_LIST_HEAD(&aux->poke_progs);
mutex_init(&aux->poke_mutex);
+ spin_lock_init(&aux->owner.lock);
map = array_map_alloc(attr);
if (IS_ERR(map)) {
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 2921ca39a93e..96ceed0e0fb5 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -72,7 +72,7 @@ void bpf_inode_storage_free(struct inode *inode)
return;
}
- /* Netiher the bpf_prog nor the bpf-map's syscall
+ /* Neither the bpf_prog nor the bpf-map's syscall
* could be modifying the local_storage->list now.
* Thus, no elem can be added-to or deleted-from the
* local_storage->list by the bpf_prog or by the bpf-map's syscall.
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 931870f9cf56..b2ee45064e06 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -360,6 +360,28 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
return supported;
}
+const struct bpf_func_proto *
+bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_iter_target_info *tinfo;
+ const struct bpf_func_proto *fn = NULL;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (tinfo->btf_id == prog->aux->attach_btf_id) {
+ const struct bpf_iter_reg *reg_info;
+
+ reg_info = tinfo->reg_info;
+ if (reg_info->get_func_proto)
+ fn = reg_info->get_func_proto(func_id, prog);
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ return fn;
+}
+
static void bpf_iter_link_release(struct bpf_link *link)
{
struct bpf_iter_link *iter_link =
@@ -473,15 +495,16 @@ bool bpf_link_is_iter(struct bpf_link *link)
return link->ops == &bpf_iter_link_lops;
}
-int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
+ struct bpf_prog *prog)
{
- union bpf_iter_link_info __user *ulinfo;
struct bpf_link_primer link_primer;
struct bpf_iter_target_info *tinfo;
union bpf_iter_link_info linfo;
struct bpf_iter_link *link;
u32 prog_btf_id, linfo_len;
bool existed = false;
+ bpfptr_t ulinfo;
int err;
if (attr->link_create.target_fd || attr->link_create.flags)
@@ -489,18 +512,18 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
memset(&linfo, 0, sizeof(union bpf_iter_link_info));
- ulinfo = u64_to_user_ptr(attr->link_create.iter_info);
+ ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel);
linfo_len = attr->link_create.iter_info_len;
- if (!ulinfo ^ !linfo_len)
+ if (bpfptr_is_null(ulinfo) ^ !linfo_len)
return -EINVAL;
- if (ulinfo) {
+ if (!bpfptr_is_null(ulinfo)) {
err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo),
linfo_len);
if (err)
return err;
linfo_len = min_t(u32, linfo_len, sizeof(linfo));
- if (copy_from_user(&linfo, ulinfo, linfo_len))
+ if (copy_from_bpfptr(&linfo, ulinfo, linfo_len))
return -EFAULT;
}
@@ -663,7 +686,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
rcu_read_lock();
migrate_disable();
- ret = BPF_PROG_RUN(prog, ctx);
+ ret = bpf_prog_run(prog, ctx);
migrate_enable();
rcu_read_unlock();
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index da471bf01b97..06062370c3b8 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -127,7 +127,7 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
/* The set of hooks which are called without pagefaults disabled and are allowed
- * to "sleep" and thus can be used for sleeable BPF programs.
+ * to "sleep" and thus can be used for sleepable BPF programs.
*/
BTF_SET_START(sleepable_lsm_hooks)
BTF_ID(func, bpf_lsm_bpf)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 70f6fd4fa305..9abcc33f02cf 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -28,6 +28,7 @@ struct bpf_struct_ops_value {
struct bpf_struct_ops_map {
struct bpf_map map;
+ struct rcu_head rcu;
const struct bpf_struct_ops *st_ops;
/* protect map_update */
struct mutex lock;
@@ -367,6 +368,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
u32 moff;
+ u32 flags;
moff = btf_member_bit_offset(t, member) / 8;
ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
@@ -430,10 +432,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
+ flags = st_ops->func_models[i].ret_size > 0 ?
+ BPF_TRAMP_F_RET_FENTRY_RET : 0;
err = arch_prepare_bpf_trampoline(NULL, image,
st_map->image + PAGE_SIZE,
- &st_ops->func_models[i], 0,
- tprogs, NULL);
+ &st_ops->func_models[i],
+ flags, tprogs, NULL);
if (err < 0)
goto reset_unlock;
@@ -622,6 +626,14 @@ bool bpf_struct_ops_get(const void *kdata)
return refcount_inc_not_zero(&kvalue->refcnt);
}
+static void bpf_struct_ops_put_rcu(struct rcu_head *head)
+{
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = container_of(head, struct bpf_struct_ops_map, rcu);
+ bpf_map_put(&st_map->map);
+}
+
void bpf_struct_ops_put(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
@@ -632,6 +644,17 @@ void bpf_struct_ops_put(const void *kdata)
st_map = container_of(kvalue, struct bpf_struct_ops_map,
kvalue);
- bpf_map_put(&st_map->map);
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its map->refcnt may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * Thus, a rcu grace period is needed here.
+ */
+ call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
}
}
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 3ce75758d394..ebfa8bc90892 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -317,15 +317,13 @@ const struct bpf_map_ops task_storage_map_ops = {
.map_owner_storage_ptr = task_storage_ptr,
};
-BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct)
-
const struct bpf_func_proto bpf_task_storage_get_proto = {
.func = bpf_task_storage_get,
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
- .arg2_btf_id = &bpf_task_storage_btf_ids[0],
+ .arg2_btf_id = &btf_task_struct_ids[0],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
@@ -336,5 +334,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
- .arg2_btf_id = &bpf_task_storage_btf_ids[0],
+ .arg2_btf_id = &btf_task_struct_ids[0],
};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f982a9f0dbc4..dfe61df4f974 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -51,7 +51,7 @@
* The BTF type section contains a list of 'struct btf_type' objects.
* Each one describes a C type. Recall from the above section
* that a 'struct btf_type' object could be immediately followed by extra
- * data in order to desribe some particular C types.
+ * data in order to describe some particular C types.
*
* type_id:
* ~~~~~~~
@@ -1143,7 +1143,7 @@ static void *btf_show_obj_safe(struct btf_show *show,
/*
* We need a new copy to our safe object, either because we haven't
- * yet copied and are intializing safe data, or because the data
+ * yet copied and are initializing safe data, or because the data
* we want falls outside the boundaries of the safe object.
*/
if (!safe) {
@@ -3046,43 +3046,92 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
+ const char *name, int sz, int align)
{
const struct btf_member *member;
u32 i, off = -ENOENT;
- if (!__btf_type_is_struct(t))
- return -EINVAL;
-
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
if (!__btf_type_is_struct(member_type))
continue;
- if (member_type->size != sizeof(struct bpf_spin_lock))
+ if (member_type->size != sz)
continue;
- if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
- "bpf_spin_lock"))
+ if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
continue;
if (off != -ENOENT)
- /* only one 'struct bpf_spin_lock' is allowed */
+ /* only one such field is allowed */
return -E2BIG;
off = btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
- if (off % __alignof__(struct bpf_spin_lock))
- /* valid struct bpf_spin_lock will be 4 byte aligned */
+ if (off % align)
+ return -EINVAL;
+ }
+ return off;
+}
+
+static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
+ const char *name, int sz, int align)
+{
+ const struct btf_var_secinfo *vsi;
+ u32 i, off = -ENOENT;
+
+ for_each_vsi(i, t, vsi) {
+ const struct btf_type *var = btf_type_by_id(btf, vsi->type);
+ const struct btf_type *var_type = btf_type_by_id(btf, var->type);
+
+ if (!__btf_type_is_struct(var_type))
+ continue;
+ if (var_type->size != sz)
+ continue;
+ if (vsi->size != sz)
+ continue;
+ if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
+ continue;
+ if (off != -ENOENT)
+ /* only one such field is allowed */
+ return -E2BIG;
+ off = vsi->offset;
+ if (off % align)
return -EINVAL;
}
return off;
}
+static int btf_find_field(const struct btf *btf, const struct btf_type *t,
+ const char *name, int sz, int align)
+{
+
+ if (__btf_type_is_struct(t))
+ return btf_find_struct_field(btf, t, name, sz, align);
+ else if (btf_type_is_datasec(t))
+ return btf_find_datasec_var(btf, t, name, sz, align);
+ return -EINVAL;
+}
+
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+ return btf_find_field(btf, t, "bpf_spin_lock",
+ sizeof(struct bpf_spin_lock),
+ __alignof__(struct bpf_spin_lock));
+}
+
+int btf_find_timer(const struct btf *btf, const struct btf_type *t)
+{
+ return btf_find_field(btf, t, "bpf_timer",
+ sizeof(struct bpf_timer),
+ __alignof__(struct bpf_timer));
+}
+
static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)
@@ -3417,7 +3466,7 @@ static struct btf_kind_operations func_proto_ops = {
* BTF_KIND_FUNC_PROTO cannot be directly referred by
* a struct's member.
*
- * It should be a funciton pointer instead.
+ * It should be a function pointer instead.
* (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
*
* Hence, there is no btf_func_check_member().
@@ -4257,7 +4306,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
return 0;
}
-static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
+static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
u32 log_level, char __user *log_ubuf, u32 log_size)
{
struct btf_verifier_env *env = NULL;
@@ -4306,7 +4355,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
btf->data = data;
btf->data_size = btf_data_size;
- if (copy_from_user(data, btf_data, btf_data_size)) {
+ if (copy_from_bpfptr(data, btf_data, btf_data_size)) {
err = -EFAULT;
goto errout;
}
@@ -4776,6 +4825,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
if (ctx_arg_info->offset == off) {
+ if (!ctx_arg_info->btf_id) {
+ bpf_log(log,"invalid btf_id for context argument offset %u\n", off);
+ return false;
+ }
+
info->reg_type = ctx_arg_info->reg_type;
info->btf = btf_vmlinux;
info->btf_id = ctx_arg_info->btf_id;
@@ -5792,12 +5846,12 @@ static int __btf_new_fd(struct btf *btf)
return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
}
-int btf_new_fd(const union bpf_attr *attr)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
{
struct btf *btf;
int ret;
- btf = btf_parse(u64_to_user_ptr(attr->btf),
+ btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel),
attr->btf_size, attr->btf_log_level,
u64_to_user_ptr(attr->btf_log_buf),
attr->btf_log_size);
@@ -6097,3 +6151,67 @@ struct module *btf_try_get_module(const struct btf *btf)
return res;
}
+
+BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
+{
+ struct btf *btf;
+ long ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (name_sz <= 1 || name[name_sz - 1])
+ return -EINVAL;
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ ret = btf_find_by_name_kind(btf, name, kind);
+ /* ret is never zero, since btf_find_by_name_kind returns
+ * positive btf_id or negative error.
+ */
+ if (ret < 0) {
+ struct btf *mod_btf;
+ int id;
+
+ /* If name is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, mod_btf, id) {
+ if (!btf_is_module(mod_btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(mod_btf);
+ spin_unlock_bh(&btf_idr_lock);
+ ret = btf_find_by_name_kind(mod_btf, name, kind);
+ if (ret > 0) {
+ int btf_obj_fd;
+
+ btf_obj_fd = __btf_new_fd(mod_btf);
+ if (btf_obj_fd < 0) {
+ btf_put(mod_btf);
+ return btf_obj_fd;
+ }
+ return ret | (((u64)btf_obj_fd) << 32);
+ }
+ spin_lock_bh(&btf_idr_lock);
+ btf_put(mod_btf);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ }
+ return ret;
+}
+
+const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
+ .func = bpf_btf_find_by_name_kind,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b567ca46555c..03145d45e3d5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -19,7 +19,7 @@
#include "../cgroup/cgroup-internal.h"
-DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE);
+DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
void cgroup_bpf_offline(struct cgroup *cgrp)
@@ -113,12 +113,12 @@ static void cgroup_bpf_release(struct work_struct *work)
struct list_head *storages = &cgrp->bpf.storages;
struct bpf_cgroup_storage *storage, *stmp;
- unsigned int type;
+ unsigned int atype;
mutex_lock(&cgroup_mutex);
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
- struct list_head *progs = &cgrp->bpf.progs[type];
+ for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
+ struct list_head *progs = &cgrp->bpf.progs[atype];
struct bpf_prog_list *pl, *pltmp;
list_for_each_entry_safe(pl, pltmp, progs, node) {
@@ -128,10 +128,10 @@ static void cgroup_bpf_release(struct work_struct *work)
if (pl->link)
bpf_cgroup_link_auto_detach(pl->link);
kfree(pl);
- static_branch_dec(&cgroup_bpf_enabled_key[type]);
+ static_branch_dec(&cgroup_bpf_enabled_key[atype]);
}
old_array = rcu_dereference_protected(
- cgrp->bpf.effective[type],
+ cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
bpf_prog_array_free(old_array);
}
@@ -196,7 +196,7 @@ static u32 prog_list_length(struct list_head *head)
* if parent has overridable or multi-prog, allow attaching
*/
static bool hierarchy_allows_attach(struct cgroup *cgrp,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup *p;
@@ -204,12 +204,12 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
if (!p)
return true;
do {
- u32 flags = p->bpf.flags[type];
+ u32 flags = p->bpf.flags[atype];
u32 cnt;
if (flags & BPF_F_ALLOW_MULTI)
return true;
- cnt = prog_list_length(&p->bpf.progs[type]);
+ cnt = prog_list_length(&p->bpf.progs[atype]);
WARN_ON_ONCE(cnt > 1);
if (cnt == 1)
return !!(flags & BPF_F_ALLOW_OVERRIDE);
@@ -225,7 +225,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
* to programs in this cgroup
*/
static int compute_effective_progs(struct cgroup *cgrp,
- enum bpf_attach_type type,
+ enum cgroup_bpf_attach_type atype,
struct bpf_prog_array **array)
{
struct bpf_prog_array_item *item;
@@ -236,8 +236,8 @@ static int compute_effective_progs(struct cgroup *cgrp,
/* count number of effective programs by walking parents */
do {
- if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
- cnt += prog_list_length(&p->bpf.progs[type]);
+ if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ cnt += prog_list_length(&p->bpf.progs[atype]);
p = cgroup_parent(p);
} while (p);
@@ -249,10 +249,10 @@ static int compute_effective_progs(struct cgroup *cgrp,
cnt = 0;
p = cgrp;
do {
- if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
continue;
- list_for_each_entry(pl, &p->bpf.progs[type], node) {
+ list_for_each_entry(pl, &p->bpf.progs[atype], node) {
if (!prog_list_prog(pl))
continue;
@@ -269,10 +269,10 @@ static int compute_effective_progs(struct cgroup *cgrp,
}
static void activate_effective_progs(struct cgroup *cgrp,
- enum bpf_attach_type type,
+ enum cgroup_bpf_attach_type atype,
struct bpf_prog_array *old_array)
{
- old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
+ old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
lockdep_is_held(&cgroup_mutex));
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
@@ -328,7 +328,7 @@ cleanup:
}
static int update_effective_progs(struct cgroup *cgrp,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup_subsys_state *css;
int err;
@@ -340,7 +340,7 @@ static int update_effective_progs(struct cgroup *cgrp,
if (percpu_ref_is_zero(&desc->bpf.refcnt))
continue;
- err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
if (err)
goto cleanup;
}
@@ -357,7 +357,7 @@ static int update_effective_progs(struct cgroup *cgrp,
continue;
}
- activate_effective_progs(desc, type, desc->bpf.inactive);
+ activate_effective_progs(desc, atype, desc->bpf.inactive);
desc->bpf.inactive = NULL;
}
@@ -436,11 +436,12 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
enum bpf_attach_type type, u32 flags)
{
u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
- struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ enum cgroup_bpf_attach_type atype;
struct bpf_prog_list *pl;
+ struct list_head *progs;
int err;
if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
@@ -454,10 +455,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
/* replace_prog implies BPF_F_REPLACE, and vice versa */
return -EINVAL;
- if (!hierarchy_allows_attach(cgrp, type))
+ atype = to_cgroup_bpf_attach_type(type);
+ if (atype < 0)
+ return -EINVAL;
+
+ progs = &cgrp->bpf.progs[atype];
+
+ if (!hierarchy_allows_attach(cgrp, atype))
return -EPERM;
- if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
+ if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
/* Disallow attaching non-overridable on top
* of existing overridable in this cgroup.
* Disallow attaching multi-prog if overridable or none
@@ -490,16 +497,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
pl->prog = prog;
pl->link = link;
bpf_cgroup_storages_assign(pl->storage, storage);
- cgrp->bpf.flags[type] = saved_flags;
+ cgrp->bpf.flags[atype] = saved_flags;
- err = update_effective_progs(cgrp, type);
+ err = update_effective_progs(cgrp, atype);
if (err)
goto cleanup;
if (old_prog)
bpf_prog_put(old_prog);
else
- static_branch_inc(&cgroup_bpf_enabled_key[type]);
+ static_branch_inc(&cgroup_bpf_enabled_key[atype]);
bpf_cgroup_storages_link(new_storage, cgrp, type);
return 0;
@@ -520,7 +527,7 @@ cleanup:
* all descendant cgroups. This function is guaranteed to succeed.
*/
static void replace_effective_prog(struct cgroup *cgrp,
- enum bpf_attach_type type,
+ enum cgroup_bpf_attach_type atype,
struct bpf_cgroup_link *link)
{
struct bpf_prog_array_item *item;
@@ -539,10 +546,10 @@ static void replace_effective_prog(struct cgroup *cgrp,
/* find position of link in effective progs array */
for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
- if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
continue;
- head = &cg->bpf.progs[type];
+ head = &cg->bpf.progs[atype];
list_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
@@ -554,7 +561,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
found:
BUG_ON(!cg);
progs = rcu_dereference_protected(
- desc->bpf.effective[type],
+ desc->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
item = &progs->items[pos];
WRITE_ONCE(item->prog, link->link.prog);
@@ -574,11 +581,18 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
struct bpf_cgroup_link *link,
struct bpf_prog *new_prog)
{
- struct list_head *progs = &cgrp->bpf.progs[link->type];
+ enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
struct bpf_prog_list *pl;
+ struct list_head *progs;
bool found = false;
+ atype = to_cgroup_bpf_attach_type(link->type);
+ if (atype < 0)
+ return -EINVAL;
+
+ progs = &cgrp->bpf.progs[atype];
+
if (link->link.prog->type != new_prog->type)
return -EINVAL;
@@ -592,7 +606,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
return -ENOENT;
old_prog = xchg(&link->link.prog, new_prog);
- replace_effective_prog(cgrp, link->type, link);
+ replace_effective_prog(cgrp, atype, link);
bpf_prog_put(old_prog);
return 0;
}
@@ -667,12 +681,20 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
struct bpf_cgroup_link *link, enum bpf_attach_type type)
{
- struct list_head *progs = &cgrp->bpf.progs[type];
- u32 flags = cgrp->bpf.flags[type];
- struct bpf_prog_list *pl;
+ enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
+ struct bpf_prog_list *pl;
+ struct list_head *progs;
+ u32 flags;
int err;
+ atype = to_cgroup_bpf_attach_type(type);
+ if (atype < 0)
+ return -EINVAL;
+
+ progs = &cgrp->bpf.progs[atype];
+ flags = cgrp->bpf.flags[atype];
+
if (prog && link)
/* only one of prog or link can be specified */
return -EINVAL;
@@ -686,7 +708,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
pl->prog = NULL;
pl->link = NULL;
- err = update_effective_progs(cgrp, type);
+ err = update_effective_progs(cgrp, atype);
if (err)
goto cleanup;
@@ -695,10 +717,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
kfree(pl);
if (list_empty(progs))
/* last program was detached, reset flags to zero */
- cgrp->bpf.flags[type] = 0;
+ cgrp->bpf.flags[atype] = 0;
if (old_prog)
bpf_prog_put(old_prog);
- static_branch_dec(&cgroup_bpf_enabled_key[type]);
+ static_branch_dec(&cgroup_bpf_enabled_key[atype]);
return 0;
cleanup:
@@ -714,13 +736,21 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
{
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
enum bpf_attach_type type = attr->query.attach_type;
- struct list_head *progs = &cgrp->bpf.progs[type];
- u32 flags = cgrp->bpf.flags[type];
+ enum cgroup_bpf_attach_type atype;
struct bpf_prog_array *effective;
+ struct list_head *progs;
struct bpf_prog *prog;
int cnt, ret = 0, i;
+ u32 flags;
- effective = rcu_dereference_protected(cgrp->bpf.effective[type],
+ atype = to_cgroup_bpf_attach_type(type);
+ if (atype < 0)
+ return -EINVAL;
+
+ progs = &cgrp->bpf.progs[atype];
+ flags = cgrp->bpf.flags[atype];
+
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
@@ -925,14 +955,14 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
link->cgroup = cgrp;
link->type = attr->link_create.attach_type;
- err = bpf_link_prime(&link->link, &link_primer);
+ err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
goto out_put_cgroup;
}
- err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
- BPF_F_ALLOW_MULTI);
+ err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
+ link->type, BPF_F_ALLOW_MULTI);
if (err) {
bpf_link_cleanup(&link_primer);
goto out_put_cgroup;
@@ -986,7 +1016,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
*/
int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
unsigned int offset = skb->data - skb_network_header(skb);
struct sock *save_sk;
@@ -1008,12 +1038,12 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
/* compute pointers for the bpf prog */
bpf_compute_and_save_data_end(skb, &saved_data_end);
- if (type == BPF_CGROUP_INET_EGRESS) {
+ if (atype == CGROUP_INET_EGRESS) {
ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
- cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+ cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
} else {
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
- __bpf_prog_run_save_cb);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
+ __bpf_prog_run_save_cb);
ret = (ret == 1 ? 0 : -EPERM);
}
bpf_restore_data_end(skb, saved_data_end);
@@ -1038,12 +1068,12 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
* and if it returned != 1 during execution. In all other cases, 0 is returned.
*/
int __cgroup_bpf_run_filter_sk(struct sock *sk,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
int ret;
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run);
return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1065,7 +1095,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
*/
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
struct sockaddr *uaddr,
- enum bpf_attach_type type,
+ enum cgroup_bpf_attach_type atype,
void *t_ctx,
u32 *flags)
{
@@ -1090,8 +1120,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
}
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx,
- BPF_PROG_RUN, flags);
+ ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run, flags);
return ret == 1 ? 0 : -EPERM;
}
@@ -1115,19 +1145,19 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
*/
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
struct bpf_sock_ops_kern *sock_ops,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
int ret;
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
- BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
+ bpf_prog_run);
return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
- short access, enum bpf_attach_type type)
+ short access, enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp;
struct bpf_cgroup_dev_ctx ctx = {
@@ -1135,12 +1165,12 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
.major = major,
.minor = minor,
};
- int allow = 1;
+ int allow;
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
- BPF_PROG_RUN);
+ allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run);
rcu_read_unlock();
return !allow;
@@ -1231,7 +1261,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
struct ctl_table *table, int write,
char **buf, size_t *pcount, loff_t *ppos,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct bpf_sysctl_kern ctx = {
.head = head,
@@ -1271,7 +1301,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run);
rcu_read_unlock();
kfree(ctx.cur_val);
@@ -1289,7 +1319,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
#ifdef CONFIG_NET
static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
- enum bpf_attach_type attach_type)
+ enum cgroup_bpf_attach_type attach_type)
{
struct bpf_prog_array *prog_array;
bool empty;
@@ -1364,7 +1394,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
* attached to the hook so we don't waste time allocating
* memory and locking the socket.
*/
- if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+ if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
return 0;
/* Allocate a bit more than the initial user buffer for
@@ -1385,8 +1415,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
}
lock_sock(sk);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
- &ctx, BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
+ &ctx, bpf_prog_run);
release_sock(sk);
if (!ret) {
@@ -1460,7 +1490,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
* attached to the hook so we don't waste time allocating
* memory and locking the socket.
*/
- if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+ if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
return retval;
ctx.optlen = max_optlen;
@@ -1495,8 +1525,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
}
lock_sock(sk);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
- &ctx, BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
+ &ctx, bpf_prog_run);
release_sock(sk);
if (!ret) {
@@ -1556,8 +1586,8 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
* be called if that data shouldn't be "exported".
*/
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
- &ctx, BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
+ &ctx, bpf_prog_run);
if (!ret)
return -EPERM;
@@ -1846,15 +1876,41 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
const struct bpf_prog_ops cg_sysctl_prog_ops = {
};
+#ifdef CONFIG_NET
+BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
+{
+ const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
+
+ return net->net_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
+ .func = bpf_get_netns_cookie_sockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+#endif
+
static const struct bpf_func_proto *
cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
#ifdef CONFIG_NET
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sockopt_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
+ return &bpf_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
+ return &bpf_sk_getsockopt_proto;
+ return NULL;
#endif
#ifdef CONFIG_INET
case BPF_FUNC_tcp_sock:
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5e31ee9f7512..6e3ae90ad107 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -32,6 +32,8 @@
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
+
+#include <asm/barrier.h>
#include <asm/unaligned.h>
/* Registers */
@@ -522,6 +524,7 @@ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden __read_mostly;
long bpf_jit_limit __read_mostly;
+long bpf_jit_limit_max __read_mostly;
static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
@@ -815,7 +818,8 @@ u64 __weak bpf_jit_alloc_exec_limit(void)
static int __init bpf_jit_charge_init(void)
{
/* Only used as heuristic here to derive limit. */
- bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2,
+ bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
+ bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2,
PAGE_SIZE), LONG_MAX);
return 0;
}
@@ -825,7 +829,7 @@ int bpf_jit_charge_modmem(u32 pages)
{
if (atomic_long_add_return(pages, &bpf_jit_current) >
(bpf_jit_limit >> PAGE_SHIFT)) {
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!bpf_capable()) {
atomic_long_sub(pages, &bpf_jit_current);
return -EPERM;
}
@@ -1360,11 +1364,13 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
}
/**
- * __bpf_prog_run - run eBPF program on a given context
+ * ___bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
*
* Decode and execute eBPF instructions.
+ *
+ * Return: whatever value is in %BPF_R0 at program exit
*/
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
@@ -1377,6 +1383,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
/* Non-UAPI available opcodes. */
[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
+ [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC,
[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
@@ -1392,29 +1399,54 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
select_insn:
goto *jumptable[insn->code];
- /* ALU */
-#define ALU(OPCODE, OP) \
- ALU64_##OPCODE##_X: \
- DST = DST OP SRC; \
- CONT; \
- ALU_##OPCODE##_X: \
- DST = (u32) DST OP (u32) SRC; \
- CONT; \
- ALU64_##OPCODE##_K: \
- DST = DST OP IMM; \
- CONT; \
- ALU_##OPCODE##_K: \
- DST = (u32) DST OP (u32) IMM; \
+ /* Explicitly mask the register-based shift amounts with 63 or 31
+ * to avoid undefined behavior. Normally this won't affect the
+ * generated code, for example, in case of native 64 bit archs such
+ * as x86-64 or arm64, the compiler is optimizing the AND away for
+ * the interpreter. In case of JITs, each of the JIT backends compiles
+ * the BPF shift operations to machine instructions which produce
+ * implementation-defined results in such a case; the resulting
+ * contents of the register may be arbitrary, but program behaviour
+ * as a whole remains defined. In other words, in case of JIT backends,
+ * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
+ */
+ /* ALU (shifts) */
+#define SHT(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP (SRC & 63); \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP ((u32) SRC & 31); \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+ /* ALU (rest) */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
CONT;
-
ALU(ADD, +)
ALU(SUB, -)
ALU(AND, &)
ALU(OR, |)
- ALU(LSH, <<)
- ALU(RSH, >>)
ALU(XOR, ^)
ALU(MUL, *)
+ SHT(LSH, <<)
+ SHT(RSH, >>)
+#undef SHT
#undef ALU
ALU_NEG:
DST = (u32) -DST;
@@ -1439,13 +1471,13 @@ select_insn:
insn++;
CONT;
ALU_ARSH_X:
- DST = (u64) (u32) (((s32) DST) >> SRC);
+ DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
CONT;
ALU_ARSH_K:
DST = (u64) (u32) (((s32) DST) >> IMM);
CONT;
ALU64_ARSH_X:
- (*(s64 *) &DST) >>= SRC;
+ (*(s64 *) &DST) >>= (SRC & 63);
CONT;
ALU64_ARSH_K:
(*(s64 *) &DST) >>= IMM;
@@ -1596,7 +1628,21 @@ out:
COND_JMP(s, JSGE, >=)
COND_JMP(s, JSLE, <=)
#undef COND_JMP
- /* STX and ST and LDX*/
+ /* ST, STX and LDX*/
+ ST_NOSPEC:
+ /* Speculation barrier for mitigating Speculative Store Bypass.
+ * In case of arm64, we rely on the firmware mitigation as
+ * controlled via the ssbd kernel parameter. Whenever the
+ * mitigation is enabled, it works for all of the kernel code
+ * with no need to provide any additional instructions here.
+ * In case of x86, we use 'lfence' insn for mitigation. We
+ * reuse preexisting logic from Spectre v1 mitigation that
+ * happens to produce the required code on x86 for v4 as well.
+ */
+#ifdef CONFIG_X86
+ barrier_nospec();
+#endif
+ CONT;
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
@@ -1777,20 +1823,26 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
{
+ bool ret;
+
if (fp->kprobe_override)
return false;
- if (!array->aux->type) {
+ spin_lock(&array->aux->owner.lock);
+
+ if (!array->aux->owner.type) {
/* There's no owner yet where we could check for
* compatibility.
*/
- array->aux->type = fp->type;
- array->aux->jited = fp->jited;
- return true;
+ array->aux->owner.type = fp->type;
+ array->aux->owner.jited = fp->jited;
+ ret = true;
+ } else {
+ ret = array->aux->owner.type == fp->type &&
+ array->aux->owner.jited == fp->jited;
}
-
- return array->aux->type == fp->type &&
- array->aux->jited == fp->jited;
+ spin_unlock(&array->aux->owner.lock);
+ return ret;
}
static int bpf_check_tail_call(const struct bpf_prog *fp)
@@ -1835,7 +1887,10 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
* @err: pointer to error variable
*
* Try to JIT eBPF program, if JIT is not available, use interpreter.
- * The BPF program will be executed via BPF_PROG_RUN() macro.
+ * The BPF program will be executed via bpf_prog_run() function.
+ *
+ * Return: the &fp argument along with &err set to 0 for success or
+ * a negative errno code on failure
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
@@ -2072,13 +2127,13 @@ int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
int bpf_prog_array_copy(struct bpf_prog_array *old_array,
struct bpf_prog *exclude_prog,
struct bpf_prog *include_prog,
+ u64 bpf_cookie,
struct bpf_prog_array **new_array)
{
int new_prog_cnt, carry_prog_cnt = 0;
- struct bpf_prog_array_item *existing;
+ struct bpf_prog_array_item *existing, *new;
struct bpf_prog_array *array;
bool found_exclude = false;
- int new_prog_idx = 0;
/* Figure out how many existing progs we need to carry over to
* the new array.
@@ -2115,20 +2170,27 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
if (!array)
return -ENOMEM;
+ new = array->items;
/* Fill in the new prog array */
if (carry_prog_cnt) {
existing = old_array->items;
- for (; existing->prog; existing++)
- if (existing->prog != exclude_prog &&
- existing->prog != &dummy_bpf_prog.prog) {
- array->items[new_prog_idx++].prog =
- existing->prog;
- }
+ for (; existing->prog; existing++) {
+ if (existing->prog == exclude_prog ||
+ existing->prog == &dummy_bpf_prog.prog)
+ continue;
+
+ new->prog = existing->prog;
+ new->bpf_cookie = existing->bpf_cookie;
+ new++;
+ }
}
- if (include_prog)
- array->items[new_prog_idx++].prog = include_prog;
- array->items[new_prog_idx].prog = NULL;
+ if (include_prog) {
+ new->prog = include_prog;
+ new->bpf_cookie = bpf_cookie;
+ new++;
+ }
+ new->prog = NULL;
*new_array = array;
return 0;
}
@@ -2211,8 +2273,14 @@ static void bpf_prog_free_deferred(struct work_struct *work)
#endif
if (aux->dst_trampoline)
bpf_trampoline_put(aux->dst_trampoline);
- for (i = 0; i < aux->func_cnt; i++)
+ for (i = 0; i < aux->func_cnt; i++) {
+ /* We can just unlink the subprog poke descriptor table as
+ * it was originally linked to the main program and is also
+ * released along with it.
+ */
+ aux->func[i]->aux->poke_tab = NULL;
bpf_jit_free(aux->func[i]);
+ }
if (aux->func_cnt) {
kfree(aux->func);
bpf_prog_unlock_free(aux->prog);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 5dd3e866599a..585b2b77ccc4 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -16,6 +16,7 @@
* netstack, and assigning dedicated CPUs for this stage. This
* basically allows for 10G wirespeed pre-filtering via bpf.
*/
+#include <linux/bitops.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/ptr_ring.h>
@@ -74,7 +75,7 @@ struct bpf_cpu_map_entry {
struct bpf_cpu_map {
struct bpf_map map;
/* Below members specific for map type */
- struct bpf_cpu_map_entry **cpu_map;
+ struct bpf_cpu_map_entry __rcu **cpu_map;
};
static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
@@ -168,6 +169,46 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
}
}
+static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
+ struct list_head *listp,
+ struct xdp_cpumap_stats *stats)
+{
+ struct sk_buff *skb, *tmp;
+ struct xdp_buff xdp;
+ u32 act;
+ int err;
+
+ list_for_each_entry_safe(skb, tmp, listp, list) {
+ act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog);
+ switch (act) {
+ case XDP_PASS:
+ break;
+ case XDP_REDIRECT:
+ skb_list_del_init(skb);
+ err = xdp_do_generic_redirect(skb->dev, skb, &xdp,
+ rcpu->prog);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ stats->drop++;
+ } else {
+ stats->redirect++;
+ }
+ return;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(skb->dev, rcpu->prog, act);
+ fallthrough;
+ case XDP_DROP:
+ skb_list_del_init(skb);
+ kfree_skb(skb);
+ stats->drop++;
+ return;
+ }
+ }
+}
+
static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
void **frames, int n,
struct xdp_cpumap_stats *stats)
@@ -176,11 +217,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
struct xdp_buff xdp;
int i, nframes = 0;
- if (!rcpu->prog)
- return n;
-
- rcu_read_lock_bh();
-
xdp_set_return_frame_no_direct();
xdp.rxq = &rxq;
@@ -227,17 +263,37 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
}
+ xdp_clear_return_frame_no_direct();
+
+ return nframes;
+}
+
+#define CPUMAP_BATCH 8
+
+static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
+ int xdp_n, struct xdp_cpumap_stats *stats,
+ struct list_head *list)
+{
+ int nframes;
+
+ if (!rcpu->prog)
+ return xdp_n;
+
+ rcu_read_lock_bh();
+
+ nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
+
if (stats->redirect)
- xdp_do_flush_map();
+ xdp_do_flush();
- xdp_clear_return_frame_no_direct();
+ if (unlikely(!list_empty(list)))
+ cpu_map_bpf_prog_run_skb(rcpu, list, stats);
rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
return nframes;
}
-#define CPUMAP_BATCH 8
static int cpu_map_kthread_run(void *data)
{
@@ -254,9 +310,9 @@ static int cpu_map_kthread_run(void *data)
struct xdp_cpumap_stats stats = {}; /* zero stats */
unsigned int kmem_alloc_drops = 0, sched = 0;
gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
+ int i, n, m, nframes, xdp_n;
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
- int i, n, m, nframes;
LIST_HEAD(list);
/* Release CPU reschedule checks */
@@ -280,9 +336,20 @@ static int cpu_map_kthread_run(void *data)
*/
n = __ptr_ring_consume_batched(rcpu->queue, frames,
CPUMAP_BATCH);
- for (i = 0; i < n; i++) {
+ for (i = 0, xdp_n = 0; i < n; i++) {
void *f = frames[i];
- struct page *page = virt_to_page(f);
+ struct page *page;
+
+ if (unlikely(__ptr_test_bit(0, &f))) {
+ struct sk_buff *skb = f;
+
+ __ptr_clear_bit(0, &skb);
+ list_add_tail(&skb->list, &list);
+ continue;
+ }
+
+ frames[xdp_n++] = f;
+ page = virt_to_page(f);
/* Bring struct page memory area to curr CPU. Read by
* build_skb_around via page_is_pfmemalloc(), and when
@@ -292,7 +359,7 @@ static int cpu_map_kthread_run(void *data)
}
/* Support running another XDP prog on this CPU */
- nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats);
+ nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
if (nframes) {
m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
if (unlikely(m == 0)) {
@@ -330,12 +397,6 @@ static int cpu_map_kthread_run(void *data)
return 0;
}
-bool cpu_map_prog_allowed(struct bpf_map *map)
-{
- return map->map_type == BPF_MAP_TYPE_CPUMAP &&
- map->value_size != offsetofend(struct bpf_cpumap_val, qsize);
-}
-
static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
{
struct bpf_prog *prog;
@@ -469,7 +530,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
{
struct bpf_cpu_map_entry *old_rcpu;
- old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+ old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
if (old_rcpu) {
call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
@@ -551,7 +612,7 @@ static void cpu_map_free(struct bpf_map *map)
for (i = 0; i < cmap->map.max_entries; i++) {
struct bpf_cpu_map_entry *rcpu;
- rcpu = READ_ONCE(cmap->cpu_map[i]);
+ rcpu = rcu_dereference_raw(cmap->cpu_map[i]);
if (!rcpu)
continue;
@@ -562,6 +623,10 @@ static void cpu_map_free(struct bpf_map *map)
kfree(cmap);
}
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
@@ -570,7 +635,8 @@ static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
if (key >= map->max_entries)
return NULL;
- rcpu = READ_ONCE(cmap->cpu_map[key]);
+ rcpu = rcu_dereference_check(cmap->cpu_map[key],
+ rcu_read_lock_bh_held());
return rcpu;
}
@@ -601,7 +667,8 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem);
+ return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+ __cpu_map_lookup_elem);
}
static int cpu_map_btf_id;
@@ -695,6 +762,25 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}
+int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ __skb_pull(skb, skb->mac_len);
+ skb_set_redirected(skb, false);
+ __ptr_set_bit(0, &skb);
+
+ ret = ptr_ring_produce(rcpu->queue, skb);
+ if (ret < 0)
+ goto trace;
+
+ wake_up_process(rcpu->kthread);
+trace:
+ trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu);
+ return ret;
+}
+
void __cpu_map_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index aa516472ce46..f02d04540c0c 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -57,6 +57,7 @@ struct xdp_dev_bulk_queue {
struct list_head flush_node;
struct net_device *dev;
struct net_device *dev_rx;
+ struct bpf_prog *xdp_prog;
unsigned int count;
};
@@ -72,7 +73,7 @@ struct bpf_dtab_netdev {
struct bpf_dtab {
struct bpf_map map;
- struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
+ struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */
struct list_head list;
/* these are only used for DEVMAP_HASH type maps */
@@ -92,7 +93,7 @@ static struct hlist_head *dev_map_create_hash(unsigned int entries,
int i;
struct hlist_head *hash;
- hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node);
+ hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node);
if (hash != NULL)
for (i = 0; i < entries; i++)
INIT_HLIST_HEAD(&hash[i]);
@@ -143,7 +144,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
spin_lock_init(&dtab->index_lock);
} else {
- dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
+ dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries *
sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node);
if (!dtab->netdev_map)
@@ -197,6 +198,7 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
+ bpf_clear_redirect_map(map);
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -224,7 +226,7 @@ static void dev_map_free(struct bpf_map *map)
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev;
- dev = dtab->netdev_map[i];
+ dev = rcu_dereference_raw(dtab->netdev_map[i]);
if (!dev)
continue;
@@ -257,6 +259,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@@ -316,32 +322,69 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
return -ENOENT;
}
-bool dev_map_can_have_prog(struct bpf_map *map)
+static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
+ struct xdp_frame **frames, int n,
+ struct net_device *dev)
{
- if ((map->map_type == BPF_MAP_TYPE_DEVMAP ||
- map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) &&
- map->value_size != offsetofend(struct bpf_devmap_val, ifindex))
- return true;
+ struct xdp_txq_info txq = { .dev = dev };
+ struct xdp_buff xdp;
+ int i, nframes = 0;
- return false;
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ u32 act;
+ int err;
+
+ xdp_convert_frame_to_buff(xdpf, &xdp);
+ xdp.txq = &txq;
+
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ switch (act) {
+ case XDP_PASS:
+ err = xdp_update_frame_from_buff(&xdp, xdpf);
+ if (unlikely(err < 0))
+ xdp_return_frame_rx_napi(xdpf);
+ else
+ frames[nframes++] = xdpf;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(dev, xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ xdp_return_frame_rx_napi(xdpf);
+ break;
+ }
+ }
+ return nframes; /* sent frames count */
}
static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
struct net_device *dev = bq->dev;
+ unsigned int cnt = bq->count;
int sent = 0, err = 0;
+ int to_send = cnt;
int i;
- if (unlikely(!bq->count))
+ if (unlikely(!cnt))
return;
- for (i = 0; i < bq->count; i++) {
+ for (i = 0; i < cnt; i++) {
struct xdp_frame *xdpf = bq->q[i];
prefetch(xdpf);
}
- sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
+ if (bq->xdp_prog) {
+ to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
+ if (!to_send)
+ goto out;
+ }
+
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
if (sent < 0) {
/* If ndo_xdp_xmit fails with an errno, no frames have
* been xmit'ed.
@@ -353,37 +396,34 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
/* If not all frames have been transmitted, it is our
* responsibility to free them
*/
- for (i = sent; unlikely(i < bq->count); i++)
+ for (i = sent; unlikely(i < to_send); i++)
xdp_return_frame_rx_napi(bq->q[i]);
- trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, bq->count - sent, err);
- bq->dev_rx = NULL;
+out:
bq->count = 0;
- __list_del_clearprev(&bq->flush_node);
+ trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
}
-/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
- * from the driver before returning from its napi->poll() routine. The poll()
- * routine is called either from busy_poll context or net_rx_action signaled
- * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
- * net device can be torn down. On devmap tear down we ensure the flush list
- * is empty before completing to ensure all flush operations have completed.
- * When drivers update the bpf program they may need to ensure any flush ops
- * are also complete. Using synchronize_rcu or call_rcu will suffice for this
- * because both wait for napi context to exit.
+/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
+ * driver before returning from its napi->poll() routine. See the comment above
+ * xdp_do_flush() in filter.c.
*/
void __dev_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq, *tmp;
- list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
bq_xmit_all(bq, XDP_XMIT_FLUSH);
+ bq->dev_rx = NULL;
+ bq->xdp_prog = NULL;
+ __list_del_clearprev(&bq->flush_node);
+ }
}
-/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
- * update happens in parallel here a dev_put wont happen until after reading the
- * ifindex.
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
*/
static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
@@ -393,15 +433,17 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
if (key >= map->max_entries)
return NULL;
- obj = READ_ONCE(dtab->netdev_map[key]);
+ obj = rcu_dereference_check(dtab->netdev_map[key],
+ rcu_read_lock_bh_held());
return obj;
}
-/* Runs under RCU-read-side, plus in softirq under NAPI protection.
- * Thus, safe percpu variable access.
+/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
+ * variable access, and map elements stick around. See comment above
+ * xdp_do_flush() in filter.c.
*/
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
- struct net_device *dev_rx)
+ struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
@@ -412,18 +454,22 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
* from net_device drivers NAPI func end.
+ *
+ * Do the same with xdp_prog and flush_list since these fields
+ * are only ever modified together.
*/
- if (!bq->dev_rx)
+ if (!bq->dev_rx) {
bq->dev_rx = dev_rx;
+ bq->xdp_prog = xdp_prog;
+ list_add(&bq->flush_node, flush_list);
+ }
bq->q[bq->count++] = xdpf;
-
- if (!bq->flush_node.prev)
- list_add(&bq->flush_node, flush_list);
}
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+ struct net_device *dev_rx,
+ struct bpf_prog *xdp_prog)
{
struct xdp_frame *xdpf;
int err;
@@ -439,42 +485,45 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
if (unlikely(!xdpf))
return -EOVERFLOW;
- bq_enqueue(dev, xdpf, dev_rx);
+ bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
return 0;
}
-static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
- struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
{
- struct xdp_txq_info txq = { .dev = dev };
+ struct xdp_txq_info txq = { .dev = dst->dev };
+ struct xdp_buff xdp;
u32 act;
- xdp_set_data_meta_invalid(xdp);
- xdp->txq = &txq;
+ if (!dst->xdp_prog)
+ return XDP_PASS;
+
+ __skb_pull(skb, skb->mac_len);
+ xdp.txq = &txq;
- act = bpf_prog_run_xdp(xdp_prog, xdp);
+ act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog);
switch (act) {
case XDP_PASS:
- return xdp;
- case XDP_DROP:
+ __skb_push(skb, skb->mac_len);
break;
default:
bpf_warn_invalid_xdp_action(act);
fallthrough;
case XDP_ABORTED:
- trace_xdp_exception(dev, xdp_prog, act);
+ trace_xdp_exception(dst->dev, dst->xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ kfree_skb(skb);
break;
}
- xdp_return_buff(xdp);
- return NULL;
+ return act;
}
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx)
{
- return __xdp_enqueue(dev, xdp, dev_rx);
+ return __xdp_enqueue(dev, xdp, dev_rx, NULL);
}
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
@@ -482,12 +531,138 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
{
struct net_device *dev = dst->dev;
- if (dst->xdp_prog) {
- xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
- if (!xdp)
- return 0;
+ return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
+}
+
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp)
+{
+ if (!obj ||
+ !obj->dev->netdev_ops->ndo_xdp_xmit)
+ return false;
+
+ if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
+ return false;
+
+ return true;
+}
+
+static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
+ struct net_device *dev_rx,
+ struct xdp_frame *xdpf)
+{
+ struct xdp_frame *nxdpf;
+
+ nxdpf = xdpf_clone(xdpf);
+ if (!nxdpf)
+ return -ENOMEM;
+
+ bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
+
+ return 0;
+}
+
+static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex)
+{
+ while (num_excluded--) {
+ if (ifindex == excluded[num_excluded])
+ return true;
}
- return __xdp_enqueue(dev, xdp, dev_rx);
+ return false;
+}
+
+/* Get ifindex of each upper device. 'indexes' must be able to hold at
+ * least MAX_NEST_DEV elements.
+ * Returns the number of ifindexes added.
+ */
+static int get_upper_ifindexes(struct net_device *dev, int *indexes)
+{
+ struct net_device *upper;
+ struct list_head *iter;
+ int n = 0;
+
+ netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+ indexes[n++] = upper->ifindex;
+ }
+ return n;
+}
+
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+ struct bpf_map *map, bool exclude_ingress)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ int excluded_devices[1+MAX_NEST_DEV];
+ struct hlist_head *head;
+ struct xdp_frame *xdpf;
+ int num_excluded = 0;
+ unsigned int i;
+ int err;
+
+ if (exclude_ingress) {
+ num_excluded = get_upper_ifindexes(dev_rx, excluded_devices);
+ excluded_devices[num_excluded++] = dev_rx->ifindex;
+ }
+
+ xdpf = xdp_convert_buff_to_frame(xdp);
+ if (unlikely(!xdpf))
+ return -EOVERFLOW;
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = rcu_dereference_check(dtab->netdev_map[i],
+ rcu_read_lock_bh_held());
+ if (!is_valid_dst(dst, xdp))
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_rcu(dst, head, index_hlist,
+ lockdep_is_held(&dtab->index_lock)) {
+ if (!is_valid_dst(dst, xdp))
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded,
+ dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
+ }
+
+ /* consume the last copy of the frame */
+ if (last_dst)
+ bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
+ else
+ xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
+
+ return 0;
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
@@ -498,12 +673,116 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
err = xdp_ok_fwd_dev(dst->dev, skb->len);
if (unlikely(err))
return err;
+
+ /* Redirect has already succeeded semantically at this point, so we just
+ * return 0 even if packet is dropped. Helper below takes care of
+ * freeing skb.
+ */
+ if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS)
+ return 0;
+
skb->dev = dst->dev;
generic_xdp_tx(skb, xdp_prog);
return 0;
}
+static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
+ struct sk_buff *skb,
+ struct bpf_prog *xdp_prog)
+{
+ struct sk_buff *nskb;
+ int err;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ err = dev_map_generic_redirect(dst, nskb, xdp_prog);
+ if (unlikely(err)) {
+ consume_skb(nskb);
+ return err;
+ }
+
+ return 0;
+}
+
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+ struct bpf_prog *xdp_prog, struct bpf_map *map,
+ bool exclude_ingress)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ int excluded_devices[1+MAX_NEST_DEV];
+ struct hlist_head *head;
+ struct hlist_node *next;
+ int num_excluded = 0;
+ unsigned int i;
+ int err;
+
+ if (exclude_ingress) {
+ num_excluded = get_upper_ifindexes(dev, excluded_devices);
+ excluded_devices[num_excluded++] = dev->ifindex;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = rcu_dereference_check(dtab->netdev_map[i],
+ rcu_read_lock_bh_held());
+ if (!dst)
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_safe(dst, next, head, index_hlist) {
+ if (!dst)
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded,
+ dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
+ }
+
+ /* consume the first skb and return */
+ if (last_dst)
+ return dev_map_generic_redirect(last_dst, skb, xdp_prog);
+
+ /* dtab is empty */
+ consume_skb(skb);
+ return 0;
+}
+
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
@@ -538,14 +817,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
if (k >= map->max_entries)
return -EINVAL;
- /* Use call_rcu() here to ensure any rcu critical sections have
- * completed as well as any flush operations because call_rcu
- * will wait for preempt-disable region to complete, NAPI in this
- * context. And additionally, the driver tear down ensures all
- * soft irqs are complete before removing the net device in the
- * case of dev_put equals zero.
- */
- old_dev = xchg(&dtab->netdev_map[k], NULL);
+ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
return 0;
@@ -654,7 +926,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
* Remembering the driver side flush operation will happen before the
* net device is removed.
*/
- old_dev = xchg(&dtab->netdev_map[i], dev);
+ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
@@ -730,12 +1002,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem);
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_lookup_elem);
}
static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem);
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_hash_lookup_elem);
}
static int dev_map_btf_id;
@@ -830,10 +1106,10 @@ static int dev_map_notification(struct notifier_block *notifier,
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev, *odev;
- dev = READ_ONCE(dtab->netdev_map[i]);
+ dev = rcu_dereference(dtab->netdev_map[i]);
if (!dev || netdev != dev->dev)
continue;
- odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
+ odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
if (dev == odev)
call_rcu(&dev->rcu,
__dev_map_entry_free);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index bbfc6bb79240..7b4afb7d96db 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-only
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
@@ -206,15 +206,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "BUG_%02x\n", insn->code);
}
} else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
+ if (BPF_MODE(insn->code) == BPF_MEM) {
+ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) {
+ verbose(cbs->private_data, "(%02x) nospec\n", insn->code);
+ } else {
verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
- return;
}
- verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->imm);
} else if (class == BPF_LDX) {
if (BPF_MODE(insn->code) != BPF_MEM) {
verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e546b18d27da..a4b040793f44 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d7ebb12ffffc..32471ba02708 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -46,12 +46,12 @@
* events, kprobes and tracing to be invoked before the prior invocation
* from one of these contexts completed. sys_bpf() uses the same mechanism
* by pinning the task to the current CPU and incrementing the recursion
- * protection accross the map operation.
+ * protection across the map operation.
*
* This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
* operations like memory allocations (even with GFP_ATOMIC) from atomic
* contexts. This is required because even with GFP_ATOMIC the memory
- * allocator calls into code pathes which acquire locks with long held lock
+ * allocator calls into code paths which acquire locks with long held lock
* sections. To ensure the deterministic behaviour these locks are regular
* spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
* true atomic contexts on an RT kernel are the low level hardware
@@ -228,6 +228,32 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
}
+static bool htab_has_extra_elems(struct bpf_htab *htab)
+{
+ return !htab_is_percpu(htab) && !htab_is_lru(htab);
+}
+
+static void htab_free_prealloced_timers(struct bpf_htab *htab)
+{
+ u32 num_entries = htab->map.max_entries;
+ int i;
+
+ if (likely(!map_value_has_timer(&htab->map)))
+ return;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ for (i = 0; i < num_entries; i++) {
+ struct htab_elem *elem;
+
+ elem = get_htab_elem(htab, i);
+ bpf_timer_cancel_and_free(elem->key +
+ round_up(htab->map.key_size, 8) +
+ htab->map.timer_off);
+ cond_resched();
+ }
+}
+
static void htab_free_elems(struct bpf_htab *htab)
{
int i;
@@ -265,8 +291,12 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
struct htab_elem *l;
if (node) {
+ u32 key_size = htab->map.key_size;
+
l = container_of(node, struct htab_elem, lru_node);
- memcpy(l->key, key, htab->map.key_size);
+ memcpy(l->key, key, key_size);
+ check_and_init_map_value(&htab->map,
+ l->key + round_up(key_size, 8));
return l;
}
@@ -278,7 +308,7 @@ static int prealloc_init(struct bpf_htab *htab)
u32 num_entries = htab->map.max_entries;
int err = -ENOMEM, i;
- if (!htab_is_percpu(htab) && !htab_is_lru(htab))
+ if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
@@ -596,7 +626,8 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l;
u32 hash, key_size;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -694,6 +725,14 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
+static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
+{
+ if (unlikely(map_value_has_timer(&htab->map)))
+ bpf_timer_cancel_and_free(elem->key +
+ round_up(htab->map.key_size, 8) +
+ htab->map.timer_off);
+}
+
/* It is called from the bpf_lru_list when the LRU needs to delete
* older elements from the htab.
*/
@@ -718,6 +757,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
+ check_and_free_timer(htab, l);
break;
}
@@ -789,6 +829,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+ check_and_free_timer(htab, l);
kfree(l);
}
@@ -816,6 +857,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
htab_put_fd_value(htab, l);
if (htab_is_prealloc(htab)) {
+ check_and_free_timer(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
@@ -919,8 +961,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
- check_and_init_map_lock(&htab->map,
- l_new->key + round_up(key_size, 8));
+ check_and_init_map_value(&htab->map,
+ l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
@@ -989,7 +1031,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -1060,6 +1103,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_del_rcu(&l_old->hash_node);
if (!htab_is_prealloc(htab))
free_htab_elem(htab, l_old);
+ else
+ check_and_free_timer(htab, l_old);
}
ret = 0;
err:
@@ -1067,6 +1112,12 @@ err:
return ret;
}
+static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
+{
+ check_and_free_timer(htab, elem);
+ bpf_lru_push_free(&htab->lru, &elem->lru_node);
+}
+
static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
@@ -1082,7 +1133,8 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -1099,7 +1151,8 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
l_new = prealloc_lru_pop(htab, key, hash);
if (!l_new)
return -ENOMEM;
- memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
+ copy_map_value(&htab->map,
+ l_new->key + round_up(map->key_size, 8), value);
ret = htab_lock_bucket(htab, b, hash, &flags);
if (ret)
@@ -1125,9 +1178,9 @@ err:
htab_unlock_bucket(htab, b, hash, flags);
if (ret)
- bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ htab_lru_push_free(htab, l_new);
else if (l_old)
- bpf_lru_push_free(&htab->lru, &l_old->lru_node);
+ htab_lru_push_free(htab, l_old);
return ret;
}
@@ -1148,7 +1201,8 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -1202,7 +1256,8 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -1276,7 +1331,8 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -1311,7 +1367,8 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret;
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -1332,7 +1389,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
htab_unlock_bucket(htab, b, hash, flags);
if (l)
- bpf_lru_push_free(&htab->lru, &l->lru_node);
+ htab_lru_push_free(htab, l);
return ret;
}
@@ -1352,6 +1409,35 @@ static void delete_all_elements(struct bpf_htab *htab)
}
}
+static void htab_free_malloced_timers(struct bpf_htab *htab)
+{
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_nulls_head *head = select_bucket(htab, i);
+ struct hlist_nulls_node *n;
+ struct htab_elem *l;
+
+ hlist_nulls_for_each_entry(l, n, head, hash_node)
+ check_and_free_timer(htab, l);
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+}
+
+static void htab_map_free_timers(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ if (likely(!map_value_has_timer(&htab->map)))
+ return;
+ if (!htab_is_prealloc(htab))
+ htab_free_malloced_timers(htab);
+ else
+ htab_free_prealloced_timers(htab);
+}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
@@ -1401,6 +1487,100 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, bool is_lru_map,
+ bool is_percpu, u64 flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_head *head;
+ unsigned long bflags;
+ struct htab_elem *l;
+ u32 hash, key_size;
+ struct bucket *b;
+ int ret;
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ ret = htab_lock_bucket(htab, b, hash, &bflags);
+ if (ret)
+ return ret;
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+ if (!l) {
+ ret = -ENOENT;
+ } else {
+ if (is_percpu) {
+ u32 roundup_value_size = round_up(map->value_size, 8);
+ void __percpu *pptr;
+ int off = 0, cpu;
+
+ pptr = htab_elem_get_ptr(l, key_size);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(value + off,
+ per_cpu_ptr(pptr, cpu),
+ roundup_value_size);
+ off += roundup_value_size;
+ }
+ } else {
+ u32 roundup_key_size = round_up(map->key_size, 8);
+
+ if (flags & BPF_F_LOCK)
+ copy_map_value_locked(map, value, l->key +
+ roundup_key_size,
+ true);
+ else
+ copy_map_value(map, value, l->key +
+ roundup_key_size);
+ check_and_init_map_value(map, value);
+ }
+
+ hlist_nulls_del_rcu(&l->hash_node);
+ if (!is_lru_map)
+ free_htab_elem(htab, l);
+ }
+
+ htab_unlock_bucket(htab, b, hash, bflags);
+
+ if (is_lru_map && l)
+ htab_lru_push_free(htab, l);
+
+ return ret;
+}
+
+static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
+ flags);
+}
+
+static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+ void *key, void *value,
+ u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
+ flags);
+}
+
+static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
+ flags);
+}
+
+static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+ void *key, void *value,
+ u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
+ flags);
+}
+
static int
__htab_map_lookup_and_delete_batch(struct bpf_map *map,
const union bpf_attr *attr,
@@ -1464,8 +1644,8 @@ alloc:
/* We cannot do copy_from_user or copy_to_user inside
* the rcu_read_lock. Allocate enough space here.
*/
- keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
- values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
if (!keys || !values) {
ret = -ENOMEM;
goto after_loop;
@@ -1544,7 +1724,7 @@ again_nocopy:
true);
else
copy_map_value(map, dst_val, value);
- check_and_init_map_lock(map, dst_val);
+ check_and_init_map_value(map, dst_val);
}
if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node);
@@ -1571,7 +1751,7 @@ again_nocopy:
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink;
- bpf_lru_push_free(&htab->lru, &l->lru_node);
+ htab_lru_push_free(htab, l);
}
next_batch:
@@ -1933,7 +2113,9 @@ const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
+ .map_release_uref = htab_map_free_timers,
.map_lookup_elem = htab_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
@@ -1953,7 +2135,9 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
+ .map_release_uref = htab_map_free_timers,
.map_lookup_elem = htab_lru_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
@@ -2077,6 +2261,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_percpu_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
@@ -2096,6 +2281,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a2f1f15ce432..9aabf84afd4b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -29,7 +29,7 @@
*/
BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
return (unsigned long) map->ops->map_lookup_elem(map, key);
}
@@ -45,7 +45,7 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = {
BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
void *, value, u64, flags)
{
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
return map->ops->map_update_elem(map, key, value, flags);
}
@@ -62,7 +62,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
return map->ops->map_delete_elem(map, key);
}
@@ -289,13 +289,18 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
static DEFINE_PER_CPU(unsigned long, irqsave_flags);
-notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
{
unsigned long flags;
local_irq_save(flags);
__bpf_spin_lock(lock);
__this_cpu_write(irqsave_flags, flags);
+}
+
+notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+{
+ __bpf_spin_lock_irqsave(lock);
return 0;
}
@@ -306,13 +311,18 @@ const struct bpf_func_proto bpf_spin_lock_proto = {
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
};
-notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
{
unsigned long flags;
flags = __this_cpu_read(irqsave_flags);
__bpf_spin_unlock(lock);
local_irq_restore(flags);
+}
+
+notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+{
+ __bpf_spin_unlock_irqrestore(lock);
return 0;
}
@@ -333,9 +343,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
else
lock = dst + map->spin_lock_off;
preempt_disable();
- ____bpf_spin_lock(lock);
+ __bpf_spin_lock_irqsave(lock);
copy_map_value(map, dst, src);
- ____bpf_spin_unlock(lock);
+ __bpf_spin_unlock_irqrestore(lock);
preempt_enable();
}
@@ -353,9 +363,15 @@ const struct bpf_func_proto bpf_jiffies64_proto = {
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
- struct cgroup *cgrp = task_dfl_cgroup(current);
+ struct cgroup *cgrp;
+ u64 cgrp_id;
- return cgroup_id(cgrp);
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ cgrp_id = cgroup_id(cgrp);
+ rcu_read_unlock();
+
+ return cgrp_id;
}
const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
@@ -366,13 +382,17 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
{
- struct cgroup *cgrp = task_dfl_cgroup(current);
+ struct cgroup *cgrp;
struct cgroup *ancestor;
+ u64 cgrp_id;
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
ancestor = cgroup_ancestor(cgrp, ancestor_level);
- if (!ancestor)
- return 0;
- return cgroup_id(ancestor);
+ cgrp_id = ancestor ? cgroup_id(ancestor) : 0;
+ rcu_read_unlock();
+
+ return cgrp_id;
}
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
@@ -383,8 +403,6 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
};
#ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
- bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
{
@@ -393,17 +411,13 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
* verifier checks that its value is correct.
*/
enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage *storage = NULL;
+ struct bpf_cgroup_storage *storage;
+ struct bpf_cg_run_ctx *ctx;
void *ptr;
- int i;
- for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
- if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
- continue;
-
- storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
- break;
- }
+ /* get current cgroup storage from BPF run context */
+ ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+ storage = ctx->prog_item->cgroup_storage[stype];
if (stype == BPF_CGROUP_STORAGE_SHARED)
ptr = &READ_ONCE(storage->buf)->data[0];
@@ -904,6 +918,20 @@ fmt_str:
num_spec++;
continue;
+ } else if (fmt[i] == 'c') {
+ if (!tmp_buf)
+ goto nocopy_fmt;
+
+ if (tmp_buf_end == tmp_buf) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ *tmp_buf = raw_args[num_spec];
+ tmp_buf++;
+ num_spec++;
+
+ continue;
}
sizeof_cur_arg = sizeof(int);
@@ -989,11 +1017,327 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+/* BPF map elements can contain 'struct bpf_timer'.
+ * Such map owns all of its BPF timers.
+ * 'struct bpf_timer' is allocated as part of map element allocation
+ * and it's zero initialized.
+ * That space is used to keep 'struct bpf_timer_kern'.
+ * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
+ * remembers 'struct bpf_map *' pointer it's part of.
+ * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
+ * bpf_timer_start() arms the timer.
+ * If user space reference to a map goes to zero at this point
+ * ops->map_release_uref callback is responsible for cancelling the timers,
+ * freeing their memory, and decrementing prog's refcnts.
+ * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
+ * Inner maps can contain bpf timers as well. ops->map_release_uref is
+ * freeing the timers when inner map is replaced or deleted by user space.
+ */
+struct bpf_hrtimer {
+ struct hrtimer timer;
+ struct bpf_map *map;
+ struct bpf_prog *prog;
+ void __rcu *callback_fn;
+ void *value;
+};
+
+/* the actual struct hidden inside uapi struct bpf_timer */
+struct bpf_timer_kern {
+ struct bpf_hrtimer *timer;
+ /* bpf_spin_lock is used here instead of spinlock_t to make
+ * sure that it always fits into space resereved by struct bpf_timer
+ * regardless of LOCKDEP and spinlock debug flags.
+ */
+ struct bpf_spin_lock lock;
+} __attribute__((aligned(8)));
+
+static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+
+static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+{
+ struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
+ struct bpf_map *map = t->map;
+ void *value = t->value;
+ void *callback_fn;
+ void *key;
+ u32 idx;
+
+ callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
+ if (!callback_fn)
+ goto out;
+
+ /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
+ * cannot be preempted by another bpf_timer_cb() on the same cpu.
+ * Remember the timer this callback is servicing to prevent
+ * deadlock if callback_fn() calls bpf_timer_cancel() or
+ * bpf_map_delete_elem() on the same timer.
+ */
+ this_cpu_write(hrtimer_running, t);
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* compute the key */
+ idx = ((char *)value - array->value) / array->elem_size;
+ key = &idx;
+ } else { /* hash or lru */
+ key = value - round_up(map->key_size, 8);
+ }
+
+ BPF_CAST_CALL(callback_fn)((u64)(long)map, (u64)(long)key,
+ (u64)(long)value, 0, 0);
+ /* The verifier checked that return value is zero. */
+
+ this_cpu_write(hrtimer_running, NULL);
+out:
+ return HRTIMER_NORESTART;
+}
+
+BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
+ u64, flags)
+{
+ clockid_t clockid = flags & (MAX_CLOCKS - 1);
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ BUILD_BUG_ON(MAX_CLOCKS != 16);
+ BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
+ BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+
+ if (flags >= MAX_CLOCKS ||
+ /* similar to timerfd except _ALARM variants are not supported */
+ (clockid != CLOCK_MONOTONIC &&
+ clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_BOOTTIME))
+ return -EINVAL;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (t) {
+ ret = -EBUSY;
+ goto out;
+ }
+ if (!atomic64_read(&map->usercnt)) {
+ /* maps with timers must be either held by user space
+ * or pinned in bpffs.
+ */
+ ret = -EPERM;
+ goto out;
+ }
+ /* allocate hrtimer via map_kmalloc to use memcg accounting */
+ t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
+ if (!t) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ t->value = (void *)timer - map->timer_off;
+ t->map = map;
+ t->prog = NULL;
+ rcu_assign_pointer(t->callback_fn, NULL);
+ hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+ t->timer.function = bpf_timer_cb;
+ timer->timer = t;
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_init_proto = {
+ .func = bpf_timer_init,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
+ struct bpf_prog_aux *, aux)
+{
+ struct bpf_prog *prev, *prog = aux->prog;
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!atomic64_read(&t->map->usercnt)) {
+ /* maps with timers must be either held by user space
+ * or pinned in bpffs. Otherwise timer might still be
+ * running even when bpf prog is detached and user space
+ * is gone, since map_release_uref won't ever be called.
+ */
+ ret = -EPERM;
+ goto out;
+ }
+ prev = t->prog;
+ if (prev != prog) {
+ /* Bump prog refcnt once. Every bpf_timer_set_callback()
+ * can pick different callback_fn-s within the same prog.
+ */
+ prog = bpf_prog_inc_not_zero(prog);
+ if (IS_ERR(prog)) {
+ ret = PTR_ERR(prog);
+ goto out;
+ }
+ if (prev)
+ /* Drop prev prog refcnt when swapping with new prog */
+ bpf_prog_put(prev);
+ t->prog = prog;
+ }
+ rcu_assign_pointer(t->callback_fn, callback_fn);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_set_callback_proto = {
+ .func = bpf_timer_set_callback,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_PTR_TO_FUNC,
+};
+
+BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
+{
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ if (flags)
+ return -EINVAL;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t || !t->prog) {
+ ret = -EINVAL;
+ goto out;
+ }
+ hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_start_proto = {
+ .func = bpf_timer_start,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static void drop_prog_refcnt(struct bpf_hrtimer *t)
+{
+ struct bpf_prog *prog = t->prog;
+
+ if (prog) {
+ bpf_prog_put(prog);
+ t->prog = NULL;
+ rcu_assign_pointer(t->callback_fn, NULL);
+ }
+}
+
+BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+{
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (this_cpu_read(hrtimer_running) == t) {
+ /* If bpf callback_fn is trying to bpf_timer_cancel()
+ * its own timer the hrtimer_cancel() will deadlock
+ * since it waits for callback_fn to finish
+ */
+ ret = -EDEADLK;
+ goto out;
+ }
+ drop_prog_refcnt(t);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ /* Cancel the timer and wait for associated callback to finish
+ * if it was running.
+ */
+ ret = ret ?: hrtimer_cancel(&t->timer);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_cancel_proto = {
+ .func = bpf_timer_cancel,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+};
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+ struct bpf_timer_kern *timer = val;
+ struct bpf_hrtimer *t;
+
+ /* Performance optimization: read timer->timer without lock first. */
+ if (!READ_ONCE(timer->timer))
+ return;
+
+ __bpf_spin_lock_irqsave(&timer->lock);
+ /* re-read it under lock */
+ t = timer->timer;
+ if (!t)
+ goto out;
+ drop_prog_refcnt(t);
+ /* The subsequent bpf_timer_start/cancel() helpers won't be able to use
+ * this timer, since it won't be initialized.
+ */
+ timer->timer = NULL;
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ if (!t)
+ return;
+ /* Cancel the timer and wait for callback to complete if it was running.
+ * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
+ * right after for both preallocated and non-preallocated maps.
+ * The timer->timer = NULL was already done and no code path can
+ * see address 't' anymore.
+ *
+ * Check that bpf_map_delete/update_elem() wasn't called from timer
+ * callback_fn. In such case don't call hrtimer_cancel() (since it will
+ * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
+ * return -1). Though callback_fn is still running on this cpu it's
+ * safe to do kfree(t) because bpf_timer_cb() read everything it needed
+ * from 't'. The bpf subprog callback_fn won't be able to access 't',
+ * since timer->timer = NULL was already done. The timer will be
+ * effectively cancelled because bpf_timer_cb() will return
+ * HRTIMER_NORESTART.
+ */
+ if (this_cpu_read(hrtimer_running) != t)
+ hrtimer_cancel(&t->timer);
+ kfree(t);
+}
+
const struct bpf_func_proto bpf_get_current_task_proto __weak;
+const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
+const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)
@@ -1055,6 +1399,14 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_per_cpu_ptr_proto;
case BPF_FUNC_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
+ case BPF_FUNC_timer_init:
+ return &bpf_timer_init_proto;
+ case BPF_FUNC_timer_set_callback:
+ return &bpf_timer_set_callback_proto;
+ case BPF_FUNC_timer_start:
+ return &bpf_timer_start_proto;
+ case BPF_FUNC_timer_cancel:
+ return &bpf_timer_cancel_proto;
default:
break;
}
@@ -1067,20 +1419,24 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return bpf_get_trace_printk_proto();
case BPF_FUNC_get_current_task:
return &bpf_get_current_task_proto;
+ case BPF_FUNC_get_current_task_btf:
+ return &bpf_get_current_task_btf_proto;
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_snprintf:
return &bpf_snprintf_proto;
+ case BPF_FUNC_task_pt_regs:
+ return &bpf_task_pt_regs_proto;
default:
return NULL;
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index b4ebd60a6c16..80da1db47c68 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -543,7 +543,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
return PTR_ERR(raw);
if (type == BPF_TYPE_PROG)
- ret = (f_flags != O_RDWR) ? -EINVAL : bpf_prog_new_fd(raw);
+ ret = bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
ret = bpf_map_new_fd(raw, f_flags);
else if (type == BPF_TYPE_LINK)
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index bd11db9774c3..035e9e3a7132 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -1,6 +1,7 @@
//SPDX-License-Identifier: GPL-2.0
#include <linux/bpf-cgroup.h>
#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
#include <linux/btf.h>
#include <linux/bug.h>
#include <linux/filter.h>
@@ -11,9 +12,6 @@
#ifdef CONFIG_CGROUP_BPF
-DEFINE_PER_CPU(struct bpf_cgroup_storage_info,
- bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
-
#include "../cgroup/cgroup-internal.h"
#define LOCAL_STORAGE_CREATE_FLAG_MASK \
@@ -173,7 +171,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return -ENOMEM;
memcpy(&new->data[0], value, map->value_size);
- check_and_init_map_lock(map, new->data);
+ check_and_init_map_value(map, new->data);
new = xchg(&storage->buf, new);
kfree_rcu(new, rcu);
@@ -286,9 +284,17 @@ enoent:
static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
+ __u32 max_value_size = BPF_LOCAL_STORAGE_MAX_VALUE_SIZE;
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_cgroup_storage_map *map;
+ /* percpu is bound by PCPU_MIN_UNIT_SIZE, non-percu
+ * is the same as other local storages.
+ */
+ if (attr->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ max_value_size = min_t(__u32, max_value_size,
+ PCPU_MIN_UNIT_SIZE);
+
if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) &&
attr->key_size != sizeof(__u64))
return ERR_PTR(-EINVAL);
@@ -296,7 +302,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
if (attr->value_size == 0)
return ERR_PTR(-EINVAL);
- if (attr->value_size > PAGE_SIZE)
+ if (attr->value_size > max_value_size)
return ERR_PTR(-E2BIG);
if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK ||
@@ -409,7 +415,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
- enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+ enum bpf_cgroup_storage_type stype;
struct bpf_cgroup_storage *storage;
int cpu;
@@ -509,7 +515,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
map->numa_node);
if (!storage->buf)
goto enomem;
- check_and_init_map_lock(map, storage->buf->data);
+ check_and_init_map_value(map, storage->buf->data);
} else {
storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp);
if (!storage->percpu_buf)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1b7b8a6f34ee..423549d2c52e 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -232,7 +232,8 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
/* Start walking the trie from the root node ... */
- for (node = rcu_dereference(trie->root); node;) {
+ for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held());
+ node;) {
unsigned int next_bit;
size_t matchlen;
@@ -264,7 +265,8 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
* traverse down.
*/
next_bit = extract_bit(key->data, node->prefixlen);
- node = rcu_dereference(node->child[next_bit]);
+ node = rcu_dereference_check(node->child[next_bit],
+ rcu_read_lock_bh_held());
}
if (!found)
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 39ab0b68cade..5cd8f5277279 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -3,6 +3,7 @@
*/
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include "map_in_map.h"
@@ -50,6 +51,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->max_entries = inner_map->max_entries;
inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
+ inner_map_meta->timer_off = inner_map->timer_off;
+ if (inner_map->btf) {
+ btf_get(inner_map->btf);
+ inner_map_meta->btf = inner_map->btf;
+ }
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
@@ -65,6 +71,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
void bpf_map_meta_free(struct bpf_map *map_meta)
{
+ btf_put(map_meta->btf);
kfree(map_meta);
}
@@ -75,6 +82,7 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
return meta0->map_type == meta1->map_type &&
meta0->key_size == meta1->key_size &&
meta0->value_size == meta1->value_size &&
+ meta0->timer_off == meta1->timer_off &&
meta0->map_flags == meta1->map_flags;
}
diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c
index 52aa7b38e8b8..03af863314ea 100644
--- a/kernel/bpf/preload/iterators/iterators.bpf.c
+++ b/kernel/bpf/preload/iterators/iterators.bpf.c
@@ -2,7 +2,6 @@
/* Copyright (c) 2020 Facebook */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 4838922f723d..93a55391791a 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -102,7 +102,7 @@ static void reuseport_array_free(struct bpf_map *map)
/*
* ops->map_*_elem() will not be able to access this
* array now. Hence, this function only races with
- * bpf_sk_reuseport_detach() which was triggerred by
+ * bpf_sk_reuseport_detach() which was triggered by
* close() or disconnect().
*
* This function and bpf_sk_reuseport_detach() are
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 84b3b35fc0d0..9e0c10c6892a 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -8,6 +8,7 @@
#include <linux/vmalloc.h>
#include <linux/wait.h>
#include <linux/poll.h>
+#include <linux/kmemleak.h>
#include <uapi/linux/btf.h>
#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
@@ -105,6 +106,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
VM_ALLOC | VM_USERMAP, PAGE_KERNEL);
if (rb) {
+ kmemleak_not_leak(pages);
rb->pages = pages;
rb->nr_pages = nr_pages;
return rb;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 6fbc2abe9c91..6e75bbee39f0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -63,7 +63,8 @@ static inline int stack_map_data_size(struct bpf_map *map)
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
- u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
+ u64 elem_size = sizeof(struct stack_map_bucket) +
+ (u64)smap->map.value_size;
int err;
smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
@@ -179,7 +180,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
* with build_id.
*/
if (!user || !current || !current->mm || irq_work_busy ||
- !mmap_read_trylock_non_owner(current->mm)) {
+ !mmap_read_trylock(current->mm)) {
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
@@ -204,9 +205,15 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
}
if (!work) {
- mmap_read_unlock_non_owner(current->mm);
+ mmap_read_unlock(current->mm);
} else {
work->mm = current->mm;
+
+ /* The lock will be released once we're out of interrupt
+ * context. Tell lockdep that we've released it now so
+ * it doesn't complain that we forgot to release it.
+ */
+ rwsem_release(&current->mm->mmap_lock.dep_map, _RET_IP_);
irq_work_queue(&work->irq_work);
}
}
@@ -530,14 +537,12 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
return res;
}
-BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct)
-
const struct bpf_func_proto bpf_get_task_stack_proto = {
.func = bpf_get_task_stack,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
- .arg1_btf_id = &bpf_get_task_stack_btf_ids[0],
+ .arg1_btf_id = &btf_task_struct_ids[0],
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ea04b0deb5ce..1cad6979a0d0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -73,11 +73,10 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
* copy_from_user() call. However, this is not a concern since this function is
* meant to be a future-proofing of bits.
*/
-int bpf_check_uarg_tail_zero(void __user *uaddr,
+int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
size_t expected_size,
size_t actual_size)
{
- unsigned char __user *addr = uaddr + expected_size;
int res;
if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
@@ -86,7 +85,12 @@ int bpf_check_uarg_tail_zero(void __user *uaddr,
if (actual_size <= expected_size)
return 0;
- res = check_zeroed_user(addr, actual_size - expected_size);
+ if (uaddr.is_kernel)
+ res = memchr_inv(uaddr.kernel + expected_size, 0,
+ actual_size - expected_size) == NULL;
+ else
+ res = check_zeroed_user(uaddr.user + expected_size,
+ actual_size - expected_size);
if (res < 0)
return res;
return res ? 0 : -E2BIG;
@@ -256,8 +260,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, value, ptr, true);
else
copy_map_value(map, value, ptr);
- /* mask lock, since value wasn't zero inited */
- check_and_init_map_lock(map, value);
+ /* mask lock and timer, since value wasn't zero inited */
+ check_and_init_map_value(map, value);
}
rcu_read_unlock();
}
@@ -539,8 +543,10 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
array = container_of(map, struct bpf_array, map);
- type = array->aux->type;
- jited = array->aux->jited;
+ spin_lock(&array->aux->owner.lock);
+ type = array->aux->owner.type;
+ jited = array->aux->owner.jited;
+ spin_unlock(&array->aux->owner.lock);
}
seq_printf(m,
@@ -619,7 +625,8 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
struct bpf_map *map = filp->private_data;
int err;
- if (!map->ops->map_mmap || map_value_has_spin_lock(map))
+ if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
+ map_value_has_timer(map))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
@@ -789,6 +796,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
}
}
+ map->timer_off = btf_find_timer(btf, value_type);
+ if (map_value_has_timer(map)) {
+ if (map->map_flags & BPF_F_RDONLY_PROG)
+ return -EACCES;
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY)
+ return -EOPNOTSUPP;
+ }
+
if (map->ops->map_check_btf)
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
@@ -840,6 +857,7 @@ static int map_create(union bpf_attr *attr)
mutex_init(&map->freeze_mutex);
map->spin_lock_off = -EINVAL;
+ map->timer_off = -EINVAL;
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
@@ -997,7 +1015,7 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
static void *__bpf_copy_key(void __user *ukey, u64 key_size)
{
if (key_size)
- return memdup_user(ukey, key_size);
+ return vmemdup_user(ukey, key_size);
if (ukey)
return ERR_PTR(-EINVAL);
@@ -1005,6 +1023,17 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
return NULL;
}
+static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
+{
+ if (key_size)
+ return kvmemdup_bpfptr(ukey, key_size);
+
+ if (!bpfptr_is_null(ukey))
+ return ERR_PTR(-EINVAL);
+
+ return NULL;
+}
+
/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
@@ -1049,7 +1078,7 @@ static int map_lookup_elem(union bpf_attr *attr)
value_size = bpf_map_value_size(map);
err = -ENOMEM;
- value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
@@ -1064,9 +1093,9 @@ static int map_lookup_elem(union bpf_attr *attr)
err = 0;
free_value:
- kfree(value);
+ kvfree(value);
free_key:
- kfree(key);
+ kvfree(key);
err_put:
fdput(f);
return err;
@@ -1075,10 +1104,10 @@ err_put:
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
-static int map_update_elem(union bpf_attr *attr)
+static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
{
- void __user *ukey = u64_to_user_ptr(attr->key);
- void __user *uvalue = u64_to_user_ptr(attr->value);
+ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
+ bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
@@ -1104,35 +1133,29 @@ static int map_update_elem(union bpf_attr *attr)
goto err_put;
}
- key = __bpf_copy_key(ukey, map->key_size);
+ key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
- value_size = round_up(map->value_size, 8) * num_possible_cpus();
- else
- value_size = map->value_size;
+ value_size = bpf_map_value_size(map);
err = -ENOMEM;
- value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
err = -EFAULT;
- if (copy_from_user(value, uvalue, value_size) != 0)
+ if (copy_from_bpfptr(value, uvalue, value_size) != 0)
goto free_value;
err = bpf_map_update_value(map, f, key, value, attr->flags);
free_value:
- kfree(value);
+ kvfree(value);
free_key:
- kfree(key);
+ kvfree(key);
err_put:
fdput(f);
return err;
@@ -1184,7 +1207,7 @@ static int map_delete_elem(union bpf_attr *attr)
bpf_enable_instrumentation();
maybe_wait_bpf_programs(map);
out:
- kfree(key);
+ kvfree(key);
err_put:
fdput(f);
return err;
@@ -1226,7 +1249,7 @@ static int map_get_next_key(union bpf_attr *attr)
}
err = -ENOMEM;
- next_key = kmalloc(map->key_size, GFP_USER);
+ next_key = kvmalloc(map->key_size, GFP_USER);
if (!next_key)
goto free_key;
@@ -1249,9 +1272,9 @@ out:
err = 0;
free_next_key:
- kfree(next_key);
+ kvfree(next_key);
free_key:
- kfree(key);
+ kvfree(key);
err_put:
fdput(f);
return err;
@@ -1278,7 +1301,7 @@ int generic_map_delete_batch(struct bpf_map *map,
if (!max_count)
return 0;
- key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
@@ -1305,7 +1328,7 @@ int generic_map_delete_batch(struct bpf_map *map,
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
err = -EFAULT;
- kfree(key);
+ kvfree(key);
return err;
}
@@ -1316,12 +1339,11 @@ int generic_map_update_batch(struct bpf_map *map,
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 value_size, cp, max_count;
- int ufd = attr->map_fd;
+ int ufd = attr->batch.map_fd;
void *key, *value;
struct fd f;
int err = 0;
- f = fdget(ufd);
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
@@ -1336,16 +1358,17 @@ int generic_map_update_batch(struct bpf_map *map,
if (!max_count)
return 0;
- key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
- value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value) {
- kfree(key);
+ kvfree(key);
return -ENOMEM;
}
+ f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
for (cp = 0; cp < max_count; cp++) {
err = -EFAULT;
if (copy_from_user(key, keys + cp * map->key_size,
@@ -1363,8 +1386,9 @@ int generic_map_update_batch(struct bpf_map *map,
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
err = -EFAULT;
- kfree(value);
- kfree(key);
+ kvfree(value);
+ kvfree(key);
+ fdput(f);
return err;
}
@@ -1398,13 +1422,13 @@ int generic_map_lookup_batch(struct bpf_map *map,
if (put_user(0, &uattr->batch.count))
return -EFAULT;
- buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!buf_prevkey)
return -ENOMEM;
- buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
+ buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
if (!buf) {
- kfree(buf_prevkey);
+ kvfree(buf_prevkey);
return -ENOMEM;
}
@@ -1464,12 +1488,12 @@ int generic_map_lookup_batch(struct bpf_map *map,
err = -EFAULT;
free_buf:
- kfree(buf_prevkey);
- kfree(buf);
+ kvfree(buf_prevkey);
+ kvfree(buf);
return err;
}
-#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
static int map_lookup_and_delete_elem(union bpf_attr *attr)
{
@@ -1485,6 +1509,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
return -EINVAL;
+ if (attr->flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
@@ -1495,24 +1522,47 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
goto err_put;
}
+ if (attr->flags &&
+ (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ if ((attr->flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
- value_size = map->value_size;
+ value_size = bpf_map_value_size(map);
err = -ENOMEM;
- value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
+ err = -ENOTSUPP;
if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
err = map->ops->map_pop_elem(map, value);
- } else {
- err = -ENOTSUPP;
+ } else if (map->map_type == BPF_MAP_TYPE_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ if (!bpf_map_is_dev_bound(map)) {
+ bpf_disable_instrumentation();
+ rcu_read_lock();
+ err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ }
}
if (err)
@@ -1526,9 +1576,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
err = 0;
free_value:
- kfree(value);
+ kvfree(value);
free_key:
- kfree(key);
+ kvfree(key);
err_put:
fdput(f);
return err;
@@ -1550,7 +1600,8 @@ static int map_freeze(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
+ map_value_has_timer(map)) {
fdput(f);
return -ENOTSUPP;
}
@@ -1658,6 +1709,8 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
{
+ unsigned long flags;
+
/* cBPF to eBPF migrations are currently not in the idr store.
* Offloaded programs are removed from the store when their device
* disappears - even if someone grabs an fd to them they are unusable,
@@ -1667,7 +1720,7 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
return;
if (do_idr_lock)
- spin_lock_bh(&prog_idr_lock);
+ spin_lock_irqsave(&prog_idr_lock, flags);
else
__acquire(&prog_idr_lock);
@@ -1675,7 +1728,7 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
prog->aux->id = 0;
if (do_idr_lock)
- spin_unlock_bh(&prog_idr_lock);
+ spin_unlock_irqrestore(&prog_idr_lock, flags);
else
__release(&prog_idr_lock);
}
@@ -1711,14 +1764,32 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
}
}
+static void bpf_prog_put_deferred(struct work_struct *work)
+{
+ struct bpf_prog_aux *aux;
+ struct bpf_prog *prog;
+
+ aux = container_of(work, struct bpf_prog_aux, work);
+ prog = aux->prog;
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+ __bpf_prog_put_noref(prog, true);
+}
+
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
- if (atomic64_dec_and_test(&prog->aux->refcnt)) {
- perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
- bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+ struct bpf_prog_aux *aux = prog->aux;
+
+ if (atomic64_dec_and_test(&aux->refcnt)) {
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
- __bpf_prog_put_noref(prog, true);
+
+ if (in_irq() || irqs_disabled()) {
+ INIT_WORK(&aux->work, bpf_prog_put_deferred);
+ schedule_work(&aux->work);
+ } else {
+ bpf_prog_put_deferred(&aux->work);
+ }
}
}
@@ -1932,6 +2003,11 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
attr->expected_attach_type =
BPF_CGROUP_INET_SOCK_CREATE;
break;
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ if (!attr->expected_attach_type)
+ attr->expected_attach_type =
+ BPF_SK_REUSEPORT_SELECT;
+ break;
}
}
@@ -2015,6 +2091,15 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
if (expected_attach_type == BPF_SK_LOOKUP)
return 0;
return -EINVAL;
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ switch (expected_attach_type) {
+ case BPF_SK_REUSEPORT_SELECT:
+ case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_SYSCALL:
case BPF_PROG_TYPE_EXT:
if (expected_attach_type)
return -EINVAL;
@@ -2074,9 +2159,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
+#define BPF_PROG_LOAD_LAST_FIELD fd_array
-static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
@@ -2101,8 +2186,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
return -EPERM;
/* copy eBPF program license from user space */
- if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
- sizeof(license) - 1) < 0)
+ if (strncpy_from_bpfptr(license,
+ make_bpfptr(attr->license, uattr.is_kernel),
+ sizeof(license) - 1) < 0)
return -EFAULT;
license[sizeof(license) - 1] = 0;
@@ -2186,8 +2272,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
prog->len = attr->insn_cnt;
err = -EFAULT;
- if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
- bpf_prog_insn_size(prog)) != 0)
+ if (copy_from_bpfptr(prog->insns,
+ make_bpfptr(attr->insns, uattr.is_kernel),
+ bpf_prog_insn_size(prog)) != 0)
goto free_prog_sec;
prog->orig_prog = NULL;
@@ -2816,6 +2903,79 @@ static const struct bpf_link_ops bpf_raw_tp_link_lops = {
.fill_link_info = bpf_raw_tp_link_fill_link_info,
};
+#ifdef CONFIG_PERF_EVENTS
+struct bpf_perf_link {
+ struct bpf_link link;
+ struct file *perf_file;
+};
+
+static void bpf_perf_link_release(struct bpf_link *link)
+{
+ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+ struct perf_event *event = perf_link->perf_file->private_data;
+
+ perf_event_free_bpf_prog(event);
+ fput(perf_link->perf_file);
+}
+
+static void bpf_perf_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+
+ kfree(perf_link);
+}
+
+static const struct bpf_link_ops bpf_perf_link_lops = {
+ .release = bpf_perf_link_release,
+ .dealloc = bpf_perf_link_dealloc,
+};
+
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct bpf_perf_link *link;
+ struct perf_event *event;
+ struct file *perf_file;
+ int err;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ perf_file = perf_event_get(attr->link_create.target_fd);
+ if (IS_ERR(perf_file))
+ return PTR_ERR(perf_file);
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto out_put_file;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
+ link->perf_file = perf_file;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ goto out_put_file;
+ }
+
+ event = perf_file->private_data;
+ err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ goto out_put_file;
+ }
+ /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
+ bpf_prog_inc(prog);
+
+ return bpf_link_settle(&link_primer);
+
+out_put_file:
+ fput(perf_file);
+ return err;
+}
+#endif /* CONFIG_PERF_EVENTS */
+
#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
@@ -3423,7 +3583,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
u32 ulen;
int err;
- err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -3702,7 +3862,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
u32 info_len = attr->info.info_len;
int err;
- err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -3745,7 +3905,7 @@ static int bpf_btf_get_info_by_fd(struct file *file,
u32 info_len = attr->info.info_len;
int err;
- err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
if (err)
return err;
@@ -3762,7 +3922,7 @@ static int bpf_link_get_info_by_fd(struct file *file,
u32 info_len = attr->info.info_len;
int err;
- err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -3825,7 +3985,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
-static int bpf_btf_load(const union bpf_attr *attr)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
{
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
@@ -3833,7 +3993,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
if (!bpf_capable())
return -EPERM;
- return btf_new_fd(attr);
+ return btf_new_fd(attr, uattr);
}
#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
@@ -4023,13 +4183,14 @@ err_put:
return err;
}
-static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
+ struct bpf_prog *prog)
{
if (attr->link_create.attach_type != prog->expected_attach_type)
return -EINVAL;
if (prog->expected_attach_type == BPF_TRACE_ITER)
- return bpf_iter_link_attach(attr, prog);
+ return bpf_iter_link_attach(attr, uattr, prog);
else if (prog->type == BPF_PROG_TYPE_EXT)
return bpf_tracing_prog_attach(prog,
attr->link_create.target_fd,
@@ -4038,7 +4199,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *
}
#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
-static int link_create(union bpf_attr *attr)
+static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
enum bpf_prog_type ptype;
struct bpf_prog *prog;
@@ -4056,15 +4217,26 @@ static int link_create(union bpf_attr *attr)
if (ret)
goto out;
- if (prog->type == BPF_PROG_TYPE_EXT) {
- ret = tracing_bpf_link_attach(attr, prog);
- goto out;
- }
-
- ptype = attach_type_to_prog_type(attr->link_create.attach_type);
- if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
- ret = -EINVAL;
+ switch (prog->type) {
+ case BPF_PROG_TYPE_EXT:
+ ret = tracing_bpf_link_attach(attr, uattr, prog);
goto out;
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ if (attr->link_create.attach_type != BPF_PERF_EVENT) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ptype = prog->type;
+ break;
+ default:
+ ptype = attach_type_to_prog_type(attr->link_create.attach_type);
+ if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
}
switch (ptype) {
@@ -4078,7 +4250,7 @@ static int link_create(union bpf_attr *attr)
ret = cgroup_bpf_link_attach(attr, prog);
break;
case BPF_PROG_TYPE_TRACING:
- ret = tracing_bpf_link_attach(attr, prog);
+ ret = tracing_bpf_link_attach(attr, uattr, prog);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_SK_LOOKUP:
@@ -4089,6 +4261,13 @@ static int link_create(union bpf_attr *attr)
ret = bpf_xdp_link_attach(attr, prog);
break;
#endif
+#ifdef CONFIG_PERF_EVENTS
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_KPROBE:
+ ret = bpf_perf_link_attach(attr, prog);
+ break;
+#endif
default:
ret = -EINVAL;
}
@@ -4366,7 +4545,7 @@ out_prog_put:
return ret;
}
-SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
int err;
@@ -4381,7 +4560,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
memset(&attr, 0, sizeof(attr));
- if (copy_from_user(&attr, uattr, size) != 0)
+ if (copy_from_bpfptr(&attr, uattr, size) != 0)
return -EFAULT;
err = security_bpf(cmd, &attr, size);
@@ -4396,7 +4575,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = map_lookup_elem(&attr);
break;
case BPF_MAP_UPDATE_ELEM:
- err = map_update_elem(&attr);
+ err = map_update_elem(&attr, uattr);
break;
case BPF_MAP_DELETE_ELEM:
err = map_delete_elem(&attr);
@@ -4423,21 +4602,21 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_prog_detach(&attr);
break;
case BPF_PROG_QUERY:
- err = bpf_prog_query(&attr, uattr);
+ err = bpf_prog_query(&attr, uattr.user);
break;
case BPF_PROG_TEST_RUN:
- err = bpf_prog_test_run(&attr, uattr);
+ err = bpf_prog_test_run(&attr, uattr.user);
break;
case BPF_PROG_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
+ err = bpf_obj_get_next_id(&attr, uattr.user,
&prog_idr, &prog_idr_lock);
break;
case BPF_MAP_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
+ err = bpf_obj_get_next_id(&attr, uattr.user,
&map_idr, &map_idr_lock);
break;
case BPF_BTF_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
+ err = bpf_obj_get_next_id(&attr, uattr.user,
&btf_idr, &btf_idr_lock);
break;
case BPF_PROG_GET_FD_BY_ID:
@@ -4447,38 +4626,38 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_map_get_fd_by_id(&attr);
break;
case BPF_OBJ_GET_INFO_BY_FD:
- err = bpf_obj_get_info_by_fd(&attr, uattr);
+ err = bpf_obj_get_info_by_fd(&attr, uattr.user);
break;
case BPF_RAW_TRACEPOINT_OPEN:
err = bpf_raw_tracepoint_open(&attr);
break;
case BPF_BTF_LOAD:
- err = bpf_btf_load(&attr);
+ err = bpf_btf_load(&attr, uattr);
break;
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
break;
case BPF_TASK_FD_QUERY:
- err = bpf_task_fd_query(&attr, uattr);
+ err = bpf_task_fd_query(&attr, uattr.user);
break;
case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
err = map_lookup_and_delete_elem(&attr);
break;
case BPF_MAP_LOOKUP_BATCH:
- err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
break;
case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
- err = bpf_map_do_batch(&attr, uattr,
+ err = bpf_map_do_batch(&attr, uattr.user,
BPF_MAP_LOOKUP_AND_DELETE_BATCH);
break;
case BPF_MAP_UPDATE_BATCH:
- err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
+ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
break;
case BPF_MAP_DELETE_BATCH:
- err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
+ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
break;
case BPF_LINK_CREATE:
- err = link_create(&attr);
+ err = link_create(&attr, uattr);
break;
case BPF_LINK_UPDATE:
err = link_update(&attr);
@@ -4487,7 +4666,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_link_get_fd_by_id(&attr);
break;
case BPF_LINK_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
+ err = bpf_obj_get_next_id(&attr, uattr.user,
&link_idr, &link_idr_lock);
break;
case BPF_ENABLE_STATS:
@@ -4509,3 +4688,94 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
return err;
}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+ return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
+}
+
+static bool syscall_prog_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= U16_MAX)
+ return false;
+ if (off % size != 0)
+ return false;
+ return true;
+}
+
+BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size)
+{
+ switch (cmd) {
+ case BPF_MAP_CREATE:
+ case BPF_MAP_UPDATE_ELEM:
+ case BPF_MAP_FREEZE:
+ case BPF_PROG_LOAD:
+ case BPF_BTF_LOAD:
+ break;
+ /* case BPF_PROG_TEST_RUN:
+ * is not part of this list to prevent recursive test_run
+ */
+ default:
+ return -EINVAL;
+ }
+ return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
+}
+
+static const struct bpf_func_proto bpf_sys_bpf_proto = {
+ .func = bpf_sys_bpf,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+const struct bpf_func_proto * __weak
+tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id);
+}
+
+BPF_CALL_1(bpf_sys_close, u32, fd)
+{
+ /* When bpf program calls this helper there should not be
+ * an fdget() without matching completed fdput().
+ * This helper is allowed in the following callchain only:
+ * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
+ */
+ return close_fd(fd);
+}
+
+static const struct bpf_func_proto bpf_sys_close_proto = {
+ .func = bpf_sys_close,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_sys_bpf:
+ return &bpf_sys_bpf_proto;
+ case BPF_FUNC_btf_find_by_name_kind:
+ return &bpf_btf_find_by_name_kind_proto;
+ case BPF_FUNC_sys_close:
+ return &bpf_sys_close_proto;
+ default:
+ return tracing_prog_func_proto(func_id, prog);
+ }
+}
+
+const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
+ .get_func_proto = syscall_prog_func_proto,
+ .is_valid_access = syscall_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops bpf_syscall_prog_ops = {
+ .test_run = bpf_prog_test_run_syscall,
+};
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index b68cb5d6d6eb..b48750bfba5a 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -525,7 +525,6 @@ static const struct seq_operations task_vma_seq_ops = {
};
BTF_ID_LIST(btf_task_file_ids)
-BTF_ID(struct, task_struct)
BTF_ID(struct, file)
BTF_ID(struct, vm_area_struct)
@@ -591,19 +590,19 @@ static int __init task_iter_init(void)
{
int ret;
- task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
+ task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
ret = bpf_iter_reg_target(&task_reg_info);
if (ret)
return ret;
- task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
- task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
+ task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
+ task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0];
ret = bpf_iter_reg_target(&task_file_reg_info);
if (ret)
return ret;
- task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
- task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2];
+ task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
+ task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
return bpf_iter_reg_target(&task_vma_reg_info);
}
late_initcall(task_iter_init);
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index ceac5281bd31..3d7127f439a1 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -111,28 +111,31 @@ struct tnum tnum_xor(struct tnum a, struct tnum b)
return TNUM(v & ~mu, mu);
}
-/* half-multiply add: acc += (unknown * mask * value).
- * An intermediate step in the multiply algorithm.
+/* Generate partial products by multiplying each bit in the multiplier (tnum a)
+ * with the multiplicand (tnum b), and add the partial products after
+ * appropriately bit-shifting them. Instead of directly performing tnum addition
+ * on the generated partial products, equivalenty, decompose each partial
+ * product into two tnums, consisting of the value-sum (acc_v) and the
+ * mask-sum (acc_m) and then perform tnum addition on them. The following paper
+ * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398.
*/
-static struct tnum hma(struct tnum acc, u64 value, u64 mask)
-{
- while (mask) {
- if (mask & 1)
- acc = tnum_add(acc, TNUM(0, value));
- mask >>= 1;
- value <<= 1;
- }
- return acc;
-}
-
struct tnum tnum_mul(struct tnum a, struct tnum b)
{
- struct tnum acc;
- u64 pi;
+ u64 acc_v = a.value * b.value;
+ struct tnum acc_m = TNUM(0, 0);
- pi = a.value * b.value;
- acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value);
- return hma(acc, b.mask, a.value);
+ while (a.value || a.mask) {
+ /* LSB of tnum a is a certain 1 */
+ if (a.value & 1)
+ acc_m = tnum_add(acc_m, TNUM(0, b.mask));
+ /* LSB of tnum a is uncertain */
+ else if (a.mask & 1)
+ acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask));
+ /* Note: no case for LSB is certain 0 */
+ a = tnum_rshift(a, 1);
+ b = tnum_lshift(b, 1);
+ }
+ return tnum_add(TNUM(acc_v, 0), acc_m);
}
/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 2d44b5aa0057..fe1e857324e6 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -172,7 +172,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
}
static struct bpf_tramp_progs *
-bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
+bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg)
{
const struct bpf_prog_aux *aux;
struct bpf_tramp_progs *tprogs;
@@ -189,8 +189,10 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
*total += tr->progs_cnt[kind];
progs = tprogs[kind].progs;
- hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist)
+ hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) {
+ *ip_arg |= aux->prog->call_get_func_ip;
*progs++ = aux->prog;
+ }
}
return tprogs;
}
@@ -333,9 +335,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
struct bpf_tramp_image *im;
struct bpf_tramp_progs *tprogs;
u32 flags = BPF_TRAMP_F_RESTORE_REGS;
+ bool ip_arg = false;
int err, total;
- tprogs = bpf_trampoline_get_progs(tr, &total);
+ tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg);
if (IS_ERR(tprogs))
return PTR_ERR(tprogs);
@@ -357,6 +360,9 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ if (ip_arg)
+ flags |= BPF_TRAMP_F_IP_ARG;
+
err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
&tr->func.model, flags, tprogs,
tr->func.addr);
@@ -542,7 +548,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
u64_stats_update_end(&stats->syncp);
}
-/* The logic is similar to BPF_PROG_RUN, but with an explicit
+/* The logic is similar to bpf_prog_run(), but with an explicit
* rcu_read_lock() and migrate_disable() which are required
* for the trampoline. The macro is split into
* call __bpf_prog_enter
@@ -552,7 +558,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
* __bpf_prog_enter returns:
* 0 - skip execution of the bpf prog
* 1 - execute bpf prog
- * [2..MAX_U64] - excute bpf prog and record execution time.
+ * [2..MAX_U64] - execute bpf prog and record execution time.
* This is start time.
*/
u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c6a27574242d..e76b55917905 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -47,7 +47,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
* - unreachable insns exist (shouldn't be a forest. program = one function)
* - out of bounds or malformed jumps
* The second pass is all possible path descent from the 1st insn.
- * Since it's analyzing all pathes through the program, the length of the
+ * Since it's analyzing all paths through the program, the length of the
* analysis is limited to 64k insn, which may be hit even if total number of
* insn is less then 4K, but there are too many branches that change stack/regs.
* Number of 'branches to be analyzed' is limited to 1k
@@ -132,7 +132,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
* If it's ok, then verifier allows this BPF_CALL insn and looks at
* .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
* R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
- * returns ether pointer to map value or NULL.
+ * returns either pointer to map value or NULL.
*
* When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
* insn, the register holding that pointer in the true branch changes state to
@@ -255,6 +255,7 @@ struct bpf_call_arg_meta {
int mem_size;
u64 msize_max_value;
int ref_obj_id;
+ int map_uid;
int func_id;
struct btf *btf;
u32 btf_id;
@@ -734,84 +735,111 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (state->refs[i].id)
verbose(env, ",%d", state->refs[i].id);
}
+ if (state->in_callback_fn)
+ verbose(env, " cb");
+ if (state->in_async_callback_fn)
+ verbose(env, " async_cb");
verbose(env, "\n");
}
-#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE) \
-static int copy_##NAME##_state(struct bpf_func_state *dst, \
- const struct bpf_func_state *src) \
-{ \
- if (!src->FIELD) \
- return 0; \
- if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) { \
- /* internal bug, make state invalid to reject the program */ \
- memset(dst, 0, sizeof(*dst)); \
- return -EFAULT; \
- } \
- memcpy(dst->FIELD, src->FIELD, \
- sizeof(*src->FIELD) * (src->COUNT / SIZE)); \
- return 0; \
-}
-/* copy_reference_state() */
-COPY_STATE_FN(reference, acquired_refs, refs, 1)
-/* copy_stack_state() */
-COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
-#undef COPY_STATE_FN
+/* copy array src of length n * size bytes to dst. dst is reallocated if it's too
+ * small to hold src. This is different from krealloc since we don't want to preserve
+ * the contents of dst.
+ *
+ * Leaves dst untouched if src is NULL or length is zero. Returns NULL if memory could
+ * not be allocated.
+ */
+static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags)
+{
+ size_t bytes;
+
+ if (ZERO_OR_NULL_PTR(src))
+ goto out;
-#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE) \
-static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \
- bool copy_old) \
-{ \
- u32 old_size = state->COUNT; \
- struct bpf_##NAME##_state *new_##FIELD; \
- int slot = size / SIZE; \
- \
- if (size <= old_size || !size) { \
- if (copy_old) \
- return 0; \
- state->COUNT = slot * SIZE; \
- if (!size && old_size) { \
- kfree(state->FIELD); \
- state->FIELD = NULL; \
- } \
- return 0; \
- } \
- new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \
- GFP_KERNEL); \
- if (!new_##FIELD) \
- return -ENOMEM; \
- if (copy_old) { \
- if (state->FIELD) \
- memcpy(new_##FIELD, state->FIELD, \
- sizeof(*new_##FIELD) * (old_size / SIZE)); \
- memset(new_##FIELD + old_size / SIZE, 0, \
- sizeof(*new_##FIELD) * (size - old_size) / SIZE); \
- } \
- state->COUNT = slot * SIZE; \
- kfree(state->FIELD); \
- state->FIELD = new_##FIELD; \
- return 0; \
+ if (unlikely(check_mul_overflow(n, size, &bytes)))
+ return NULL;
+
+ if (ksize(dst) < bytes) {
+ kfree(dst);
+ dst = kmalloc_track_caller(bytes, flags);
+ if (!dst)
+ return NULL;
+ }
+
+ memcpy(dst, src, bytes);
+out:
+ return dst ? dst : ZERO_SIZE_PTR;
}
-/* realloc_reference_state() */
-REALLOC_STATE_FN(reference, acquired_refs, refs, 1)
-/* realloc_stack_state() */
-REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
-#undef REALLOC_STATE_FN
-/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
- * make it consume minimal amount of memory. check_stack_write() access from
- * the program calls into realloc_func_state() to grow the stack size.
- * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
- * which realloc_stack_state() copies over. It points to previous
- * bpf_verifier_state which is never reallocated.
+/* resize an array from old_n items to new_n items. the array is reallocated if it's too
+ * small to hold new_n items. new items are zeroed out if the array grows.
+ *
+ * Contrary to krealloc_array, does not free arr if new_n is zero.
*/
-static int realloc_func_state(struct bpf_func_state *state, int stack_size,
- int refs_size, bool copy_old)
+static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
{
- int err = realloc_reference_state(state, refs_size, copy_old);
- if (err)
- return err;
- return realloc_stack_state(state, stack_size, copy_old);
+ if (!new_n || old_n == new_n)
+ goto out;
+
+ arr = krealloc_array(arr, new_n, size, GFP_KERNEL);
+ if (!arr)
+ return NULL;
+
+ if (new_n > old_n)
+ memset(arr + old_n * size, 0, (new_n - old_n) * size);
+
+out:
+ return arr ? arr : ZERO_SIZE_PTR;
+}
+
+static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
+{
+ dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs,
+ sizeof(struct bpf_reference_state), GFP_KERNEL);
+ if (!dst->refs)
+ return -ENOMEM;
+
+ dst->acquired_refs = src->acquired_refs;
+ return 0;
+}
+
+static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
+{
+ size_t n = src->allocated_stack / BPF_REG_SIZE;
+
+ dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state),
+ GFP_KERNEL);
+ if (!dst->stack)
+ return -ENOMEM;
+
+ dst->allocated_stack = src->allocated_stack;
+ return 0;
+}
+
+static int resize_reference_state(struct bpf_func_state *state, size_t n)
+{
+ state->refs = realloc_array(state->refs, state->acquired_refs, n,
+ sizeof(struct bpf_reference_state));
+ if (!state->refs)
+ return -ENOMEM;
+
+ state->acquired_refs = n;
+ return 0;
+}
+
+static int grow_stack_state(struct bpf_func_state *state, int size)
+{
+ size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE;
+
+ if (old_n >= n)
+ return 0;
+
+ state->stack = realloc_array(state->stack, old_n, n, sizeof(struct bpf_stack_state));
+ if (!state->stack)
+ return -ENOMEM;
+
+ state->allocated_stack = size;
+ return 0;
}
/* Acquire a pointer id from the env and update the state->refs to include
@@ -825,7 +853,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
int new_ofs = state->acquired_refs;
int id, err;
- err = realloc_reference_state(state, state->acquired_refs + 1, true);
+ err = resize_reference_state(state, state->acquired_refs + 1);
if (err)
return err;
id = ++env->id_gen;
@@ -854,18 +882,6 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
return -EINVAL;
}
-static int transfer_reference_state(struct bpf_func_state *dst,
- struct bpf_func_state *src)
-{
- int err = realloc_reference_state(dst, src->acquired_refs, false);
- if (err)
- return err;
- err = copy_reference_state(dst, src);
- if (err)
- return err;
- return 0;
-}
-
static void free_func_state(struct bpf_func_state *state)
{
if (!state)
@@ -904,10 +920,6 @@ static int copy_func_state(struct bpf_func_state *dst,
{
int err;
- err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs,
- false);
- if (err)
- return err;
memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
err = copy_reference_state(dst, src);
if (err)
@@ -919,16 +931,13 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
const struct bpf_verifier_state *src)
{
struct bpf_func_state *dst;
- u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt;
int i, err;
- if (dst_state->jmp_history_cnt < src->jmp_history_cnt) {
- kfree(dst_state->jmp_history);
- dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER);
- if (!dst_state->jmp_history)
- return -ENOMEM;
- }
- memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz);
+ dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
+ src->jmp_history_cnt, sizeof(struct bpf_idx_pair),
+ GFP_USER);
+ if (!dst_state->jmp_history)
+ return -ENOMEM;
dst_state->jmp_history_cnt = src->jmp_history_cnt;
/* if dst has more stack frames then src frame, free them */
@@ -1131,6 +1140,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
if (map->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
reg->map_ptr = map->inner_map_meta;
+ /* transfer reg's id which is unique for every map_lookup_elem
+ * as UID of the inner map.
+ */
+ reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -1518,6 +1531,54 @@ static void init_func_state(struct bpf_verifier_env *env,
init_reg_state(env, state);
}
+/* Similar to push_stack(), but for async callbacks */
+static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx,
+ int subprog)
+{
+ struct bpf_verifier_stack_elem *elem;
+ struct bpf_func_state *frame;
+
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ if (!elem)
+ goto err;
+
+ elem->insn_idx = insn_idx;
+ elem->prev_insn_idx = prev_insn_idx;
+ elem->next = env->head;
+ elem->log_pos = env->log.len_used;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
+ verbose(env,
+ "The sequence of %d jumps is too complex for async cb.\n",
+ env->stack_size);
+ goto err;
+ }
+ /* Unlike push_stack() do not copy_verifier_state().
+ * The caller state doesn't matter.
+ * This is async callback. It starts in a fresh stack.
+ * Initialize it similar to do_check_common().
+ */
+ elem->st.branches = 1;
+ frame = kzalloc(sizeof(*frame), GFP_KERNEL);
+ if (!frame)
+ goto err;
+ init_func_state(env, frame,
+ BPF_MAIN_FUNC /* callsite */,
+ 0 /* frameno within this callchain */,
+ subprog /* subprog number within this prog */);
+ elem->st.frame[0] = frame;
+ return &elem->st;
+err:
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ /* pop all elements and return */
+ while (!pop_stack(env, NULL, NULL, false));
+ return NULL;
+}
+
+
enum reg_arg_type {
SRC_OP, /* register is used as source operand */
DST_OP, /* register is used as destination operand */
@@ -2590,8 +2651,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
struct bpf_reg_state *reg = NULL;
- err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
- state->acquired_refs, true);
+ err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
if (err)
return err;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -2607,13 +2667,26 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
cur = env->cur_state->frame[env->cur_state->curframe];
if (value_regno >= 0)
reg = &cur->regs[value_regno];
+ if (!env->bypass_spec_v4) {
+ bool sanitize = reg && is_spillable_regtype(reg->type);
+
+ for (i = 0; i < size; i++) {
+ if (state->stack[spi].slot_type[i] == STACK_INVALID) {
+ sanitize = true;
+ break;
+ }
+ }
+
+ if (sanitize)
+ env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
+ }
if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) &&
!register_is_null(reg) && env->bpf_capable) {
if (dst_reg != BPF_REG_FP) {
/* The backtracking logic can only recognize explicit
* stack slot address like [fp - 8]. Other spill of
- * scalar via different register has to be conervative.
+ * scalar via different register has to be conservative.
* Backtrack from here and mark all registers as precise
* that contributed into 'reg' being a constant.
*/
@@ -2629,47 +2702,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
verbose(env, "invalid size of register spill\n");
return -EACCES;
}
-
if (state != cur && reg->type == PTR_TO_STACK) {
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
return -EINVAL;
}
-
- if (!env->bypass_spec_v4) {
- bool sanitize = false;
-
- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
- register_is_const(&state->stack[spi].spilled_ptr))
- sanitize = true;
- for (i = 0; i < BPF_REG_SIZE; i++)
- if (state->stack[spi].slot_type[i] == STACK_MISC) {
- sanitize = true;
- break;
- }
- if (sanitize) {
- int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
- int soff = (-spi - 1) * BPF_REG_SIZE;
-
- /* detected reuse of integer stack slot with a pointer
- * which means either llvm is reusing stack slot or
- * an attacker is trying to exploit CVE-2018-3639
- * (speculative store bypass)
- * Have to sanitize that slot with preemptive
- * store of zero.
- */
- if (*poff && *poff != soff) {
- /* disallow programs where single insn stores
- * into two different stack slots, since verifier
- * cannot sanitize them
- */
- verbose(env,
- "insn %d cannot access two stack slots fp%d and fp%d",
- insn_idx, *poff, soff);
- return -EINVAL;
- }
- *poff = soff;
- }
- }
save_register_state(state, spi, reg);
} else {
u8 type = STACK_MISC;
@@ -2753,8 +2789,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
if (value_reg && register_is_null(value_reg))
writing_zero = true;
- err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE),
- state->acquired_refs, true);
+ err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE));
if (err)
return err;
@@ -3239,6 +3274,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
}
+ if (map_value_has_timer(map)) {
+ u32 t = map->timer_off;
+
+ if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
+ t < reg->umax_value + off + size) {
+ verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
+ return -EACCES;
+ }
+ }
return err;
}
@@ -3641,6 +3685,8 @@ process_func:
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
+ int next_insn;
+
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
@@ -3648,13 +3694,22 @@ continue_func:
ret_prog[frame] = idx;
/* find the callee */
- i = i + insn[i].imm + 1;
- idx = find_subprog(env, i);
+ next_insn = i + insn[i].imm + 1;
+ idx = find_subprog(env, next_insn);
if (idx < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- i);
+ next_insn);
return -EFAULT;
}
+ if (subprog[idx].is_async_cb) {
+ if (subprog[idx].has_tail_call) {
+ verbose(env, "verifier bug. subprog has tail_call and async cb\n");
+ return -EFAULT;
+ }
+ /* async callbacks don't increase bpf prog stack size */
+ continue;
+ }
+ i = next_insn;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@@ -3675,6 +3730,8 @@ continue_func:
if (tail_call_reachable)
for (j = 0; j < frame; j++)
subprog[ret_prog[j]].tail_call_reachable = true;
+ if (subprog[0].tail_call_reachable)
+ env->prog->aux->tail_call_reachable = true;
/* end of for() loop means the last insn of the 'subprog'
* was reached. Doesn't matter whether it was JA or EXIT
@@ -4654,6 +4711,54 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
return 0;
}
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ bool is_const = tnum_is_const(reg->var_off);
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+
+ if (!is_const) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+ if (!map->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (!map_value_has_timer(map)) {
+ if (map->timer_off == -E2BIG)
+ verbose(env,
+ "map '%s' has more than one 'struct bpf_timer'\n",
+ map->name);
+ else if (map->timer_off == -ENOENT)
+ verbose(env,
+ "map '%s' doesn't have 'struct bpf_timer'\n",
+ map->name);
+ else
+ verbose(env,
+ "map '%s' is not a struct type or bpf_timer is mangled\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (map->timer_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
+ val + reg->off, map->timer_off);
+ return -EINVAL;
+ }
+ if (meta->map_ptr) {
+ verbose(env, "verifier bug. Two map pointers in a timer helper\n");
+ return -EFAULT;
+ }
+ meta->map_uid = reg->map_uid;
+ meta->map_ptr = map;
+ return 0;
+}
+
static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
{
return type == ARG_PTR_TO_MEM ||
@@ -4786,6 +4891,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
@@ -4817,6 +4923,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_FUNC] = &func_ptr_types,
[ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
+ [ARG_PTR_TO_TIMER] = &timer_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -4946,7 +5053,29 @@ skip_type_check:
if (arg_type == ARG_CONST_MAP_PTR) {
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ if (meta->map_ptr) {
+ /* Use map_uid (which is unique id of inner map) to reject:
+ * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
+ * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
+ * if (inner_map1 && inner_map2) {
+ * timer = bpf_map_lookup_elem(inner_map1);
+ * if (timer)
+ * // mismatch would have been allowed
+ * bpf_timer_init(timer, inner_map2);
+ * }
+ *
+ * Comparing map_ptr is enough to distinguish normal and outer maps.
+ */
+ if (meta->map_ptr != reg->map_ptr ||
+ meta->map_uid != reg->map_uid) {
+ verbose(env,
+ "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
+ meta->map_uid, reg->map_uid);
+ return -EINVAL;
+ }
+ }
meta->map_ptr = reg->map_ptr;
+ meta->map_uid = reg->map_uid;
} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
@@ -4998,6 +5127,9 @@ skip_type_check:
verbose(env, "verifier internal error\n");
return -EFAULT;
}
+ } else if (arg_type == ARG_PTR_TO_TIMER) {
+ if (process_timer_func(env, regno, meta))
+ return -EACCES;
} else if (arg_type == ARG_PTR_TO_FUNC) {
meta->subprogno = reg->subprogno;
} else if (arg_type_is_mem_ptr(arg_type)) {
@@ -5170,8 +5302,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_RINGBUF:
if (func_id != BPF_FUNC_ringbuf_output &&
func_id != BPF_FUNC_ringbuf_reserve &&
- func_id != BPF_FUNC_ringbuf_submit &&
- func_id != BPF_FUNC_ringbuf_discard &&
func_id != BPF_FUNC_ringbuf_query)
goto error;
break;
@@ -5280,6 +5410,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
+ case BPF_FUNC_ringbuf_output:
+ case BPF_FUNC_ringbuf_reserve:
+ case BPF_FUNC_ringbuf_query:
+ if (map->map_type != BPF_MAP_TYPE_RINGBUF)
+ goto error;
+ break;
case BPF_FUNC_get_stackid:
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
@@ -5613,6 +5749,31 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
}
+ if (insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->imm == BPF_FUNC_timer_set_callback) {
+ struct bpf_verifier_state *async_cb;
+
+ /* there is no real recursion here. timer callbacks are async */
+ env->subprog_info[subprog].is_async_cb = true;
+ async_cb = push_async_cb(env, env->subprog_info[subprog].start,
+ *insn_idx, subprog);
+ if (!async_cb)
+ return -EFAULT;
+ callee = async_cb->frame[0];
+ callee->async_entry_cnt = caller->async_entry_cnt + 1;
+
+ /* Convert bpf_timer_set_callback() args into timer callback args */
+ err = set_callee_state_cb(env, caller, callee, *insn_idx);
+ if (err)
+ return err;
+
+ clear_caller_saved_regs(env, caller->regs);
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
+ caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+ /* continue with next insn after call */
+ return 0;
+ }
+
callee = kzalloc(sizeof(*callee), GFP_KERNEL);
if (!callee)
return -ENOMEM;
@@ -5629,7 +5790,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
subprog /* subprog number within this prog */);
/* Transfer references to the callee */
- err = transfer_reference_state(callee, caller);
+ err = copy_reference_state(callee, caller);
if (err)
return err;
@@ -5740,6 +5901,35 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_timer_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ /* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);
+ * callback_fn(struct bpf_map *map, void *key, void *value);
+ */
+ callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_async_callback_fn = true;
+ return 0;
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -5780,7 +5970,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
}
/* Transfer references to the caller */
- err = transfer_reference_state(caller, callee);
+ err = copy_reference_state(caller, callee);
if (err)
return err;
@@ -5953,6 +6143,29 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
return err;
}
+static int check_get_func_ip(struct bpf_verifier_env *env)
+{
+ enum bpf_attach_type eatype = env->prog->expected_attach_type;
+ enum bpf_prog_type type = resolve_prog_type(env->prog);
+ int func_id = BPF_FUNC_get_func_ip;
+
+ if (type == BPF_PROG_TYPE_TRACING) {
+ if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT &&
+ eatype != BPF_MODIFY_RETURN) {
+ verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
+ func_id_name(func_id), func_id);
+ return -ENOTSUPP;
+ }
+ return 0;
+ } else if (type == BPF_PROG_TYPE_KPROBE) {
+ return 0;
+ }
+
+ verbose(env, "func %s#%d not supported for program type %d\n",
+ func_id_name(func_id), func_id, type);
+ return -ENOTSUPP;
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
@@ -6067,6 +6280,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
+ if (func_id == BPF_FUNC_timer_set_callback) {
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_timer_callback_state);
+ if (err < 0)
+ return -EINVAL;
+ }
+
if (func_id == BPF_FUNC_snprintf) {
err = check_bpf_snprintf_call(env, regs);
if (err < 0)
@@ -6102,6 +6322,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
+ regs[BPF_REG_0].map_uid = meta.map_uid;
if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
if (map_value_has_spin_lock(meta.map_ptr))
@@ -6223,6 +6444,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
env->prog->call_get_stack = true;
+ if (func_id == BPF_FUNC_get_func_ip) {
+ if (check_get_func_ip(env))
+ return -ENOTSUPP;
+ env->prog->call_get_func_ip = true;
+ }
+
if (changes_data)
clear_all_pkt_pointers(env);
return 0;
@@ -6557,6 +6784,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
alu_state |= ptr_is_dst_reg ?
BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+
+ /* Limit pruning on unknown scalars to enable deep search for
+ * potential masking differences from other program paths.
+ */
+ if (!off_is_imm)
+ env->explore_alu_limits = true;
}
err = update_alu_sanitation_state(aux, alu_state, alu_limit);
@@ -8968,12 +9201,14 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
mark_reg_known_zero(env, regs, insn->dst_reg);
dst_reg->map_ptr = map;
- if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
+ if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
+ insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
if (map_value_has_spin_lock(map))
dst_reg->id = ++env->id_gen;
- } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
+ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
+ insn->src_reg == BPF_PSEUDO_MAP_IDX) {
dst_reg->type = CONST_PTR_TO_MAP;
} else {
verbose(env, "bpf verifier is misconfigured\n");
@@ -9095,7 +9330,8 @@ static int check_return_code(struct bpf_verifier_env *env)
struct tnum range = tnum_range(0, 1);
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
int err;
- const bool is_subprog = env->cur_state->frame[0]->subprogno;
+ struct bpf_func_state *frame = env->cur_state->frame[0];
+ const bool is_subprog = frame->subprogno;
/* LSM and struct_ops func-ptr's return type could be "void" */
if (!is_subprog &&
@@ -9104,7 +9340,7 @@ static int check_return_code(struct bpf_verifier_env *env)
!prog->aux->attach_func_proto->type)
return 0;
- /* eBPF calling convetion is such that R0 is used
+ /* eBPF calling convention is such that R0 is used
* to return the value from eBPF program.
* Make sure that it's readable at this time
* of bpf_exit, which means that program wrote
@@ -9120,6 +9356,22 @@ static int check_return_code(struct bpf_verifier_env *env)
}
reg = cur_regs(env) + BPF_REG_0;
+
+ if (frame->in_async_callback_fn) {
+ /* enforce return zero from async callbacks like timer */
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "In async callback the register R0 is not a known value (%s)\n",
+ reg_type_str[reg->type]);
+ return -EINVAL;
+ }
+
+ if (!tnum_in(tnum_const(0), reg->var_off)) {
+ verbose_invalid_scalar(env, reg, &range, "async callback", "R0");
+ return -EINVAL;
+ }
+ return 0;
+ }
+
if (is_subprog) {
if (reg->type != SCALAR_VALUE) {
verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
@@ -9334,8 +9586,12 @@ static int visit_func_call_insn(int t, int insn_cnt,
init_explored_state(env, t + 1);
if (visit_callee) {
init_explored_state(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
- env, false);
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
+ /* It's ok to allow recursion from CFG point of
+ * view. __check_func_call() will do the actual
+ * check.
+ */
+ bpf_pseudo_func(insns + t));
}
return ret;
}
@@ -9363,6 +9619,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
return DONE_EXPLORING;
case BPF_CALL:
+ if (insns[t].imm == BPF_FUNC_timer_set_callback)
+ /* Mark this call insn to trigger is_state_visited() check
+ * before call itself is processed by __check_func_call().
+ * Otherwise new async state will be pushed for further
+ * exploration.
+ */
+ init_explored_state(env, t);
return visit_func_call_insn(t, insn_cnt, insns, env,
insns[t].src_reg == BPF_PSEUDO_CALL);
@@ -9489,7 +9752,7 @@ static int check_abnormal_return(struct bpf_verifier_env *env)
static int check_btf_func(struct bpf_verifier_env *env,
const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+ bpfptr_t uattr)
{
const struct btf_type *type, *func_proto, *ret_type;
u32 i, nfuncs, urec_size, min_size;
@@ -9498,7 +9761,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
struct bpf_func_info_aux *info_aux = NULL;
struct bpf_prog *prog;
const struct btf *btf;
- void __user *urecord;
+ bpfptr_t urecord;
u32 prev_offset = 0;
bool scalar_return;
int ret = -ENOMEM;
@@ -9526,7 +9789,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
prog = env->prog;
btf = prog->aux->btf;
- urecord = u64_to_user_ptr(attr->func_info);
+ urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
min_size = min_t(u32, krec_size, urec_size);
krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
@@ -9544,13 +9807,15 @@ static int check_btf_func(struct bpf_verifier_env *env,
/* set the size kernel expects so loader can zero
* out the rest of the record.
*/
- if (put_user(min_size, &uattr->func_info_rec_size))
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, func_info_rec_size),
+ &min_size, sizeof(min_size)))
ret = -EFAULT;
}
goto err_free;
}
- if (copy_from_user(&krecord[i], urecord, min_size)) {
+ if (copy_from_bpfptr(&krecord[i], urecord, min_size)) {
ret = -EFAULT;
goto err_free;
}
@@ -9602,7 +9867,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
}
prev_offset = krecord[i].insn_off;
- urecord += urec_size;
+ bpfptr_add(&urecord, urec_size);
}
prog->aux->func_info = krecord;
@@ -9634,19 +9899,21 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
static int check_btf_line(struct bpf_verifier_env *env,
const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+ bpfptr_t uattr)
{
u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
struct bpf_subprog_info *sub;
struct bpf_line_info *linfo;
struct bpf_prog *prog;
const struct btf *btf;
- void __user *ulinfo;
+ bpfptr_t ulinfo;
int err;
nr_linfo = attr->line_info_cnt;
if (!nr_linfo)
return 0;
+ if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
+ return -EINVAL;
rec_size = attr->line_info_rec_size;
if (rec_size < MIN_BPF_LINEINFO_SIZE ||
@@ -9667,7 +9934,7 @@ static int check_btf_line(struct bpf_verifier_env *env,
s = 0;
sub = env->subprog_info;
- ulinfo = u64_to_user_ptr(attr->line_info);
+ ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel);
expected_size = sizeof(struct bpf_line_info);
ncopy = min_t(u32, expected_size, rec_size);
for (i = 0; i < nr_linfo; i++) {
@@ -9675,14 +9942,15 @@ static int check_btf_line(struct bpf_verifier_env *env,
if (err) {
if (err == -E2BIG) {
verbose(env, "nonzero tailing record in line_info");
- if (put_user(expected_size,
- &uattr->line_info_rec_size))
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, line_info_rec_size),
+ &expected_size, sizeof(expected_size)))
err = -EFAULT;
}
goto err_free;
}
- if (copy_from_user(&linfo[i], ulinfo, ncopy)) {
+ if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) {
err = -EFAULT;
goto err_free;
}
@@ -9734,7 +10002,7 @@ static int check_btf_line(struct bpf_verifier_env *env,
}
prev_offset = linfo[i].insn_off;
- ulinfo += rec_size;
+ bpfptr_add(&ulinfo, rec_size);
}
if (s != env->subprog_cnt) {
@@ -9756,7 +10024,7 @@ err_free:
static int check_btf_info(struct bpf_verifier_env *env,
const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+ bpfptr_t uattr)
{
struct btf *btf;
int err;
@@ -9801,13 +10069,6 @@ static bool range_within(struct bpf_reg_state *old,
old->s32_max_value >= cur->s32_max_value;
}
-/* Maximum number of register states that can exist at once */
-#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
-struct idpair {
- u32 old;
- u32 cur;
-};
-
/* If in the old state two registers had the same id, then they need to have
* the same id in the new state as well. But that id could be different from
* the old state, so we need to track the mapping from old to new ids.
@@ -9818,11 +10079,11 @@ struct idpair {
* So we look through our idmap to see if this old id has been seen before. If
* so, we require the new id to match; otherwise, we add the id pair to the map.
*/
-static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
+static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)
{
unsigned int i;
- for (i = 0; i < ID_MAP_SIZE; i++) {
+ for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
if (!idmap[i].old) {
/* Reached an empty slot; haven't seen this id before */
idmap[i].old = old_id;
@@ -9899,7 +10160,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
* Since the verifier pushes the branch states as it sees them while exploring
* the program the condition of walking the branch instruction for the second
* time means that all states below this branch were already explored and
- * their final liveness markes are already propagated.
+ * their final liveness marks are already propagated.
* Hence when the verifier completes the search of state list in is_state_visited()
* we can call this clean_live_states() function to mark all liveness states
* as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
@@ -9934,8 +10195,8 @@ next:
}
/* Returns true if (rold safe implies rcur safe) */
-static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
- struct idpair *idmap)
+static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
+ struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
{
bool equal;
@@ -9961,6 +10222,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
switch (rold->type) {
case SCALAR_VALUE:
+ if (env->explore_alu_limits)
+ return false;
if (rcur->type == SCALAR_VALUE) {
if (!rold->precise && !rcur->precise)
return true;
@@ -10051,9 +10314,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
}
-static bool stacksafe(struct bpf_func_state *old,
- struct bpf_func_state *cur,
- struct idpair *idmap)
+static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, struct bpf_id_pair *idmap)
{
int i, spi;
@@ -10098,9 +10360,8 @@ static bool stacksafe(struct bpf_func_state *old,
continue;
if (old->stack[spi].slot_type[0] != STACK_SPILL)
continue;
- if (!regsafe(&old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr,
- idmap))
+ if (!regsafe(env, &old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr, idmap))
/* when explored and current stack slot are both storing
* spilled registers, check that stored pointers types
* are the same as well.
@@ -10150,32 +10411,24 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
* whereas register type in current state is meaningful, it means that
* the current state will reach 'bpf_exit' instruction safely
*/
-static bool func_states_equal(struct bpf_func_state *old,
+static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
struct bpf_func_state *cur)
{
- struct idpair *idmap;
- bool ret = false;
int i;
- idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL);
- /* If we failed to allocate the idmap, just say it's not safe */
- if (!idmap)
- return false;
-
- for (i = 0; i < MAX_BPF_REG; i++) {
- if (!regsafe(&old->regs[i], &cur->regs[i], idmap))
- goto out_free;
- }
+ memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (!regsafe(env, &old->regs[i], &cur->regs[i],
+ env->idmap_scratch))
+ return false;
- if (!stacksafe(old, cur, idmap))
- goto out_free;
+ if (!stacksafe(env, old, cur, env->idmap_scratch))
+ return false;
if (!refsafe(old, cur))
- goto out_free;
- ret = true;
-out_free:
- kfree(idmap);
- return ret;
+ return false;
+
+ return true;
}
static bool states_equal(struct bpf_verifier_env *env,
@@ -10202,7 +10455,7 @@ static bool states_equal(struct bpf_verifier_env *env,
for (i = 0; i <= old->curframe; i++) {
if (old->frame[i]->callsite != cur->frame[i]->callsite)
return false;
- if (!func_states_equal(old->frame[i], cur->frame[i]))
+ if (!func_states_equal(env, old->frame[i], cur->frame[i]))
return false;
}
return true;
@@ -10383,9 +10636,25 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
states_cnt++;
if (sl->state.insn_idx != insn_idx)
goto next;
+
if (sl->state.branches) {
- if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur)) {
+ struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
+
+ if (frame->in_async_callback_fn &&
+ frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
+ /* Different async_entry_cnt means that the verifier is
+ * processing another entry into async callback.
+ * Seeing the same state is not an indication of infinite
+ * loop or infinite recursion.
+ * But finding the same state doesn't mean that it's safe
+ * to stop processing the current state. The previous state
+ * hasn't yet reached bpf_exit, since state.branches > 0.
+ * Checking in_async_callback_fn alone is not enough either.
+ * Since the verifier still needs to catch infinite loops
+ * inside async callbacks.
+ */
+ } else if (states_maybe_looping(&sl->state, cur) &&
+ states_equal(env, &sl->state, cur)) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
return -EINVAL;
@@ -11239,6 +11508,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
struct bpf_map *map;
struct fd f;
u64 addr;
+ u32 fd;
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
@@ -11268,16 +11538,38 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
/* In final convert_pseudo_ld_imm64() step, this is
* converted into regular 64-bit imm load insn.
*/
- if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
- insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) ||
- (insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
- insn[1].imm != 0)) {
- verbose(env,
- "unrecognized bpf_ld_imm64 insn\n");
+ switch (insn[0].src_reg) {
+ case BPF_PSEUDO_MAP_VALUE:
+ case BPF_PSEUDO_MAP_IDX_VALUE:
+ break;
+ case BPF_PSEUDO_MAP_FD:
+ case BPF_PSEUDO_MAP_IDX:
+ if (insn[1].imm == 0)
+ break;
+ fallthrough;
+ default:
+ verbose(env, "unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}
- f = fdget(insn[0].imm);
+ switch (insn[0].src_reg) {
+ case BPF_PSEUDO_MAP_IDX_VALUE:
+ case BPF_PSEUDO_MAP_IDX:
+ if (bpfptr_is_null(env->fd_array)) {
+ verbose(env, "fd_idx without fd_array is invalid\n");
+ return -EPROTO;
+ }
+ if (copy_from_bpfptr_offset(&fd, env->fd_array,
+ insn[0].imm * sizeof(fd),
+ sizeof(fd)))
+ return -EFAULT;
+ break;
+ default:
+ fd = insn[0].imm;
+ break;
+ }
+
+ f = fdget(fd);
map = __bpf_map_get(f);
if (IS_ERR(map)) {
verbose(env, "fd %d is not pointing to valid bpf_map\n",
@@ -11292,7 +11584,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
}
aux = &env->insn_aux_data[i];
- if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
+ if (insn[0].src_reg == BPF_PSEUDO_MAP_FD ||
+ insn[0].src_reg == BPF_PSEUDO_MAP_IDX) {
addr = (unsigned long)map;
} else {
u32 off = insn[1].imm;
@@ -11410,10 +11703,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
* insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
* [0, off) and [off, end) to new locations, so the patched range stays zero
*/
-static int adjust_insn_aux_data(struct bpf_verifier_env *env,
- struct bpf_prog *new_prog, u32 off, u32 cnt)
+static void adjust_insn_aux_data(struct bpf_verifier_env *env,
+ struct bpf_insn_aux_data *new_data,
+ struct bpf_prog *new_prog, u32 off, u32 cnt)
{
- struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ struct bpf_insn_aux_data *old_data = env->insn_aux_data;
struct bpf_insn *insn = new_prog->insnsi;
u32 old_seen = old_data[off].seen;
u32 prog_len;
@@ -11426,12 +11720,9 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
if (cnt == 1)
- return 0;
+ return;
prog_len = new_prog->len;
- new_data = vzalloc(array_size(prog_len,
- sizeof(struct bpf_insn_aux_data)));
- if (!new_data)
- return -ENOMEM;
+
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
@@ -11442,7 +11733,6 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
}
env->insn_aux_data = new_data;
vfree(old_data);
- return 0;
}
static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
@@ -11459,7 +11749,7 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
}
}
-static void adjust_poke_descs(struct bpf_prog *prog, u32 len)
+static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
{
struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
int i, sz = prog->aux->size_poke_tab;
@@ -11467,6 +11757,8 @@ static void adjust_poke_descs(struct bpf_prog *prog, u32 len)
for (i = 0; i < sz; i++) {
desc = &tab[i];
+ if (desc->insn_idx <= off)
+ continue;
desc->insn_idx += len - 1;
}
}
@@ -11475,6 +11767,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
const struct bpf_insn *patch, u32 len)
{
struct bpf_prog *new_prog;
+ struct bpf_insn_aux_data *new_data = NULL;
+
+ if (len > 1) {
+ new_data = vzalloc(array_size(env->prog->len + len - 1,
+ sizeof(struct bpf_insn_aux_data)));
+ if (!new_data)
+ return NULL;
+ }
new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
if (IS_ERR(new_prog)) {
@@ -11482,12 +11782,12 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
verbose(env,
"insn %d cannot be patched due to 16-bit range\n",
env->insn_aux_data[off].orig_idx);
+ vfree(new_data);
return NULL;
}
- if (adjust_insn_aux_data(env, new_prog, off, len))
- return NULL;
+ adjust_insn_aux_data(env, new_data, new_prog, off, len);
adjust_subprog_starts(env, off, len);
- adjust_poke_descs(new_prog, len);
+ adjust_poke_descs(new_prog, off, len);
return new_prog;
}
@@ -11661,6 +11961,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
if (aux_data[i].seen)
continue;
memcpy(insn + i, &trap, sizeof(trap));
+ aux_data[i].zext_dst = false;
}
}
@@ -11887,35 +12188,33 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
bpf_convert_ctx_access_t convert_ctx_access;
+ bool ctx_access;
if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {
type = BPF_READ;
- else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_DW))
+ ctx_access = true;
+ } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
type = BPF_WRITE;
- else
+ ctx_access = BPF_CLASS(insn->code) == BPF_STX;
+ } else {
continue;
+ }
if (type == BPF_WRITE &&
- env->insn_aux_data[i + delta].sanitize_stack_off) {
+ env->insn_aux_data[i + delta].sanitize_stack_spill) {
struct bpf_insn patch[] = {
- /* Sanitize suspicious stack slot with zero.
- * There are no memory dependencies for this store,
- * since it's only using frame pointer and immediate
- * constant of zero
- */
- BPF_ST_MEM(BPF_DW, BPF_REG_FP,
- env->insn_aux_data[i + delta].sanitize_stack_off,
- 0),
- /* the original STX instruction will immediately
- * overwrite the same stack slot with appropriate value
- */
*insn,
+ BPF_ST_NOSPEC(),
};
cnt = ARRAY_SIZE(patch);
@@ -11929,6 +12228,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
continue;
}
+ if (!ctx_access)
+ continue;
+
switch (env->insn_aux_data[i + delta].ptr_type) {
case PTR_TO_CTX:
if (!ops->convert_ctx_access)
@@ -12000,6 +12302,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (is_narrower_load && size < target_size) {
u8 shift = bpf_ctx_narrow_access_offset(
off, size, size_default) * 8;
+ if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
+ verbose(env, "bpf verifier narrow ctx load misconfigured\n");
+ return -EINVAL;
+ }
if (ctx_field_size <= 4) {
if (shift)
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
@@ -12088,7 +12394,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
subprog_end = env->subprog_info[i + 1].start;
len = subprog_end - subprog_start;
- /* BPF_PROG_RUN doesn't call subprogs directly,
+ /* bpf_prog_run() doesn't call subprogs directly,
* hence main prog stats include the runtime of subprogs.
* subprogs don't have IDs and not reachable via prog_get_next_id
* func[i]->stats will never be accessed and stays NULL
@@ -12104,33 +12410,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
goto out_free;
func[i]->is_func = 1;
func[i]->aux->func_idx = i;
- /* the btf and func_info will be freed only at prog->aux */
+ /* Below members will be freed only at prog->aux */
func[i]->aux->btf = prog->aux->btf;
func[i]->aux->func_info = prog->aux->func_info;
+ func[i]->aux->poke_tab = prog->aux->poke_tab;
+ func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
for (j = 0; j < prog->aux->size_poke_tab; j++) {
- u32 insn_idx = prog->aux->poke_tab[j].insn_idx;
- int ret;
-
- if (!(insn_idx >= subprog_start &&
- insn_idx <= subprog_end))
- continue;
-
- ret = bpf_jit_add_poke_descriptor(func[i],
- &prog->aux->poke_tab[j]);
- if (ret < 0) {
- verbose(env, "adding tail call poke descriptor failed\n");
- goto out_free;
- }
+ struct bpf_jit_poke_descriptor *poke;
- func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1;
-
- map_ptr = func[i]->aux->poke_tab[ret].tail_call.map;
- ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux);
- if (ret < 0) {
- verbose(env, "tracking tail call prog failed\n");
- goto out_free;
- }
+ poke = &prog->aux->poke_tab[j];
+ if (poke->insn_idx < subprog_end &&
+ poke->insn_idx >= subprog_start)
+ poke->aux = func[i]->aux;
}
/* Use bpf_prog_F_tag to indicate functions in stack traces.
@@ -12161,18 +12453,6 @@ static int jit_subprogs(struct bpf_verifier_env *env)
cond_resched();
}
- /* Untrack main program's aux structs so that during map_poke_run()
- * we will not stumble upon the unfilled poke descriptors; each
- * of the main program's poke descs got distributed across subprogs
- * and got tracked onto map, so we are sure that none of them will
- * be missed after the operation below
- */
- for (i = 0; i < prog->aux->size_poke_tab; i++) {
- map_ptr = prog->aux->poke_tab[i].tail_call.map;
-
- map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
- }
-
/* at this point all bpf functions were successfully JITed
* now populate all bpf_calls with correct addresses and
* run last pass of JIT
@@ -12250,14 +12530,22 @@ static int jit_subprogs(struct bpf_verifier_env *env)
bpf_prog_jit_attempt_done(prog);
return 0;
out_free:
+ /* We failed JIT'ing, so at this point we need to unregister poke
+ * descriptors from subprogs, so that kernel is not attempting to
+ * patch it anymore as we're freeing the subprog JIT memory.
+ */
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
+ map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
+ }
+ /* At this point we're guaranteed that poke descriptors are not
+ * live anymore. We can just unlink its descriptor table as it's
+ * released with the main prog.
+ */
for (i = 0; i < env->subprog_cnt; i++) {
if (!func[i])
continue;
-
- for (j = 0; j < func[i]->aux->size_poke_tab; j++) {
- map_ptr = func[i]->aux->poke_tab[j].tail_call.map;
- map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux);
- }
+ func[i]->aux->poke_tab = NULL;
bpf_jit_free(func[i]);
}
kfree(func);
@@ -12352,6 +12640,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
bool expect_blinding = bpf_jit_blinding_enabled(prog);
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
const int insn_cnt = prog->len;
@@ -12506,7 +12795,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
prog->aux->max_pkt_offset = MAX_PACKET_OFF;
/* mark bpf_tail_call as different opcode to avoid
- * conditional branch in the interpeter for every normal
+ * conditional branch in the interpreter for every normal
* call and to prevent accidental JITing by JIT compiler
* that doesn't support bpf_tail_call yet
*/
@@ -12569,6 +12858,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
continue;
}
+ if (insn->imm == BPF_FUNC_timer_set_callback) {
+ /* The verifier will process callback_fn as many times as necessary
+ * with different maps and the register states prepared by
+ * set_timer_callback_state will be accurate.
+ *
+ * The following use case is valid:
+ * map1 is shared by prog1, prog2, prog3.
+ * prog1 calls bpf_timer_init for some map1 elements
+ * prog2 calls bpf_timer_set_callback for some map1 elements.
+ * Those that were not bpf_timer_init-ed will return -EINVAL.
+ * prog3 calls bpf_timer_start for some map1 elements.
+ * Those that were not both bpf_timer_init-ed and
+ * bpf_timer_set_callback-ed will return -EINVAL.
+ */
+ struct bpf_insn ld_addrs[2] = {
+ BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
+ };
+
+ insn_buf[0] = ld_addrs[0];
+ insn_buf[1] = ld_addrs[1];
+ insn_buf[2] = *insn;
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* and other inlining handlers are currently limited to 64 bit
* only.
@@ -12685,6 +13007,21 @@ patch_map_ops_generic:
continue;
}
+ /* Implement bpf_get_func_ip inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ip) {
+ /* Load IP address from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
@@ -12751,37 +13088,6 @@ static void free_states(struct bpf_verifier_env *env)
}
}
-/* The verifier is using insn_aux_data[] to store temporary data during
- * verification and to store information for passes that run after the
- * verification like dead code sanitization. do_check_common() for subprogram N
- * may analyze many other subprograms. sanitize_insn_aux_data() clears all
- * temporary data after do_check_common() finds that subprogram N cannot be
- * verified independently. pass_cnt counts the number of times
- * do_check_common() was run and insn->aux->seen tells the pass number
- * insn_aux_data was touched. These variables are compared to clear temporary
- * data from failed pass. For testing and experiments do_check_common() can be
- * run multiple times even when prior attempt to verify is unsuccessful.
- *
- * Note that special handling is needed on !env->bypass_spec_v1 if this is
- * ever called outside of error path with subsequent program rejection.
- */
-static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
-{
- struct bpf_insn *insn = env->prog->insnsi;
- struct bpf_insn_aux_data *aux;
- int i, class;
-
- for (i = 0; i < env->prog->len; i++) {
- class = BPF_CLASS(insn[i].code);
- if (class != BPF_LDX && class != BPF_STX)
- continue;
- aux = &env->insn_aux_data[i];
- if (aux->seen != env->pass_cnt)
- continue;
- memset(aux, 0, offsetof(typeof(*aux), orig_idx));
- }
-}
-
static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
@@ -12858,9 +13164,6 @@ out:
if (!ret && pop_log)
bpf_vlog_reset(&env->log, 0);
free_states(env);
- if (ret)
- /* clean aux data in case subprog was rejected */
- sanitize_insn_aux_data(env);
return ret;
}
@@ -13281,6 +13584,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
int ret;
u64 key;
+ if (prog->type == BPF_PROG_TYPE_SYSCALL) {
+ if (prog->aux->sleepable)
+ /* attach_btf_id checked to be zero already */
+ return 0;
+ verbose(env, "Syscall programs can only be sleepable\n");
+ return -EINVAL;
+ }
+
if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
prog->type != BPF_PROG_TYPE_LSM) {
verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
@@ -13355,8 +13666,7 @@ struct btf *bpf_get_btf_vmlinux(void)
return btf_vmlinux;
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
- union bpf_attr __user *uattr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
{
u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
@@ -13386,6 +13696,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
+ env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
is_priv = bpf_capable();
bpf_get_btf_vmlinux();
diff --git a/kernel/cfi.c b/kernel/cfi.c
index e17a56639766..9594cfd1cf2c 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -248,9 +248,9 @@ static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
{
cfi_check_fn fn;
- rcu_read_lock_sched();
+ rcu_read_lock_sched_notrace();
fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr);
- rcu_read_unlock_sched();
+ rcu_read_unlock_sched_notrace();
return fn;
}
@@ -269,11 +269,11 @@ static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
cfi_check_fn fn = NULL;
struct module *mod;
- rcu_read_lock_sched();
+ rcu_read_lock_sched_notrace();
mod = __module_address(ptr);
if (mod)
fn = mod->cfi_check;
- rcu_read_unlock_sched();
+ rcu_read_unlock_sched_notrace();
return fn;
}
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 1f274d7fc934..35b920328344 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -50,6 +50,8 @@ bool cgroup1_ssid_disabled(int ssid)
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
* @from: attach to all cgroups of a given task
* @tsk: the task to be attached
+ *
+ * Return: %0 on success or a negative errno code on failure
*/
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
@@ -80,7 +82,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * cgroup_transfer_tasks - move tasks from one cgroup to another
* @to: cgroup to which the tasks will be moved
* @from: cgroup in which the tasks currently reside
*
@@ -89,6 +91,8 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
* is guaranteed to be either visible in the source cgroup after the
* parent's migration is complete or put into the target cgroup. No task
* can slip out of migration through forking.
+ *
+ * Return: %0 on success or a negative errno code on failure
*/
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
@@ -682,6 +686,8 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
+ *
+ * Return: %0 on success or a negative errno code on failure
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
@@ -713,7 +719,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
- switch (tsk->state) {
+ switch (READ_ONCE(tsk->__state)) {
case TASK_RUNNING:
stats->nr_running++;
break;
@@ -911,13 +917,11 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
if (opt == -ENOPARAM) {
- if (strcmp(param->key, "source") == 0) {
- if (fc->source)
- return invalf(fc, "Multiple sources not supported");
- fc->source = param->string;
- param->string = NULL;
- return 0;
- }
+ int ret;
+
+ ret = vfs_parse_fs_param_source(fc, param);
+ if (ret != -ENOPARAM)
+ return ret;
for_each_subsys(ss, i) {
if (strcmp(param->key, ss->legacy_name))
continue;
@@ -1223,9 +1227,7 @@ int cgroup1_get_tree(struct fs_context *fc)
ret = cgroup_do_get_tree(fc);
if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
- struct super_block *sb = fc->root->d_sb;
- dput(fc->root);
- deactivate_locked_super(sb);
+ fc_drop_locked(fc);
ret = 1;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 21ecc6ee6a6d..ea08f01d0111 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -68,6 +68,14 @@
#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
/*
+ * To avoid confusing the compiler (and generating warnings) with code
+ * that attempts to access what would be a 0-element array (i.e. sized
+ * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
+ * constant expression can be added.
+ */
+#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
+
+/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
@@ -209,6 +217,22 @@ struct cgroup_namespace init_cgroup_ns = {
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
+/* cgroup optional features */
+enum cgroup_opt_features {
+#ifdef CONFIG_PSI
+ OPT_FEATURE_PRESSURE,
+#endif
+ OPT_FEATURE_COUNT
+};
+
+static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
+#ifdef CONFIG_PSI
+ "pressure",
+#endif
+};
+
+static u16 cgroup_feature_disable_mask __read_mostly;
+
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
@@ -232,7 +256,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
*/
bool cgroup_ssid_enabled(int ssid)
{
- if (CGROUP_SUBSYS_COUNT == 0)
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
return false;
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
@@ -456,7 +480,7 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
- if (ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && ss)
return rcu_dereference_check(cgrp->subsys[ss->id],
lockdep_is_held(&cgroup_mutex));
else
@@ -534,6 +558,9 @@ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
{
struct cgroup_subsys_state *css;
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
+ return NULL;
+
do {
css = cgroup_css(cgrp, ss);
@@ -561,6 +588,9 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
{
struct cgroup_subsys_state *css;
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
+ return NULL;
+
rcu_read_lock();
do {
@@ -577,6 +607,7 @@ out_unlock:
rcu_read_unlock();
return css;
}
+EXPORT_SYMBOL_GPL(cgroup_get_e_css);
static void cgroup_get_live(struct cgroup *cgrp)
{
@@ -630,7 +661,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
* the matching css from the cgroup's subsys table is guaranteed to
* be and stay valid until the enclosing operation is complete.
*/
- if (cft->ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
else
return &cgrp->self;
@@ -678,7 +709,7 @@ EXPORT_SYMBOL_GPL(of_css);
*/
#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
unsigned long __ss_mask = (ss_mask); \
- if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
+ if (!CGROUP_HAS_SUBSYS_CONFIG) { \
(ssid) = 0; \
break; \
} \
@@ -2152,13 +2183,14 @@ static void cgroup_kill_sb(struct super_block *sb)
/*
* If @root doesn't have any children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
- * cgroup_mount() may wait for @root's release.
*
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
- !percpu_ref_is_dying(&root->cgrp.self.refcnt))
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+ cgroup_bpf_offline(&root->cgrp);
percpu_ref_kill(&root->cgrp.self.refcnt);
+ }
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
@@ -2356,7 +2388,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
struct css_set *cset = tset->cur_cset;
struct task_struct *task = tset->cur_task;
- while (&cset->mg_node != tset->csets) {
+ while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
if (!task)
task = list_first_entry(&cset->mg_tasks,
struct task_struct, cg_list);
@@ -2389,7 +2421,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
}
/**
- * cgroup_taskset_migrate - migrate a taskset
+ * cgroup_migrate_execute - migrate a taskset
* @mgctx: migration context
*
* Migrate tasks in @mgctx as setup by migration preparation functions.
@@ -3631,6 +3663,18 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
{
psi_trigger_replace(&of->priv, NULL);
}
+
+bool cgroup_psi_enabled(void)
+{
+ return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
+}
+
+#else /* CONFIG_PSI */
+bool cgroup_psi_enabled(void)
+{
+ return false;
+}
+
#endif /* CONFIG_PSI */
static int cgroup_freeze_show(struct seq_file *seq, void *v)
@@ -3667,6 +3711,80 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
return nbytes;
}
+static void __cgroup_kill(struct cgroup *cgrp)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ spin_lock_irq(&css_set_lock);
+ set_bit(CGRP_KILL, &cgrp->flags);
+ spin_unlock_irq(&css_set_lock);
+
+ css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
+ while ((task = css_task_iter_next(&it))) {
+ /* Ignore kernel threads here. */
+ if (task->flags & PF_KTHREAD)
+ continue;
+
+ /* Skip tasks that are already dying. */
+ if (__fatal_signal_pending(task))
+ continue;
+
+ send_sig(SIGKILL, task, 0);
+ }
+ css_task_iter_end(&it);
+
+ spin_lock_irq(&css_set_lock);
+ clear_bit(CGRP_KILL, &cgrp->flags);
+ spin_unlock_irq(&css_set_lock);
+}
+
+static void cgroup_kill(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *dsct;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
+ __cgroup_kill(dsct);
+}
+
+static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ ssize_t ret = 0;
+ int kill;
+ struct cgroup *cgrp;
+
+ ret = kstrtoint(strstrip(buf), 0, &kill);
+ if (ret)
+ return ret;
+
+ if (kill != 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ /*
+ * Killing is a process directed operation, i.e. the whole thread-group
+ * is taken down so act like we do for cgroup.procs and only make this
+ * writable in non-threaded cgroups.
+ */
+ if (cgroup_is_threaded(cgrp))
+ ret = -EOPNOTSUPP;
+ else
+ cgroup_kill(cgrp);
+
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
@@ -3881,6 +3999,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
restart:
for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
+ if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+ continue;
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@ -3958,6 +4078,9 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
WARN_ON(cft->ss || cft->kf_ops);
+ if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+ continue;
+
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
else
@@ -4536,7 +4659,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
it->ss = css->ss;
it->flags = flags;
- if (it->ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
else
it->cset_pos = &css->cgroup->cset_links;
@@ -4860,12 +4983,18 @@ static struct cftype cgroup_base_files[] = {
.write = cgroup_freeze_write,
},
{
+ .name = "cgroup.kill",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .write = cgroup_kill_write,
+ },
+ {
.name = "cpu.stat",
.seq_show = cpu_stat_show,
},
#ifdef CONFIG_PSI
{
.name = "io.pressure",
+ .flags = CFTYPE_PRESSURE,
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4873,6 +5002,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "memory.pressure",
+ .flags = CFTYPE_PRESSURE,
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4880,6 +5010,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cpu.pressure",
+ .flags = CFTYPE_PRESSURE,
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5771,6 +5902,31 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
}
/*
+ * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * @id: cgroup id
+ * On success return the cgrp, on failure return NULL
+ */
+struct cgroup *cgroup_get_from_id(u64 id)
+{
+ struct kernfs_node *kn;
+ struct cgroup *cgrp = NULL;
+
+ mutex_lock(&cgroup_mutex);
+ kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ if (!kn)
+ goto out_unlock;
+
+ cgrp = kn->priv;
+ if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp))
+ cgrp = NULL;
+ kernfs_put(kn);
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_id);
+
+/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
@@ -6079,6 +6235,8 @@ void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
+ unsigned long cgrp_flags = 0;
+ bool kill = false;
struct cgroup_subsys *ss;
struct css_set *cset;
int i;
@@ -6090,6 +6248,11 @@ void cgroup_post_fork(struct task_struct *child,
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
+ if (kargs->cgrp)
+ cgrp_flags = kargs->cgrp->flags;
+ else
+ cgrp_flags = cset->dfl_cgrp->flags;
+
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
@@ -6098,23 +6261,32 @@ void cgroup_post_fork(struct task_struct *child,
cset = NULL;
}
- /*
- * If the cgroup has to be frozen, the new task has too. Let's set
- * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
- * frozen state.
- */
- if (unlikely(cgroup_task_freeze(child))) {
- spin_lock(&child->sighand->siglock);
- WARN_ON_ONCE(child->frozen);
- child->jobctl |= JOBCTL_TRAP_FREEZE;
- spin_unlock(&child->sighand->siglock);
+ if (!(child->flags & PF_KTHREAD)) {
+ if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
+ /*
+ * If the cgroup has to be frozen, the new task has
+ * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
+ * get the task into the frozen state.
+ */
+ spin_lock(&child->sighand->siglock);
+ WARN_ON_ONCE(child->frozen);
+ child->jobctl |= JOBCTL_TRAP_FREEZE;
+ spin_unlock(&child->sighand->siglock);
+
+ /*
+ * Calling cgroup_update_frozen() isn't required here,
+ * because it will be called anyway a bit later from
+ * do_freezer_trap(). So we avoid cgroup's transient
+ * switch from the frozen state and back.
+ */
+ }
/*
- * Calling cgroup_update_frozen() isn't required here,
- * because it will be called anyway a bit later from
- * do_freezer_trap(). So we avoid cgroup's transient switch
- * from the frozen state and back.
+ * If the cgroup is to be killed notice it now and take the
+ * child down right after we finished preparing it for
+ * userspace.
*/
+ kill = test_bit(CGRP_KILL, &cgrp_flags);
}
spin_unlock_irq(&css_set_lock);
@@ -6137,6 +6309,10 @@ void cgroup_post_fork(struct task_struct *child,
put_css_set(rcset);
}
+ /* Cgroup has to be killed so take down child immediately. */
+ if (unlikely(kill))
+ do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
+
cgroup_css_set_put_fork(kargs);
}
@@ -6162,7 +6338,8 @@ void cgroup_exit(struct task_struct *tsk)
cset->nr_tasks--;
WARN_ON_ONCE(cgroup_task_frozen(tsk));
- if (unlikely(cgroup_task_freeze(tsk)))
+ if (unlikely(!(tsk->flags & PF_KTHREAD) &&
+ test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
spin_unlock_irq(&css_set_lock);
@@ -6213,6 +6390,15 @@ static int __init cgroup_disable(char *str)
pr_info("Disabling %s control group subsystem\n",
ss->name);
}
+
+ for (i = 0; i < OPT_FEATURE_COUNT; i++) {
+ if (strcmp(token, cgroup_opt_feature_names[i]))
+ continue;
+ cgroup_feature_disable_mask |= 1 << i;
+ pr_info("Disabling %s control group feature\n",
+ cgroup_opt_feature_names[i]);
+ break;
+ }
}
return 1;
}
@@ -6388,74 +6574,51 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
- if (cgroup_sk_alloc_disabled)
- return;
- pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
- cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled false
-
-#endif
-
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled) {
- skcd->no_refcnt = 1;
- return;
- }
-
- /* Don't associate the sock with unrelated interrupted task's cgroup. */
- if (in_interrupt())
- return;
+ struct cgroup *cgroup;
rcu_read_lock();
+ /* Don't associate the sock with unrelated interrupted task's cgroup. */
+ if (in_interrupt()) {
+ cgroup = &cgrp_dfl_root.cgrp;
+ cgroup_get(cgroup);
+ goto out;
+ }
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
- skcd->val = (unsigned long)cset->dfl_cgrp;
- cgroup_bpf_get(cset->dfl_cgrp);
+ cgroup = cset->dfl_cgrp;
break;
}
cpu_relax();
}
-
+out:
+ skcd->cgroup = cgroup;
+ cgroup_bpf_get(cgroup);
rcu_read_unlock();
}
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
- if (skcd->val) {
- if (skcd->no_refcnt)
- return;
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
- cgroup_bpf_get(sock_cgroup_ptr(skcd));
- }
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(cgrp);
+ cgroup_bpf_get(cgrp);
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- if (skcd->no_refcnt)
- return;
cgroup_bpf_put(cgrp);
cgroup_put(cgrp);
}
@@ -6511,6 +6674,9 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
continue;
+ if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+ continue;
+
if (prefix)
ret += snprintf(buf + ret, size - ret, "%s.", prefix);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index adb5190c4429..2a9695ccb65f 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -160,6 +160,9 @@ struct cpuset {
*/
int use_parent_ecpus;
int child_ecpus_count;
+
+ /* Handle for cpuset.cpus.partition */
+ struct cgroup_file partition_file;
};
/*
@@ -263,6 +266,16 @@ static inline int is_partition_root(const struct cpuset *cs)
return cs->partition_root_state > 0;
}
+/*
+ * Send notification event of whenever partition_root_state changes.
+ */
+static inline void notify_partition_change(struct cpuset *cs,
+ int old_prs, int new_prs)
+{
+ if (old_prs != new_prs)
+ cgroup_file_notify(&cs->partition_file);
+}
+
static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
@@ -298,17 +311,19 @@ static struct cpuset top_cpuset = {
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
/*
- * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * There are two global locks guarding cpuset structures - cpuset_rwsem and
* callback_lock. We also require taking task_lock() when dereferencing a
* task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment.
+ * comment. The cpuset code uses only cpuset_rwsem write lock. Other
+ * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to
+ * prevent change to cpuset structures.
*
* A task must hold both locks to modify cpusets. If a task holds
- * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it
* is the only task able to also acquire callback_lock and be able to
* modify cpusets. It can perform various checks on the cpuset structure
* first, knowing nothing will change. It can also allocate memory while
- * just holding cpuset_mutex. While it is performing these checks, various
+ * just holding cpuset_rwsem. While it is performing these checks, various
* callback routines can briefly acquire callback_lock to query cpusets.
* Once it is ready to make the changes, it takes callback_lock, blocking
* everyone else.
@@ -372,18 +387,29 @@ static inline bool is_in_v2_mode(void)
}
/*
- * Return in pmask the portion of a cpusets's cpus_allowed that
- * are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus.
+ * Return in pmask the portion of a task's cpusets's cpus_allowed that
+ * are online and are capable of running the task. If none are found,
+ * walk up the cpuset hierarchy until we find one that does have some
+ * appropriate cpus.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
+static void guarantee_online_cpus(struct task_struct *tsk,
+ struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ struct cpuset *cs;
+
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
+ cpumask_copy(pmask, cpu_online_mask);
+
+ rcu_read_lock();
+ cs = task_cs(tsk);
+
+ while (!cpumask_intersects(cs->effective_cpus, pmask)) {
cs = parent_cs(cs);
if (unlikely(!cs)) {
/*
@@ -393,11 +419,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* cpuset's effective_cpus is on its way to be
* identical to cpu_online_mask.
*/
- cpumask_copy(pmask, cpu_online_mask);
- return;
+ goto out_unlock;
}
}
- cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
+ cpumask_and(pmask, pmask, cs->effective_cpus);
+
+out_unlock:
+ rcu_read_unlock();
}
/*
@@ -409,7 +437,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
@@ -421,7 +449,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -442,7 +470,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cpuset_mutex.
+ * are only set if the other's are set. Call holding cpuset_rwsem.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -551,7 +579,7 @@ static inline void free_cpuset(struct cpuset *cs)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cpuset_mutex held.
+ * cpuset_rwsem held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -674,7 +702,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
-/* Must be called with cpuset_mutex held. */
+/* Must be called with cpuset_rwsem held. */
static inline int nr_cpusets(void)
{
/* jump label reference count + the top-level cpuset */
@@ -700,7 +728,7 @@ static inline int nr_cpusets(void)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cpuset_mutex held.
+ * Must be called with cpuset_rwsem held.
*
* The three key local variables below are:
* cp - cpuset pointer, used (together with pos_css) to perform a
@@ -979,7 +1007,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
- * Call with cpuset_mutex held. Takes get_online_cpus().
+ * Call with cpuset_rwsem held. Takes cpus_read_lock().
*/
static void rebuild_sched_domains_locked(void)
{
@@ -1040,11 +1068,11 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void)
{
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
rebuild_sched_domains_locked();
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
}
/**
@@ -1052,7 +1080,7 @@ void rebuild_sched_domains(void)
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its cpus_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
+ * effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
static void update_tasks_cpumask(struct cpuset *cs)
@@ -1114,7 +1142,7 @@ enum subparts_cmd {
* cpus_allowed can be granted or an error code will be returned.
*
* For partcmd_disable, the cpuset is being transofrmed from a partition
- * root back to a non-partition root. any CPUs in cpus_allowed that are in
+ * root back to a non-partition root. Any CPUs in cpus_allowed that are in
* parent's subparts_cpus will be taken away from that cpumask and put back
* into parent's effective_cpus. 0 should always be returned.
*
@@ -1148,6 +1176,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
struct cpuset *parent = parent_cs(cpuset);
int adding; /* Moving cpus from effective_cpus to subparts_cpus */
int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
+ int old_prs, new_prs;
bool part_error = false; /* Partition error? */
percpu_rwsem_assert_held(&cpuset_rwsem);
@@ -1183,6 +1212,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* A cpumask update cannot make parent's effective_cpus become empty.
*/
adding = deleting = false;
+ old_prs = new_prs = cpuset->partition_root_state;
if (cmd == partcmd_enable) {
cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
adding = true;
@@ -1225,7 +1255,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
/*
* partcmd_update w/o newmask:
*
- * addmask = cpus_allowed & parent->effectiveb_cpus
+ * addmask = cpus_allowed & parent->effective_cpus
*
* Note that parent's subparts_cpus may have been
* pre-shrunk in case there is a change in the cpu list.
@@ -1247,11 +1277,11 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
switch (cpuset->partition_root_state) {
case PRS_ENABLED:
if (part_error)
- cpuset->partition_root_state = PRS_ERROR;
+ new_prs = PRS_ERROR;
break;
case PRS_ERROR:
if (!part_error)
- cpuset->partition_root_state = PRS_ENABLED;
+ new_prs = PRS_ENABLED;
break;
}
/*
@@ -1260,10 +1290,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
part_error = (prev_prs == PRS_ERROR);
}
- if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
+ if (!part_error && (new_prs == PRS_ERROR))
return 0; /* Nothing need to be done */
- if (cpuset->partition_root_state == PRS_ERROR) {
+ if (new_prs == PRS_ERROR) {
/*
* Remove all its cpus from parent's subparts_cpus.
*/
@@ -1272,7 +1302,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
parent->subparts_cpus);
}
- if (!adding && !deleting)
+ if (!adding && !deleting && (new_prs == old_prs))
return 0;
/*
@@ -1299,7 +1329,12 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
}
parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+
+ if (old_prs != new_prs)
+ cpuset->partition_root_state = new_prs;
+
spin_unlock_irq(&callback_lock);
+ notify_partition_change(cpuset, old_prs, new_prs);
return cmd == partcmd_update;
}
@@ -1314,13 +1349,14 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
*
* On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
*
- * Called with cpuset_mutex held
+ * Called with cpuset_rwsem held
*/
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false;
+ int old_prs, new_prs;
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
@@ -1360,17 +1396,18 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
* update_tasks_cpumask() again for tasks in the parent
* cpuset if the parent's subparts_cpus changes.
*/
- if ((cp != cs) && cp->partition_root_state) {
+ old_prs = new_prs = cp->partition_root_state;
+ if ((cp != cs) && old_prs) {
switch (parent->partition_root_state) {
case PRS_DISABLED:
/*
* If parent is not a partition root or an
- * invalid partition root, clear the state
- * state and the CS_CPU_EXCLUSIVE flag.
+ * invalid partition root, clear its state
+ * and its CS_CPU_EXCLUSIVE flag.
*/
WARN_ON_ONCE(cp->partition_root_state
!= PRS_ERROR);
- cp->partition_root_state = 0;
+ new_prs = PRS_DISABLED;
/*
* clear_bit() is an atomic operation and
@@ -1391,11 +1428,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
/*
* When parent is invalid, it has to be too.
*/
- cp->partition_root_state = PRS_ERROR;
- if (cp->nr_subparts_cpus) {
- cp->nr_subparts_cpus = 0;
- cpumask_clear(cp->subparts_cpus);
- }
+ new_prs = PRS_ERROR;
break;
}
}
@@ -1407,8 +1440,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- if (cp->nr_subparts_cpus &&
- (cp->partition_root_state != PRS_ENABLED)) {
+ if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
cp->nr_subparts_cpus = 0;
cpumask_clear(cp->subparts_cpus);
} else if (cp->nr_subparts_cpus) {
@@ -1435,7 +1467,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
= cpumask_weight(cp->subparts_cpus);
}
}
+
+ if (new_prs != old_prs)
+ cp->partition_root_state = new_prs;
+
spin_unlock_irq(&callback_lock);
+ notify_partition_change(cp, old_prs, new_prs);
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -1612,6 +1649,11 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
{
struct cpuset_migrate_mm_work *mwork;
+ if (nodes_equal(*from, *to)) {
+ mmput(mm);
+ return;
+ }
+
mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
if (mwork) {
mwork->mm = mm;
@@ -1664,12 +1706,12 @@ static void *cpuset_being_rebound;
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its mems_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
+ * effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
static void update_tasks_nodemask(struct cpuset *cs)
{
- static nodemask_t newmems; /* protected by cpuset_mutex */
+ static nodemask_t newmems; /* protected by cpuset_rwsem */
struct css_task_iter it;
struct task_struct *task;
@@ -1682,7 +1724,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cpuset_mutex, we know that no other rebind effort
+ * the global cpuset_rwsem, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -1728,7 +1770,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
*
* On legacy hierarchy, effective_mems will be the same with mems_allowed.
*
- * Called with cpuset_mutex held
+ * Called with cpuset_rwsem held
*/
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
@@ -1781,7 +1823,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_mutex held. May take callback_lock during call.
+ * Call with cpuset_rwsem held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_lock, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1871,7 +1913,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
* @cs: the cpuset in which each task's spread flags needs to be changed
*
* Iterate through each task of @cs updating its spread flags. As this
- * function is called with cpuset_mutex held, cpuset membership stays
+ * function is called with cpuset_rwsem held, cpuset membership stays
* stable.
*/
static void update_tasks_flags(struct cpuset *cs)
@@ -1891,7 +1933,7 @@ static void update_tasks_flags(struct cpuset *cs)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cpuset_mutex held.
+ * Call with cpuset_rwsem held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1937,34 +1979,32 @@ out:
/*
* update_prstate - update partititon_root_state
- * cs: the cpuset to update
- * val: 0 - disabled, 1 - enabled
+ * cs: the cpuset to update
+ * new_prs: new partition root state
*
- * Call with cpuset_mutex held.
+ * Call with cpuset_rwsem held.
*/
-static int update_prstate(struct cpuset *cs, int val)
+static int update_prstate(struct cpuset *cs, int new_prs)
{
- int err;
+ int err, old_prs = cs->partition_root_state;
struct cpuset *parent = parent_cs(cs);
- struct tmpmasks tmp;
+ struct tmpmasks tmpmask;
- if ((val != 0) && (val != 1))
- return -EINVAL;
- if (val == cs->partition_root_state)
+ if (old_prs == new_prs)
return 0;
/*
* Cannot force a partial or invalid partition root to a full
* partition root.
*/
- if (val && cs->partition_root_state)
+ if (new_prs && (old_prs == PRS_ERROR))
return -EINVAL;
- if (alloc_cpumasks(NULL, &tmp))
+ if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
err = -EINVAL;
- if (!cs->partition_root_state) {
+ if (!old_prs) {
/*
* Turning on partition root requires setting the
* CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
@@ -1978,31 +2018,27 @@ static int update_prstate(struct cpuset *cs, int val)
goto out;
err = update_parent_subparts_cpumask(cs, partcmd_enable,
- NULL, &tmp);
+ NULL, &tmpmask);
if (err) {
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
goto out;
}
- cs->partition_root_state = PRS_ENABLED;
} else {
/*
* Turning off partition root will clear the
* CS_CPU_EXCLUSIVE bit.
*/
- if (cs->partition_root_state == PRS_ERROR) {
- cs->partition_root_state = 0;
+ if (old_prs == PRS_ERROR) {
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
err = 0;
goto out;
}
err = update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, &tmp);
+ NULL, &tmpmask);
if (err)
goto out;
- cs->partition_root_state = 0;
-
/* Turning off CS_CPU_EXCLUSIVE will not return error */
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
}
@@ -2015,11 +2051,18 @@ static int update_prstate(struct cpuset *cs, int val)
update_tasks_cpumask(parent);
if (parent->child_ecpus_count)
- update_sibling_cpumasks(parent, cs, &tmp);
+ update_sibling_cpumasks(parent, cs, &tmpmask);
rebuild_sched_domains_locked();
out:
- free_cpumasks(NULL, &tmp);
+ if (!err) {
+ spin_lock_irq(&callback_lock);
+ cs->partition_root_state = new_prs;
+ spin_unlock_irq(&callback_lock);
+ notify_partition_change(cs, old_prs, new_prs);
+ }
+
+ free_cpumasks(NULL, &tmpmask);
return err;
}
@@ -2126,7 +2169,7 @@ static int fmeter_getrate(struct fmeter *fmp)
static struct cpuset *cpuset_attach_old_cs;
-/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
+/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
@@ -2178,7 +2221,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
}
/*
- * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
@@ -2186,7 +2229,7 @@ static cpumask_var_t cpus_attach;
static void cpuset_attach(struct cgroup_taskset *tset)
{
- /* static buf protected by cpuset_mutex */
+ /* static buf protected by cpuset_rwsem */
static nodemask_t cpuset_attach_nodemask_to;
struct task_struct *task;
struct task_struct *leader;
@@ -2199,15 +2242,13 @@ static void cpuset_attach(struct cgroup_taskset *tset)
percpu_down_write(&cpuset_rwsem);
- /* prepare for attach */
- if (cs == &top_cpuset)
- cpumask_copy(cpus_attach, cpu_possible_mask);
- else
- guarantee_online_cpus(cs, cpus_attach);
-
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, css, tset) {
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(task, cpus_attach);
+ else
+ cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
/*
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
@@ -2282,7 +2323,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
@@ -2320,7 +2361,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
return retval;
}
@@ -2331,7 +2372,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2346,7 +2387,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
return retval;
}
@@ -2378,14 +2419,14 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
- * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * grabbing cpuset_rwsem anyway. This only happens on the legacy
* hierarchies.
*/
css_get(&cs->css);
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2411,7 +2452,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_cpuset(trialcs);
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
@@ -2542,7 +2583,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
return -EINVAL;
css_get(&cs->css);
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2550,7 +2591,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
retval = update_prstate(cs, val);
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
css_put(&cs->css);
return retval ?: nbytes;
}
@@ -2702,6 +2743,7 @@ static struct cftype dfl_files[] = {
.write = sched_partition_write,
.private = FILE_PARTITION_ROOT,
.flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct cpuset, partition_file),
},
{
@@ -2737,12 +2779,16 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
}
- set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
nodes_clear(cs->mems_allowed);
nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
+ /* Set CS_MEMORY_MIGRATE for default hierarchy */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
+
return &cs->css;
}
@@ -2756,7 +2802,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
set_bit(CS_ONLINE, &cs->flags);
@@ -2809,7 +2855,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
spin_unlock_irq(&callback_lock);
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
@@ -2828,7 +2874,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (is_partition_root(cs))
@@ -2849,7 +2895,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
clear_bit(CS_ONLINE, &cs->flags);
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -3060,7 +3106,7 @@ retry:
goto retry;
}
- parent = parent_cs(cs);
+ parent = parent_cs(cs);
compute_effective_cpumask(&new_cpus, cs, parent);
nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
@@ -3082,8 +3128,10 @@ retry:
if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
(parent->partition_root_state == PRS_ERROR))) {
if (cs->nr_subparts_cpus) {
+ spin_lock_irq(&callback_lock);
cs->nr_subparts_cpus = 0;
cpumask_clear(cs->subparts_cpus);
+ spin_unlock_irq(&callback_lock);
compute_effective_cpumask(&new_cpus, cs, parent);
}
@@ -3095,9 +3143,17 @@ retry:
*/
if ((parent->partition_root_state == PRS_ERROR) ||
cpumask_empty(&new_cpus)) {
+ int old_prs;
+
update_parent_subparts_cpumask(cs, partcmd_disable,
NULL, tmp);
- cs->partition_root_state = PRS_ERROR;
+ old_prs = cs->partition_root_state;
+ if (old_prs != PRS_ERROR) {
+ spin_lock_irq(&callback_lock);
+ cs->partition_root_state = PRS_ERROR;
+ spin_unlock_irq(&callback_lock);
+ notify_partition_change(cs, old_prs, PRS_ERROR);
+ }
}
cpuset_force_rebuild();
}
@@ -3168,6 +3224,13 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
+ /*
+ * In the rare case that hotplug removes all the cpus in subparts_cpus,
+ * we assumed that cpus are updated.
+ */
+ if (!cpus_updated && top_cpuset.nr_subparts_cpus)
+ cpus_updated = true;
+
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
spin_lock_irq(&callback_lock);
@@ -3302,9 +3365,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
unsigned long flags;
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
- guarantee_online_cpus(task_cs(tsk), pmask);
- rcu_read_unlock();
+ guarantee_online_cpus(tsk, pmask);
spin_unlock_irqrestore(&callback_lock, flags);
}
@@ -3318,13 +3379,22 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
* which will not contain a sane cpumask during cases such as cpu hotplugging.
* This is the absolute last resort for the scheduler and it is only used if
* _every_ other avenue has been traveled.
+ *
+ * Returns true if the affinity of @tsk was changed, false otherwise.
**/
-void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ const struct cpumask *cs_mask;
+ bool changed = false;
+
rcu_read_lock();
- do_set_cpus_allowed(tsk, is_in_v2_mode() ?
- task_cs(tsk)->cpus_allowed : cpu_possible_mask);
+ cs_mask = task_cs(tsk)->cpus_allowed;
+ if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
+ do_set_cpus_allowed(tsk, cs_mask);
+ changed = true;
+ }
rcu_read_unlock();
/*
@@ -3344,6 +3414,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* select_fallback_rq() will fix things ups and set cpu_possible_mask
* if required.
*/
+ return changed;
}
void __init cpuset_init_current_mems_allowed(void)
@@ -3603,7 +3674,7 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cpuset_mutex, keeping cpuset_attach() from changing it
+ * and we take cpuset_rwsem, keeping cpuset_attach() from changing it
* anyway.
*/
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index f5e8828c109c..0d5c29879a50 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
struct cgroup_namespace *new_ns;
int ret;
- new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+ new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns)
return ERR_PTR(-ENOMEM);
ret = ns_alloc_inum(&new_ns->ns);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index cee265cb535c..b264ab5652ba 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -220,7 +220,7 @@ void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
}
/**
- * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
+ * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
* @cgrp: target cgroup
*
* Flush stats in @cgrp's subtree and prevent further flushes. Must be
@@ -347,19 +347,20 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
}
static struct cgroup_rstat_cpu *
-cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
+cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
struct cgroup_rstat_cpu *rstatc;
rstatc = get_cpu_ptr(cgrp->rstat_cpu);
- u64_stats_update_begin(&rstatc->bsync);
+ *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
return rstatc;
}
static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
- struct cgroup_rstat_cpu *rstatc)
+ struct cgroup_rstat_cpu *rstatc,
+ unsigned long flags)
{
- u64_stats_update_end(&rstatc->bsync);
+ u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
cgroup_rstat_updated(cgrp, smp_processor_id());
put_cpu_ptr(rstatc);
}
@@ -367,18 +368,20 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) {
case CPUTIME_USER:
@@ -394,7 +397,7 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
break;
}
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
/*
diff --git a/kernel/compat.c b/kernel/compat.c
index 05adfd6fa8bf..55551989d9da 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -269,24 +269,3 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
return 0;
}
EXPORT_SYMBOL_GPL(get_compat_sigset);
-
-/*
- * Allocate user-space memory for the duration of a single system call,
- * in order to marshall parameters inside a compat thunk.
- */
-void __user *compat_alloc_user_space(unsigned long len)
-{
- void __user *ptr;
-
- /* If len would occupy more than half of the entire compat space... */
- if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
- return NULL;
-
- ptr = arch_compat_alloc_user_space(len);
-
- if (unlikely(!access_ok(ptr, len)))
- return NULL;
-
- return ptr;
-}
-EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e538518556f4..192e43a87407 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -32,6 +32,7 @@
#include <linux/relay.h>
#include <linux/slab.h>
#include <linux/percpu-rwsem.h>
+#include <linux/cpuset.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -40,14 +41,19 @@
#include "smpboot.h"
/**
- * cpuhp_cpu_state - Per cpu hotplug state storage
+ * struct cpuhp_cpu_state - Per cpu hotplug state storage
* @state: The current cpu state
* @target: The target state
+ * @fail: Current CPU hotplug callback state
* @thread: Pointer to the hotplug thread
* @should_run: Thread should execute
* @rollback: Perform a rollback
* @single: Single callback invocation
* @bringup: Single callback bringup or teardown selector
+ * @cpu: CPU number
+ * @node: Remote CPU node; for multi-instance, do a
+ * single entry callback for install/remove
+ * @last: For multi-instance rollback, remember how far we got
* @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
* @done_up: Signal completion to the issuer of the task for cpu-up
@@ -105,11 +111,12 @@ static inline void cpuhp_lock_release(bool bringup) { }
#endif
/**
- * cpuhp_step - Hotplug state machine step
+ * struct cpuhp_step - Hotplug state machine step
* @name: Name of the step
* @startup: Startup function of the step
* @teardown: Teardown function of the step
* @cant_stop: Bringup/teardown can't be stopped at this step
+ * @multi_instance: State has multiple instances which get added afterwards
*/
struct cpuhp_step {
const char *name;
@@ -123,7 +130,9 @@ struct cpuhp_step {
int (*multi)(unsigned int cpu,
struct hlist_node *node);
} teardown;
+ /* private: */
struct hlist_head list;
+ /* public: */
bool cant_stop;
bool multi_instance;
};
@@ -142,7 +151,7 @@ static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
}
/**
- * cpuhp_invoke_callback _ Invoke the callbacks for a given state
+ * cpuhp_invoke_callback - Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
* @state: The state to do callbacks for
* @bringup: True if the bringup callback should be invoked
@@ -150,6 +159,8 @@ static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
* @lastp: For multi-instance rollback, remember how far we got
*
* Called from cpu hotplug and from the state register machinery.
+ *
+ * Return: %0 on success or a negative errno code
*/
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
bool bringup, struct hlist_node *node,
@@ -681,6 +692,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret = cpuhp_invoke_callback_range(true, cpu, st, target);
if (ret) {
+ pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n",
+ ret, cpu, cpuhp_get_step(st->state)->name,
+ st->state);
+
cpuhp_reset_state(st, prev_state);
if (can_rollback_cpu(st))
WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
@@ -873,6 +888,52 @@ void __init cpuhp_threads_init(void)
kthread_unpark(this_cpu_read(cpuhp_state.thread));
}
+/*
+ *
+ * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
+ * protected region.
+ *
+ * The operation is still serialized against concurrent CPU hotplug via
+ * cpu_add_remove_lock, i.e. CPU map protection. But it is _not_
+ * serialized against other hotplug related activity like adding or
+ * removing of state callbacks and state instances, which invoke either the
+ * startup or the teardown callback of the affected state.
+ *
+ * This is required for subsystems which are unfixable vs. CPU hotplug and
+ * evade lock inversion problems by scheduling work which has to be
+ * completed _before_ cpu_up()/_cpu_down() returns.
+ *
+ * Don't even think about adding anything to this for any new code or even
+ * drivers. It's only purpose is to keep existing lock order trainwrecks
+ * working.
+ *
+ * For cpu_down() there might be valid reasons to finish cleanups which are
+ * not required to be done under cpu_hotplug_lock, but that's a different
+ * story and would be not invoked via this.
+ */
+static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
+{
+ /*
+ * cpusets delegate hotplug operations to a worker to "solve" the
+ * lock order problems. Wait for the worker, but only if tasks are
+ * _not_ frozen (suspend, hibernate) as that would wait forever.
+ *
+ * The wait is required because otherwise the hotplug operation
+ * returns with inconsistent state, which could even be observed in
+ * user space when a new CPU is brought up. The CPU plug uevent
+ * would be delivered and user space reacting on it would fail to
+ * move tasks to the newly plugged CPU up to the point where the
+ * work has finished because up to that point the newly plugged CPU
+ * is not assignable in cpusets/cgroups. On unplug that's not
+ * necessarily a visible issue, but it is still inconsistent state,
+ * which is the real problem which needs to be "fixed". This can't
+ * prevent the transient state between scheduling the work and
+ * returning from waiting for it.
+ */
+ if (!tasks_frozen)
+ cpuset_wait_for_hotplug();
+}
+
#ifdef CONFIG_HOTPLUG_CPU
#ifndef arch_clear_mm_cpumask_cpu
#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
@@ -961,7 +1022,7 @@ static int takedown_cpu(unsigned int cpu)
int err;
/* Park the smpboot threads */
- kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
+ kthread_park(st->thread);
/*
* Prevent irq alloc/free while the dying cpu reorganizes the
@@ -977,7 +1038,7 @@ static int takedown_cpu(unsigned int cpu)
/* CPU refused to die */
irq_unlock_sparse();
/* Unpark the hotplug thread so we can rollback there */
- kthread_unpark(per_cpu_ptr(&cpuhp_state, cpu)->thread);
+ kthread_unpark(st->thread);
return err;
}
BUG_ON(cpu_online(cpu));
@@ -1034,6 +1095,9 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret = cpuhp_invoke_callback_range(false, cpu, st, target);
if (ret) {
+ pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n",
+ ret, cpu, cpuhp_get_step(st->state)->name,
+ st->state);
cpuhp_reset_state(st, prev_state);
@@ -1108,6 +1172,7 @@ out:
*/
lockup_detector_cleanup();
arch_smt_update();
+ cpu_up_down_serialize_trainwrecks(tasks_frozen);
return ret;
}
@@ -1135,6 +1200,8 @@ static int cpu_down(unsigned int cpu, enum cpuhp_state target)
* This function is meant to be used by device core cpu subsystem only.
*
* Other subsystems should use remove_cpu() instead.
+ *
+ * Return: %0 on success or a negative errno code
*/
int cpu_device_down(struct device *dev)
{
@@ -1302,6 +1369,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
out:
cpus_write_unlock();
arch_smt_update();
+ cpu_up_down_serialize_trainwrecks(tasks_frozen);
return ret;
}
@@ -1346,6 +1414,8 @@ out:
* This function is meant to be used by device core cpu subsystem only.
*
* Other subsystems should use add_cpu() instead.
+ *
+ * Return: %0 on success or a negative errno code
*/
int cpu_device_up(struct device *dev)
{
@@ -1371,6 +1441,8 @@ EXPORT_SYMBOL_GPL(add_cpu);
* On some architectures like arm64, we can hibernate on any CPU, but on
* wake up the CPU we hibernated on might be offline as a side effect of
* using maxcpus= for example.
+ *
+ * Return: %0 on success or a negative errno code
*/
int bringup_hibernate_cpu(unsigned int sleep_cpu)
{
@@ -1927,6 +1999,7 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
/**
* __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
* @state: The state to setup
+ * @name: Name of the step
* @invoke: If true, the startup function is invoked for cpus where
* cpu state >= @state
* @startup: startup callback function
@@ -1935,9 +2008,9 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
* added afterwards.
*
* The caller needs to hold cpus read locked while calling this function.
- * Returns:
+ * Return:
* On success:
- * Positive state number if @state is CPUHP_AP_ONLINE_DYN
+ * Positive state number if @state is CPUHP_AP_ONLINE_DYN;
* 0 for all other states
* On failure: proper (negative) error code
*/
@@ -2183,18 +2256,17 @@ int cpuhp_smt_enable(void)
#endif
#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
-static ssize_t show_cpuhp_state(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t state_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->state);
}
-static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL);
+static DEVICE_ATTR_RO(state);
-static ssize_t write_cpuhp_target(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t target_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
struct cpuhp_step *sp;
@@ -2232,19 +2304,17 @@ out:
return ret ? ret : count;
}
-static ssize_t show_cpuhp_target(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t target_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->target);
}
-static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
-
+static DEVICE_ATTR_RW(target);
-static ssize_t write_cpuhp_fail(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t fail_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
struct cpuhp_step *sp;
@@ -2293,15 +2363,15 @@ static ssize_t write_cpuhp_fail(struct device *dev,
return count;
}
-static ssize_t show_cpuhp_fail(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t fail_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->fail);
}
-static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
+static DEVICE_ATTR_RW(fail);
static struct attribute *cpuhp_cpu_attrs[] = {
&dev_attr_state.attr,
@@ -2316,7 +2386,7 @@ static const struct attribute_group cpuhp_cpu_attr_group = {
NULL
};
-static ssize_t show_cpuhp_states(struct device *dev,
+static ssize_t states_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
ssize_t cur, res = 0;
@@ -2335,7 +2405,7 @@ static ssize_t show_cpuhp_states(struct device *dev,
mutex_unlock(&cpuhp_state_mutex);
return res;
}
-static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL);
+static DEVICE_ATTR_RO(states);
static struct attribute *cpuhp_cpu_root_attrs[] = {
&dev_attr_states.attr,
@@ -2408,28 +2478,27 @@ static const char *smt_states[] = {
[CPU_SMT_NOT_IMPLEMENTED] = "notimplemented",
};
-static ssize_t
-show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
+static ssize_t control_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
const char *state = smt_states[cpu_smt_control];
return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
}
-static ssize_t
-store_smt_control(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t control_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
return __store_smt_control(dev, attr, buf, count);
}
-static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
+static DEVICE_ATTR_RW(control);
-static ssize_t
-show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
+static ssize_t active_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
}
-static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
+static DEVICE_ATTR_RO(active);
static struct attribute *cpuhp_smt_attrs[] = {
&dev_attr_control.attr,
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index f7e1d0eccdbc..246efc74e3f3 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -13,19 +13,32 @@
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
-static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+/*
+ * atomic_notifiers use a spinlock_t, which can block under PREEMPT_RT.
+ * Notifications for cpu_pm will be issued by the idle task itself, which can
+ * never block, IOW it requires using a raw_spinlock_t.
+ */
+static struct {
+ struct raw_notifier_head chain;
+ raw_spinlock_t lock;
+} cpu_pm_notifier = {
+ .chain = RAW_NOTIFIER_INIT(cpu_pm_notifier.chain),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(cpu_pm_notifier.lock),
+};
static int cpu_pm_notify(enum cpu_pm_event event)
{
int ret;
/*
- * atomic_notifier_call_chain has a RCU read critical section, which
- * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
- * RCU know this.
+ * This introduces a RCU read critical section, which could be
+ * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know
+ * this.
*/
rcu_irq_enter_irqson();
- ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL);
+ rcu_read_lock();
+ ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);
+ rcu_read_unlock();
rcu_irq_exit_irqson();
return notifier_to_errno(ret);
@@ -33,10 +46,13 @@ static int cpu_pm_notify(enum cpu_pm_event event)
static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down)
{
+ unsigned long flags;
int ret;
rcu_irq_enter_irqson();
- ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL);
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
rcu_irq_exit_irqson();
return notifier_to_errno(ret);
@@ -49,12 +65,17 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev
* Add a driver to a list of drivers that are notified about
* CPU and CPU cluster low power entry and exit.
*
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_register.
+ * This function has the same return conditions as raw_notifier_chain_register.
*/
int cpu_pm_register_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_chain_register(&cpu_pm_notifier.chain, nb);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
+ return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
@@ -64,12 +85,17 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
*
* Remove a driver from the CPU PM notifier list.
*
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_unregister.
+ * This function has the same return conditions as raw_notifier_chain_unregister.
*/
int cpu_pm_unregister_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_chain_unregister(&cpu_pm_notifier.chain, nb);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
+ return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 684a6061a13a..eb53f5ec62c9 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -4,6 +4,7 @@
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
*/
+#include <linux/buildid.h>
#include <linux/crash_core.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
@@ -378,53 +379,6 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
}
EXPORT_SYMBOL(paddr_vmcoreinfo_note);
-#define NOTES_SIZE (&__stop_notes - &__start_notes)
-#define BUILD_ID_MAX SHA1_DIGEST_SIZE
-#define NT_GNU_BUILD_ID 3
-
-struct elf_note_section {
- struct elf_note n_hdr;
- u8 n_data[];
-};
-
-/*
- * Add build ID from .notes section as generated by the GNU ld(1)
- * or LLVM lld(1) --build-id option.
- */
-static void add_build_id_vmcoreinfo(void)
-{
- char build_id[BUILD_ID_MAX * 2 + 1];
- int n_remain = NOTES_SIZE;
-
- while (n_remain >= sizeof(struct elf_note)) {
- const struct elf_note_section *note_sec =
- &__start_notes + NOTES_SIZE - n_remain;
- const u32 n_namesz = note_sec->n_hdr.n_namesz;
-
- if (note_sec->n_hdr.n_type == NT_GNU_BUILD_ID &&
- n_namesz != 0 &&
- !strcmp((char *)&note_sec->n_data[0], "GNU")) {
- if (note_sec->n_hdr.n_descsz <= BUILD_ID_MAX) {
- const u32 n_descsz = note_sec->n_hdr.n_descsz;
- const u8 *s = &note_sec->n_data[n_namesz];
-
- s = PTR_ALIGN(s, 4);
- bin2hex(build_id, s, n_descsz);
- build_id[2 * n_descsz] = '\0';
- VMCOREINFO_BUILD_ID(build_id);
- return;
- }
- pr_warn("Build ID is too large to include in vmcoreinfo: %u > %u\n",
- note_sec->n_hdr.n_descsz,
- BUILD_ID_MAX);
- return;
- }
- n_remain -= sizeof(struct elf_note) +
- ALIGN(note_sec->n_hdr.n_namesz, 4) +
- ALIGN(note_sec->n_hdr.n_descsz, 4);
- }
-}
-
static int __init crash_save_vmcoreinfo_init(void)
{
vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
@@ -443,7 +397,7 @@ static int __init crash_save_vmcoreinfo_init(void)
}
VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
- add_build_id_vmcoreinfo();
+ VMCOREINFO_BUILD_ID();
VMCOREINFO_PAGESIZE(PAGE_SIZE);
VMCOREINFO_SYMBOL(init_uts_ns);
@@ -455,7 +409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(_stext);
VMCOREINFO_SYMBOL(vmap_area_list);
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
VMCOREINFO_SYMBOL(mem_map);
VMCOREINFO_SYMBOL(contig_page_data);
#endif
@@ -484,7 +438,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, compound_head);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
+#ifdef CONFIG_FLATMEM
VMCOREINFO_OFFSET(pglist_data, node_mem_map);
#endif
VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
diff --git a/kernel/cred.c b/kernel/cred.c
index e1d274cd741b..1ae0b4948a5a 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -60,6 +60,7 @@ struct cred init_cred = {
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
+ .ucounts = &init_ucounts,
};
static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -119,6 +120,8 @@ static void put_cred_rcu(struct rcu_head *rcu)
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
+ if (cred->ucounts)
+ put_ucounts(cred->ucounts);
put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred);
}
@@ -222,7 +225,6 @@ struct cred *cred_alloc_blank(void)
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
#endif
-
if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
@@ -282,8 +284,13 @@ struct cred *prepare_creds(void)
new->security = NULL;
#endif
+ new->ucounts = get_ucounts(new->ucounts);
+ if (!new->ucounts)
+ goto error;
+
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
+
validate_creds(new);
return new;
@@ -351,7 +358,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
kdebug("share_creds(%p{%d,%d})",
p->cred, atomic_read(&p->cred->usage),
read_cred_subscribers(p->cred));
- atomic_inc(&p->cred->user->processes);
+ inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
return 0;
}
@@ -363,6 +370,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
ret = create_user_ns(new);
if (ret < 0)
goto error_put;
+ ret = set_cred_ucounts(new);
+ if (ret < 0)
+ goto error_put;
}
#ifdef CONFIG_KEYS
@@ -384,8 +394,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
}
#endif
- atomic_inc(&new->user->processes);
p->cred = p->real_cred = get_cred(new);
+ inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
alter_cred_subscribers(new, 2);
validate_creds(new);
return 0;
@@ -485,12 +495,12 @@ int commit_creds(struct cred *new)
* in set_user().
*/
alter_cred_subscribers(new, 2);
- if (new->user != old->user)
- atomic_inc(&new->user->processes);
+ if (new->user != old->user || new->user_ns != old->user_ns)
+ inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
- if (new->user != old->user)
- atomic_dec(&old->user->processes);
+ if (new->user != old->user || new->user_ns != old->user_ns)
+ dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
alter_cred_subscribers(old, -2);
/* send notifications */
@@ -653,6 +663,32 @@ int cred_fscmp(const struct cred *a, const struct cred *b)
}
EXPORT_SYMBOL(cred_fscmp);
+int set_cred_ucounts(struct cred *new)
+{
+ struct task_struct *task = current;
+ const struct cred *old = task->real_cred;
+ struct ucounts *new_ucounts, *old_ucounts = new->ucounts;
+
+ if (new->user == old->user && new->user_ns == old->user_ns)
+ return 0;
+
+ /*
+ * This optimization is needed because alloc_ucounts() uses locks
+ * for table lookups.
+ */
+ if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
+ return 0;
+
+ if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid)))
+ return -EAGAIN;
+
+ new->ucounts = new_ucounts;
+ if (old_ucounts)
+ put_ucounts(old_ucounts);
+
+ return 0;
+}
+
/*
* initialise the credentials stuff
*/
@@ -716,6 +752,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
+ new->ucounts = get_ucounts(new->ucounts);
+ if (!new->ucounts)
+ goto error;
+
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 4708aec492df..da06a5553835 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel Debug Core
*
@@ -22,10 +23,6 @@
*
* Original KGDB stub: David Grothe <dave@gcom.com>,
* Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
*/
#define pr_fmt(fmt) "KGDB: " fmt
@@ -1032,12 +1029,13 @@ dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
/*
* Take the following action on reboot notify depending on value:
* 1 == Enter debugger
- * 0 == [the default] detatch debug client
+ * 0 == [the default] detach debug client
* -1 == Do nothing... and use this until the board resets
*/
switch (kgdbreboot) {
case 1:
kgdb_breakpoint();
+ goto done;
case -1:
goto done;
}
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 8372897402f4..9d34d2364b5a 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel Debug Core
*
@@ -22,10 +23,6 @@
*
* Original KGDB stub: David Grothe <dave@gcom.com>,
* Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
*/
#include <linux/kernel.h>
@@ -1045,8 +1042,8 @@ int gdb_serial_stub(struct kgdb_state *ks)
gdb_cmd_detachkill(ks);
return DBG_PASS_EVENT;
}
-#endif
fallthrough;
+#endif
case 'C': /* Exception passing */
tmp = gdb_cmd_exception_pass(ks);
if (tmp > 0)
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 2168f8dacb99..372025cf1ca3 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -523,51 +523,51 @@ static int kdb_ss(int argc, const char **argv)
}
static kdbtab_t bptab[] = {
- { .cmd_name = "bp",
- .cmd_func = kdb_bp,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "Set/Display breakpoints",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ { .name = "bp",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "Set/Display breakpoints",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "bl",
- .cmd_func = kdb_bp,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "Display breakpoints",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ { .name = "bl",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "Display breakpoints",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "bc",
- .cmd_func = kdb_bc,
- .cmd_usage = "<bpnum>",
- .cmd_help = "Clear Breakpoint",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ { .name = "bc",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Clear Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
},
- { .cmd_name = "be",
- .cmd_func = kdb_bc,
- .cmd_usage = "<bpnum>",
- .cmd_help = "Enable Breakpoint",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ { .name = "be",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Enable Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
},
- { .cmd_name = "bd",
- .cmd_func = kdb_bc,
- .cmd_usage = "<bpnum>",
- .cmd_help = "Disable Breakpoint",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ { .name = "bd",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Disable Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
},
- { .cmd_name = "ss",
- .cmd_func = kdb_ss,
- .cmd_usage = "",
- .cmd_help = "Single Step",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ { .name = "ss",
+ .func = kdb_ss,
+ .usage = "",
+ .help = "Single Step",
+ .minlen = 1,
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
},
};
static kdbtab_t bphcmd = {
- .cmd_name = "bph",
- .cmd_func = kdb_bp,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "[datar [length]|dataw [length]] Set hw brk",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ .name = "bph",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "[datar [length]|dataw [length]] Set hw brk",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
};
/* Initialize the breakpoint table and register breakpoint commands. */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 0220afda3200..e91fc3e4edd5 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -140,7 +140,6 @@ int kdb_stub(struct kgdb_state *ks)
*/
kdb_common_deinit_state();
KDB_STATE_CLEAR(PAGER);
- kdbnearsym_cleanup();
if (error == KDB_CMD_KGDB) {
if (KDB_STATE(DOING_KGDB))
KDB_STATE_CLEAR(DOING_KGDB);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 1baa96a2ecb8..fa6deda894a1 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -33,7 +33,6 @@
#include <linux/kallsyms.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
-#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
@@ -253,7 +252,7 @@ static char *kdballocenv(size_t bytes)
* Parameters:
* match A character string representing a numeric value
* Outputs:
- * *value the unsigned long represntation of the env variable 'match'
+ * *value the unsigned long representation of the env variable 'match'
* Returns:
* Zero on success, a kdb diagnostic on failure.
*/
@@ -356,7 +355,7 @@ static void kdb_printenv(void)
* Parameters:
* arg A character string representing a numeric value
* Outputs:
- * *value the unsigned long represntation of arg.
+ * *value the unsigned long representation of arg.
* Returns:
* Zero on success, a kdb diagnostic on failure.
*/
@@ -470,7 +469,7 @@ static int kdb_check_regs(void)
* symbol name, and offset to the caller.
*
* The argument may consist of a numeric value (decimal or
- * hexidecimal), a symbol name, a register name (preceded by the
+ * hexadecimal), a symbol name, a register name (preceded by the
* percent sign), an environment variable with a numeric value
* (preceded by a dollar sign) or a simple arithmetic expression
* consisting of a symbol name, +/-, and a numeric constant value
@@ -654,16 +653,17 @@ static void kdb_cmderror(int diag)
* Returns:
* zero for success, a kdb diagnostic if error
*/
-struct defcmd_set {
- int count;
- bool usable;
- char *name;
- char *usage;
- char *help;
- char **command;
+struct kdb_macro {
+ kdbtab_t cmd; /* Macro command */
+ struct list_head statements; /* Associated statement list */
+};
+
+struct kdb_macro_statement {
+ char *statement; /* Statement text */
+ struct list_head list_node; /* Statement list node */
};
-static struct defcmd_set *defcmd_set;
-static int defcmd_set_count;
+
+static struct kdb_macro *kdb_macro;
static bool defcmd_in_progress;
/* Forward references */
@@ -671,53 +671,55 @@ static int kdb_exec_defcmd(int argc, const char **argv);
static int kdb_defcmd2(const char *cmdstr, const char *argv0)
{
- struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
- char **save_command = s->command;
+ struct kdb_macro_statement *kms;
+
+ if (!kdb_macro)
+ return KDB_NOTIMP;
+
if (strcmp(argv0, "endefcmd") == 0) {
defcmd_in_progress = false;
- if (!s->count)
- s->usable = false;
- if (s->usable)
- /* macros are always safe because when executed each
- * internal command re-enters kdb_parse() and is
- * safety checked individually.
- */
- kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
- s->help, 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ if (!list_empty(&kdb_macro->statements))
+ kdb_register(&kdb_macro->cmd);
return 0;
}
- if (!s->usable)
- return KDB_NOTIMP;
- s->command = kcalloc(s->count + 1, sizeof(*(s->command)), GFP_KDB);
- if (!s->command) {
- kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
+
+ kms = kmalloc(sizeof(*kms), GFP_KDB);
+ if (!kms) {
+ kdb_printf("Could not allocate new kdb macro command: %s\n",
cmdstr);
- s->usable = false;
return KDB_NOTIMP;
}
- memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
- s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
- kfree(save_command);
+
+ kms->statement = kdb_strdup(cmdstr, GFP_KDB);
+ list_add_tail(&kms->list_node, &kdb_macro->statements);
+
return 0;
}
static int kdb_defcmd(int argc, const char **argv)
{
- struct defcmd_set *save_defcmd_set = defcmd_set, *s;
+ kdbtab_t *mp;
+
if (defcmd_in_progress) {
kdb_printf("kdb: nested defcmd detected, assuming missing "
"endefcmd\n");
kdb_defcmd2("endefcmd", "endefcmd");
}
if (argc == 0) {
- int i;
- for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
- kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
- s->usage, s->help);
- for (i = 0; i < s->count; ++i)
- kdb_printf("%s", s->command[i]);
- kdb_printf("endefcmd\n");
+ kdbtab_t *kp;
+ struct kdb_macro *kmp;
+ struct kdb_macro_statement *kms;
+
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (kp->func == kdb_exec_defcmd) {
+ kdb_printf("defcmd %s \"%s\" \"%s\"\n",
+ kp->name, kp->usage, kp->help);
+ kmp = container_of(kp, struct kdb_macro, cmd);
+ list_for_each_entry(kms, &kmp->statements,
+ list_node)
+ kdb_printf("%s", kms->statement);
+ kdb_printf("endefcmd\n");
+ }
}
return 0;
}
@@ -727,45 +729,43 @@ static int kdb_defcmd(int argc, const char **argv)
kdb_printf("Command only available during kdb_init()\n");
return KDB_NOTIMP;
}
- defcmd_set = kmalloc_array(defcmd_set_count + 1, sizeof(*defcmd_set),
- GFP_KDB);
- if (!defcmd_set)
+ kdb_macro = kzalloc(sizeof(*kdb_macro), GFP_KDB);
+ if (!kdb_macro)
goto fail_defcmd;
- memcpy(defcmd_set, save_defcmd_set,
- defcmd_set_count * sizeof(*defcmd_set));
- s = defcmd_set + defcmd_set_count;
- memset(s, 0, sizeof(*s));
- s->usable = true;
- s->name = kdb_strdup(argv[1], GFP_KDB);
- if (!s->name)
+
+ mp = &kdb_macro->cmd;
+ mp->func = kdb_exec_defcmd;
+ mp->minlen = 0;
+ mp->flags = KDB_ENABLE_ALWAYS_SAFE;
+ mp->name = kdb_strdup(argv[1], GFP_KDB);
+ if (!mp->name)
goto fail_name;
- s->usage = kdb_strdup(argv[2], GFP_KDB);
- if (!s->usage)
+ mp->usage = kdb_strdup(argv[2], GFP_KDB);
+ if (!mp->usage)
goto fail_usage;
- s->help = kdb_strdup(argv[3], GFP_KDB);
- if (!s->help)
+ mp->help = kdb_strdup(argv[3], GFP_KDB);
+ if (!mp->help)
goto fail_help;
- if (s->usage[0] == '"') {
- strcpy(s->usage, argv[2]+1);
- s->usage[strlen(s->usage)-1] = '\0';
+ if (mp->usage[0] == '"') {
+ strcpy(mp->usage, argv[2]+1);
+ mp->usage[strlen(mp->usage)-1] = '\0';
}
- if (s->help[0] == '"') {
- strcpy(s->help, argv[3]+1);
- s->help[strlen(s->help)-1] = '\0';
+ if (mp->help[0] == '"') {
+ strcpy(mp->help, argv[3]+1);
+ mp->help[strlen(mp->help)-1] = '\0';
}
- ++defcmd_set_count;
+
+ INIT_LIST_HEAD(&kdb_macro->statements);
defcmd_in_progress = true;
- kfree(save_defcmd_set);
return 0;
fail_help:
- kfree(s->usage);
+ kfree(mp->usage);
fail_usage:
- kfree(s->name);
+ kfree(mp->name);
fail_name:
- kfree(defcmd_set);
+ kfree(kdb_macro);
fail_defcmd:
- kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
- defcmd_set = save_defcmd_set;
+ kdb_printf("Could not allocate new kdb_macro entry for %s\n", argv[1]);
return KDB_NOTIMP;
}
@@ -780,25 +780,31 @@ fail_defcmd:
*/
static int kdb_exec_defcmd(int argc, const char **argv)
{
- int i, ret;
- struct defcmd_set *s;
+ int ret;
+ kdbtab_t *kp;
+ struct kdb_macro *kmp;
+ struct kdb_macro_statement *kms;
+
if (argc != 0)
return KDB_ARGCOUNT;
- for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
- if (strcmp(s->name, argv[0]) == 0)
+
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (strcmp(kp->name, argv[0]) == 0)
break;
}
- if (i == defcmd_set_count) {
+ if (list_entry_is_head(kp, &kdb_cmds_head, list_node)) {
kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
argv[0]);
return KDB_NOTIMP;
}
- for (i = 0; i < s->count; ++i) {
- /* Recursive use of kdb_parse, do not use argv after
- * this point */
+ kmp = container_of(kp, struct kdb_macro, cmd);
+ list_for_each_entry(kms, &kmp->statements, list_node) {
+ /*
+ * Recursive use of kdb_parse, do not use argv after this point.
+ */
argv = NULL;
- kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
- ret = kdb_parse(s->command[i]);
+ kdb_printf("[%s]kdb> %s\n", kmp->cmd.name, kms->statement);
+ ret = kdb_parse(kms->statement);
if (ret)
return ret;
}
@@ -894,7 +900,7 @@ static void parse_grep(const char *str)
* Limited to 20 tokens.
*
* Real rudimentary tokenization. Basically only whitespace
- * is considered a token delimeter (but special consideration
+ * is considered a token delimiter (but special consideration
* is taken of the '=' sign as used by the 'set' command).
*
* The algorithm used to tokenize the input string relies on
@@ -1009,11 +1015,11 @@ int kdb_parse(const char *cmdstr)
* If this command is allowed to be abbreviated,
* check to see if this is it.
*/
- if (tp->cmd_minlen && (strlen(argv[0]) <= tp->cmd_minlen) &&
- (strncmp(argv[0], tp->cmd_name, tp->cmd_minlen) == 0))
+ if (tp->minlen && (strlen(argv[0]) <= tp->minlen) &&
+ (strncmp(argv[0], tp->name, tp->minlen) == 0))
break;
- if (strcmp(argv[0], tp->cmd_name) == 0)
+ if (strcmp(argv[0], tp->name) == 0)
break;
}
@@ -1024,8 +1030,7 @@ int kdb_parse(const char *cmdstr)
*/
if (list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
list_for_each_entry(tp, &kdb_cmds_head, list_node) {
- if (strncmp(argv[0], tp->cmd_name,
- strlen(tp->cmd_name)) == 0)
+ if (strncmp(argv[0], tp->name, strlen(tp->name)) == 0)
break;
}
}
@@ -1033,19 +1038,19 @@ int kdb_parse(const char *cmdstr)
if (!list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
int result;
- if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
+ if (!kdb_check_flags(tp->flags, kdb_cmd_enabled, argc <= 1))
return KDB_NOPERM;
KDB_STATE_SET(CMD);
- result = (*tp->cmd_func)(argc-1, (const char **)argv);
+ result = (*tp->func)(argc-1, (const char **)argv);
if (result && ignore_errors && result > KDB_CMD_GO)
result = 0;
KDB_STATE_CLEAR(CMD);
- if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
+ if (tp->flags & KDB_REPEAT_WITH_ARGS)
return result;
- argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
+ argc = tp->flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
if (argv[argc])
*(argv[argc]) = '\0';
return result;
@@ -2412,12 +2417,12 @@ static int kdb_help(int argc, const char **argv)
char *space = "";
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
- if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
+ if (!kdb_check_flags(kt->flags, kdb_cmd_enabled, true))
continue;
- if (strlen(kt->cmd_usage) > 20)
+ if (strlen(kt->usage) > 20)
space = "\n ";
- kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
- kt->cmd_usage, space, kt->cmd_help);
+ kdb_printf("%-15.15s %-20s%s%s\n", kt->name,
+ kt->usage, space, kt->help);
}
return 0;
}
@@ -2488,7 +2493,6 @@ static void kdb_sysinfo(struct sysinfo *val)
static int kdb_summary(int argc, const char **argv)
{
time64_t now;
- struct tm tm;
struct sysinfo val;
if (argc)
@@ -2502,13 +2506,7 @@ static int kdb_summary(int argc, const char **argv)
kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
now = __ktime_get_real_seconds();
- time64_to_tm(now, 0, &tm);
- kdb_printf("date %04ld-%02d-%02d %02d:%02d:%02d "
- "tz_minuteswest %d\n",
- 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
- tm.tm_hour, tm.tm_min, tm.tm_sec,
- sys_tz.tz_minuteswest);
-
+ kdb_printf("date %ptTs tz_minuteswest %d\n", &now, sys_tz.tz_minuteswest);
kdb_sysinfo(&val);
kdb_printf("uptime ");
if (val.uptime > (24*60*60)) {
@@ -2620,56 +2618,32 @@ static int kdb_grep_help(int argc, const char **argv)
return 0;
}
-/*
- * kdb_register_flags - This function is used to register a kernel
- * debugger command.
- * Inputs:
- * cmd Command name
- * func Function to execute the command
- * usage A simple usage string showing arguments
- * help A simple help string describing command
- * repeat Does the command auto repeat on enter?
- * Returns:
- * zero for success, one if a duplicate command.
+/**
+ * kdb_register() - This function is used to register a kernel debugger
+ * command.
+ * @cmd: pointer to kdb command
+ *
+ * Note that it's the job of the caller to keep the memory for the cmd
+ * allocated until unregister is called.
*/
-int kdb_register_flags(char *cmd,
- kdb_func_t func,
- char *usage,
- char *help,
- short minlen,
- kdb_cmdflags_t flags)
+int kdb_register(kdbtab_t *cmd)
{
kdbtab_t *kp;
list_for_each_entry(kp, &kdb_cmds_head, list_node) {
- if (strcmp(kp->cmd_name, cmd) == 0) {
- kdb_printf("Duplicate kdb command registered: "
- "%s, func %px help %s\n", cmd, func, help);
+ if (strcmp(kp->name, cmd->name) == 0) {
+ kdb_printf("Duplicate kdb cmd: %s, func %p help %s\n",
+ cmd->name, cmd->func, cmd->help);
return 1;
}
}
- kp = kmalloc(sizeof(*kp), GFP_KDB);
- if (!kp) {
- kdb_printf("Could not allocate new kdb_command table\n");
- return 1;
- }
-
- kp->cmd_name = cmd;
- kp->cmd_func = func;
- kp->cmd_usage = usage;
- kp->cmd_help = help;
- kp->cmd_minlen = minlen;
- kp->cmd_flags = flags;
- kp->is_dynamic = true;
-
- list_add_tail(&kp->list_node, &kdb_cmds_head);
-
+ list_add_tail(&cmd->list_node, &kdb_cmds_head);
return 0;
}
-EXPORT_SYMBOL_GPL(kdb_register_flags);
+EXPORT_SYMBOL_GPL(kdb_register);
-/*
+/**
* kdb_register_table() - This function is used to register a kdb command
* table.
* @kp: pointer to kdb command table
@@ -2683,266 +2657,231 @@ void kdb_register_table(kdbtab_t *kp, size_t len)
}
}
-/*
- * kdb_register - Compatibility register function for commands that do
- * not need to specify a repeat state. Equivalent to
- * kdb_register_flags with flags set to 0.
- * Inputs:
- * cmd Command name
- * func Function to execute the command
- * usage A simple usage string showing arguments
- * help A simple help string describing command
- * Returns:
- * zero for success, one if a duplicate command.
- */
-int kdb_register(char *cmd,
- kdb_func_t func,
- char *usage,
- char *help,
- short minlen)
-{
- return kdb_register_flags(cmd, func, usage, help, minlen, 0);
-}
-EXPORT_SYMBOL_GPL(kdb_register);
-
-/*
- * kdb_unregister - This function is used to unregister a kernel
- * debugger command. It is generally called when a module which
- * implements kdb commands is unloaded.
- * Inputs:
- * cmd Command name
- * Returns:
- * zero for success, one command not registered.
+/**
+ * kdb_unregister() - This function is used to unregister a kernel debugger
+ * command. It is generally called when a module which
+ * implements kdb command is unloaded.
+ * @cmd: pointer to kdb command
*/
-int kdb_unregister(char *cmd)
+void kdb_unregister(kdbtab_t *cmd)
{
- kdbtab_t *kp;
-
- /*
- * find the command.
- */
- list_for_each_entry(kp, &kdb_cmds_head, list_node) {
- if (strcmp(kp->cmd_name, cmd) == 0) {
- list_del(&kp->list_node);
- if (kp->is_dynamic)
- kfree(kp);
- return 0;
- }
- }
-
- /* Couldn't find it. */
- return 1;
+ list_del(&cmd->list_node);
}
EXPORT_SYMBOL_GPL(kdb_unregister);
static kdbtab_t maintab[] = {
- { .cmd_name = "md",
- .cmd_func = kdb_md,
- .cmd_usage = "<vaddr>",
- .cmd_help = "Display Memory Contents, also mdWcN, e.g. md8c1",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ { .name = "md",
+ .func = kdb_md,
+ .usage = "<vaddr>",
+ .help = "Display Memory Contents, also mdWcN, e.g. md8c1",
+ .minlen = 1,
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "mdr",
- .cmd_func = kdb_md,
- .cmd_usage = "<vaddr> <bytes>",
- .cmd_help = "Display Raw Memory",
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ { .name = "mdr",
+ .func = kdb_md,
+ .usage = "<vaddr> <bytes>",
+ .help = "Display Raw Memory",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "mdp",
- .cmd_func = kdb_md,
- .cmd_usage = "<paddr> <bytes>",
- .cmd_help = "Display Physical Memory",
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ { .name = "mdp",
+ .func = kdb_md,
+ .usage = "<paddr> <bytes>",
+ .help = "Display Physical Memory",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "mds",
- .cmd_func = kdb_md,
- .cmd_usage = "<vaddr>",
- .cmd_help = "Display Memory Symbolically",
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ { .name = "mds",
+ .func = kdb_md,
+ .usage = "<vaddr>",
+ .help = "Display Memory Symbolically",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "mm",
- .cmd_func = kdb_mm,
- .cmd_usage = "<vaddr> <contents>",
- .cmd_help = "Modify Memory Contents",
- .cmd_flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS,
+ { .name = "mm",
+ .func = kdb_mm,
+ .usage = "<vaddr> <contents>",
+ .help = "Modify Memory Contents",
+ .flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "go",
- .cmd_func = kdb_go,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "Continue Execution",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_REG_WRITE |
+ { .name = "go",
+ .func = kdb_go,
+ .usage = "[<vaddr>]",
+ .help = "Continue Execution",
+ .minlen = 1,
+ .flags = KDB_ENABLE_REG_WRITE |
KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
},
- { .cmd_name = "rd",
- .cmd_func = kdb_rd,
- .cmd_usage = "",
- .cmd_help = "Display Registers",
- .cmd_flags = KDB_ENABLE_REG_READ,
+ { .name = "rd",
+ .func = kdb_rd,
+ .usage = "",
+ .help = "Display Registers",
+ .flags = KDB_ENABLE_REG_READ,
},
- { .cmd_name = "rm",
- .cmd_func = kdb_rm,
- .cmd_usage = "<reg> <contents>",
- .cmd_help = "Modify Registers",
- .cmd_flags = KDB_ENABLE_REG_WRITE,
+ { .name = "rm",
+ .func = kdb_rm,
+ .usage = "<reg> <contents>",
+ .help = "Modify Registers",
+ .flags = KDB_ENABLE_REG_WRITE,
},
- { .cmd_name = "ef",
- .cmd_func = kdb_ef,
- .cmd_usage = "<vaddr>",
- .cmd_help = "Display exception frame",
- .cmd_flags = KDB_ENABLE_MEM_READ,
+ { .name = "ef",
+ .func = kdb_ef,
+ .usage = "<vaddr>",
+ .help = "Display exception frame",
+ .flags = KDB_ENABLE_MEM_READ,
},
- { .cmd_name = "bt",
- .cmd_func = kdb_bt,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "Stack traceback",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ { .name = "bt",
+ .func = kdb_bt,
+ .usage = "[<vaddr>]",
+ .help = "Stack traceback",
+ .minlen = 1,
+ .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
},
- { .cmd_name = "btp",
- .cmd_func = kdb_bt,
- .cmd_usage = "<pid>",
- .cmd_help = "Display stack for process <pid>",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "btp",
+ .func = kdb_bt,
+ .usage = "<pid>",
+ .help = "Display stack for process <pid>",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "bta",
- .cmd_func = kdb_bt,
- .cmd_usage = "[D|R|S|T|C|Z|E|U|I|M|A]",
- .cmd_help = "Backtrace all processes matching state flag",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "bta",
+ .func = kdb_bt,
+ .usage = "[D|R|S|T|C|Z|E|U|I|M|A]",
+ .help = "Backtrace all processes matching state flag",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "btc",
- .cmd_func = kdb_bt,
- .cmd_usage = "",
- .cmd_help = "Backtrace current process on each cpu",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "btc",
+ .func = kdb_bt,
+ .usage = "",
+ .help = "Backtrace current process on each cpu",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "btt",
- .cmd_func = kdb_bt,
- .cmd_usage = "<vaddr>",
- .cmd_help = "Backtrace process given its struct task address",
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ { .name = "btt",
+ .func = kdb_bt,
+ .usage = "<vaddr>",
+ .help = "Backtrace process given its struct task address",
+ .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
},
- { .cmd_name = "env",
- .cmd_func = kdb_env,
- .cmd_usage = "",
- .cmd_help = "Show environment variables",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "env",
+ .func = kdb_env,
+ .usage = "",
+ .help = "Show environment variables",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "set",
- .cmd_func = kdb_set,
- .cmd_usage = "",
- .cmd_help = "Set environment variables",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "set",
+ .func = kdb_set,
+ .usage = "",
+ .help = "Set environment variables",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "help",
- .cmd_func = kdb_help,
- .cmd_usage = "",
- .cmd_help = "Display Help Message",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "help",
+ .func = kdb_help,
+ .usage = "",
+ .help = "Display Help Message",
+ .minlen = 1,
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "?",
- .cmd_func = kdb_help,
- .cmd_usage = "",
- .cmd_help = "Display Help Message",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "?",
+ .func = kdb_help,
+ .usage = "",
+ .help = "Display Help Message",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "cpu",
- .cmd_func = kdb_cpu,
- .cmd_usage = "<cpunum>",
- .cmd_help = "Switch to new cpu",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
+ { .name = "cpu",
+ .func = kdb_cpu,
+ .usage = "<cpunum>",
+ .help = "Switch to new cpu",
+ .flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
},
- { .cmd_name = "kgdb",
- .cmd_func = kdb_kgdb,
- .cmd_usage = "",
- .cmd_help = "Enter kgdb mode",
- .cmd_flags = 0,
+ { .name = "kgdb",
+ .func = kdb_kgdb,
+ .usage = "",
+ .help = "Enter kgdb mode",
+ .flags = 0,
},
- { .cmd_name = "ps",
- .cmd_func = kdb_ps,
- .cmd_usage = "[<flags>|A]",
- .cmd_help = "Display active task list",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "ps",
+ .func = kdb_ps,
+ .usage = "[<flags>|A]",
+ .help = "Display active task list",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "pid",
- .cmd_func = kdb_pid,
- .cmd_usage = "<pidnum>",
- .cmd_help = "Switch to another task",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "pid",
+ .func = kdb_pid,
+ .usage = "<pidnum>",
+ .help = "Switch to another task",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "reboot",
- .cmd_func = kdb_reboot,
- .cmd_usage = "",
- .cmd_help = "Reboot the machine immediately",
- .cmd_flags = KDB_ENABLE_REBOOT,
+ { .name = "reboot",
+ .func = kdb_reboot,
+ .usage = "",
+ .help = "Reboot the machine immediately",
+ .flags = KDB_ENABLE_REBOOT,
},
#if defined(CONFIG_MODULES)
- { .cmd_name = "lsmod",
- .cmd_func = kdb_lsmod,
- .cmd_usage = "",
- .cmd_help = "List loaded kernel modules",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "lsmod",
+ .func = kdb_lsmod,
+ .usage = "",
+ .help = "List loaded kernel modules",
+ .flags = KDB_ENABLE_INSPECT,
},
#endif
#if defined(CONFIG_MAGIC_SYSRQ)
- { .cmd_name = "sr",
- .cmd_func = kdb_sr,
- .cmd_usage = "<key>",
- .cmd_help = "Magic SysRq key",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "sr",
+ .func = kdb_sr,
+ .usage = "<key>",
+ .help = "Magic SysRq key",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
#endif
#if defined(CONFIG_PRINTK)
- { .cmd_name = "dmesg",
- .cmd_func = kdb_dmesg,
- .cmd_usage = "[lines]",
- .cmd_help = "Display syslog buffer",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "dmesg",
+ .func = kdb_dmesg,
+ .usage = "[lines]",
+ .help = "Display syslog buffer",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
#endif
- { .cmd_name = "defcmd",
- .cmd_func = kdb_defcmd,
- .cmd_usage = "name \"usage\" \"help\"",
- .cmd_help = "Define a set of commands, down to endefcmd",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "defcmd",
+ .func = kdb_defcmd,
+ .usage = "name \"usage\" \"help\"",
+ .help = "Define a set of commands, down to endefcmd",
+ /*
+ * Macros are always safe because when executed each
+ * internal command re-enters kdb_parse() and is safety
+ * checked individually.
+ */
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "kill",
- .cmd_func = kdb_kill,
- .cmd_usage = "<-signal> <pid>",
- .cmd_help = "Send a signal to a process",
- .cmd_flags = KDB_ENABLE_SIGNAL,
+ { .name = "kill",
+ .func = kdb_kill,
+ .usage = "<-signal> <pid>",
+ .help = "Send a signal to a process",
+ .flags = KDB_ENABLE_SIGNAL,
},
- { .cmd_name = "summary",
- .cmd_func = kdb_summary,
- .cmd_usage = "",
- .cmd_help = "Summarize the system",
- .cmd_minlen = 4,
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "summary",
+ .func = kdb_summary,
+ .usage = "",
+ .help = "Summarize the system",
+ .minlen = 4,
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "per_cpu",
- .cmd_func = kdb_per_cpu,
- .cmd_usage = "<sym> [<bytes>] [<cpu>]",
- .cmd_help = "Display per_cpu variables",
- .cmd_minlen = 3,
- .cmd_flags = KDB_ENABLE_MEM_READ,
+ { .name = "per_cpu",
+ .func = kdb_per_cpu,
+ .usage = "<sym> [<bytes>] [<cpu>]",
+ .help = "Display per_cpu variables",
+ .minlen = 3,
+ .flags = KDB_ENABLE_MEM_READ,
},
- { .cmd_name = "grephelp",
- .cmd_func = kdb_grep_help,
- .cmd_usage = "",
- .cmd_help = "Display help on | grep",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "grephelp",
+ .func = kdb_grep_help,
+ .usage = "",
+ .help = "Display help on | grep",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
};
static kdbtab_t nmicmd = {
- .cmd_name = "disable_nmi",
- .cmd_func = kdb_disable_nmi,
- .cmd_usage = "",
- .cmd_help = "Disable NMI entry to KDB",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ .name = "disable_nmi",
+ .func = kdb_disable_nmi,
+ .usage = "",
+ .help = "Disable NMI entry to KDB",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
};
/* Initialize the kdb command table. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index ccbed9089808..629590084a0d 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -64,7 +64,7 @@
/*
* KDB_MAXBPT describes the total number of breakpoints
- * supported by this architecure.
+ * supported by this architecture.
*/
#define KDB_MAXBPT 16
@@ -109,7 +109,6 @@ extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
long *, char **);
extern int kdbgetsymval(const char *, kdb_symtab_t *);
extern int kdbnearsym(unsigned long, kdb_symtab_t *);
-extern void kdbnearsym_cleanup(void);
extern char *kdb_strdup(const char *str, gfp_t type);
extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
@@ -165,19 +164,6 @@ typedef struct _kdb_bp {
#ifdef CONFIG_KGDB_KDB
extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
-/* The KDB shell command table */
-typedef struct _kdbtab {
- char *cmd_name; /* Command name */
- kdb_func_t cmd_func; /* Function to execute command */
- char *cmd_usage; /* Usage String for this command */
- char *cmd_help; /* Help message for this command */
- short cmd_minlen; /* Minimum legal # command
- * chars required */
- kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
- struct list_head list_node; /* Command list */
- bool is_dynamic; /* Command table allocation type */
-} kdbtab_t;
-
extern void kdb_register_table(kdbtab_t *kp, size_t len);
extern int kdb_bt(int, const char **); /* KDB display back trace */
@@ -233,10 +219,6 @@ extern struct task_struct *kdb_curr_task(int);
#define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL)
-extern void *debug_kmalloc(size_t size, gfp_t flags);
-extern void debug_kfree(void *);
-extern void debug_kusage(void);
-
extern struct task_struct *kdb_current_task;
extern struct pt_regs *kdb_current_regs;
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 91bb666d7c03..7507d9a8dc6a 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -10,7 +10,6 @@
* 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
*/
-#include <stdarg.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/mm.h>
@@ -52,48 +51,48 @@ int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
}
EXPORT_SYMBOL(kdbgetsymval);
-static char *kdb_name_table[100]; /* arbitrary size */
-
-/*
- * kdbnearsym - Return the name of the symbol with the nearest address
- * less than 'addr'.
+/**
+ * kdbnearsym() - Return the name of the symbol with the nearest address
+ * less than @addr.
+ * @addr: Address to check for near symbol
+ * @symtab: Structure to receive results
*
- * Parameters:
- * addr Address to check for symbol near
- * symtab Structure to receive results
- * Returns:
- * 0 No sections contain this address, symtab zero filled
- * 1 Address mapped to module/symbol/section, data in symtab
- * Remarks:
- * 2.6 kallsyms has a "feature" where it unpacks the name into a
- * string. If that string is reused before the caller expects it
- * then the caller sees its string change without warning. To
- * avoid cluttering up the main kdb code with lots of kdb_strdup,
- * tests and kfree calls, kdbnearsym maintains an LRU list of the
- * last few unique strings. The list is sized large enough to
- * hold active strings, no kdb caller of kdbnearsym makes more
- * than ~20 later calls before using a saved value.
+ * WARNING: This function may return a pointer to a single statically
+ * allocated buffer (namebuf). kdb's unusual calling context (single
+ * threaded, all other CPUs halted) provides us sufficient locking for
+ * this to be safe. The only constraint imposed by the static buffer is
+ * that the caller must consume any previous reply prior to another call
+ * to lookup a new symbol.
+ *
+ * Note that, strictly speaking, some architectures may re-enter the kdb
+ * trap if the system turns out to be very badly damaged and this breaks
+ * the single-threaded assumption above. In these circumstances successful
+ * continuation and exit from the inner trap is unlikely to work and any
+ * user attempting this receives a prominent warning before being allowed
+ * to progress. In these circumstances we remain memory safe because
+ * namebuf[KSYM_NAME_LEN-1] will never change from '\0' although we do
+ * tolerate the possibility of garbled symbol display from the outer kdb
+ * trap.
+ *
+ * Return:
+ * * 0 - No sections contain this address, symtab zero filled
+ * * 1 - Address mapped to module/symbol/section, data in symtab
*/
int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
{
int ret = 0;
unsigned long symbolsize = 0;
unsigned long offset = 0;
-#define knt1_size 128 /* must be >= kallsyms table size */
- char *knt1 = NULL;
+ static char namebuf[KSYM_NAME_LEN];
kdb_dbg_printf(AR, "addr=0x%lx, symtab=%px\n", addr, symtab);
memset(symtab, 0, sizeof(*symtab));
if (addr < 4096)
goto out;
- knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
- if (!knt1) {
- kdb_func_printf("addr=0x%lx cannot kmalloc knt1\n", addr);
- goto out;
- }
+
symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
- (char **)(&symtab->mod_name), knt1);
+ (char **)(&symtab->mod_name), namebuf);
if (offset > 8*1024*1024) {
symtab->sym_name = NULL;
addr = offset = symbolsize = 0;
@@ -102,63 +101,14 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
symtab->sym_end = symtab->sym_start + symbolsize;
ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
- if (ret) {
- int i;
- /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
- * set but the buffer passed into kallsyms_lookup is not used,
- * so it contains garbage. The caller has to work out which
- * buffer needs to be saved.
- *
- * What was Rusty smoking when he wrote that code?
- */
- if (symtab->sym_name != knt1) {
- strncpy(knt1, symtab->sym_name, knt1_size);
- knt1[knt1_size-1] = '\0';
- }
- for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
- if (kdb_name_table[i] &&
- strcmp(kdb_name_table[i], knt1) == 0)
- break;
- }
- if (i >= ARRAY_SIZE(kdb_name_table)) {
- debug_kfree(kdb_name_table[0]);
- memmove(kdb_name_table, kdb_name_table+1,
- sizeof(kdb_name_table[0]) *
- (ARRAY_SIZE(kdb_name_table)-1));
- } else {
- debug_kfree(knt1);
- knt1 = kdb_name_table[i];
- memmove(kdb_name_table+i, kdb_name_table+i+1,
- sizeof(kdb_name_table[0]) *
- (ARRAY_SIZE(kdb_name_table)-i-1));
- }
- i = ARRAY_SIZE(kdb_name_table) - 1;
- kdb_name_table[i] = knt1;
- symtab->sym_name = kdb_name_table[i];
- knt1 = NULL;
- }
-
if (symtab->mod_name == NULL)
symtab->mod_name = "kernel";
kdb_dbg_printf(AR, "returns %d symtab->sym_start=0x%lx, symtab->mod_name=%px, symtab->sym_name=%px (%s)\n",
ret, symtab->sym_start, symtab->mod_name, symtab->sym_name, symtab->sym_name);
-
out:
- debug_kfree(knt1);
return ret;
}
-void kdbnearsym_cleanup(void)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
- if (kdb_name_table[i]) {
- debug_kfree(kdb_name_table[i]);
- kdb_name_table[i] = NULL;
- }
- }
-}
-
static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
/*
@@ -609,23 +559,25 @@ unsigned long kdb_task_state_string(const char *s)
*/
char kdb_task_state_char (const struct task_struct *p)
{
- int cpu;
- char state;
+ unsigned int p_state;
unsigned long tmp;
+ char state;
+ int cpu;
if (!p ||
copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long)))
return 'E';
cpu = kdb_process_cpu(p);
- state = (p->state == 0) ? 'R' :
- (p->state < 0) ? 'U' :
- (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
- (p->state & TASK_STOPPED) ? 'T' :
- (p->state & TASK_TRACED) ? 'C' :
+ p_state = READ_ONCE(p->__state);
+ state = (p_state == 0) ? 'R' :
+ (p_state < 0) ? 'U' :
+ (p_state & TASK_UNINTERRUPTIBLE) ? 'D' :
+ (p_state & TASK_STOPPED) ? 'T' :
+ (p_state & TASK_TRACED) ? 'C' :
(p->exit_state & EXIT_ZOMBIE) ? 'Z' :
(p->exit_state & EXIT_DEAD) ? 'E' :
- (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+ (p_state & TASK_INTERRUPTIBLE) ? 'S' : '?';
if (is_idle_task(p)) {
/* Idle task. Is it really idle, apart from the kdb
* interrupt? */
@@ -654,230 +606,6 @@ unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
return (mask & kdb_task_state_string(state)) != 0;
}
-/* Last ditch allocator for debugging, so we can still debug even when
- * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
- * for space usage, not for speed. One smallish memory pool, the free
- * chain is always in ascending address order to allow coalescing,
- * allocations are done in brute force best fit.
- */
-
-struct debug_alloc_header {
- u32 next; /* offset of next header from start of pool */
- u32 size;
- void *caller;
-};
-
-/* The memory returned by this allocator must be aligned, which means
- * so must the header size. Do not assume that sizeof(struct
- * debug_alloc_header) is a multiple of the alignment, explicitly
- * calculate the overhead of this header, including the alignment.
- * The rest of this code must not use sizeof() on any header or
- * pointer to a header.
- */
-#define dah_align 8
-#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
-
-static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
-static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
-static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
-
-/* Locking is awkward. The debug code is called from all contexts,
- * including non maskable interrupts. A normal spinlock is not safe
- * in NMI context. Try to get the debug allocator lock, if it cannot
- * be obtained after a second then give up. If the lock could not be
- * previously obtained on this cpu then only try once.
- *
- * sparse has no annotation for "this function _sometimes_ acquires a
- * lock", so fudge the acquire/release notation.
- */
-static DEFINE_SPINLOCK(dap_lock);
-static int get_dap_lock(void)
- __acquires(dap_lock)
-{
- static int dap_locked = -1;
- int count;
- if (dap_locked == smp_processor_id())
- count = 1;
- else
- count = 1000;
- while (1) {
- if (spin_trylock(&dap_lock)) {
- dap_locked = -1;
- return 1;
- }
- if (!count--)
- break;
- udelay(1000);
- }
- dap_locked = smp_processor_id();
- __acquire(dap_lock);
- return 0;
-}
-
-void *debug_kmalloc(size_t size, gfp_t flags)
-{
- unsigned int rem, h_offset;
- struct debug_alloc_header *best, *bestprev, *prev, *h;
- void *p = NULL;
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return NULL;
- }
- h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
- if (dah_first_call) {
- h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
- dah_first_call = 0;
- }
- size = ALIGN(size, dah_align);
- prev = best = bestprev = NULL;
- while (1) {
- if (h->size >= size && (!best || h->size < best->size)) {
- best = h;
- bestprev = prev;
- if (h->size == size)
- break;
- }
- if (!h->next)
- break;
- prev = h;
- h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
- }
- if (!best)
- goto out;
- rem = best->size - size;
- /* The pool must always contain at least one header */
- if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
- goto out;
- if (rem >= dah_overhead) {
- best->size = size;
- h_offset = ((char *)best - debug_alloc_pool) +
- dah_overhead + best->size;
- h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
- h->size = rem - dah_overhead;
- h->next = best->next;
- } else
- h_offset = best->next;
- best->caller = __builtin_return_address(0);
- dah_used += best->size;
- dah_used_max = max(dah_used, dah_used_max);
- if (bestprev)
- bestprev->next = h_offset;
- else
- dah_first = h_offset;
- p = (char *)best + dah_overhead;
- memset(p, POISON_INUSE, best->size - 1);
- *((char *)p + best->size - 1) = POISON_END;
-out:
- spin_unlock(&dap_lock);
- return p;
-}
-
-void debug_kfree(void *p)
-{
- struct debug_alloc_header *h;
- unsigned int h_offset;
- if (!p)
- return;
- if ((char *)p < debug_alloc_pool ||
- (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
- kfree(p);
- return;
- }
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return; /* memory leak, cannot be helped */
- }
- h = (struct debug_alloc_header *)((char *)p - dah_overhead);
- memset(p, POISON_FREE, h->size - 1);
- *((char *)p + h->size - 1) = POISON_END;
- h->caller = NULL;
- dah_used -= h->size;
- h_offset = (char *)h - debug_alloc_pool;
- if (h_offset < dah_first) {
- h->next = dah_first;
- dah_first = h_offset;
- } else {
- struct debug_alloc_header *prev;
- unsigned int prev_offset;
- prev = (struct debug_alloc_header *)(debug_alloc_pool +
- dah_first);
- while (1) {
- if (!prev->next || prev->next > h_offset)
- break;
- prev = (struct debug_alloc_header *)
- (debug_alloc_pool + prev->next);
- }
- prev_offset = (char *)prev - debug_alloc_pool;
- if (prev_offset + dah_overhead + prev->size == h_offset) {
- prev->size += dah_overhead + h->size;
- memset(h, POISON_FREE, dah_overhead - 1);
- *((char *)h + dah_overhead - 1) = POISON_END;
- h = prev;
- h_offset = prev_offset;
- } else {
- h->next = prev->next;
- prev->next = h_offset;
- }
- }
- if (h_offset + dah_overhead + h->size == h->next) {
- struct debug_alloc_header *next;
- next = (struct debug_alloc_header *)
- (debug_alloc_pool + h->next);
- h->size += dah_overhead + next->size;
- h->next = next->next;
- memset(next, POISON_FREE, dah_overhead - 1);
- *((char *)next + dah_overhead - 1) = POISON_END;
- }
- spin_unlock(&dap_lock);
-}
-
-void debug_kusage(void)
-{
- struct debug_alloc_header *h_free, *h_used;
-#ifdef CONFIG_IA64
- /* FIXME: using dah for ia64 unwind always results in a memory leak.
- * Fix that memory leak first, then set debug_kusage_one_time = 1 for
- * all architectures.
- */
- static int debug_kusage_one_time;
-#else
- static int debug_kusage_one_time = 1;
-#endif
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return;
- }
- h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
- if (dah_first == 0 &&
- (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
- dah_first_call))
- goto out;
- if (!debug_kusage_one_time)
- goto out;
- debug_kusage_one_time = 0;
- kdb_func_printf("debug_kmalloc memory leak dah_first %d\n", dah_first);
- if (dah_first) {
- h_used = (struct debug_alloc_header *)debug_alloc_pool;
- kdb_func_printf("h_used %px size %d\n", h_used, h_used->size);
- }
- do {
- h_used = (struct debug_alloc_header *)
- ((char *)h_free + dah_overhead + h_free->size);
- kdb_func_printf("h_used %px size %d caller %px\n",
- h_used, h_used->size, h_used->caller);
- h_free = (struct debug_alloc_header *)
- (debug_alloc_pool + h_free->next);
- } while (h_free->next);
- h_used = (struct debug_alloc_header *)
- ((char *)h_free + dah_overhead + h_free->size);
- if ((char *)h_used - debug_alloc_pool !=
- sizeof(debug_alloc_pool_aligned))
- kdb_func_printf("h_used %px size %d caller %px\n",
- h_used, h_used->size, h_used->caller);
-out:
- spin_unlock(&dap_lock);
-}
-
/* Maintain a small stack of kdb_flags to allow recursion without disturbing
* the global kdb state.
*/
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 27725754ac99..51530d5b15a8 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -7,29 +7,63 @@
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/clock.h>
#include <linux/slab.h>
#include <linux/taskstats.h>
-#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>
#include <linux/module.h>
-int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
-EXPORT_SYMBOL_GPL(delayacct_on);
+DEFINE_STATIC_KEY_FALSE(delayacct_key);
+int delayacct_on __read_mostly; /* Delay accounting turned on/off */
struct kmem_cache *delayacct_cache;
-static int __init delayacct_setup_disable(char *str)
+static void set_delayacct(bool enabled)
{
- delayacct_on = 0;
+ if (enabled) {
+ static_branch_enable(&delayacct_key);
+ delayacct_on = 1;
+ } else {
+ delayacct_on = 0;
+ static_branch_disable(&delayacct_key);
+ }
+}
+
+static int __init delayacct_setup_enable(char *str)
+{
+ delayacct_on = 1;
return 1;
}
-__setup("nodelayacct", delayacct_setup_disable);
+__setup("delayacct", delayacct_setup_enable);
void delayacct_init(void)
{
delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
delayacct_tsk_init(&init_task);
+ set_delayacct(delayacct_on);
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int state = delayacct_on;
+ struct ctl_table t;
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_delayacct(state);
+ return err;
}
+#endif
void __delayacct_tsk_init(struct task_struct *tsk)
{
@@ -42,10 +76,9 @@ void __delayacct_tsk_init(struct task_struct *tsk)
* Finish delay accounting for a statistic using its timestamps (@start),
* accumalator (@total) and @count
*/
-static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total,
- u32 *count)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count)
{
- s64 ns = ktime_get_ns() - *start;
+ s64 ns = local_clock() - *start;
unsigned long flags;
if (ns > 0) {
@@ -58,7 +91,7 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total,
void __delayacct_blkio_start(void)
{
- current->delays->blkio_start = ktime_get_ns();
+ current->delays->blkio_start = local_clock();
}
/*
@@ -82,7 +115,7 @@ void __delayacct_blkio_end(struct task_struct *p)
delayacct_end(&delays->lock, &delays->blkio_start, total, count);
}
-int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
{
u64 utime, stime, stimescaled, utimescaled;
unsigned long long t2, t3;
@@ -117,6 +150,9 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->cpu_run_virtual_total =
(tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
+ if (!tsk->delays)
+ return 0;
+
/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
@@ -151,21 +187,20 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
void __delayacct_freepages_start(void)
{
- current->delays->freepages_start = ktime_get_ns();
+ current->delays->freepages_start = local_clock();
}
void __delayacct_freepages_end(void)
{
- delayacct_end(
- &current->delays->lock,
- &current->delays->freepages_start,
- &current->delays->freepages_delay,
- &current->delays->freepages_count);
+ delayacct_end(&current->delays->lock,
+ &current->delays->freepages_start,
+ &current->delays->freepages_delay,
+ &current->delays->freepages_count);
}
void __delayacct_thrashing_start(void)
{
- current->delays->thrashing_start = ktime_get_ns();
+ current->delays->thrashing_start = local_clock();
}
void __delayacct_thrashing_end(void)
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 77b405508743..1b02179758cb 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -80,6 +80,19 @@ config SWIOTLB
bool
select NEED_DMA_MAP_STATE
+config DMA_RESTRICTED_POOL
+ bool "DMA Restricted Pool"
+ depends on OF && OF_RESERVED_MEM && SWIOTLB
+ help
+ This enables support for restricted DMA pools which provide a level of
+ DMA memory protection on systems with limited hardware protection
+ capabilities, such as those lacking an IOMMU.
+
+ For more information see
+ <Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt>
+ and <kernel/dma/swiotlb.c>.
+ If unsure, say "n".
+
#
# Should be selected if we can mmap non-coherent mappings to userspace.
# The only thing that is really required is a way to set an uncached bit
@@ -93,6 +106,10 @@ config DMA_COHERENT_POOL
select GENERIC_ALLOCATOR
bool
+config DMA_GLOBAL_POOL
+ select DMA_DECLARE_COHERENT
+ bool
+
config DMA_REMAP
bool
depends on MMU
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 5b5b6c7ec7f2..25fc85a7aebe 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -20,8 +20,6 @@ struct dma_coherent_mem {
bool use_dev_dma_pfn_offset;
};
-static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init;
-
static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev)
{
if (dev && dev->dma_mem)
@@ -37,51 +35,44 @@ static inline dma_addr_t dma_get_device_base(struct device *dev,
return mem->device_base;
}
-static int dma_init_coherent_memory(phys_addr_t phys_addr,
- dma_addr_t device_addr, size_t size,
- struct dma_coherent_mem **mem)
+static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
+ dma_addr_t device_addr, size_t size, bool use_dma_pfn_offset)
{
- struct dma_coherent_mem *dma_mem = NULL;
- void *mem_base = NULL;
+ struct dma_coherent_mem *dma_mem;
int pages = size >> PAGE_SHIFT;
int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
- int ret;
+ void *mem_base;
- if (!size) {
- ret = -EINVAL;
- goto out;
- }
+ if (!size)
+ return ERR_PTR(-EINVAL);
mem_base = memremap(phys_addr, size, MEMREMAP_WC);
- if (!mem_base) {
- ret = -EINVAL;
- goto out;
- }
+ if (!mem_base)
+ return ERR_PTR(-EINVAL);
+
dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
- if (!dma_mem) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!dma_mem)
+ goto out_unmap_membase;
dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!dma_mem->bitmap) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!dma_mem->bitmap)
+ goto out_free_dma_mem;
dma_mem->virt_base = mem_base;
dma_mem->device_base = device_addr;
dma_mem->pfn_base = PFN_DOWN(phys_addr);
dma_mem->size = pages;
+ dma_mem->use_dev_dma_pfn_offset = use_dma_pfn_offset;
spin_lock_init(&dma_mem->spinlock);
- *mem = dma_mem;
- return 0;
+ return dma_mem;
-out:
+out_free_dma_mem:
kfree(dma_mem);
- if (mem_base)
- memunmap(mem_base);
- return ret;
+out_unmap_membase:
+ memunmap(mem_base);
+ pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %zd MiB\n",
+ &phys_addr, size / SZ_1M);
+ return ERR_PTR(-ENOMEM);
}
static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
@@ -111,7 +102,7 @@ static int dma_assign_coherent_memory(struct device *dev,
* Declare a region of memory to be handed out by dma_alloc_coherent() when it
* is asked for coherent memory for this device. This shall only be used
* from platform code, usually based on the device tree description.
- *
+ *
* phys_addr is the CPU physical address to which the memory is currently
* assigned (this will be ioremapped so the CPU can access the region).
*
@@ -130,9 +121,9 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
struct dma_coherent_mem *mem;
int ret;
- ret = dma_init_coherent_memory(phys_addr, device_addr, size, &mem);
- if (ret)
- return ret;
+ mem = dma_init_coherent_memory(phys_addr, device_addr, size, false);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
ret = dma_assign_coherent_memory(dev, mem);
if (ret)
@@ -198,16 +189,6 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
return 1;
}
-void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
- dma_addr_t *dma_handle)
-{
- if (!dma_coherent_default_memory)
- return NULL;
-
- return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size,
- dma_handle);
-}
-
static int __dma_release_from_coherent(struct dma_coherent_mem *mem,
int order, void *vaddr)
{
@@ -243,15 +224,6 @@ int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr)
return __dma_release_from_coherent(mem, order, vaddr);
}
-int dma_release_from_global_coherent(int order, void *vaddr)
-{
- if (!dma_coherent_default_memory)
- return 0;
-
- return __dma_release_from_coherent(dma_coherent_default_memory, order,
- vaddr);
-}
-
static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem,
struct vm_area_struct *vma, void *vaddr, size_t size, int *ret)
{
@@ -297,6 +269,28 @@ int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret);
}
+#ifdef CONFIG_DMA_GLOBAL_POOL
+static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init;
+
+void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
+ dma_addr_t *dma_handle)
+{
+ if (!dma_coherent_default_memory)
+ return NULL;
+
+ return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size,
+ dma_handle);
+}
+
+int dma_release_from_global_coherent(int order, void *vaddr)
+{
+ if (!dma_coherent_default_memory)
+ return 0;
+
+ return __dma_release_from_coherent(dma_coherent_default_memory, order,
+ vaddr);
+}
+
int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
size_t size, int *ret)
{
@@ -307,6 +301,19 @@ int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
vaddr, size, ret);
}
+int dma_init_global_coherent(phys_addr_t phys_addr, size_t size)
+{
+ struct dma_coherent_mem *mem;
+
+ mem = dma_init_coherent_memory(phys_addr, phys_addr, size, true);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+ dma_coherent_default_memory = mem;
+ pr_info("DMA: default coherent area is set\n");
+ return 0;
+}
+#endif /* CONFIG_DMA_GLOBAL_POOL */
+
/*
* Support for reserved memory regions defined in device tree
*/
@@ -315,25 +322,22 @@ int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
#include <linux/of_fdt.h>
#include <linux/of_reserved_mem.h>
+#ifdef CONFIG_DMA_GLOBAL_POOL
static struct reserved_mem *dma_reserved_default_memory __initdata;
+#endif
static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
{
- struct dma_coherent_mem *mem = rmem->priv;
- int ret;
+ if (!rmem->priv) {
+ struct dma_coherent_mem *mem;
- if (!mem) {
- ret = dma_init_coherent_memory(rmem->base, rmem->base,
- rmem->size, &mem);
- if (ret) {
- pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n",
- &rmem->base, (unsigned long)rmem->size / SZ_1M);
- return ret;
- }
+ mem = dma_init_coherent_memory(rmem->base, rmem->base,
+ rmem->size, true);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+ rmem->priv = mem;
}
- mem->use_dev_dma_pfn_offset = true;
- rmem->priv = mem;
- dma_assign_coherent_memory(dev, mem);
+ dma_assign_coherent_memory(dev, rmem->priv);
return 0;
}
@@ -361,7 +365,9 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem)
pr_err("Reserved memory: regions without no-map are not yet supported\n");
return -EINVAL;
}
+#endif
+#ifdef CONFIG_DMA_GLOBAL_POOL
if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) {
WARN(dma_reserved_default_memory,
"Reserved memory: region for default DMA coherent area is redefined\n");
@@ -375,31 +381,16 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem)
return 0;
}
+#ifdef CONFIG_DMA_GLOBAL_POOL
static int __init dma_init_reserved_memory(void)
{
- const struct reserved_mem_ops *ops;
- int ret;
-
if (!dma_reserved_default_memory)
return -ENOMEM;
-
- ops = dma_reserved_default_memory->ops;
-
- /*
- * We rely on rmem_dma_device_init() does not propagate error of
- * dma_assign_coherent_memory() for "NULL" device.
- */
- ret = ops->device_init(dma_reserved_default_memory, NULL);
-
- if (!ret) {
- dma_coherent_default_memory = dma_reserved_default_memory->priv;
- pr_info("DMA: default coherent area is set\n");
- }
-
- return ret;
+ return dma_init_global_coherent(dma_reserved_default_memory->base,
+ dma_reserved_default_memory->size);
}
-
core_initcall(dma_init_reserved_memory);
+#endif /* CONFIG_DMA_GLOBAL_POOL */
RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup);
#endif
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 14de1271463f..7a14ca29c377 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -552,7 +552,7 @@ static void active_cacheline_remove(struct dma_debug_entry *entry)
* Wrapper function for adding an entry to the hash.
* This function takes care of locking itself.
*/
-static void add_dma_entry(struct dma_debug_entry *entry)
+static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
{
struct hash_bucket *bucket;
unsigned long flags;
@@ -566,11 +566,10 @@ static void add_dma_entry(struct dma_debug_entry *entry)
if (rc == -ENOMEM) {
pr_err("cacheline tracking ENOMEM, dma-debug disabled\n");
global_disable = true;
+ } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
+ err_printk(entry->dev, entry,
+ "cacheline tracking EEXIST, overlapping mappings aren't supported\n");
}
-
- /* TODO: report -EEXIST errors here as overlapping mappings are
- * not supported by the DMA API
- */
}
static int dma_debug_create_entries(gfp_t gfp)
@@ -794,7 +793,7 @@ static int dump_show(struct seq_file *seq, void *v)
}
DEFINE_SHOW_ATTRIBUTE(dump);
-static void dma_debug_fs_init(void)
+static int __init dma_debug_fs_init(void)
{
struct dentry *dentry = debugfs_create_dir("dma-api", NULL);
@@ -807,7 +806,10 @@ static void dma_debug_fs_init(void)
debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries);
debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops);
debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops);
+
+ return 0;
}
+core_initcall_sync(dma_debug_fs_init);
static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry)
{
@@ -892,8 +894,6 @@ static int dma_debug_init(void)
spin_lock_init(&dma_entry_hash[i].lock);
}
- dma_debug_fs_init();
-
nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES);
for (i = 0; i < nr_pages; ++i)
dma_debug_create_entries(GFP_KERNEL);
@@ -1066,20 +1066,10 @@ static void check_for_stack(struct device *dev,
}
}
-static inline bool overlap(void *addr, unsigned long len, void *start, void *end)
-{
- unsigned long a1 = (unsigned long)addr;
- unsigned long b1 = a1 + len;
- unsigned long a2 = (unsigned long)start;
- unsigned long b2 = (unsigned long)end;
-
- return !(b1 <= a2 || a1 >= b2);
-}
-
static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len)
{
- if (overlap(addr, len, _stext, _etext) ||
- overlap(addr, len, __start_rodata, __end_rodata))
+ if (memory_intersects(_stext, _etext, addr, len) ||
+ memory_intersects(__start_rodata, __end_rodata, addr, len))
err_printk(dev, NULL, "device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
}
@@ -1201,7 +1191,8 @@ void debug_dma_map_single(struct device *dev, const void *addr,
EXPORT_SYMBOL(debug_dma_map_single);
void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
- size_t size, int direction, dma_addr_t dma_addr)
+ size_t size, int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1232,7 +1223,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
check_for_illegal_area(dev, addr, size);
}
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
@@ -1290,7 +1281,8 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
}
void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, int mapped_ents, int direction)
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
struct scatterlist *s;
@@ -1299,6 +1291,12 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
if (unlikely(dma_debug_disabled()))
return;
+ for_each_sg(sg, s, nents, i) {
+ check_for_stack(dev, sg_page(s), s->offset);
+ if (!PageHighMem(sg_page(s)))
+ check_for_illegal_area(dev, sg_virt(s), s->length);
+ }
+
for_each_sg(sg, s, mapped_ents, i) {
entry = dma_entry_alloc();
if (!entry)
@@ -1314,15 +1312,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
entry->sg_call_ents = nents;
entry->sg_mapped_ents = mapped_ents;
- check_for_stack(dev, sg_page(s), s->offset);
-
- if (!PageHighMem(sg_page(s))) {
- check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
- }
-
check_sg_segment(dev, s);
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
}
@@ -1378,7 +1370,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
}
void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt)
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1408,7 +1401,7 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
else
entry->pfn = page_to_pfn(virt_to_page(virt));
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
void debug_dma_free_coherent(struct device *dev, size_t size,
@@ -1439,7 +1432,8 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
}
void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
- int direction, dma_addr_t dma_addr)
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1459,7 +1453,7 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
entry->direction = direction;
entry->map_err_type = MAP_ERR_NOT_CHECKED;
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
index 83643b3010b2..f525197d3cae 100644
--- a/kernel/dma/debug.h
+++ b/kernel/dma/debug.h
@@ -11,26 +11,30 @@
#ifdef CONFIG_DMA_API_DEBUG
extern void debug_dma_map_page(struct device *dev, struct page *page,
size_t offset, size_t size,
- int direction, dma_addr_t dma_addr);
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs);
extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
size_t size, int direction);
extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, int mapped_ents, int direction);
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs);
extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
int nelems, int dir);
extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt);
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs);
extern void debug_dma_free_coherent(struct device *dev, size_t size,
void *virt, dma_addr_t addr);
extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
size_t size, int direction,
- dma_addr_t dma_addr);
+ dma_addr_t dma_addr,
+ unsigned long attrs);
extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
size_t size, int direction);
@@ -53,7 +57,8 @@ extern void debug_dma_sync_sg_for_device(struct device *dev,
#else /* CONFIG_DMA_API_DEBUG */
static inline void debug_dma_map_page(struct device *dev, struct page *page,
size_t offset, size_t size,
- int direction, dma_addr_t dma_addr)
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
}
@@ -63,7 +68,8 @@ static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
}
static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, int mapped_ents, int direction)
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs)
{
}
@@ -74,7 +80,8 @@ static inline void debug_dma_unmap_sg(struct device *dev,
}
static inline void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt)
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs)
{
}
@@ -85,7 +92,8 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size,
static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
size_t size, int direction,
- dma_addr_t dma_addr)
+ dma_addr_t dma_addr,
+ unsigned long attrs)
{
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index f737e3347059..4c6c5e0635e3 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -75,6 +75,15 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
}
+static void __dma_direct_free_pages(struct device *dev, struct page *page,
+ size_t size)
+{
+ if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
+ swiotlb_free(dev, page, size))
+ return;
+ dma_free_contiguous(dev, page, size);
+}
+
static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp_t gfp)
{
@@ -86,6 +95,16 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_limit);
+ if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
+ is_swiotlb_for_alloc(dev)) {
+ page = swiotlb_alloc(dev, size);
+ if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ __dma_direct_free_pages(dev, page, size);
+ return NULL;
+ }
+ return page;
+ }
+
page = dma_alloc_contiguous(dev, size, gfp);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_free_contiguous(dev, page, size);
@@ -142,7 +161,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
gfp |= __GFP_NOWARN;
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev)) {
+ !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
if (!page)
return NULL;
@@ -156,17 +175,28 @@ void *dma_direct_alloc(struct device *dev, size_t size,
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !dev_is_dma_coherent(dev))
+ !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ !dev_is_dma_coherent(dev) &&
+ !is_swiotlb_for_alloc(dev))
return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
+ if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ !dev_is_dma_coherent(dev))
+ return dma_alloc_from_global_coherent(dev, size, dma_handle);
+
/*
* Remapping or decrypting memory may block. If either is required and
* we can't block, allocate the memory from the atomic pools.
+ * If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must
+ * set up another device coherent pool by shared-dma-pool and use
+ * dma_alloc_from_dev_coherent instead.
*/
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
!gfpflags_allow_blocking(gfp) &&
(force_dma_unencrypted(dev) ||
- (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev))))
+ (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ !dev_is_dma_coherent(dev))) &&
+ !is_swiotlb_for_alloc(dev))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
/* we always manually zero the memory once we are done */
@@ -237,7 +267,7 @@ out_encrypt_pages:
return NULL;
}
out_free_pages:
- dma_free_contiguous(dev, page, size);
+ __dma_direct_free_pages(dev, page, size);
return NULL;
}
@@ -247,7 +277,7 @@ void dma_direct_free(struct device *dev, size_t size,
unsigned int page_order = get_order(size);
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev)) {
+ !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
dma_free_contiguous(dev, cpu_addr, size);
return;
@@ -255,11 +285,20 @@ void dma_direct_free(struct device *dev, size_t size,
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !dev_is_dma_coherent(dev)) {
+ !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ !dev_is_dma_coherent(dev) &&
+ !is_swiotlb_for_alloc(dev)) {
arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
return;
}
+ if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ !dev_is_dma_coherent(dev)) {
+ if (!dma_release_from_global_coherent(page_order, cpu_addr))
+ WARN_ON_ONCE(1);
+ return;
+ }
+
/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
@@ -273,7 +312,7 @@ void dma_direct_free(struct device *dev, size_t size,
else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
arch_dma_clear_uncached(cpu_addr, size);
- dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
+ __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
}
struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
@@ -283,7 +322,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
void *ret;
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
- force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp))
+ force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) &&
+ !is_swiotlb_for_alloc(dev))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
page = __dma_direct_alloc_pages(dev, size, gfp);
@@ -310,7 +350,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
out_free_pages:
- dma_free_contiguous(dev, page, size);
+ __dma_direct_free_pages(dev, page, size);
return NULL;
}
@@ -329,7 +369,7 @@ void dma_direct_free_pages(struct device *dev, size_t size,
if (force_dma_unencrypted(dev))
set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
- dma_free_contiguous(dev, page, size);
+ __dma_direct_free_pages(dev, page, size);
}
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
@@ -343,7 +383,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
for_each_sg(sgl, sg, nents, i) {
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
- if (unlikely(is_swiotlb_buffer(paddr)))
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
swiotlb_sync_single_for_device(dev, paddr, sg->length,
dir);
@@ -369,7 +409,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(paddr, sg->length, dir);
- if (unlikely(is_swiotlb_buffer(paddr)))
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
swiotlb_sync_single_for_cpu(dev, paddr, sg->length,
dir);
@@ -411,7 +451,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
- return 0;
+ return -EIO;
}
dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
@@ -462,6 +502,8 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
return ret;
+ if (dma_mmap_from_global_coherent(vma, cpu_addr, size, &ret))
+ return ret;
if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff)
return -ENXIO;
@@ -495,8 +537,8 @@ int dma_direct_supported(struct device *dev, u64 mask)
size_t dma_direct_max_mapping_size(struct device *dev)
{
/* If SWIOTLB is active, use its maximum mapping size */
- if (is_swiotlb_active() &&
- (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
+ if (is_swiotlb_active(dev) &&
+ (dma_addressing_limited(dev) || is_swiotlb_force_bounce(dev)))
return swiotlb_max_mapping_size(dev);
return SIZE_MAX;
}
@@ -504,7 +546,7 @@ size_t dma_direct_max_mapping_size(struct device *dev)
bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr)
{
return !dev_is_dma_coherent(dev) ||
- is_swiotlb_buffer(dma_to_phys(dev, dma_addr));
+ is_swiotlb_buffer(dev, dma_to_phys(dev, dma_addr));
}
/**
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 50afc05b6f1d..4632b0f4f72e 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -56,7 +56,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
{
phys_addr_t paddr = dma_to_phys(dev, addr);
- if (unlikely(is_swiotlb_buffer(paddr)))
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
swiotlb_sync_single_for_device(dev, paddr, size, dir);
if (!dev_is_dma_coherent(dev))
@@ -73,7 +73,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
arch_sync_dma_for_cpu_all();
}
- if (unlikely(is_swiotlb_buffer(paddr)))
+ if (unlikely(is_swiotlb_buffer(dev, paddr)))
swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
if (dir == DMA_FROM_DEVICE)
@@ -87,7 +87,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
phys_addr_t phys = page_to_phys(page) + offset;
dma_addr_t dma_addr = phys_to_dma(dev, phys);
- if (unlikely(swiotlb_force == SWIOTLB_FORCE))
+ if (is_swiotlb_force_bounce(dev))
return swiotlb_map(dev, phys, size, dir, attrs);
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
@@ -113,7 +113,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
- if (unlikely(is_swiotlb_buffer(phys)))
+ if (unlikely(is_swiotlb_buffer(dev, phys)))
swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
}
#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
index eacd4c5b10bf..b492d59ac77e 100644
--- a/kernel/dma/dummy.c
+++ b/kernel/dma/dummy.c
@@ -22,7 +22,7 @@ static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
int nelems, enum dma_data_direction dir,
unsigned long attrs)
{
- return 0;
+ return -EINVAL;
}
static int dma_dummy_supported(struct device *hwdev, u64 mask)
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 2b06a809d0b9..8349a9f2c345 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -156,7 +156,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
else
addr = ops->map_page(dev, page, offset, size, dir, attrs);
- debug_dma_map_page(dev, page, offset, size, dir, addr);
+ debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
return addr;
}
@@ -177,12 +177,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
}
EXPORT_SYMBOL(dma_unmap_page_attrs);
-/*
- * dma_maps_sg_attrs returns 0 on error and > 0 on success.
- * It should never return a value < 0.
- */
-int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
- enum dma_data_direction dir, unsigned long attrs)
+static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
int ents;
@@ -197,13 +193,82 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
- BUG_ON(ents < 0);
- debug_dma_map_sg(dev, sg, nents, ents, dir);
+
+ if (ents > 0)
+ debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
+ else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
+ ents != -EIO))
+ return -EIO;
return ents;
}
+
+/**
+ * dma_map_sg_attrs - Map the given buffer for DMA
+ * @dev: The device for which to perform the DMA operation
+ * @sg: The sg_table object describing the buffer
+ * @nents: Number of entries to map
+ * @dir: DMA direction
+ * @attrs: Optional DMA attributes for the map operation
+ *
+ * Maps a buffer described by a scatterlist passed in the sg argument with
+ * nents segments for the @dir DMA operation by the @dev device.
+ *
+ * Returns the number of mapped entries (which can be less than nents)
+ * on success. Zero is returned for any error.
+ *
+ * dma_unmap_sg_attrs() should be used to unmap the buffer with the
+ * original sg and original nents (not the value returned by this funciton).
+ */
+unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+ int ret;
+
+ ret = __dma_map_sg_attrs(dev, sg, nents, dir, attrs);
+ if (ret < 0)
+ return 0;
+ return ret;
+}
EXPORT_SYMBOL(dma_map_sg_attrs);
+/**
+ * dma_map_sgtable - Map the given buffer for DMA
+ * @dev: The device for which to perform the DMA operation
+ * @sgt: The sg_table object describing the buffer
+ * @dir: DMA direction
+ * @attrs: Optional DMA attributes for the map operation
+ *
+ * Maps a buffer described by a scatterlist stored in the given sg_table
+ * object for the @dir DMA operation by the @dev device. After success, the
+ * ownership for the buffer is transferred to the DMA domain. One has to
+ * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the
+ * ownership of the buffer back to the CPU domain before touching the
+ * buffer by the CPU.
+ *
+ * Returns 0 on success or a negative error code on error. The following
+ * error codes are supported with the given meaning:
+ *
+ * -EINVAL An invalid argument, unaligned access or other error
+ * in usage. Will not succeed if retried.
+ * -ENOMEM Insufficient resources (like memory or IOVA space) to
+ * complete the mapping. Should succeed if retried later.
+ * -EIO Legacy error code with an unknown meaning. eg. this is
+ * returned if a lower level call returned DMA_MAPPING_ERROR.
+ */
+int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ int nents;
+
+ nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
+ if (nents < 0)
+ return nents;
+ sgt->nents = nents;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dma_map_sgtable);
+
void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction dir,
unsigned long attrs)
@@ -240,7 +305,7 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
else if (ops->map_resource)
addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
- debug_dma_map_resource(dev, phys_addr, size, dir, addr);
+ debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs);
return addr;
}
EXPORT_SYMBOL(dma_map_resource);
@@ -445,7 +510,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
else
return NULL;
- debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+ debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs);
return cpu_addr;
}
EXPORT_SYMBOL(dma_alloc_attrs);
@@ -501,7 +566,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp);
if (page)
- debug_dma_map_page(dev, page, 0, size, dir, *dma_handle);
+ debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0);
return page;
}
EXPORT_SYMBOL_GPL(dma_alloc_pages);
@@ -579,7 +644,7 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
if (sgt) {
sgt->nents = 1;
- debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir);
+ debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs);
}
return sgt;
}
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
index 910ae69cae77..af4a6ef48ce0 100644
--- a/kernel/dma/ops_helpers.c
+++ b/kernel/dma/ops_helpers.c
@@ -5,6 +5,13 @@
*/
#include <linux/dma-map-ops.h>
+static struct page *dma_common_vaddr_to_page(void *cpu_addr)
+{
+ if (is_vmalloc_addr(cpu_addr))
+ return vmalloc_to_page(cpu_addr);
+ return virt_to_page(cpu_addr);
+}
+
/*
* Create scatter-list for the already allocated DMA buffer.
*/
@@ -12,7 +19,7 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
- struct page *page = virt_to_page(cpu_addr);
+ struct page *page = dma_common_vaddr_to_page(cpu_addr);
int ret;
ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
@@ -32,6 +39,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
unsigned long user_count = vma_pages(vma);
unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long off = vma->vm_pgoff;
+ struct page *page = dma_common_vaddr_to_page(cpu_addr);
int ret = -ENXIO;
vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
@@ -43,7 +51,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
return -ENXIO;
return remap_pfn_range(vma, vma->vm_start,
- page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff,
+ page_to_pfn(page) + vma->vm_pgoff,
user_count << PAGE_SHIFT, vma->vm_page_prot);
#else
return -ENXIO;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index e50df8d8f87e..87c40517e822 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -39,6 +39,13 @@
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#endif
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
+#include <linux/slab.h>
+#endif
#include <asm/io.h>
#include <asm/dma.h>
@@ -63,7 +70,7 @@
enum swiotlb_force swiotlb_force;
-struct io_tlb_mem *io_tlb_default_mem;
+struct io_tlb_mem io_tlb_default_mem;
/*
* Max segment that we can provide which (if pages are contingous) will
@@ -94,7 +101,7 @@ early_param("swiotlb", setup_io_tlb_npages);
unsigned int swiotlb_max_segment(void)
{
- return io_tlb_default_mem ? max_segment : 0;
+ return io_tlb_default_mem.nslabs ? max_segment : 0;
}
EXPORT_SYMBOL_GPL(swiotlb_max_segment);
@@ -127,9 +134,9 @@ void __init swiotlb_adjust_size(unsigned long size)
void swiotlb_print_info(void)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
- if (!mem) {
+ if (!mem->nslabs) {
pr_warn("No low mem\n");
return;
}
@@ -156,11 +163,11 @@ static inline unsigned long nr_slots(u64 val)
*/
void __init swiotlb_update_mem_attributes(void)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
void *vaddr;
unsigned long bytes;
- if (!mem || mem->late_alloc)
+ if (!mem->nslabs || mem->late_alloc)
return;
vaddr = phys_to_virt(mem->start);
bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
@@ -168,36 +175,50 @@ void __init swiotlb_update_mem_attributes(void)
memset(vaddr, 0, bytes);
}
-int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
+ unsigned long nslabs, bool late_alloc)
{
+ void *vaddr = phys_to_virt(start);
unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
- struct io_tlb_mem *mem;
- size_t alloc_size;
-
- if (swiotlb_force == SWIOTLB_NO_FORCE)
- return 0;
-
- /* protect against double initialization */
- if (WARN_ON_ONCE(io_tlb_default_mem))
- return -ENOMEM;
- alloc_size = PAGE_ALIGN(struct_size(mem, slots, nslabs));
- mem = memblock_alloc(alloc_size, PAGE_SIZE);
- if (!mem)
- panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
- __func__, alloc_size, PAGE_SIZE);
mem->nslabs = nslabs;
- mem->start = __pa(tlb);
+ mem->start = start;
mem->end = mem->start + bytes;
mem->index = 0;
+ mem->late_alloc = late_alloc;
+
+ if (swiotlb_force == SWIOTLB_FORCE)
+ mem->force_bounce = true;
+
spin_lock_init(&mem->lock);
for (i = 0; i < mem->nslabs; i++) {
mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
}
+ memset(vaddr, 0, bytes);
+}
+
+int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+{
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+ size_t alloc_size;
+
+ if (swiotlb_force == SWIOTLB_NO_FORCE)
+ return 0;
+
+ /* protect against double initialization */
+ if (WARN_ON_ONCE(mem->nslabs))
+ return -ENOMEM;
+
+ alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
+ mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!mem->slots)
+ panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+
+ swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false);
- io_tlb_default_mem = mem;
if (verbose)
swiotlb_print_info();
swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
@@ -282,37 +303,24 @@ swiotlb_late_init_with_default_size(size_t default_size)
int
swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
{
- unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
- struct io_tlb_mem *mem;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+ unsigned long bytes = nslabs << IO_TLB_SHIFT;
if (swiotlb_force == SWIOTLB_NO_FORCE)
return 0;
/* protect against double initialization */
- if (WARN_ON_ONCE(io_tlb_default_mem))
+ if (WARN_ON_ONCE(mem->nslabs))
return -ENOMEM;
- mem = (void *)__get_free_pages(GFP_KERNEL,
- get_order(struct_size(mem, slots, nslabs)));
- if (!mem)
+ mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(array_size(sizeof(*mem->slots), nslabs)));
+ if (!mem->slots)
return -ENOMEM;
- mem->nslabs = nslabs;
- mem->start = virt_to_phys(tlb);
- mem->end = mem->start + bytes;
- mem->index = 0;
- mem->late_alloc = 1;
- spin_lock_init(&mem->lock);
- for (i = 0; i < mem->nslabs; i++) {
- mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
- mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
- mem->slots[i].alloc_size = 0;
- }
-
set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
- memset(tlb, 0, bytes);
+ swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true);
- io_tlb_default_mem = mem;
swiotlb_print_info();
swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
return 0;
@@ -320,18 +328,28 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
void __init swiotlb_exit(void)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
- size_t size;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+ unsigned long tbl_vaddr;
+ size_t tbl_size, slots_size;
- if (!mem)
+ if (!mem->nslabs)
return;
- size = struct_size(mem, slots, mem->nslabs);
- if (mem->late_alloc)
- free_pages((unsigned long)mem, get_order(size));
- else
- memblock_free_late(__pa(mem), PAGE_ALIGN(size));
- io_tlb_default_mem = NULL;
+ pr_info("tearing down default memory pool\n");
+ tbl_vaddr = (unsigned long)phys_to_virt(mem->start);
+ tbl_size = PAGE_ALIGN(mem->end - mem->start);
+ slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs));
+
+ set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
+ if (mem->late_alloc) {
+ free_pages(tbl_vaddr, get_order(tbl_size));
+ free_pages((unsigned long)mem->slots, get_order(slots_size));
+ } else {
+ memblock_free_late(mem->start, tbl_size);
+ memblock_free_late(__pa(mem->slots), slots_size);
+ }
+
+ memset(mem, 0, sizeof(*mem));
}
/*
@@ -348,19 +366,33 @@ static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
enum dma_data_direction dir)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = mem->slots[index].orig_addr;
size_t alloc_size = mem->slots[index].alloc_size;
unsigned long pfn = PFN_DOWN(orig_addr);
unsigned char *vaddr = phys_to_virt(tlb_addr);
- unsigned int tlb_offset;
+ unsigned int tlb_offset, orig_addr_offset;
if (orig_addr == INVALID_PHYS_ADDR)
return;
- tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) -
- swiotlb_align_offset(dev, orig_addr);
+ tlb_offset = tlb_addr & (IO_TLB_SIZE - 1);
+ orig_addr_offset = swiotlb_align_offset(dev, orig_addr);
+ if (tlb_offset < orig_addr_offset) {
+ dev_WARN_ONCE(dev, 1,
+ "Access before mapping start detected. orig offset %u, requested offset %u.\n",
+ orig_addr_offset, tlb_offset);
+ return;
+ }
+
+ tlb_offset -= orig_addr_offset;
+ if (tlb_offset > alloc_size) {
+ dev_WARN_ONCE(dev, 1,
+ "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n",
+ alloc_size, size, tlb_offset);
+ return;
+ }
orig_addr += tlb_offset;
alloc_size -= tlb_offset;
@@ -426,10 +458,10 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
* Find a suitable number of IO TLB entries size that will fit this request and
* allocate a buffer from that IO TLB pool.
*/
-static int find_slots(struct device *dev, phys_addr_t orig_addr,
- size_t alloc_size)
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
dma_addr_t tbl_dma_addr =
phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
@@ -438,6 +470,7 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr,
dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
unsigned int nslots = nr_slots(alloc_size), stride;
unsigned int index, wrap, count = 0, i;
+ unsigned int offset = swiotlb_align_offset(dev, orig_addr);
unsigned long flags;
BUG_ON(!nslots);
@@ -457,8 +490,9 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr,
index = wrap = wrap_index(mem, ALIGN(mem->index, stride));
do {
- if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
- (orig_addr & iotlb_align_mask)) {
+ if (orig_addr &&
+ (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
+ (orig_addr & iotlb_align_mask)) {
index = wrap_index(mem, index + 1);
continue;
}
@@ -482,8 +516,11 @@ not_found:
return -1;
found:
- for (i = index; i < index + nslots; i++)
+ for (i = index; i < index + nslots; i++) {
mem->slots[i].list = 0;
+ mem->slots[i].alloc_size =
+ alloc_size - (offset + ((i - index) << IO_TLB_SHIFT));
+ }
for (i = index - 1;
io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
mem->slots[i].list; i--)
@@ -506,7 +543,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
size_t mapping_size, size_t alloc_size,
enum dma_data_direction dir, unsigned long attrs)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
unsigned int i;
int index;
@@ -524,7 +561,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
return (phys_addr_t)DMA_MAPPING_ERROR;
}
- index = find_slots(dev, orig_addr, alloc_size + offset);
+ index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset);
if (index == -1) {
if (!(attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
@@ -538,11 +575,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
* This is needed when we sync the memory. Then we sync the buffer if
* needed.
*/
- for (i = 0; i < nr_slots(alloc_size + offset); i++) {
+ for (i = 0; i < nr_slots(alloc_size + offset); i++)
mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
- mem->slots[index + i].alloc_size =
- alloc_size - (i << IO_TLB_SHIFT);
- }
tlb_addr = slot_addr(mem->start, index) + offset;
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
@@ -550,28 +584,16 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
return tlb_addr;
}
-/*
- * tlb_addr is the physical address of the bounce buffer to unmap.
- */
-void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
- size_t mapping_size, enum dma_data_direction dir,
- unsigned long attrs)
+static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned long flags;
- unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
+ unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
int nslots = nr_slots(mem->slots[index].alloc_size + offset);
int count, i;
/*
- * First, sync the memory before unmapping the entry
- */
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
-
- /*
* Return the buffer to the free list by setting the corresponding
* entries to indicate the number of contiguous entries available.
* While returning the entries to the free list, we merge the entries
@@ -605,6 +627,23 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
spin_unlock_irqrestore(&mem->lock, flags);
}
+/*
+ * tlb_addr is the physical address of the bounce buffer to unmap.
+ */
+void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
+ size_t mapping_size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ /*
+ * First, sync the memory before unmapping the entry
+ */
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+
+ swiotlb_release_slots(dev, tlb_addr);
+}
+
void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir)
{
@@ -662,26 +701,155 @@ size_t swiotlb_max_mapping_size(struct device *dev)
return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
}
-bool is_swiotlb_active(void)
+bool is_swiotlb_active(struct device *dev)
{
- return io_tlb_default_mem != NULL;
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+
+ return mem && mem->nslabs;
}
EXPORT_SYMBOL_GPL(is_swiotlb_active);
#ifdef CONFIG_DEBUG_FS
+static struct dentry *debugfs_dir;
-static int __init swiotlb_create_debugfs(void)
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem)
{
- struct io_tlb_mem *mem = io_tlb_default_mem;
-
- if (!mem)
- return 0;
- mem->debugfs = debugfs_create_dir("swiotlb", NULL);
debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
+}
+
+static int __init swiotlb_create_default_debugfs(void)
+{
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+
+ debugfs_dir = debugfs_create_dir("swiotlb", NULL);
+ if (mem->nslabs) {
+ mem->debugfs = debugfs_dir;
+ swiotlb_create_debugfs_files(mem);
+ }
return 0;
}
-late_initcall(swiotlb_create_debugfs);
+late_initcall(swiotlb_create_default_debugfs);
+
+#endif
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+
+#ifdef CONFIG_DEBUG_FS
+static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
+{
+ struct io_tlb_mem *mem = rmem->priv;
+
+ mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir);
+ swiotlb_create_debugfs_files(mem);
+}
+#else
+static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
+{
+}
#endif
+
+struct page *swiotlb_alloc(struct device *dev, size_t size)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ phys_addr_t tlb_addr;
+ int index;
+
+ if (!mem)
+ return NULL;
+
+ index = swiotlb_find_slots(dev, 0, size);
+ if (index == -1)
+ return NULL;
+
+ tlb_addr = slot_addr(mem->start, index);
+
+ return pfn_to_page(PFN_DOWN(tlb_addr));
+}
+
+bool swiotlb_free(struct device *dev, struct page *page, size_t size)
+{
+ phys_addr_t tlb_addr = page_to_phys(page);
+
+ if (!is_swiotlb_buffer(dev, tlb_addr))
+ return false;
+
+ swiotlb_release_slots(dev, tlb_addr);
+
+ return true;
+}
+
+static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ struct io_tlb_mem *mem = rmem->priv;
+ unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
+
+ /*
+ * Since multiple devices can share the same pool, the private data,
+ * io_tlb_mem struct, will be initialized by the first device attached
+ * to it.
+ */
+ if (!mem) {
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ if (!mem)
+ return -ENOMEM;
+
+ mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs),
+ GFP_KERNEL);
+ if (!mem->slots) {
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
+ rmem->size >> PAGE_SHIFT);
+ swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false);
+ mem->force_bounce = true;
+ mem->for_alloc = true;
+
+ rmem->priv = mem;
+
+ rmem_swiotlb_debugfs_init(rmem);
+ }
+
+ dev->dma_io_tlb_mem = mem;
+
+ return 0;
+}
+
+static void rmem_swiotlb_device_release(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ dev->dma_io_tlb_mem = &io_tlb_default_mem;
+}
+
+static const struct reserved_mem_ops rmem_swiotlb_ops = {
+ .device_init = rmem_swiotlb_device_init,
+ .device_release = rmem_swiotlb_device_release,
+};
+
+static int __init rmem_swiotlb_setup(struct reserved_mem *rmem)
+{
+ unsigned long node = rmem->fdt_node;
+
+ if (of_get_flat_dt_prop(node, "reusable", NULL) ||
+ of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
+ of_get_flat_dt_prop(node, "linux,dma-default", NULL) ||
+ of_get_flat_dt_prop(node, "no-map", NULL))
+ return -EINVAL;
+
+ if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) {
+ pr_err("Restricted DMA pool must be accessible within the linear mapping.");
+ return -EINVAL;
+ }
+
+ rmem->ops = &rmem_swiotlb_ops;
+ pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n",
+ &rmem->base, (unsigned long)rmem->size / SZ_1M);
+ return 0;
+}
+
+RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup);
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bf16395b9e13..d5a61d565ad5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -171,10 +171,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
handle_signal_work(regs, ti_work);
- if (ti_work & _TIF_NOTIFY_RESUME) {
+ if (ti_work & _TIF_NOTIFY_RESUME)
tracehook_notify_resume(regs);
- rseq_handle_notify_resume(NULL, regs);
- }
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fe88d6eea3c2..f23ca260307f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -132,6 +132,7 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
/**
* cpu_function_call - call a function on the cpu
+ * @cpu: target cpu to queue this function
* @func: the function to be called
* @info: the function call argument
*
@@ -3706,6 +3707,29 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
return 0;
}
+static inline bool event_update_userpage(struct perf_event *event)
+{
+ if (likely(!atomic_read(&event->mmap_count)))
+ return false;
+
+ perf_event_update_time(event);
+ perf_set_shadow_time(event, event->ctx);
+ perf_event_update_userpage(event);
+
+ return true;
+}
+
+static inline void group_update_userpage(struct perf_event *group_event)
+{
+ struct perf_event *event;
+
+ if (!event_update_userpage(group_event))
+ return;
+
+ for_each_sibling_event(event, group_event)
+ event_update_userpage(event);
+}
+
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
@@ -3724,14 +3748,15 @@ static int merge_sched_in(struct perf_event *event, void *data)
}
if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ *can_add_hw = 0;
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ } else {
+ ctx->rotate_necessary = 1;
+ perf_mux_hrtimer_restart(cpuctx);
+ group_update_userpage(event);
}
-
- *can_add_hw = 0;
- ctx->rotate_necessary = 1;
- perf_mux_hrtimer_restart(cpuctx);
}
return 0;
@@ -3821,9 +3846,16 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
struct task_struct *task)
{
struct perf_cpu_context *cpuctx;
- struct pmu *pmu = ctx->pmu;
+ struct pmu *pmu;
cpuctx = __get_cpu_context(ctx);
+
+ /*
+ * HACK: for HETEROGENEOUS the task context might have switched to a
+ * different PMU, force (re)set the context,
+ */
+ pmu = ctx->pmu = cpuctx->ctx.pmu;
+
if (cpuctx->task_ctx == ctx) {
if (cpuctx->sched_cb_usage)
__perf_pmu_sched_task(cpuctx, true);
@@ -4689,7 +4721,6 @@ errout:
}
static void perf_event_free_filter(struct perf_event *event);
-static void perf_event_free_bpf_prog(struct perf_event *event);
static void free_event_rcu(struct rcu_head *head)
{
@@ -5566,7 +5597,6 @@ static inline int perf_fget_light(int fd, struct fd *p)
static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
struct perf_event_attr *attr);
@@ -5629,7 +5659,22 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
return perf_event_set_filter(event, (void __user *)arg);
case PERF_EVENT_IOC_SET_BPF:
- return perf_event_set_bpf_prog(event, arg);
+ {
+ struct bpf_prog *prog;
+ int err;
+
+ prog = bpf_prog_get(arg);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = perf_event_set_bpf_prog(event, prog, 0);
+ if (err) {
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ return 0;
+ }
case PERF_EVENT_IOC_PAUSE_OUTPUT: {
struct perf_buffer *rb;
@@ -6303,6 +6348,8 @@ accounting:
ring_buffer_attach(event, rb);
+ perf_event_update_time(event);
+ perf_set_shadow_time(event, event->ctx);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
} else {
@@ -6669,10 +6716,10 @@ out:
return data->aux_size;
}
-long perf_pmu_snapshot_aux(struct perf_buffer *rb,
- struct perf_event *event,
- struct perf_output_handle *handle,
- unsigned long size)
+static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
+ struct perf_event *event,
+ struct perf_output_handle *handle,
+ unsigned long size)
{
unsigned long flags;
long ret;
@@ -8299,10 +8346,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
else
flags = MAP_PRIVATE;
- if (vma->vm_flags & VM_DENYWRITE)
- flags |= MAP_DENYWRITE;
- if (vma->vm_flags & VM_MAYEXEC)
- flags |= MAP_EXECUTABLE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
if (is_vm_hugetlb_page(vma))
@@ -8682,13 +8725,12 @@ static void perf_event_switch(struct task_struct *task,
},
};
- if (!sched_in && task->state == TASK_RUNNING)
+ if (!sched_in && task->on_rq) {
switch_event.event_id.header.misc |=
PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
+ }
- perf_iterate_sb(perf_event_switch_output,
- &switch_event,
- NULL);
+ perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
}
/*
@@ -9902,13 +9944,16 @@ static void bpf_overflow_handler(struct perf_event *event,
.data = data,
.event = event,
};
+ struct bpf_prog *prog;
int ret = 0;
ctx.regs = perf_arch_bpf_user_pt_regs(regs);
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
goto out;
rcu_read_lock();
- ret = BPF_PROG_RUN(event->prog, &ctx);
+ prog = READ_ONCE(event->prog);
+ if (prog)
+ ret = bpf_prog_run(prog, &ctx);
rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
@@ -9918,10 +9963,10 @@ out:
event->orig_overflow_handler(event, data, regs);
}
-static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+static int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
- struct bpf_prog *prog;
-
if (event->overflow_handler_context)
/* hw breakpoint or kernel counter */
return -EINVAL;
@@ -9929,9 +9974,8 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
if (event->prog)
return -EEXIST;
- prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
+ if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
+ return -EINVAL;
if (event->attr.precise_ip &&
prog->call_get_stack &&
@@ -9947,11 +9991,11 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
* attached to perf_sample_data, do not allow attaching BPF
* program that calls bpf_get_[stack|stackid].
*/
- bpf_prog_put(prog);
return -EPROTO;
}
event->prog = prog;
+ event->bpf_cookie = bpf_cookie;
event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
return 0;
@@ -9969,7 +10013,9 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
bpf_prog_put(prog);
}
#else
-static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+static int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
return -EOPNOTSUPP;
}
@@ -9997,14 +10043,13 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
return false;
}
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+ u64 bpf_cookie)
{
bool is_kprobe, is_tracepoint, is_syscall_tp;
- struct bpf_prog *prog;
- int ret;
if (!perf_event_is_tracing(event))
- return perf_event_set_bpf_handler(event, prog_fd);
+ return perf_event_set_bpf_handler(event, prog, bpf_cookie);
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
@@ -10013,41 +10058,27 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
- prog = bpf_prog_get(prog_fd);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
(is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
- (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
- /* valid fd, but invalid bpf program type */
- bpf_prog_put(prog);
+ (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
return -EINVAL;
- }
/* Kprobe override only works for kprobes, not uprobes. */
if (prog->kprobe_override &&
- !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
- bpf_prog_put(prog);
+ !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
return -EINVAL;
- }
if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);
- if (prog->aux->max_ctx_offset > off) {
- bpf_prog_put(prog);
+ if (prog->aux->max_ctx_offset > off)
return -EACCES;
- }
}
- ret = perf_event_attach_bpf_prog(event, prog);
- if (ret)
- bpf_prog_put(prog);
- return ret;
+ return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
}
-static void perf_event_free_bpf_prog(struct perf_event *event)
+void perf_event_free_bpf_prog(struct perf_event *event)
{
if (!perf_event_is_tracing(event)) {
perf_event_free_bpf_handler(event);
@@ -10066,12 +10097,13 @@ static void perf_event_free_filter(struct perf_event *event)
{
}
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+ u64 bpf_cookie)
{
return -ENOENT;
}
-static void perf_event_free_bpf_prog(struct perf_event *event)
+void perf_event_free_bpf_prog(struct perf_event *event)
{
}
#endif /* CONFIG_EVENT_TRACING */
@@ -10187,7 +10219,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
return;
if (ifh->nr_file_filters) {
- mm = get_task_mm(event->ctx->task);
+ mm = get_task_mm(task);
if (!mm)
goto restart;
@@ -11912,6 +11944,37 @@ again:
return gctx;
}
+static bool
+perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
+{
+ unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
+ bool is_capable = perfmon_capable();
+
+ if (attr->sigtrap) {
+ /*
+ * perf_event_attr::sigtrap sends signals to the other task.
+ * Require the current task to also have CAP_KILL.
+ */
+ rcu_read_lock();
+ is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
+ rcu_read_unlock();
+
+ /*
+ * If the required capabilities aren't available, checks for
+ * ptrace permissions: upgrade to ATTACH, since sending signals
+ * can effectively change the target task.
+ */
+ ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
+ }
+
+ /*
+ * Preserve ptrace permission check for backwards compatibility. The
+ * ptrace check also includes checks that the current task and other
+ * task have matching uids, and is therefore not done here explicitly.
+ */
+ return is_capable || ptrace_may_access(task, ptrace_mode);
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -11919,6 +11982,7 @@ again:
* @pid: target pid
* @cpu: target cpu
* @group_fd: group leader event fd
+ * @flags: perf event open flags
*/
SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr __user *, attr_uptr,
@@ -12157,15 +12221,13 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_file;
/*
- * Preserve ptrace permission check for backwards compatibility.
- *
* We must hold exec_update_lock across this and any potential
* perf_install_in_context() call for this new event to
* serialize against exec() altering our credentials (and the
* perf_event_exit_task() that could imply).
*/
err = -EACCES;
- if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ if (!perf_check_permission(&attr, task))
goto err_cred;
}
@@ -12375,6 +12437,8 @@ err_fd:
* @attr: attributes of the counter to create
* @cpu: cpu in which the counter is bound
* @task: task to profile (NULL for percpu)
+ * @overflow_handler: callback to trigger when we hit the event
+ * @context: context data could be used in overflow_handler callback
*/
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index b48d7039a015..f32320ac02fd 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -451,6 +451,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
* register_user_hw_breakpoint - register a hardware breakpoint for user space
* @attr: breakpoint attributes
* @triggered: callback to trigger when we hit the breakpoint
+ * @context: context data could be used in the triggered callback
* @tsk: pointer to 'task_struct' of the process to which the address belongs
*/
struct perf_event *
@@ -550,6 +551,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
* register_wide_hw_breakpoint - register a wide breakpoint in the kernel
* @attr: breakpoint attributes
* @triggered: callback to trigger when we hit the breakpoint
+ * @context: context data could be used in the triggered callback
*
* @return a set of per_cpu pointers to perf events
*/
@@ -566,7 +568,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
if (!cpu_events)
return (void __percpu __force *)ERR_PTR(-ENOMEM);
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu) {
bp = perf_event_create_kernel_counter(attr, cpu, NULL,
triggered, context);
@@ -577,7 +579,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
per_cpu(*cpu_events, cpu) = bp;
}
- put_online_cpus();
+ cpus_read_unlock();
if (likely(!err))
return cpu_events;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6addc9780319..af24dc3febbe 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -453,6 +453,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
* that have fixed length instructions.
*
* uprobe_write_opcode - write the opcode at a given virtual address.
+ * @auprobe: arch specific probepoint information.
* @mm: the probed process address space.
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
@@ -2046,8 +2047,8 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
struct vm_area_struct *vma;
mmap_read_lock(mm);
- vma = find_vma(mm, bp_vaddr);
- if (vma && vma->vm_start <= bp_vaddr) {
+ vma = vma_lookup(mm, bp_vaddr);
+ if (vma) {
if (valid_vma(vma, false)) {
struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
diff --git a/kernel/exit.c b/kernel/exit.c
index 65809fac3038..91a43e57a32e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -188,7 +188,7 @@ repeat:
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
- atomic_dec(&__task_cred(p)->user->processes);
+ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
rcu_read_unlock();
cgroup_release(p);
@@ -777,7 +777,7 @@ void __noreturn do_exit(long code)
schedule();
}
- io_uring_files_cancel(tsk->files);
+ io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
/* sync mm's RSS info before statistics gathering */
diff --git a/kernel/fork.c b/kernel/fork.c
index a070caed5c8e..38681ad44c76 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -425,7 +425,7 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
static void release_task_stack(struct task_struct *tsk)
{
- if (WARN_ON(tsk->state != TASK_DEAD))
+ if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
return; /* Better to leak the stack than to free prematurely */
account_kernel_stack(tsk, -1);
@@ -446,6 +446,7 @@ void put_task_stack(struct task_struct *tsk)
void free_task(struct task_struct *tsk)
{
+ release_user_cpus_ptr(tsk);
scs_release(tsk);
#ifndef CONFIG_THREAD_INFO_IN_TASK
@@ -470,6 +471,20 @@ void free_task(struct task_struct *tsk)
}
EXPORT_SYMBOL(free_task);
+static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ struct file *exe_file;
+
+ exe_file = get_mm_exe_file(oldmm);
+ RCU_INIT_POINTER(mm->exe_file, exe_file);
+ /*
+ * We depend on the oldmm having properly denied write access to the
+ * exe_file already.
+ */
+ if (exe_file && deny_write_access(exe_file))
+ pr_warn_once("deny_write_access() failed in %s\n", __func__);
+}
+
#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
@@ -493,7 +508,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
/* No ordering required: file already has been exposed. */
- RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+ dup_mm_exe_file(mm, oldmm);
mm->total_vm = oldmm->total_vm;
mm->data_vm = oldmm->data_vm;
@@ -556,12 +571,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
file = tmp->vm_file;
if (file) {
- struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
get_file(file);
- if (tmp->vm_flags & VM_DENYWRITE)
- put_write_access(inode);
i_mmap_lock_write(mapping);
if (tmp->vm_flags & VM_SHARED)
mapping_allow_writable(mapping);
@@ -639,7 +651,7 @@ static inline void mm_free_pgd(struct mm_struct *mm)
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
mmap_write_lock(oldmm);
- RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+ dup_mm_exe_file(mm, oldmm);
mmap_write_unlock(oldmm);
return 0;
}
@@ -742,6 +754,7 @@ void __put_task_struct(struct task_struct *tsk)
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
+ sched_core_free(tsk);
if (!profile_handoff_task(tsk))
free_task(tsk);
@@ -824,9 +837,14 @@ void __init fork_init(void)
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
- for (i = 0; i < UCOUNT_COUNTS; i++)
+ for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
init_user_ns.ucount_max[i] = max_threads/2;
+ set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
+ set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
+ set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
+ set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
+
#ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
NULL, free_vm_stack_cache);
@@ -918,6 +936,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#endif
if (orig->cpus_ptr == &orig->cpus_mask)
tsk->cpus_ptr = &tsk->cpus_mask;
+ dup_user_cpus_ptr(tsk, orig, node);
/*
* One for the user space visible state that goes away when reaped.
@@ -1029,7 +1048,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
- atomic_set(&mm->has_pinned, 0);
atomic64_set(&mm->pinned_vm, 0);
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
@@ -1045,6 +1063,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->pmd_huge_pte = NULL;
#endif
mm_init_uprobes_state(mm);
+ hugetlb_count_init(mm);
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -1143,11 +1162,11 @@ void mmput_async(struct mm_struct *mm)
*
* Main users are mmput() and sys_execve(). Callers prevent concurrent
* invocations: in mmput() nobody alive left, in execve task is single
- * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
- * mm->exe_file, but does so without using set_mm_exe_file() in order
- * to avoid the need for any locks.
+ * threaded.
+ *
+ * Can only fail if new_exe_file != NULL.
*/
-void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
struct file *old_exe_file;
@@ -1158,11 +1177,73 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
*/
old_exe_file = rcu_dereference_raw(mm->exe_file);
- if (new_exe_file)
+ if (new_exe_file) {
+ /*
+ * We expect the caller (i.e., sys_execve) to already denied
+ * write access, so this is unlikely to fail.
+ */
+ if (unlikely(deny_write_access(new_exe_file)))
+ return -EACCES;
get_file(new_exe_file);
+ }
rcu_assign_pointer(mm->exe_file, new_exe_file);
- if (old_exe_file)
+ if (old_exe_file) {
+ allow_write_access(old_exe_file);
+ fput(old_exe_file);
+ }
+ return 0;
+}
+
+/**
+ * replace_mm_exe_file - replace a reference to the mm's executable file
+ *
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe),
+ * dealing with concurrent invocation and without grabbing the mmap lock in
+ * write mode.
+ *
+ * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
+ */
+int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+ struct vm_area_struct *vma;
+ struct file *old_exe_file;
+ int ret = 0;
+
+ /* Forbid mm->exe_file change if old file still mapped. */
+ old_exe_file = get_mm_exe_file(mm);
+ if (old_exe_file) {
+ mmap_read_lock(mm);
+ for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (path_equal(&vma->vm_file->f_path,
+ &old_exe_file->f_path))
+ ret = -EBUSY;
+ }
+ mmap_read_unlock(mm);
fput(old_exe_file);
+ if (ret)
+ return ret;
+ }
+
+ /* set the new file, lockless */
+ ret = deny_write_access(new_exe_file);
+ if (ret)
+ return -EACCES;
+ get_file(new_exe_file);
+
+ old_exe_file = xchg(&mm->exe_file, new_exe_file);
+ if (old_exe_file) {
+ /*
+ * Don't race with dup_mmap() getting the file and disallowing
+ * write access while someone might open the file writable.
+ */
+ mmap_read_lock(mm);
+ allow_write_access(old_exe_file);
+ fput(old_exe_file);
+ mmap_read_unlock(mm);
+ }
+ return 0;
}
/**
@@ -1182,7 +1263,6 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
rcu_read_unlock();
return exe_file;
}
-EXPORT_SYMBOL(get_mm_exe_file);
/**
* get_task_exe_file - acquire a reference to the task's executable file
@@ -1205,7 +1285,6 @@ struct file *get_task_exe_file(struct task_struct *task)
task_unlock(task);
return exe_file;
}
-EXPORT_SYMBOL(get_task_exe_file);
/**
* get_task_mm - acquire a reference to the task's mm
@@ -1977,8 +2056,7 @@ static __latent_entropy struct task_struct *copy_process(
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
- if (atomic_read(&p->real_cred->user->processes) >=
- task_rlimit(p, RLIMIT_NPROC)) {
+ if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
@@ -1999,7 +2077,7 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
@@ -2079,6 +2157,7 @@ static __latent_entropy struct task_struct *copy_process(
#endif
#ifdef CONFIG_BPF_SYSCALL
RCU_INIT_POINTER(p->bpf_storage, NULL);
+ p->bpf_ctx = NULL;
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
@@ -2249,6 +2328,8 @@ static __latent_entropy struct task_struct *copy_process(
klp_copy_process(p);
+ sched_core_fork(p);
+
spin_lock(&current->sighand->siglock);
/*
@@ -2336,6 +2417,7 @@ static __latent_entropy struct task_struct *copy_process(
return p;
bad_fork_cancel_cgroup:
+ sched_core_free(p);
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p, args);
@@ -2384,10 +2466,10 @@ bad_fork_cleanup_threadgroup_lock:
#endif
delayacct_tsk_free(p);
bad_fork_cleanup_count:
- atomic_dec(&p->cred->user->processes);
+ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
exit_creds(p);
bad_fork_free:
- p->state = TASK_DEAD;
+ WRITE_ONCE(p->__state, TASK_DEAD);
put_task_stack(p);
delayed_free_task(p);
fork_out:
@@ -2407,7 +2489,7 @@ static inline void init_idle_pids(struct task_struct *idle)
}
}
-struct task_struct *fork_idle(int cpu)
+struct task_struct * __init fork_idle(int cpu)
{
struct task_struct *task;
struct kernel_clone_args args = {
@@ -2997,6 +3079,12 @@ int ksys_unshare(unsigned long unshare_flags)
if (err)
goto bad_unshare_cleanup_cred;
+ if (new_cred) {
+ err = set_cred_ucounts(new_cred);
+ if (err)
+ goto bad_unshare_cleanup_cred;
+ }
+
if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
if (do_sysvsem) {
/*
diff --git a/kernel/freezer.c b/kernel/freezer.c
index dc520f01f99d..45ab36ffd0e7 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -58,7 +58,7 @@ bool __refrigerator(bool check_kthr_stop)
/* Hmm, should we be allowed to suspend when there are realtime
processes around? */
bool was_frozen = false;
- long save = current->state;
+ unsigned int save = get_current_state();
pr_debug("%s entered refrigerator\n", current->comm);
diff --git a/kernel/futex.c b/kernel/futex.c
index 408cad5e8968..c15ad276fd15 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -179,7 +179,7 @@ struct futex_pi_state {
/*
* The PI object:
*/
- struct rt_mutex pi_mutex;
+ struct rt_mutex_base pi_mutex;
struct task_struct *owner;
refcount_t refcount;
@@ -197,6 +197,8 @@ struct futex_pi_state {
* @rt_waiter: rt_waiter storage for use with requeue_pi
* @requeue_pi_key: the requeue_pi target futex key
* @bitset: bitset for the optional bitmasked wakeup
+ * @requeue_state: State field for futex_requeue_pi()
+ * @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
*
* We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
* we can wake only the relevant ones (hashed queues may be shared).
@@ -219,12 +221,68 @@ struct futex_q {
struct rt_mutex_waiter *rt_waiter;
union futex_key *requeue_pi_key;
u32 bitset;
+ atomic_t requeue_state;
+#ifdef CONFIG_PREEMPT_RT
+ struct rcuwait requeue_wait;
+#endif
} __randomize_layout;
+/*
+ * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
+ * underlying rtmutex. The task which is about to be requeued could have
+ * just woken up (timeout, signal). After the wake up the task has to
+ * acquire hash bucket lock, which is held by the requeue code. As a task
+ * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
+ * and the hash bucket lock blocking would collide and corrupt state.
+ *
+ * On !PREEMPT_RT this is not a problem and everything could be serialized
+ * on hash bucket lock, but aside of having the benefit of common code,
+ * this allows to avoid doing the requeue when the task is already on the
+ * way out and taking the hash bucket lock of the original uaddr1 when the
+ * requeue has been completed.
+ *
+ * The following state transitions are valid:
+ *
+ * On the waiter side:
+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT
+ *
+ * On the requeue side:
+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed)
+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED
+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed)
+ *
+ * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
+ * signals that the waiter is already on the way out. It also means that
+ * the waiter is still on the 'wait' futex, i.e. uaddr1.
+ *
+ * The waiter side signals early wakeup to the requeue side either through
+ * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
+ * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
+ * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
+ * which means the wakeup is interleaving with a requeue in progress it has
+ * to wait for the requeue side to change the state. Either to DONE/LOCKED
+ * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
+ * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
+ * the requeue side when the requeue attempt failed via deadlock detection
+ * and therefore the waiter q is still on the uaddr1 futex.
+ */
+enum {
+ Q_REQUEUE_PI_NONE = 0,
+ Q_REQUEUE_PI_IGNORE,
+ Q_REQUEUE_PI_IN_PROGRESS,
+ Q_REQUEUE_PI_WAIT,
+ Q_REQUEUE_PI_DONE,
+ Q_REQUEUE_PI_LOCKED,
+};
+
static const struct futex_q futex_q_init = {
/* list gets initialized in queue_me()*/
- .key = FUTEX_KEY_INIT,
- .bitset = FUTEX_BITSET_MATCH_ANY
+ .key = FUTEX_KEY_INIT,
+ .bitset = FUTEX_BITSET_MATCH_ANY,
+ .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
};
/*
@@ -1205,6 +1263,36 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
return -ESRCH;
}
+static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
+ struct futex_pi_state **ps)
+{
+ /*
+ * No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
+ */
+ struct futex_pi_state *pi_state = alloc_pi_state();
+
+ /*
+ * Initialize the pi_mutex in locked state and make @p
+ * the owner of it:
+ */
+ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+ /* Store the key for possible exit cleanups: */
+ pi_state->key = *key;
+
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &p->pi_state_list);
+ /*
+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+ * because there is no concurrency as the object is not published yet.
+ */
+ pi_state->owner = p;
+
+ *ps = pi_state;
+}
/*
* Lookup the task for the TID provided from user space and attach to
* it after doing proper sanity checks.
@@ -1214,7 +1302,6 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
struct task_struct **exiting)
{
pid_t pid = uval & FUTEX_TID_MASK;
- struct futex_pi_state *pi_state;
struct task_struct *p;
/*
@@ -1266,60 +1353,14 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
return ret;
}
- /*
- * No existing pi state. First waiter. [2]
- *
- * This creates pi_state, we have hb->lock held, this means nothing can
- * observe this state, wait_lock is irrelevant.
- */
- pi_state = alloc_pi_state();
-
- /*
- * Initialize the pi_mutex in locked state and make @p
- * the owner of it:
- */
- rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
-
- /* Store the key for possible exit cleanups: */
- pi_state->key = *key;
-
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &p->pi_state_list);
- /*
- * Assignment without holding pi_state->pi_mutex.wait_lock is safe
- * because there is no concurrency as the object is not published yet.
- */
- pi_state->owner = p;
+ __attach_to_pi_owner(p, key, ps);
raw_spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
- *ps = pi_state;
-
return 0;
}
-static int lookup_pi_state(u32 __user *uaddr, u32 uval,
- struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps,
- struct task_struct **exiting)
-{
- struct futex_q *top_waiter = futex_top_waiter(hb, key);
-
- /*
- * If there is a waiter on that futex, validate it and
- * attach to the pi_state when the validation succeeds.
- */
- if (top_waiter)
- return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
-
- /*
- * We are the first waiter - try to look up the owner based on
- * @uval and attach to it.
- */
- return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
-}
-
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{
int err;
@@ -1354,7 +1395,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
* - 1 - acquired the lock;
* - <0 - error
*
- * The hb->lock and futex_key refs shall be held by the caller.
+ * The hb->lock must be held by the caller.
*
* @exiting is only set when the return value is -EBUSY. If so, this holds
* a refcount on the exiting task on return and the caller needs to drop it
@@ -1417,8 +1458,26 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
newval |= FUTEX_WAITERS;
ret = lock_pi_update_atomic(uaddr, uval, newval);
- /* If the take over worked, return 1 */
- return ret < 0 ? ret : 1;
+ if (ret)
+ return ret;
+
+ /*
+ * If the waiter bit was requested the caller also needs PI
+ * state attached to the new owner of the user space futex.
+ *
+ * @task is guaranteed to be alive and it cannot be exiting
+ * because it is either sleeping or waiting in
+ * futex_requeue_pi_wakeup_sync().
+ *
+ * No need to do the full attach_to_pi_owner() exercise
+ * because @task is known and valid.
+ */
+ if (set_waiters) {
+ raw_spin_lock_irq(&task->pi_lock);
+ __attach_to_pi_owner(task, key, ps);
+ raw_spin_unlock_irq(&task->pi_lock);
+ }
+ return 1;
}
/*
@@ -1493,11 +1552,11 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
*/
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
{
- u32 curval, newval;
struct rt_mutex_waiter *top_waiter;
struct task_struct *new_owner;
bool postunlock = false;
- DEFINE_WAKE_Q(wake_q);
+ DEFINE_RT_WAKE_Q(wqh);
+ u32 curval, newval;
int ret = 0;
top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
@@ -1549,14 +1608,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
* not fail.
*/
pi_state_update_owner(pi_state, new_owner);
- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
}
out_unlock:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
if (postunlock)
- rt_mutex_postunlock(&wake_q);
+ rt_mutex_postunlock(&wqh);
return ret;
}
@@ -1727,12 +1786,9 @@ retry_private:
return ret;
}
- if (!(flags & FLAGS_SHARED)) {
- cond_resched();
- goto retry_private;
- }
-
cond_resched();
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
goto retry;
}
@@ -1796,6 +1852,108 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
q->key = *key2;
}
+static inline bool futex_requeue_pi_prepare(struct futex_q *q,
+ struct futex_pi_state *pi_state)
+{
+ int old, new;
+
+ /*
+ * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
+ * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
+ * ignore the waiter.
+ */
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ if (old == Q_REQUEUE_PI_IGNORE)
+ return false;
+
+ /*
+ * futex_proxy_trylock_atomic() might have set it to
+ * IN_PROGRESS and a interleaved early wake to WAIT.
+ *
+ * It was considered to have an extra state for that
+ * trylock, but that would just add more conditionals
+ * all over the place for a dubious value.
+ */
+ if (old != Q_REQUEUE_PI_NONE)
+ break;
+
+ new = Q_REQUEUE_PI_IN_PROGRESS;
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+ q->pi_state = pi_state;
+ return true;
+}
+
+static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
+{
+ int old, new;
+
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ if (old == Q_REQUEUE_PI_IGNORE)
+ return;
+
+ if (locked >= 0) {
+ /* Requeue succeeded. Set DONE or LOCKED */
+ WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&
+ old != Q_REQUEUE_PI_WAIT);
+ new = Q_REQUEUE_PI_DONE + locked;
+ } else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
+ /* Deadlock, no early wakeup interleave */
+ new = Q_REQUEUE_PI_NONE;
+ } else {
+ /* Deadlock, early wakeup interleave. */
+ WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);
+ new = Q_REQUEUE_PI_IGNORE;
+ }
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+#ifdef CONFIG_PREEMPT_RT
+ /* If the waiter interleaved with the requeue let it know */
+ if (unlikely(old == Q_REQUEUE_PI_WAIT))
+ rcuwait_wake_up(&q->requeue_wait);
+#endif
+}
+
+static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
+{
+ int old, new;
+
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ /* Is requeue done already? */
+ if (old >= Q_REQUEUE_PI_DONE)
+ return old;
+
+ /*
+ * If not done, then tell the requeue code to either ignore
+ * the waiter or to wake it up once the requeue is done.
+ */
+ new = Q_REQUEUE_PI_WAIT;
+ if (old == Q_REQUEUE_PI_NONE)
+ new = Q_REQUEUE_PI_IGNORE;
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+ /* If the requeue was in progress, wait for it to complete */
+ if (old == Q_REQUEUE_PI_IN_PROGRESS) {
+#ifdef CONFIG_PREEMPT_RT
+ rcuwait_wait_event(&q->requeue_wait,
+ atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
+ TASK_UNINTERRUPTIBLE);
+#else
+ (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
+#endif
+ }
+
+ /*
+ * Requeue is now either prohibited or complete. Reread state
+ * because during the wait above it might have changed. Nothing
+ * will modify q->requeue_state after this point.
+ */
+ return atomic_read(&q->requeue_state);
+}
+
/**
* requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
* @q: the futex_q
@@ -1803,12 +1961,26 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
* @hb: the hash_bucket of the requeue target futex
*
* During futex_requeue, with requeue_pi=1, it is possible to acquire the
- * target futex if it is uncontended or via a lock steal. Set the futex_q key
- * to the requeue target futex so the waiter can detect the wakeup on the right
- * futex, but remove it from the hb and NULL the rt_waiter so it can detect
- * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
- * to protect access to the pi_state to fixup the owner later. Must be called
- * with both q->lock_ptr and hb->lock held.
+ * target futex if it is uncontended or via a lock steal.
+ *
+ * 1) Set @q::key to the requeue target futex key so the waiter can detect
+ * the wakeup on the right futex.
+ *
+ * 2) Dequeue @q from the hash bucket.
+ *
+ * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
+ * acquisition.
+ *
+ * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
+ * the waiter has to fixup the pi state.
+ *
+ * 5) Complete the requeue state so the waiter can make progress. After
+ * this point the waiter task can return from the syscall immediately in
+ * case that the pi state does not have to be fixed up.
+ *
+ * 6) Wake the waiter task.
+ *
+ * Must be called with both q->lock_ptr and hb->lock held.
*/
static inline
void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
@@ -1823,6 +1995,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
q->lock_ptr = &hb->lock;
+ /* Signal locked state to the waiter */
+ futex_requeue_pi_complete(q, 1);
wake_up_state(q->task, TASK_NORMAL);
}
@@ -1860,7 +2034,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
{
struct futex_q *top_waiter = NULL;
u32 curval;
- int ret, vpid;
+ int ret;
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
@@ -1873,7 +2047,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
* If the caller intends to requeue more than 1 waiter to pifutex,
* force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
* as we have means to handle the possible fault. If not, don't set
- * the bit unecessarily as it will force the subsequent unlock to enter
+ * the bit unnecessarily as it will force the subsequent unlock to enter
* the kernel.
*/
top_waiter = futex_top_waiter(hb1, key1);
@@ -1882,21 +2056,52 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
if (!top_waiter)
return 0;
+ /*
+ * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
+ * and waiting on the 'waitqueue' futex which is always !PI.
+ */
+ if (!top_waiter->rt_waiter || top_waiter->pi_state)
+ return -EINVAL;
+
/* Ensure we requeue to the expected futex. */
if (!match_futex(top_waiter->requeue_pi_key, key2))
return -EINVAL;
+ /* Ensure that this does not race against an early wakeup */
+ if (!futex_requeue_pi_prepare(top_waiter, NULL))
+ return -EAGAIN;
+
/*
- * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
- * the contended case or if set_waiters is 1. The pi_state is returned
- * in ps in contended cases.
+ * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
+ * in the contended case or if @set_waiters is true.
+ *
+ * In the contended case PI state is attached to the lock owner. If
+ * the user space lock can be acquired then PI state is attached to
+ * the new owner (@top_waiter->task) when @set_waiters is true.
*/
- vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
exiting, set_waiters);
if (ret == 1) {
+ /*
+ * Lock was acquired in user space and PI state was
+ * attached to @top_waiter->task. That means state is fully
+ * consistent and the waiter can return to user space
+ * immediately after the wakeup.
+ */
requeue_pi_wake_futex(top_waiter, key2, hb2);
- return vpid;
+ } else if (ret < 0) {
+ /* Rewind top_waiter::requeue_state */
+ futex_requeue_pi_complete(top_waiter, ret);
+ } else {
+ /*
+ * futex_lock_pi_atomic() did not acquire the user space
+ * futex, but managed to establish the proxy lock and pi
+ * state. top_waiter::requeue_state cannot be fixed up here
+ * because the waiter is not enqueued on the rtmutex
+ * yet. This is handled at the callsite depending on the
+ * result of rt_mutex_start_proxy_lock() which is
+ * guaranteed to be reached with this function returning 0.
+ */
}
return ret;
}
@@ -1951,23 +2156,35 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
return -EINVAL;
/*
+ * futex_requeue() allows the caller to define the number
+ * of waiters to wake up via the @nr_wake argument. With
+ * REQUEUE_PI, waking up more than one waiter is creating
+ * more problems than it solves. Waking up a waiter makes
+ * only sense if the PI futex @uaddr2 is uncontended as
+ * this allows the requeue code to acquire the futex
+ * @uaddr2 before waking the waiter. The waiter can then
+ * return to user space without further action. A secondary
+ * wakeup would just make the futex_wait_requeue_pi()
+ * handling more complex, because that code would have to
+ * look up pi_state and do more or less all the handling
+ * which the requeue code has to do for the to be requeued
+ * waiters. So restrict the number of waiters to wake to
+ * one, and only wake it up when the PI futex is
+ * uncontended. Otherwise requeue it and let the unlock of
+ * the PI futex handle the wakeup.
+ *
+ * All REQUEUE_PI users, e.g. pthread_cond_signal() and
+ * pthread_cond_broadcast() must use nr_wake=1.
+ */
+ if (nr_wake != 1)
+ return -EINVAL;
+
+ /*
* requeue_pi requires a pi_state, try to allocate it now
* without any locks in case it fails.
*/
if (refill_pi_state_cache())
return -ENOMEM;
- /*
- * requeue_pi must wake as many tasks as it can, up to nr_wake
- * + nr_requeue, since it acquires the rt_mutex prior to
- * returning to userspace, so as to not leave the rt_mutex with
- * waiters and no owner. However, second and third wake-ups
- * cannot be predicted as they involve race conditions with the
- * first wake and a fault while looking up the pi_state. Both
- * pthread_cond_signal() and pthread_cond_broadcast() should
- * use nr_wake=1.
- */
- if (nr_wake != 1)
- return -EINVAL;
}
retry:
@@ -2017,7 +2234,7 @@ retry_private:
}
}
- if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+ if (requeue_pi) {
struct task_struct *exiting = NULL;
/*
@@ -2025,45 +2242,65 @@ retry_private:
* intend to requeue waiters, force setting the FUTEX_WAITERS
* bit. We force this here where we are able to easily handle
* faults rather in the requeue loop below.
+ *
+ * Updates topwaiter::requeue_state if a top waiter exists.
*/
ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
&key2, &pi_state,
&exiting, nr_requeue);
/*
- * At this point the top_waiter has either taken uaddr2 or is
- * waiting on it. If the former, then the pi_state will not
- * exist yet, look it up one more time to ensure we have a
- * reference to it. If the lock was taken, ret contains the
- * vpid of the top waiter task.
- * If the lock was not taken, we have pi_state and an initial
- * refcount on it. In case of an error we have nothing.
+ * At this point the top_waiter has either taken uaddr2 or
+ * is waiting on it. In both cases pi_state has been
+ * established and an initial refcount on it. In case of an
+ * error there's nothing.
+ *
+ * The top waiter's requeue_state is up to date:
+ *
+ * - If the lock was acquired atomically (ret == 1), then
+ * the state is Q_REQUEUE_PI_LOCKED.
+ *
+ * The top waiter has been dequeued and woken up and can
+ * return to user space immediately. The kernel/user
+ * space state is consistent. In case that there must be
+ * more waiters requeued the WAITERS bit in the user
+ * space futex is set so the top waiter task has to go
+ * into the syscall slowpath to unlock the futex. This
+ * will block until this requeue operation has been
+ * completed and the hash bucket locks have been
+ * dropped.
+ *
+ * - If the trylock failed with an error (ret < 0) then
+ * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
+ * happened", or Q_REQUEUE_PI_IGNORE when there was an
+ * interleaved early wakeup.
+ *
+ * - If the trylock did not succeed (ret == 0) then the
+ * state is either Q_REQUEUE_PI_IN_PROGRESS or
+ * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
+ * This will be cleaned up in the loop below, which
+ * cannot fail because futex_proxy_trylock_atomic() did
+ * the same sanity checks for requeue_pi as the loop
+ * below does.
*/
- if (ret > 0) {
- WARN_ON(pi_state);
- task_count++;
- /*
- * If we acquired the lock, then the user space value
- * of uaddr2 should be vpid. It cannot be changed by
- * the top waiter as it is blocked on hb2 lock if it
- * tries to do so. If something fiddled with it behind
- * our back the pi state lookup might unearth it. So
- * we rather use the known value than rereading and
- * handing potential crap to lookup_pi_state.
- *
- * If that call succeeds then we have pi_state and an
- * initial refcount on it.
- */
- ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
- &pi_state, &exiting);
- }
-
switch (ret) {
case 0:
/* We hold a reference on the pi state. */
break;
- /* If the above failed, then pi_state is NULL */
+ case 1:
+ /*
+ * futex_proxy_trylock_atomic() acquired the user space
+ * futex. Adjust task_count.
+ */
+ task_count++;
+ ret = 0;
+ break;
+
+ /*
+ * If the above failed, then pi_state is NULL and
+ * waiter::requeue_state is correct.
+ */
case -EFAULT:
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
@@ -2102,7 +2339,7 @@ retry_private:
continue;
/*
- * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
* be paired with each other and no other futex ops.
*
* We should never be requeueing a futex_q with a pi_state,
@@ -2115,18 +2352,17 @@ retry_private:
break;
}
- /*
- * Wake nr_wake waiters. For requeue_pi, if we acquired the
- * lock, we already woke the top_waiter. If not, it will be
- * woken by futex_unlock_pi().
- */
- if (++task_count <= nr_wake && !requeue_pi) {
- mark_wake_futex(&wake_q, this);
+ /* Plain futexes just wake or requeue and are done */
+ if (!requeue_pi) {
+ if (++task_count <= nr_wake)
+ mark_wake_futex(&wake_q, this);
+ else
+ requeue_futex(this, hb1, hb2, &key2);
continue;
}
/* Ensure we requeue to the expected futex for requeue_pi. */
- if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
+ if (!match_futex(this->requeue_pi_key, &key2)) {
ret = -EINVAL;
break;
}
@@ -2134,54 +2370,66 @@ retry_private:
/*
* Requeue nr_requeue waiters and possibly one more in the case
* of requeue_pi if we couldn't acquire the lock atomically.
+ *
+ * Prepare the waiter to take the rt_mutex. Take a refcount
+ * on the pi_state and store the pointer in the futex_q
+ * object of the waiter.
*/
- if (requeue_pi) {
+ get_pi_state(pi_state);
+
+ /* Don't requeue when the waiter is already on the way out. */
+ if (!futex_requeue_pi_prepare(this, pi_state)) {
/*
- * Prepare the waiter to take the rt_mutex. Take a
- * refcount on the pi_state and store the pointer in
- * the futex_q object of the waiter.
+ * Early woken waiter signaled that it is on the
+ * way out. Drop the pi_state reference and try the
+ * next waiter. @this->pi_state is still NULL.
*/
- get_pi_state(pi_state);
- this->pi_state = pi_state;
- ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
- this->rt_waiter,
- this->task);
- if (ret == 1) {
- /*
- * We got the lock. We do neither drop the
- * refcount on pi_state nor clear
- * this->pi_state because the waiter needs the
- * pi_state for cleaning up the user space
- * value. It will drop the refcount after
- * doing so.
- */
- requeue_pi_wake_futex(this, &key2, hb2);
- continue;
- } else if (ret) {
- /*
- * rt_mutex_start_proxy_lock() detected a
- * potential deadlock when we tried to queue
- * that waiter. Drop the pi_state reference
- * which we took above and remove the pointer
- * to the state from the waiters futex_q
- * object.
- */
- this->pi_state = NULL;
- put_pi_state(pi_state);
- /*
- * We stop queueing more waiters and let user
- * space deal with the mess.
- */
- break;
- }
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
+ this->task);
+
+ if (ret == 1) {
+ /*
+ * We got the lock. We do neither drop the refcount
+ * on pi_state nor clear this->pi_state because the
+ * waiter needs the pi_state for cleaning up the
+ * user space value. It will drop the refcount
+ * after doing so. this::requeue_state is updated
+ * in the wakeup as well.
+ */
+ requeue_pi_wake_futex(this, &key2, hb2);
+ task_count++;
+ } else if (!ret) {
+ /* Waiter is queued, move it to hb2 */
+ requeue_futex(this, hb1, hb2, &key2);
+ futex_requeue_pi_complete(this, 0);
+ task_count++;
+ } else {
+ /*
+ * rt_mutex_start_proxy_lock() detected a potential
+ * deadlock when we tried to queue that waiter.
+ * Drop the pi_state reference which we took above
+ * and remove the pointer to the state from the
+ * waiters futex_q object.
+ */
+ this->pi_state = NULL;
+ put_pi_state(pi_state);
+ futex_requeue_pi_complete(this, ret);
+ /*
+ * We stop queueing more waiters and let user space
+ * deal with the mess.
+ */
+ break;
}
- requeue_futex(this, hb1, hb2, &key2);
}
/*
- * We took an extra initial reference to the pi_state either
- * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
- * need to drop it here again.
+ * We took an extra initial reference to the pi_state in
+ * futex_proxy_trylock_atomic(). We need to drop it here again.
*/
put_pi_state(pi_state);
@@ -2317,7 +2565,7 @@ retry:
}
/*
- * PI futexes can not be requeued and must remove themself from the
+ * PI futexes can not be requeued and must remove themselves from the
* hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
*/
static void unqueue_me_pi(struct futex_q *q)
@@ -2360,7 +2608,7 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* Modifying pi_state _before_ the user space value would leave the
* pi_state in an inconsistent state when we fault here, because we
* need to drop the locks to handle the fault. This might be observed
- * in the PID check in lookup_pi_state.
+ * in the PID checks when attaching to PI state .
*/
retry:
if (!argowner) {
@@ -2617,8 +2865,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
*
* Setup the futex_q and locate the hash_bucket. Get the futex value and
* compare it with the expected value. Handle atomic faults internally.
- * Return with the hb lock held and a q.key reference on success, and unlocked
- * with no q.key reference on failure.
+ * Return with the hb lock held on success, and unlocked on failure.
*
* Return:
* - 0 - uaddr contains val and hb has been locked;
@@ -2696,8 +2943,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
current->timer_slack_ns);
retry:
/*
- * Prepare to wait on uaddr. On success, holds hb lock and increments
- * q.key refs.
+ * Prepare to wait on uaddr. On success, it holds hb->lock and q
+ * is initialized.
*/
ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
@@ -2708,7 +2955,6 @@ retry:
/* If we were woken (and unqueued), we succeeded, whatever. */
ret = 0;
- /* unqueue_me() drops q.key ref */
if (!unqueue_me(&q))
goto out;
ret = -ETIMEDOUT;
@@ -2785,7 +3031,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
if (refill_pi_state_cache())
return -ENOMEM;
- to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);
+ to = futex_setup_timer(time, &timeout, flags, 0);
retry:
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
@@ -2902,7 +3148,7 @@ no_block:
*/
res = fixup_owner(uaddr, &q, !ret);
/*
- * If fixup_owner() returned an error, proprogate that. If it acquired
+ * If fixup_owner() returned an error, propagate that. If it acquired
* the lock, clear our -ETIMEDOUT or -EINTR.
*/
if (res)
@@ -3075,27 +3321,22 @@ pi_faulted:
}
/**
- * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
+ * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
* @hb: the hash_bucket futex_q was original enqueued on
* @q: the futex_q woken while waiting to be requeued
- * @key2: the futex_key of the requeue target futex
* @timeout: the timeout associated with the wait (NULL if none)
*
- * Detect if the task was woken on the initial futex as opposed to the requeue
- * target futex. If so, determine if it was a timeout or a signal that caused
- * the wakeup and return the appropriate error code to the caller. Must be
- * called with the hb lock held.
+ * Determine the cause for the early wakeup.
*
* Return:
- * - 0 = no early wakeup detected;
- * - <0 = -ETIMEDOUT or -ERESTARTNOINTR
+ * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
*/
static inline
int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
- struct futex_q *q, union futex_key *key2,
+ struct futex_q *q,
struct hrtimer_sleeper *timeout)
{
- int ret = 0;
+ int ret;
/*
* With the hb lock held, we avoid races while we process the wakeup.
@@ -3104,22 +3345,21 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* It can't be requeued from uaddr2 to something else since we don't
* support a PI aware source futex for requeue.
*/
- if (!match_futex(&q->key, key2)) {
- WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
- /*
- * We were woken prior to requeue by a timeout or a signal.
- * Unqueue the futex_q and determine which it was.
- */
- plist_del(&q->list, &hb->chain);
- hb_waiters_dec(hb);
+ WARN_ON_ONCE(&hb->lock != q->lock_ptr);
- /* Handle spurious wakeups gracefully */
- ret = -EWOULDBLOCK;
- if (timeout && !timeout->task)
- ret = -ETIMEDOUT;
- else if (signal_pending(current))
- ret = -ERESTARTNOINTR;
- }
+ /*
+ * We were woken prior to requeue by a timeout or a signal.
+ * Unqueue the futex_q and determine which it was.
+ */
+ plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
+
+ /* Handle spurious wakeups gracefully */
+ ret = -EWOULDBLOCK;
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ else if (signal_pending(current))
+ ret = -ERESTARTNOINTR;
return ret;
}
@@ -3172,6 +3412,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
+ struct rt_mutex_base *pi_mutex;
int res, ret;
if (!IS_ENABLED(CONFIG_FUTEX_PI))
@@ -3201,8 +3442,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
q.requeue_pi_key = &key2;
/*
- * Prepare to wait on uaddr. On success, increments q.key (key1) ref
- * count.
+ * Prepare to wait on uaddr. On success, it holds hb->lock and q
+ * is initialized.
*/
ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
@@ -3221,32 +3462,22 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);
- spin_lock(&hb->lock);
- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
- spin_unlock(&hb->lock);
- if (ret)
- goto out;
-
- /*
- * In order for us to be here, we know our q.key == key2, and since
- * we took the hb->lock above, we also know that futex_requeue() has
- * completed and we no longer have to concern ourselves with a wakeup
- * race with the atomic proxy lock acquisition by the requeue code. The
- * futex_requeue dropped our key1 reference and incremented our key2
- * reference count.
- */
+ switch (futex_requeue_pi_wakeup_sync(&q)) {
+ case Q_REQUEUE_PI_IGNORE:
+ /* The waiter is still on uaddr1 */
+ spin_lock(&hb->lock);
+ ret = handle_early_requeue_pi_wakeup(hb, &q, to);
+ spin_unlock(&hb->lock);
+ break;
- /*
- * Check if the requeue code acquired the second futex for us and do
- * any pertinent fixup.
- */
- if (!q.rt_waiter) {
+ case Q_REQUEUE_PI_LOCKED:
+ /* The requeue acquired the lock */
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_owner(uaddr2, &q, true);
/*
- * Drop the reference to the pi state which
- * the requeue_pi() code acquired for us.
+ * Drop the reference to the pi state which the
+ * requeue_pi() code acquired for us.
*/
put_pi_state(q.pi_state);
spin_unlock(q.lock_ptr);
@@ -3256,18 +3487,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
ret = ret < 0 ? ret : 0;
}
- } else {
- struct rt_mutex *pi_mutex;
+ break;
- /*
- * We have been woken up by futex_unlock_pi(), a timeout, or a
- * signal. futex_unlock_pi() will not destroy the lock_ptr nor
- * the pi_state.
- */
- WARN_ON(!q.pi_state);
+ case Q_REQUEUE_PI_DONE:
+ /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
+ /* Current is not longer pi_blocked_on */
spin_lock(q.lock_ptr);
if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
ret = 0;
@@ -3279,7 +3506,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
res = fixup_owner(uaddr2, &q, !ret);
/*
- * If fixup_owner() returned an error, proprogate that. If it
+ * If fixup_owner() returned an error, propagate that. If it
* acquired the lock, clear -ETIMEDOUT or -EINTR.
*/
if (res)
@@ -3287,17 +3514,21 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
unqueue_me_pi(&q);
spin_unlock(q.lock_ptr);
- }
- if (ret == -EINTR) {
- /*
- * We've already been requeued, but cannot restart by calling
- * futex_lock_pi() directly. We could restart this syscall, but
- * it would detect that the user space "val" changed and return
- * -EWOULDBLOCK. Save the overhead of the restart and return
- * -EWOULDBLOCK directly.
- */
- ret = -EWOULDBLOCK;
+ if (ret == -EINTR) {
+ /*
+ * We've already been requeued, but cannot restart
+ * by calling futex_lock_pi() directly. We could
+ * restart this syscall, but it would detect that
+ * the user space "val" changed and return
+ * -EWOULDBLOCK. Save the overhead of the restart
+ * and return -EWOULDBLOCK directly.
+ */
+ ret = -EWOULDBLOCK;
+ }
+ break;
+ default:
+ BUG();
}
out:
@@ -3677,7 +3908,7 @@ void futex_exec_release(struct task_struct *tsk)
{
/*
* The state handling is done for consistency, but in the case of
- * exec() there is no way to prevent futher damage as the PID stays
+ * exec() there is no way to prevent further damage as the PID stays
* the same. But for the unlikely and arguably buggy case that a
* futex is held on exec(), this provides at least as much state
* consistency protection which is possible.
@@ -3709,12 +3940,14 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
if (op & FUTEX_CLOCK_REALTIME) {
flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+ if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
+ cmd != FUTEX_LOCK_PI2)
return -ENOSYS;
}
switch (cmd) {
case FUTEX_LOCK_PI:
+ case FUTEX_LOCK_PI2:
case FUTEX_UNLOCK_PI:
case FUTEX_TRYLOCK_PI:
case FUTEX_WAIT_REQUEUE_PI:
@@ -3741,6 +3974,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
case FUTEX_WAKE_OP:
return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
case FUTEX_LOCK_PI:
+ flags |= FLAGS_CLOCKRT;
+ fallthrough;
+ case FUTEX_LOCK_PI2:
return futex_lock_pi(uaddr, flags, timeout, 0);
case FUTEX_UNLOCK_PI:
return futex_unlock_pi(uaddr, flags);
@@ -3761,6 +3997,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd)
switch (cmd) {
case FUTEX_WAIT:
case FUTEX_LOCK_PI:
+ case FUTEX_LOCK_PI2:
case FUTEX_WAIT_BITSET:
case FUTEX_WAIT_REQUEUE_PI:
return true;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 58f87a3092f3..053447183ac5 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -5,6 +5,7 @@ config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS
depends on !CC_IS_CLANG || CLANG_VERSION >= 110000
+ depends on !ARCH_WANTS_NO_INSTR || CC_HAS_NO_PROFILE_FN_ATTR
select CONSTRUCTORS
default n
help
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 34a1dc2abc7d..1966a749e0d9 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -56,9 +56,7 @@ if [ -f kernel/kheaders.md5 ] &&
exit
fi
-if [ "${quiet}" != "silent_" ]; then
- echo " GEN $tarfile"
-fi
+echo " GEN $tarfile"
rm -rf $cpio_dir
mkdir $cpio_dir
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 396ebaebea3f..9888e2bc8c76 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -15,6 +15,7 @@
#include <linux/kthread.h>
#include <linux/lockdep.h>
#include <linux/export.h>
+#include <linux/panic_notifier.h>
#include <linux/sysctl.h>
#include <linux/suspend.h>
#include <linux/utsname.h>
@@ -196,7 +197,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
last_break = jiffies;
}
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
- if (t->state == TASK_UNINTERRUPTIBLE)
+ if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
}
unlock:
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d79ef2493a28..fbc54c2a7f23 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -70,6 +70,11 @@ config IRQ_DOMAIN_HIERARCHY
bool
select IRQ_DOMAIN
+# Support for obsolete non-mapping irq domains
+config IRQ_DOMAIN_NOMAP
+ bool
+ select IRQ_DOMAIN
+
# Support for hierarchical fasteoi+edge and fasteoi+level handlers
config IRQ_FASTEOI_HIERARCHY_HANDLERS
bool
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4d89ad4fae3b..f7ff8919dc9b 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -355,7 +355,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
goto fail_npresmsk;
/* Stabilize the cpumasks */
- get_online_cpus();
+ cpus_read_lock();
build_node_to_cpumask(node_to_cpumask);
/* Spread on present CPUs starting from affd->pre_vectors */
@@ -384,7 +384,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
nr_others = ret;
fail_build_affinity:
- put_online_cpus();
+ cpus_read_unlock();
if (ret >= 0)
WARN_ON(nr_present + nr_others < numvecs);
@@ -505,9 +505,9 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
if (affd->calc_sets) {
set_vecs = maxvec - resv;
} else {
- get_online_cpus();
+ cpus_read_lock();
set_vecs = cpumask_weight(cpu_possible_mask);
- put_online_cpus();
+ cpus_read_unlock();
}
return resv + min(set_vecs, maxvec - resv);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 8cc8e5713287..a98bcfc4be7b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
} else {
switch (__irq_startup_managed(desc, aff, force)) {
case IRQ_STARTUP_NORMAL:
+ if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)
+ irq_setup_affinity(desc);
ret = __irq_startup(desc);
- irq_setup_affinity(desc);
+ if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP))
+ irq_setup_affinity(desc);
break;
case IRQ_STARTUP_MANAGED:
irq_do_set_affinity(d, aff, false);
@@ -481,7 +484,7 @@ void handle_nested_irq(unsigned int irq)
for_each_action_of_desc(desc, action)
action_ret |= action->thread_fn(action->irq, action->dev_id);
- if (!noirqdebug)
+ if (!irq_settings_no_debug(desc))
note_interrupt(desc, action_ret);
raw_spin_lock_irq(&desc->lock);
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 02236b13b359..39a41c56ad4f 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -166,7 +166,7 @@ void irq_migrate_all_off_this_cpu(void)
raw_spin_unlock(&desc->lock);
if (affinity_broken) {
- pr_warn_ratelimited("IRQ %u: no longer affine to CPU%u\n",
+ pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n",
irq, smp_processor_id());
}
}
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index f8f23af6ab0d..cc7cdd26e23e 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -240,9 +240,8 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
void __iomem *reg_base, irq_flow_handler_t handler)
{
struct irq_chip_generic *gc;
- unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
- gc = kzalloc(sz, GFP_KERNEL);
+ gc = kzalloc(struct_size(gc, chip_types, num_ct), GFP_KERNEL);
if (gc) {
irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base,
handler);
@@ -288,8 +287,11 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
{
struct irq_domain_chip_generic *dgc;
struct irq_chip_generic *gc;
- int numchips, sz, i;
unsigned long flags;
+ int numchips, i;
+ size_t dgc_sz;
+ size_t gc_sz;
+ size_t sz;
void *tmp;
if (d->gc)
@@ -300,8 +302,9 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
return -EINVAL;
/* Allocate a pointer, generic chip and chiptypes for each chip */
- sz = sizeof(*dgc) + numchips * sizeof(gc);
- sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type));
+ gc_sz = struct_size(gc, chip_types, num_ct);
+ dgc_sz = struct_size(dgc, gc, numchips);
+ sz = dgc_sz + numchips * gc_sz;
tmp = dgc = kzalloc(sz, GFP_KERNEL);
if (!dgc)
@@ -314,7 +317,7 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
d->gc = dgc;
/* Calc pointer to the first generic chip */
- tmp += sizeof(*dgc) + numchips * sizeof(gc);
+ tmp += dgc_sz;
for (i = 0; i < numchips; i++) {
/* Store the pointer to the generic chip */
dgc->gc[i] = gc = tmp;
@@ -331,7 +334,7 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
list_add_tail(&gc->list, &gc_list);
raw_spin_unlock_irqrestore(&gc_lock, flags);
/* Calc pointer to the next generic chip */
- tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+ tmp += gc_sz;
}
return 0;
}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 762a928e18f9..221d80c31e94 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -197,7 +197,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
add_interrupt_randomness(desc->irq_data.irq, flags);
- if (!noirqdebug)
+ if (!irq_settings_no_debug(desc))
note_interrupt(desc, retval);
return retval;
}
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 52f11c791bf8..08ce7da3b57c 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -14,11 +14,11 @@
/**
* irq_reserve_ipi() - Setup an IPI to destination cpumask
* @domain: IPI domain
- * @dest: cpumask of cpus which can receive the IPI
+ * @dest: cpumask of CPUs which can receive the IPI
*
* Allocate a virq that can be used to send IPI to any CPU in dest mask.
*
- * On success it'll return linux irq number and error code on failure
+ * Return: Linux IRQ number on success or error code on failure
*/
int irq_reserve_ipi(struct irq_domain *domain,
const struct cpumask *dest)
@@ -104,13 +104,13 @@ free_descs:
/**
* irq_destroy_ipi() - unreserve an IPI that was previously allocated
- * @irq: linux irq number to be destroyed
- * @dest: cpumask of cpus which should have the IPI removed
+ * @irq: Linux IRQ number to be destroyed
+ * @dest: cpumask of CPUs which should have the IPI removed
*
* The IPIs allocated with irq_reserve_ipi() are returned to the system
* destroying all virqs associated with them.
*
- * Return 0 on success or error code on failure.
+ * Return: %0 on success or error code on failure.
*/
int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
{
@@ -150,14 +150,14 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
}
/**
- * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu
- * @irq: linux irq number
- * @cpu: the target cpu
+ * ipi_get_hwirq - Get the hwirq associated with an IPI to a CPU
+ * @irq: Linux IRQ number
+ * @cpu: the target CPU
*
* When dealing with coprocessors IPI, we need to inform the coprocessor of
* the hwirq it needs to use to receive and send IPIs.
*
- * Returns hwirq value on success and INVALID_HWIRQ on failure.
+ * Return: hwirq value on success or INVALID_HWIRQ on failure.
*/
irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
{
@@ -216,7 +216,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
* This function is for architecture or core code to speed up IPI sending. Not
* usable from driver code.
*
- * Returns zero on success and negative error number on failure.
+ * Return: %0 on success or negative error number on failure.
*/
int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
{
@@ -250,7 +250,7 @@ int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
}
/**
- * ipi_send_mask - send an IPI to target Linux SMP CPU(s)
+ * __ipi_send_mask - send an IPI to target Linux SMP CPU(s)
* @desc: pointer to irq_desc of the IRQ
* @dest: dest CPU(s), must be a subset of the mask passed to
* irq_reserve_ipi()
@@ -258,7 +258,7 @@ int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
* This function is for architecture or core code to speed up IPI sending. Not
* usable from driver code.
*
- * Returns zero on success and negative error number on failure.
+ * Return: %0 on success or negative error number on failure.
*/
int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
{
@@ -298,11 +298,11 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
/**
* ipi_send_single - Send an IPI to a single CPU
- * @virq: linux irq number from irq_reserve_ipi()
+ * @virq: Linux IRQ number from irq_reserve_ipi()
* @cpu: destination CPU, must in the destination mask passed to
* irq_reserve_ipi()
*
- * Returns zero on success and negative error number on failure.
+ * Return: %0 on success or negative error number on failure.
*/
int ipi_send_single(unsigned int virq, unsigned int cpu)
{
@@ -319,11 +319,11 @@ EXPORT_SYMBOL_GPL(ipi_send_single);
/**
* ipi_send_mask - Send an IPI to target CPU(s)
- * @virq: linux irq number from irq_reserve_ipi()
+ * @virq: Linux IRQ number from irq_reserve_ipi()
* @dest: dest CPU(s), must be a subset of the mask passed to
* irq_reserve_ipi()
*
- * Returns zero on success and negative error number on failure.
+ * Return: %0 on success or negative error number on failure.
*/
int ipi_send_mask(unsigned int virq, const struct cpumask *dest)
{
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4a617d7312a4..4e3c29bb603c 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -188,7 +188,7 @@ static ssize_t hwirq_show(struct kobject *kobj,
raw_spin_lock_irq(&desc->lock);
if (desc->irq_data.domain)
- ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq);
+ ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq);
raw_spin_unlock_irq(&desc->lock);
return ret;
@@ -632,14 +632,8 @@ void irq_init_desc(unsigned int irq)
#endif /* !CONFIG_SPARSE_IRQ */
-/**
- * generic_handle_irq - Invoke the handler for a particular irq
- * @irq: The irq number to handle
- *
- */
-int generic_handle_irq(unsigned int irq)
+int handle_irq_desc(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_to_desc(irq);
struct irq_data *data;
if (!desc)
@@ -652,49 +646,67 @@ int generic_handle_irq(unsigned int irq)
generic_handle_irq_desc(desc);
return 0;
}
+EXPORT_SYMBOL_GPL(handle_irq_desc);
+
+/**
+ * generic_handle_irq - Invoke the handler for a particular irq
+ * @irq: The irq number to handle
+ *
+ */
+int generic_handle_irq(unsigned int irq)
+{
+ return handle_irq_desc(irq_to_desc(irq));
+}
EXPORT_SYMBOL_GPL(generic_handle_irq);
+#ifdef CONFIG_IRQ_DOMAIN
+/**
+ * generic_handle_domain_irq - Invoke the handler for a HW irq belonging
+ * to a domain, usually for a non-root interrupt
+ * controller
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ *
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ *
+ */
+int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
+{
+ return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
+}
+EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
+
#ifdef CONFIG_HANDLE_DOMAIN_IRQ
/**
- * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain
+ * handle_domain_irq - Invoke the handler for a HW irq belonging to a domain,
+ * usually for a root interrupt controller
* @domain: The domain where to perform the lookup
* @hwirq: The HW irq number to convert to a logical one
- * @lookup: Whether to perform the domain lookup or not
* @regs: Register file coming from the low-level handling code
*
* Returns: 0 on success, or -EINVAL if conversion has failed
*/
-int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
- bool lookup, struct pt_regs *regs)
+int handle_domain_irq(struct irq_domain *domain,
+ unsigned int hwirq, struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- unsigned int irq = hwirq;
+ struct irq_desc *desc;
int ret = 0;
irq_enter();
-#ifdef CONFIG_IRQ_DOMAIN
- if (lookup)
- irq = irq_find_mapping(domain, hwirq);
-#endif
-
- /*
- * Some hardware gives randomly wrong interrupts. Rather
- * than crashing, do something sensible.
- */
- if (unlikely(!irq || irq >= nr_irqs)) {
- ack_bad_irq(irq);
+ /* The irqdomain code provides boundary checks */
+ desc = irq_resolve_mapping(domain, hwirq);
+ if (likely(desc))
+ handle_irq_desc(desc);
+ else
ret = -EINVAL;
- } else {
- generic_handle_irq(irq);
- }
irq_exit();
set_irq_regs(old_regs);
return ret;
}
-#ifdef CONFIG_IRQ_DOMAIN
/**
* handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain
* @domain: The domain where to perform the lookup
@@ -709,7 +721,7 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- unsigned int irq;
+ struct irq_desc *desc;
int ret = 0;
/*
@@ -717,14 +729,14 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
*/
WARN_ON(!in_nmi());
- irq = irq_find_mapping(domain, hwirq);
+ desc = irq_resolve_mapping(domain, hwirq);
/*
* ack_bad_irq is not NMI-safe, just report
* an invalid interrupt.
*/
- if (likely(irq))
- generic_handle_irq(irq);
+ if (likely(desc))
+ handle_irq_desc(desc);
else
ret = -EINVAL;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 6284443b87ec..4d8fc65cf38f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -136,7 +136,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
* Allocates and initializes an irq_domain structure.
* Returns pointer to IRQ domain, or NULL on failure.
*/
-struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
+struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
irq_hw_number_t hwirq_max, int direct_max,
const struct irq_domain_ops *ops,
void *host_data)
@@ -146,7 +146,11 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
static atomic_t unknown_domains;
- domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
+ if (WARN_ON((size && direct_max) ||
+ (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && direct_max)))
+ return NULL;
+
+ domain = kzalloc_node(struct_size(domain, revmap, size),
GFP_KERNEL, of_node_to_nid(to_of_node(fwnode)));
if (!domain)
return NULL;
@@ -209,12 +213,18 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
- mutex_init(&domain->revmap_tree_mutex);
+ mutex_init(&domain->revmap_mutex);
domain->ops = ops;
domain->host_data = host_data;
domain->hwirq_max = hwirq_max;
+
+ if (direct_max) {
+ size = direct_max;
+ domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP;
+ }
+
domain->revmap_size = size;
- domain->revmap_direct_max_irq = direct_max;
+
irq_domain_check_hierarchy(domain);
mutex_lock(&irq_domain_mutex);
@@ -481,30 +491,41 @@ struct irq_domain *irq_get_default_host(void)
{
return irq_default_domain;
}
+EXPORT_SYMBOL_GPL(irq_get_default_host);
+
+static bool irq_domain_is_nomap(struct irq_domain *domain)
+{
+ return IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) &&
+ (domain->flags & IRQ_DOMAIN_FLAG_NO_MAP);
+}
static void irq_domain_clear_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = 0;
- } else {
- mutex_lock(&domain->revmap_tree_mutex);
+ if (irq_domain_is_nomap(domain))
+ return;
+
+ mutex_lock(&domain->revmap_mutex);
+ if (hwirq < domain->revmap_size)
+ rcu_assign_pointer(domain->revmap[hwirq], NULL);
+ else
radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&domain->revmap_tree_mutex);
- }
+ mutex_unlock(&domain->revmap_mutex);
}
static void irq_domain_set_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq,
struct irq_data *irq_data)
{
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = irq_data->irq;
- } else {
- mutex_lock(&domain->revmap_tree_mutex);
+ if (irq_domain_is_nomap(domain))
+ return;
+
+ mutex_lock(&domain->revmap_mutex);
+ if (hwirq < domain->revmap_size)
+ rcu_assign_pointer(domain->revmap[hwirq], irq_data);
+ else
radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
- mutex_unlock(&domain->revmap_tree_mutex);
- }
+ mutex_unlock(&domain->revmap_mutex);
}
static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
@@ -604,6 +625,7 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
}
EXPORT_SYMBOL_GPL(irq_domain_associate_many);
+#ifdef CONFIG_IRQ_DOMAIN_NOMAP
/**
* irq_create_direct_mapping() - Allocate an irq for direct mapping
* @domain: domain to allocate the irq for or NULL for default domain
@@ -628,9 +650,9 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
pr_debug("create_direct virq allocation failed\n");
return 0;
}
- if (virq >= domain->revmap_direct_max_irq) {
+ if (virq >= domain->revmap_size) {
pr_err("ERROR: no free irqs available below %i maximum\n",
- domain->revmap_direct_max_irq);
+ domain->revmap_size);
irq_free_desc(virq);
return 0;
}
@@ -644,6 +666,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
return virq;
}
EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
+#endif
/**
* irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space
@@ -862,37 +885,53 @@ void irq_dispose_mapping(unsigned int virq)
EXPORT_SYMBOL_GPL(irq_dispose_mapping);
/**
- * irq_find_mapping() - Find a linux irq from a hw irq number.
+ * __irq_resolve_mapping() - Find a linux irq from a hw irq number.
* @domain: domain owning this hardware interrupt
* @hwirq: hardware irq number in that domain space
+ * @irq: optional pointer to return the Linux irq if required
+ *
+ * Returns the interrupt descriptor.
*/
-unsigned int irq_find_mapping(struct irq_domain *domain,
- irq_hw_number_t hwirq)
+struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ unsigned int *irq)
{
+ struct irq_desc *desc = NULL;
struct irq_data *data;
/* Look for default domain if necessary */
if (domain == NULL)
domain = irq_default_domain;
if (domain == NULL)
- return 0;
+ return desc;
+
+ if (irq_domain_is_nomap(domain)) {
+ if (hwirq < domain->revmap_size) {
+ data = irq_domain_get_irq_data(domain, hwirq);
+ if (data && data->hwirq == hwirq)
+ desc = irq_data_to_desc(data);
+ }
- if (hwirq < domain->revmap_direct_max_irq) {
- data = irq_domain_get_irq_data(domain, hwirq);
- if (data && data->hwirq == hwirq)
- return hwirq;
+ return desc;
}
+ rcu_read_lock();
/* Check if the hwirq is in the linear revmap. */
if (hwirq < domain->revmap_size)
- return domain->linear_revmap[hwirq];
+ data = rcu_dereference(domain->revmap[hwirq]);
+ else
+ data = radix_tree_lookup(&domain->revmap_tree, hwirq);
+
+ if (likely(data)) {
+ desc = irq_data_to_desc(data);
+ if (irq)
+ *irq = data->irq;
+ }
- rcu_read_lock();
- data = radix_tree_lookup(&domain->revmap_tree, hwirq);
rcu_read_unlock();
- return data ? data->irq : 0;
+ return desc;
}
-EXPORT_SYMBOL_GPL(irq_find_mapping);
+EXPORT_SYMBOL_GPL(__irq_resolve_mapping);
/**
* irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
@@ -1177,6 +1216,7 @@ int irq_domain_disconnect_hierarchy(struct irq_domain *domain,
irqd->chip = ERR_PTR(-ENOTCONN);
return 0;
}
+EXPORT_SYMBOL_GPL(irq_domain_disconnect_hierarchy);
static int irq_domain_trim_hierarchy(unsigned int virq)
{
@@ -1468,15 +1508,20 @@ static void irq_domain_fix_revmap(struct irq_data *d)
{
void __rcu **slot;
- if (d->hwirq < d->domain->revmap_size)
- return; /* Not using radix tree. */
+ if (irq_domain_is_nomap(d->domain))
+ return;
/* Fix up the revmap. */
- mutex_lock(&d->domain->revmap_tree_mutex);
- slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
- if (slot)
- radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
- mutex_unlock(&d->domain->revmap_tree_mutex);
+ mutex_lock(&d->domain->revmap_mutex);
+ if (d->hwirq < d->domain->revmap_size) {
+ /* Not using radix tree */
+ rcu_assign_pointer(d->domain->revmap[d->hwirq], d);
+ } else {
+ slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
+ if (slot)
+ radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
+ }
+ mutex_unlock(&d->domain->revmap_mutex);
}
/**
@@ -1828,8 +1873,7 @@ static void
irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
{
seq_printf(m, "%*sname: %s\n", ind, "", d->name);
- seq_printf(m, "%*ssize: %u\n", ind + 1, "",
- d->revmap_size + d->revmap_direct_max_irq);
+ seq_printf(m, "%*ssize: %u\n", ind + 1, "", d->revmap_size);
seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);
seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags);
if (d->ops && d->ops->debug_show)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4c14356543d9..27667e82ecc9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -25,12 +25,11 @@
#include "internals.h"
#if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT)
-__read_mostly bool force_irqthreads;
-EXPORT_SYMBOL_GPL(force_irqthreads);
+DEFINE_STATIC_KEY_FALSE(force_irqthreads_key);
static int __init setup_forced_irqthreads(char *arg)
{
- force_irqthreads = true;
+ static_branch_enable(&force_irqthreads_key);
return 0;
}
early_param("threadirqs", setup_forced_irqthreads);
@@ -441,7 +440,8 @@ out_unlock:
return ret;
}
-int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
+static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask,
+ bool force)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
@@ -456,6 +456,36 @@ int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
return ret;
}
+/**
+ * irq_set_affinity - Set the irq affinity of a given irq
+ * @irq: Interrupt to set affinity
+ * @cpumask: cpumask
+ *
+ * Fails if cpumask does not contain an online CPU
+ */
+int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+ return __irq_set_affinity(irq, cpumask, false);
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity);
+
+/**
+ * irq_force_affinity - Force the irq affinity of a given irq
+ * @irq: Interrupt to set affinity
+ * @cpumask: cpumask
+ *
+ * Same as irq_set_affinity, but without checking the mask against
+ * online cpus.
+ *
+ * Solely for low level cpu hotplug code, where we need to make per
+ * cpu interrupts affine before the cpu becomes online.
+ */
+int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+ return __irq_set_affinity(irq, cpumask, true);
+}
+EXPORT_SYMBOL_GPL(irq_force_affinity);
+
int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
{
unsigned long flags;
@@ -1229,8 +1259,8 @@ static int irq_thread(void *data)
irqreturn_t (*handler_fn)(struct irq_desc *desc,
struct irqaction *action);
- if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
- &action->thread_flags))
+ if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
+ &action->thread_flags))
handler_fn = irq_forced_thread_fn;
else
handler_fn = irq_thread_fn;
@@ -1291,7 +1321,7 @@ EXPORT_SYMBOL_GPL(irq_wake_thread);
static int irq_setup_forced_threading(struct irqaction *new)
{
- if (!force_irqthreads)
+ if (!force_irqthreads())
return 0;
if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
return 0;
@@ -1686,8 +1716,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (new->flags & IRQF_PERCPU) {
irqd_set(&desc->irq_data, IRQD_PER_CPU);
irq_settings_set_per_cpu(desc);
+ if (new->flags & IRQF_NO_DEBUG)
+ irq_settings_set_no_debug(desc);
}
+ if (noirqdebug)
+ irq_settings_set_no_debug(desc);
+
if (new->flags & IRQF_ONESHOT)
desc->istate |= IRQS_ONESHOT;
@@ -2036,9 +2071,9 @@ const void *free_nmi(unsigned int irq, void *dev_id)
* request_threaded_irq - allocate an interrupt line
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs.
- * Primary handler for threaded interrupts
- * If NULL and thread_fn != NULL the default
- * primary handler is installed
+ * Primary handler for threaded interrupts.
+ * If handler is NULL and thread_fn != NULL
+ * the default primary handler is installed.
* @thread_fn: Function called from the irq handler thread
* If NULL, no irq thread is created
* @irqflags: Interrupt type flags
@@ -2072,7 +2107,7 @@ const void *free_nmi(unsigned int irq, void *dev_id)
*
* IRQF_SHARED Interrupt is shared
* IRQF_TRIGGER_* Specify active edge(s) or level
- *
+ * IRQF_ONESHOT Run thread_fn with interrupt line masked
*/
int request_threaded_irq(unsigned int irq, irq_handler_t handler,
irq_handler_t thread_fn, unsigned long irqflags,
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 578596e41cb6..bbfb26489aa1 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -280,7 +280,8 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk)
/**
* irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map
* @m: Matrix pointer
- * @cpu: On which CPU the interrupt should be allocated
+ * @msk: Which CPUs to search in
+ * @mapped_cpu: Pointer to store the CPU for which the irq was allocated
*/
int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk,
unsigned int *mapped_cpu)
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index c41965e348b5..6a5ecee6e567 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -14,17 +14,20 @@
#include <linux/irqdomain.h>
#include <linux/msi.h>
#include <linux/slab.h>
+#include <linux/pci.h>
#include "internals.h"
/**
- * alloc_msi_entry - Allocate an initialize msi_entry
+ * alloc_msi_entry - Allocate an initialized msi_desc
* @dev: Pointer to the device for which this is allocated
* @nvec: The number of vectors used in this entry
* @affinity: Optional pointer to an affinity mask array size of @nvec
*
- * If @affinity is not NULL then an affinity array[@nvec] is allocated
+ * If @affinity is not %NULL then an affinity array[@nvec] is allocated
* and the affinity masks and flags from @affinity are copied.
+ *
+ * Return: pointer to allocated &msi_desc on success or %NULL on failure
*/
struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
const struct irq_affinity_desc *affinity)
@@ -69,6 +72,139 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
}
EXPORT_SYMBOL_GPL(get_cached_msi_msg);
+static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct msi_desc *entry;
+ bool is_msix = false;
+ unsigned long irq;
+ int retval;
+
+ retval = kstrtoul(attr->attr.name, 10, &irq);
+ if (retval)
+ return retval;
+
+ entry = irq_get_msi_desc(irq);
+ if (!entry)
+ return -ENODEV;
+
+ if (dev_is_pci(dev))
+ is_msix = entry->msi_attrib.is_msix;
+
+ return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
+}
+
+/**
+ * msi_populate_sysfs - Populate msi_irqs sysfs entries for devices
+ * @dev: The device(PCI, platform etc) who will get sysfs entries
+ *
+ * Return attribute_group ** so that specific bus MSI can save it to
+ * somewhere during initilizing msi irqs. If devices has no MSI irq,
+ * return NULL; if it fails to populate sysfs, return ERR_PTR
+ */
+const struct attribute_group **msi_populate_sysfs(struct device *dev)
+{
+ const struct attribute_group **msi_irq_groups;
+ struct attribute **msi_attrs, *msi_attr;
+ struct device_attribute *msi_dev_attr;
+ struct attribute_group *msi_irq_group;
+ struct msi_desc *entry;
+ int ret = -ENOMEM;
+ int num_msi = 0;
+ int count = 0;
+ int i;
+
+ /* Determine how many msi entries we have */
+ for_each_msi_entry(entry, dev)
+ num_msi += entry->nvec_used;
+ if (!num_msi)
+ return NULL;
+
+ /* Dynamically create the MSI attributes for the device */
+ msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL);
+ if (!msi_attrs)
+ return ERR_PTR(-ENOMEM);
+
+ for_each_msi_entry(entry, dev) {
+ for (i = 0; i < entry->nvec_used; i++) {
+ msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
+ if (!msi_dev_attr)
+ goto error_attrs;
+ msi_attrs[count] = &msi_dev_attr->attr;
+
+ sysfs_attr_init(&msi_dev_attr->attr);
+ msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d",
+ entry->irq + i);
+ if (!msi_dev_attr->attr.name)
+ goto error_attrs;
+ msi_dev_attr->attr.mode = 0444;
+ msi_dev_attr->show = msi_mode_show;
+ ++count;
+ }
+ }
+
+ msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
+ if (!msi_irq_group)
+ goto error_attrs;
+ msi_irq_group->name = "msi_irqs";
+ msi_irq_group->attrs = msi_attrs;
+
+ msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL);
+ if (!msi_irq_groups)
+ goto error_irq_group;
+ msi_irq_groups[0] = msi_irq_group;
+
+ ret = sysfs_create_groups(&dev->kobj, msi_irq_groups);
+ if (ret)
+ goto error_irq_groups;
+
+ return msi_irq_groups;
+
+error_irq_groups:
+ kfree(msi_irq_groups);
+error_irq_group:
+ kfree(msi_irq_group);
+error_attrs:
+ count = 0;
+ msi_attr = msi_attrs[count];
+ while (msi_attr) {
+ msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
+ kfree(msi_attr->name);
+ kfree(msi_dev_attr);
+ ++count;
+ msi_attr = msi_attrs[count];
+ }
+ kfree(msi_attrs);
+ return ERR_PTR(ret);
+}
+
+/**
+ * msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices
+ * @dev: The device(PCI, platform etc) who will remove sysfs entries
+ * @msi_irq_groups: attribute_group for device msi_irqs entries
+ */
+void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups)
+{
+ struct device_attribute *dev_attr;
+ struct attribute **msi_attrs;
+ int count = 0;
+
+ if (msi_irq_groups) {
+ sysfs_remove_groups(&dev->kobj, msi_irq_groups);
+ msi_attrs = msi_irq_groups[0]->attrs;
+ while (msi_attrs[count]) {
+ dev_attr = container_of(msi_attrs[count],
+ struct device_attribute, attr);
+ kfree(dev_attr->attr.name);
+ kfree(dev_attr);
+ ++count;
+ }
+ kfree(msi_attrs);
+ kfree(msi_irq_groups[0]);
+ kfree(msi_irq_groups);
+ }
+}
+
#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
@@ -97,6 +233,8 @@ static void msi_check_level(struct irq_domain *domain, struct msi_msg *msg)
*
* Intended to be used by MSI interrupt controllers which are
* implemented with hierarchical domains.
+ *
+ * Return: IRQ_SET_MASK_* result code
*/
int msi_domain_set_affinity(struct irq_data *irq_data,
const struct cpumask *mask, bool force)
@@ -277,10 +415,12 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
}
/**
- * msi_create_irq_domain - Create a MSI interrupt domain
+ * msi_create_irq_domain - Create an MSI interrupt domain
* @fwnode: Optional fwnode of the interrupt controller
* @info: MSI domain info
* @parent: Parent irq domain
+ *
+ * Return: pointer to the created &struct irq_domain or %NULL on failure
*/
struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
struct msi_domain_info *info,
@@ -476,11 +616,6 @@ skip_activate:
return 0;
cleanup:
- for_each_msi_vector(desc, i, dev) {
- irq_data = irq_domain_get_irq_data(domain, i);
- if (irqd_is_activated(irq_data))
- irq_domain_deactivate_irq(irq_data);
- }
msi_domain_free_irqs(domain, dev);
return ret;
}
@@ -492,7 +627,7 @@ cleanup:
* are allocated
* @nvec: The number of interrupts to allocate
*
- * Returns 0 on success or an error code.
+ * Return: %0 on success or an error code.
*/
int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
int nvec)
@@ -505,7 +640,15 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
+ struct irq_data *irq_data;
struct msi_desc *desc;
+ int i;
+
+ for_each_msi_vector(desc, i, dev) {
+ irq_data = irq_domain_get_irq_data(domain, i);
+ if (irqd_is_activated(irq_data))
+ irq_domain_deactivate_irq(irq_data);
+ }
for_each_msi_entry(desc, dev) {
/*
@@ -521,7 +664,7 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
}
/**
- * __msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
+ * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev
* @domain: The domain to managing the interrupts
* @dev: Pointer to device struct of the device for which the interrupts
* are free
@@ -538,8 +681,7 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
* msi_get_domain_info - Get the MSI interrupt domain info for @domain
* @domain: The interrupt domain to retrieve data from
*
- * Returns the pointer to the msi_domain_info stored in
- * @domain->host_data.
+ * Return: the pointer to the msi_domain_info stored in @domain->host_data.
*/
struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
{
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index ce0adb22ee96..ca71123a6130 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -227,7 +227,7 @@ unlock:
}
/**
- * irq_pm_syscore_ops - enable interrupt lines early
+ * irq_pm_syscore_resume - enable interrupt lines early
*
* Enable all interrupt lines with %IRQF_EARLY_RESUME set.
*/
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7c5cd42df3b9..ee595ec09778 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -513,7 +513,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_printf(p, " %8s", "None");
}
if (desc->irq_data.domain)
- seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
+ seq_printf(p, " %*lu", prec, desc->irq_data.hwirq);
else
seq_printf(p, " %*s", prec, "");
#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 403378b9947b..7b7efb1a114b 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -18,6 +18,7 @@ enum {
_IRQ_IS_POLLED = IRQ_IS_POLLED,
_IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
_IRQ_HIDDEN = IRQ_HIDDEN,
+ _IRQ_NO_DEBUG = IRQ_NO_DEBUG,
_IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
};
@@ -33,6 +34,7 @@ enum {
#define IRQ_IS_POLLED GOT_YOU_MORON
#define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
#define IRQ_HIDDEN GOT_YOU_MORON
+#define IRQ_NO_DEBUG GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
@@ -174,3 +176,13 @@ static inline bool irq_settings_is_hidden(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_HIDDEN;
}
+
+static inline void irq_settings_set_no_debug(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NO_DEBUG;
+}
+
+static inline bool irq_settings_no_debug(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_NO_DEBUG;
+}
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index d309d6fbf5bd..c43e2ac2f8de 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -453,6 +453,11 @@ static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
*/
index = irq_timings_interval_index(interval);
+ if (index > PREDICTION_BUFFER_SIZE - 1) {
+ irqs->count = 0;
+ return;
+ }
+
/*
* Store the index as an element of the pattern in another
* circular array.
@@ -794,12 +799,14 @@ static int __init irq_timings_test_irqs(struct timings_intervals *ti)
__irq_timings_store(irq, irqs, ti->intervals[i]);
if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) {
+ ret = -EBADSLT;
pr_err("Failed to store in the circular buffer\n");
goto out;
}
}
if (irqs->count != ti->count) {
+ ret = -ERANGE;
pr_err("Count differs\n");
goto out;
}
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index ba39fbb1f8e7..b156e152d6b4 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -309,21 +309,23 @@ EXPORT_SYMBOL_GPL(jump_label_rate_limit);
static int addr_conflict(struct jump_entry *entry, void *start, void *end)
{
if (jump_entry_code(entry) <= (unsigned long)end &&
- jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
+ jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start)
return 1;
return 0;
}
static int __jump_label_text_reserved(struct jump_entry *iter_start,
- struct jump_entry *iter_stop, void *start, void *end)
+ struct jump_entry *iter_stop, void *start, void *end, bool init)
{
struct jump_entry *iter;
iter = iter_start;
while (iter < iter_stop) {
- if (addr_conflict(iter, start, end))
- return 1;
+ if (init || !jump_entry_is_init(iter)) {
+ if (addr_conflict(iter, start, end))
+ return 1;
+ }
iter++;
}
@@ -483,13 +485,14 @@ void __init jump_label_init(void)
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
+ bool in_init;
/* rewrite NOPs */
if (jump_label_type(iter) == JUMP_LABEL_NOP)
arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
- if (init_section_contains((void *)jump_entry_code(iter), 1))
- jump_entry_set_init(iter);
+ in_init = init_section_contains((void *)jump_entry_code(iter), 1);
+ jump_entry_set_init(iter, in_init);
iterk = jump_entry_key(iter);
if (iterk == key)
@@ -561,7 +564,7 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
ret = __jump_label_text_reserved(mod->jump_entries,
mod->jump_entries + mod->num_jump_entries,
- start, end);
+ start, end, mod->state == MODULE_STATE_COMING);
module_put(mod);
@@ -634,9 +637,10 @@ static int jump_label_add_module(struct module *mod)
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
+ bool in_init;
- if (within_module_init(jump_entry_code(iter), mod))
- jump_entry_set_init(iter);
+ in_init = within_module_init(jump_entry_code(iter), mod);
+ jump_entry_set_init(iter, in_init);
iterk = jump_entry_key(iter);
if (iterk == key)
@@ -786,8 +790,9 @@ early_initcall(jump_label_init_module);
*/
int jump_label_text_reserved(void *start, void *end)
{
+ bool init = system_state < SYSTEM_RUNNING;
int ret = __jump_label_text_reserved(__start___jump_table,
- __stop___jump_table, start, end);
+ __stop___jump_table, start, end, init);
if (ret)
return ret;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index c851ca0ed357..0ba87982d017 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -25,7 +25,10 @@
#include <linux/filter.h>
#include <linux/ftrace.h>
#include <linux/kprobes.h>
+#include <linux/build_bug.h>
#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
/*
* These will be re-linked against their real values
@@ -297,21 +300,14 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
get_symbol_pos(addr, symbolsize, offset);
return 1;
}
- return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
+ return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) ||
!!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
}
-/*
- * Lookup an address
- * - modname is set to NULL if it's in the kernel.
- * - We guarantee that the returned name is valid until we reschedule even if.
- * It resides in a module.
- * - We also guarantee that modname will be valid until rescheduled.
- */
-const char *kallsyms_lookup(unsigned long addr,
- unsigned long *symbolsize,
- unsigned long *offset,
- char **modname, char *namebuf)
+static const char *kallsyms_lookup_buildid(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset, char **modname,
+ const unsigned char **modbuildid, char *namebuf)
{
const char *ret;
@@ -327,6 +323,8 @@ const char *kallsyms_lookup(unsigned long addr,
namebuf, KSYM_NAME_LEN);
if (modname)
*modname = NULL;
+ if (modbuildid)
+ *modbuildid = NULL;
ret = namebuf;
goto found;
@@ -334,7 +332,7 @@ const char *kallsyms_lookup(unsigned long addr,
/* See if it's in a module or a BPF JITed image. */
ret = module_address_lookup(addr, symbolsize, offset,
- modname, namebuf);
+ modname, modbuildid, namebuf);
if (!ret)
ret = bpf_address_lookup(addr, symbolsize,
offset, modname, namebuf);
@@ -348,6 +346,22 @@ found:
return ret;
}
+/*
+ * Lookup an address
+ * - modname is set to NULL if it's in the kernel.
+ * - We guarantee that the returned name is valid until we reschedule even if.
+ * It resides in a module.
+ * - We also guarantee that modname will be valid until rescheduled.
+ */
+const char *kallsyms_lookup(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset,
+ char **modname, char *namebuf)
+{
+ return kallsyms_lookup_buildid(addr, symbolsize, offset, modname,
+ NULL, namebuf);
+}
+
int lookup_symbol_name(unsigned long addr, char *symname)
{
int res;
@@ -404,15 +418,17 @@ found:
/* Look up a kernel symbol and return it in a text buffer. */
static int __sprint_symbol(char *buffer, unsigned long address,
- int symbol_offset, int add_offset)
+ int symbol_offset, int add_offset, int add_buildid)
{
char *modname;
+ const unsigned char *buildid;
const char *name;
unsigned long offset, size;
int len;
address += symbol_offset;
- name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
+ name = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid,
+ buffer);
if (!name)
return sprintf(buffer, "0x%lx", address - symbol_offset);
@@ -424,8 +440,19 @@ static int __sprint_symbol(char *buffer, unsigned long address,
if (add_offset)
len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
- if (modname)
- len += sprintf(buffer + len, " [%s]", modname);
+ if (modname) {
+ len += sprintf(buffer + len, " [%s", modname);
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+ if (add_buildid && buildid) {
+ /* build ID should match length of sprintf */
+#if IS_ENABLED(CONFIG_MODULES)
+ static_assert(sizeof(typeof_member(struct module, build_id)) == 20);
+#endif
+ len += sprintf(buffer + len, " %20phN", buildid);
+ }
+#endif
+ len += sprintf(buffer + len, "]");
+ }
return len;
}
@@ -443,11 +470,28 @@ static int __sprint_symbol(char *buffer, unsigned long address,
*/
int sprint_symbol(char *buffer, unsigned long address)
{
- return __sprint_symbol(buffer, address, 0, 1);
+ return __sprint_symbol(buffer, address, 0, 1, 0);
}
EXPORT_SYMBOL_GPL(sprint_symbol);
/**
+ * sprint_symbol_build_id - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name,
+ * offset, size, module name and module build ID to @buffer if possible. If no
+ * symbol was found, just saves its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol_build_id(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, 0, 1, 1);
+}
+EXPORT_SYMBOL_GPL(sprint_symbol_build_id);
+
+/**
* sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
* @buffer: buffer to be stored
* @address: address to lookup
@@ -460,7 +504,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol);
*/
int sprint_symbol_no_offset(char *buffer, unsigned long address)
{
- return __sprint_symbol(buffer, address, 0, 0);
+ return __sprint_symbol(buffer, address, 0, 0, 0);
}
EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
@@ -480,7 +524,27 @@ EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
*/
int sprint_backtrace(char *buffer, unsigned long address)
{
- return __sprint_symbol(buffer, address, -1, 1);
+ return __sprint_symbol(buffer, address, -1, 1, 0);
+}
+
+/**
+ * sprint_backtrace_build_id - Look up a backtrace symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function is for stack backtrace and does the same thing as
+ * sprint_symbol() but with modified/decreased @address. If there is a
+ * tail-call to the function marked "noreturn", gcc optimized out code after
+ * the call so that the stack-saved return address could point outside of the
+ * caller. This function ensures that kallsyms will find the original caller
+ * by decreasing @address. This function also appends the module build ID to
+ * the @buffer if @address is within a kernel module.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_backtrace_build_id(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, -1, 1, 1);
}
/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h
deleted file mode 100644
index 530ae1bda8e7..000000000000
--- a/kernel/kcsan/atomic.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Rules for implicitly atomic memory accesses.
- *
- * Copyright (C) 2019, Google LLC.
- */
-
-#ifndef _KERNEL_KCSAN_ATOMIC_H
-#define _KERNEL_KCSAN_ATOMIC_H
-
-#include <linux/types.h>
-
-/*
- * Special rules for certain memory where concurrent conflicting accesses are
- * common, however, the current convention is to not mark them; returns true if
- * access to @ptr should be considered atomic. Called from slow-path.
- */
-static bool kcsan_is_atomic_special(const volatile void *ptr)
-{
- return false;
-}
-
-#endif /* _KERNEL_KCSAN_ATOMIC_H */
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 45c821d4e8bd..76e67d1e02d4 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -20,9 +20,9 @@
#include <linux/sched.h>
#include <linux/uaccess.h>
-#include "atomic.h"
#include "encoding.h"
#include "kcsan.h"
+#include "permissive.h"
static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE);
unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK;
@@ -301,9 +301,9 @@ static inline void reset_kcsan_skip(void)
this_cpu_write(kcsan_skip, skip_count);
}
-static __always_inline bool kcsan_is_enabled(void)
+static __always_inline bool kcsan_is_enabled(struct kcsan_ctx *ctx)
{
- return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0;
+ return READ_ONCE(kcsan_enabled) && !ctx->disable_count;
}
/* Introduce delay depending on context and configuration. */
@@ -353,10 +353,18 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
atomic_long_t *watchpoint,
long encoded_watchpoint)
{
+ const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
+ struct kcsan_ctx *ctx = get_ctx();
unsigned long flags;
bool consumed;
- if (!kcsan_is_enabled())
+ /*
+ * We know a watchpoint exists. Let's try to keep the race-window
+ * between here and finally consuming the watchpoint below as small as
+ * possible -- avoid unneccessarily complex code until consumed.
+ */
+
+ if (!kcsan_is_enabled(ctx))
return;
/*
@@ -364,14 +372,22 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
* reporting a race where e.g. the writer set up the watchpoint, but the
* reader has access_mask!=0, we have to ignore the found watchpoint.
*/
- if (get_ctx()->access_mask != 0)
+ if (ctx->access_mask)
return;
/*
- * Consume the watchpoint as soon as possible, to minimize the chances
- * of !consumed. Consuming the watchpoint must always be guarded by
- * kcsan_is_enabled() check, as otherwise we might erroneously
- * triggering reports when disabled.
+ * If the other thread does not want to ignore the access, and there was
+ * a value change as a result of this thread's operation, we will still
+ * generate a report of unknown origin.
+ *
+ * Use CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN=n to filter.
+ */
+ if (!is_assert && kcsan_ignore_address(ptr))
+ return;
+
+ /*
+ * Consuming the watchpoint must be guarded by kcsan_is_enabled() to
+ * avoid erroneously triggering reports if the context is disabled.
*/
consumed = try_consume_watchpoint(watchpoint, encoded_watchpoint);
@@ -380,9 +396,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
if (consumed) {
kcsan_save_irqtrace(current);
- kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE,
- KCSAN_REPORT_CONSUMED_WATCHPOINT,
- watchpoint - watchpoints);
+ kcsan_report_set_info(ptr, size, type, watchpoint - watchpoints);
kcsan_restore_irqtrace(current);
} else {
/*
@@ -393,7 +407,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_REPORT_RACES]);
}
- if ((type & KCSAN_ACCESS_ASSERT) != 0)
+ if (is_assert)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
else
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_DATA_RACES]);
@@ -407,15 +421,11 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
atomic_long_t *watchpoint;
- union {
- u8 _1;
- u16 _2;
- u32 _4;
- u64 _8;
- } expect_value;
+ u64 old, new, diff;
unsigned long access_mask;
enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE;
unsigned long ua_flags = user_access_save();
+ struct kcsan_ctx *ctx = get_ctx();
unsigned long irq_flags = 0;
/*
@@ -424,16 +434,14 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
*/
reset_kcsan_skip();
- if (!kcsan_is_enabled())
+ if (!kcsan_is_enabled(ctx))
goto out;
/*
- * Special atomic rules: unlikely to be true, so we check them here in
- * the slow-path, and not in the fast-path in is_atomic(). Call after
- * kcsan_is_enabled(), as we may access memory that is not yet
- * initialized during early boot.
+ * Check to-ignore addresses after kcsan_is_enabled(), as we may access
+ * memory that is not yet initialized during early boot.
*/
- if (!is_assert && kcsan_is_atomic_special(ptr))
+ if (!is_assert && kcsan_ignore_address(ptr))
goto out;
if (!check_encodable((unsigned long)ptr, size)) {
@@ -468,33 +476,24 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* Read the current value, to later check and infer a race if the data
* was modified via a non-instrumented access, e.g. from a device.
*/
- expect_value._8 = 0;
+ old = 0;
switch (size) {
case 1:
- expect_value._1 = READ_ONCE(*(const u8 *)ptr);
+ old = READ_ONCE(*(const u8 *)ptr);
break;
case 2:
- expect_value._2 = READ_ONCE(*(const u16 *)ptr);
+ old = READ_ONCE(*(const u16 *)ptr);
break;
case 4:
- expect_value._4 = READ_ONCE(*(const u32 *)ptr);
+ old = READ_ONCE(*(const u32 *)ptr);
break;
case 8:
- expect_value._8 = READ_ONCE(*(const u64 *)ptr);
+ old = READ_ONCE(*(const u64 *)ptr);
break;
default:
break; /* ignore; we do not diff the values */
}
- if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) {
- kcsan_disable_current();
- pr_err("watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n",
- is_write ? "write" : "read", size, ptr,
- watchpoint_slot((unsigned long)ptr),
- encode_watchpoint((unsigned long)ptr, size, is_write));
- kcsan_enable_current();
- }
-
/*
* Delay this thread, to increase probability of observing a racy
* conflicting access.
@@ -505,34 +504,37 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* Re-read value, and check if it is as expected; if not, we infer a
* racy access.
*/
- access_mask = get_ctx()->access_mask;
+ access_mask = ctx->access_mask;
+ new = 0;
switch (size) {
case 1:
- expect_value._1 ^= READ_ONCE(*(const u8 *)ptr);
- if (access_mask)
- expect_value._1 &= (u8)access_mask;
+ new = READ_ONCE(*(const u8 *)ptr);
break;
case 2:
- expect_value._2 ^= READ_ONCE(*(const u16 *)ptr);
- if (access_mask)
- expect_value._2 &= (u16)access_mask;
+ new = READ_ONCE(*(const u16 *)ptr);
break;
case 4:
- expect_value._4 ^= READ_ONCE(*(const u32 *)ptr);
- if (access_mask)
- expect_value._4 &= (u32)access_mask;
+ new = READ_ONCE(*(const u32 *)ptr);
break;
case 8:
- expect_value._8 ^= READ_ONCE(*(const u64 *)ptr);
- if (access_mask)
- expect_value._8 &= (u64)access_mask;
+ new = READ_ONCE(*(const u64 *)ptr);
break;
default:
break; /* ignore; we do not diff the values */
}
- /* Were we able to observe a value-change? */
- if (expect_value._8 != 0)
+ diff = old ^ new;
+ if (access_mask)
+ diff &= access_mask;
+
+ /*
+ * Check if we observed a value change.
+ *
+ * Also check if the data race should be ignored (the rules depend on
+ * non-zero diff); if it is to be ignored, the below rules for
+ * KCSAN_VALUE_CHANGE_MAYBE apply.
+ */
+ if (diff && !kcsan_ignore_data_race(size, type, old, new, diff))
value_change = KCSAN_VALUE_CHANGE_TRUE;
/* Check if this access raced with another. */
@@ -566,8 +568,9 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
- kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL,
- watchpoint - watchpoints);
+ kcsan_report_known_origin(ptr, size, type, value_change,
+ watchpoint - watchpoints,
+ old, new, access_mask);
} else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
/* Inferring a race, since the value should not have changed. */
@@ -576,9 +579,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert)
- kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE,
- KCSAN_REPORT_RACE_UNKNOWN_ORIGIN,
- watchpoint - watchpoints);
+ kcsan_report_unknown_origin(ptr, size, type, old, new, access_mask);
}
/*
@@ -655,6 +656,15 @@ void __init kcsan_init(void)
pr_info("enabled early\n");
WRITE_ONCE(kcsan_enabled, true);
}
+
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) ||
+ IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) ||
+ IS_ENABLED(CONFIG_KCSAN_PERMISSIVE) ||
+ IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) {
+ pr_warn("non-strict mode configured - use CONFIG_KCSAN_STRICT=y to see all data races\n");
+ } else {
+ pr_info("strict mode configured\n");
+ }
}
/* === Exported interface =================================================== */
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index e65de172ccf7..1d1d1b0e4248 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -64,7 +64,7 @@ static noinline void microbenchmark(unsigned long iters)
{
const struct kcsan_ctx ctx_save = current->kcsan_ctx;
const bool was_enabled = READ_ONCE(kcsan_enabled);
- cycles_t cycles;
+ u64 cycles;
/* We may have been called from an atomic region; reset context. */
memset(&current->kcsan_ctx, 0, sizeof(current->kcsan_ctx));
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 9881099d4179..f36e25c497ed 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -116,30 +116,27 @@ enum kcsan_value_change {
KCSAN_VALUE_CHANGE_TRUE,
};
-enum kcsan_report_type {
- /*
- * The thread that set up the watchpoint and briefly stalled was
- * signalled that another thread triggered the watchpoint.
- */
- KCSAN_REPORT_RACE_SIGNAL,
-
- /*
- * A thread found and consumed a matching watchpoint.
- */
- KCSAN_REPORT_CONSUMED_WATCHPOINT,
+/*
+ * The calling thread hit and consumed a watchpoint: set the access information
+ * to be consumed by the reporting thread. No report is printed yet.
+ */
+void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
+ int watchpoint_idx);
- /*
- * No other thread was observed to race with the access, but the data
- * value before and after the stall differs.
- */
- KCSAN_REPORT_RACE_UNKNOWN_ORIGIN,
-};
+/*
+ * The calling thread observed that the watchpoint it set up was hit and
+ * consumed: print the full report based on information set by the racing
+ * thread.
+ */
+void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
+ enum kcsan_value_change value_change, int watchpoint_idx,
+ u64 old, u64 new, u64 mask);
/*
- * Print a race report from thread that encountered the race.
+ * No other thread was observed to race with the access, but the data value
+ * before and after the stall differs. Reports a race of "unknown origin".
*/
-extern void kcsan_report(const volatile void *ptr, size_t size, int access_type,
- enum kcsan_value_change value_change,
- enum kcsan_report_type type, int watchpoint_idx);
+void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
+ u64 old, u64 new, u64 mask);
#endif /* _KERNEL_KCSAN_KCSAN_H */
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 8bcffbdef3d3..dc55fd5a36fc 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -414,6 +414,14 @@ static noinline void test_kernel_atomic_builtins(void)
__atomic_load_n(&test_var, __ATOMIC_RELAXED);
}
+static noinline void test_kernel_xor_1bit(void)
+{
+ /* Do not report data races between the read-writes. */
+ kcsan_nestable_atomic_begin();
+ test_var ^= 0x10000;
+ kcsan_nestable_atomic_end();
+}
+
/* ===== Test cases ===== */
/* Simple test with normal data race. */
@@ -952,6 +960,29 @@ static void test_atomic_builtins(struct kunit *test)
KUNIT_EXPECT_FALSE(test, match_never);
}
+__no_kcsan
+static void test_1bit_value_change(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_xor_1bit, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ },
+ };
+ bool match = false;
+
+ begin_test_checks(test_kernel_read, test_kernel_xor_1bit);
+ do {
+ match = IS_ENABLED(CONFIG_KCSAN_PERMISSIVE)
+ ? report_available()
+ : report_matches(&expect);
+ } while (!end_test_checks(match));
+ if (IS_ENABLED(CONFIG_KCSAN_PERMISSIVE))
+ KUNIT_EXPECT_FALSE(test, match);
+ else
+ KUNIT_EXPECT_TRUE(test, match);
+}
+
/*
* Generate thread counts for all test cases. Values generated are in interval
* [2, 5] followed by exponentially increasing thread counts from 8 to 32.
@@ -1024,6 +1055,7 @@ static struct kunit_case kcsan_test_cases[] = {
KCSAN_KUNIT_CASE(test_jiffies_noreport),
KCSAN_KUNIT_CASE(test_seqlock_noreport),
KCSAN_KUNIT_CASE(test_atomic_builtins),
+ KCSAN_KUNIT_CASE(test_1bit_value_change),
{},
};
diff --git a/kernel/kcsan/permissive.h b/kernel/kcsan/permissive.h
new file mode 100644
index 000000000000..2c01fe4a59ee
--- /dev/null
+++ b/kernel/kcsan/permissive.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Special rules for ignoring entire classes of data-racy memory accesses. None
+ * of the rules here imply that such data races are generally safe!
+ *
+ * All rules in this file can be configured via CONFIG_KCSAN_PERMISSIVE. Keep
+ * them separate from core code to make it easier to audit.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
+
+#ifndef _KERNEL_KCSAN_PERMISSIVE_H
+#define _KERNEL_KCSAN_PERMISSIVE_H
+
+#include <linux/bitops.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+/*
+ * Access ignore rules based on address.
+ */
+static __always_inline bool kcsan_ignore_address(const volatile void *ptr)
+{
+ if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE))
+ return false;
+
+ /*
+ * Data-racy bitops on current->flags are too common, ignore completely
+ * for now.
+ */
+ return ptr == &current->flags;
+}
+
+/*
+ * Data race ignore rules based on access type and value change patterns.
+ */
+static bool
+kcsan_ignore_data_race(size_t size, int type, u64 old, u64 new, u64 diff)
+{
+ if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE))
+ return false;
+
+ /*
+ * Rules here are only for plain read accesses, so that we still report
+ * data races between plain read-write accesses.
+ */
+ if (type || size > sizeof(long))
+ return false;
+
+ /*
+ * A common pattern is checking/setting just 1 bit in a variable; for
+ * example:
+ *
+ * if (flags & SOME_FLAG) { ... }
+ *
+ * and elsewhere flags is updated concurrently:
+ *
+ * flags |= SOME_OTHER_FLAG; // just 1 bit
+ *
+ * While it is still recommended that such accesses be marked
+ * appropriately, in many cases these types of data races are so common
+ * that marking them all is often unrealistic and left to maintainer
+ * preference.
+ *
+ * The assumption in all cases is that with all known compiler
+ * optimizations (including those that tear accesses), because no more
+ * than 1 bit changed, the plain accesses are safe despite the presence
+ * of data races.
+ *
+ * The rules here will ignore the data races if we observe no more than
+ * 1 bit changed.
+ *
+ * Of course many operations can effecively change just 1 bit, but the
+ * general assuption that data races involving 1-bit changes can be
+ * tolerated still applies.
+ *
+ * And in case a true bug is missed, the bug likely manifests as a
+ * reportable data race elsewhere.
+ */
+ if (hweight64(diff) == 1) {
+ /*
+ * Exception: Report data races where the values look like
+ * ordinary booleans (one of them was 0 and the 0th bit was
+ * changed) More often than not, they come with interesting
+ * memory ordering requirements, so let's report them.
+ */
+ if (!((!old || !new) && diff == 1))
+ return true;
+ }
+
+ return false;
+}
+
+#endif /* _KERNEL_KCSAN_PERMISSIVE_H */
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 13dce3c664d6..21137929d428 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -325,13 +325,10 @@ static void print_verbose_info(struct task_struct *task)
print_irqtrace_events(task);
}
-/*
- * Returns true if a report was generated, false otherwise.
- */
-static bool print_report(enum kcsan_value_change value_change,
- enum kcsan_report_type type,
+static void print_report(enum kcsan_value_change value_change,
const struct access_info *ai,
- const struct other_info *other_info)
+ const struct other_info *other_info,
+ u64 old, u64 new, u64 mask)
{
unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 };
int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1);
@@ -344,25 +341,24 @@ static bool print_report(enum kcsan_value_change value_change,
* Must check report filter rules before starting to print.
*/
if (skip_report(KCSAN_VALUE_CHANGE_TRUE, stack_entries[skipnr]))
- return false;
+ return;
- if (type == KCSAN_REPORT_RACE_SIGNAL) {
+ if (other_info) {
other_skipnr = get_stack_skipnr(other_info->stack_entries,
other_info->num_stack_entries);
other_frame = other_info->stack_entries[other_skipnr];
/* @value_change is only known for the other thread */
if (skip_report(value_change, other_frame))
- return false;
+ return;
}
if (rate_limit_report(this_frame, other_frame))
- return false;
+ return;
/* Print report header. */
pr_err("==================================================================\n");
- switch (type) {
- case KCSAN_REPORT_RACE_SIGNAL: {
+ if (other_info) {
int cmp;
/*
@@ -374,22 +370,15 @@ static bool print_report(enum kcsan_value_change value_change,
get_bug_type(ai->access_type | other_info->ai.access_type),
(void *)(cmp < 0 ? other_frame : this_frame),
(void *)(cmp < 0 ? this_frame : other_frame));
- } break;
-
- case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN:
+ } else {
pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(ai->access_type),
(void *)this_frame);
- break;
-
- default:
- BUG();
}
pr_err("\n");
/* Print information about the racing accesses. */
- switch (type) {
- case KCSAN_REPORT_RACE_SIGNAL:
+ if (other_info) {
pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n",
get_access_type(other_info->ai.access_type), other_info->ai.ptr,
other_info->ai.size, get_thread_desc(other_info->ai.task_pid),
@@ -407,16 +396,10 @@ static bool print_report(enum kcsan_value_change value_change,
pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n",
get_access_type(ai->access_type), ai->ptr, ai->size,
get_thread_desc(ai->task_pid), ai->cpu_id);
- break;
-
- case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN:
+ } else {
pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n",
get_access_type(ai->access_type), ai->ptr, ai->size,
get_thread_desc(ai->task_pid), ai->cpu_id);
- break;
-
- default:
- BUG();
}
/* Print stack trace of this thread. */
stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr,
@@ -425,24 +408,41 @@ static bool print_report(enum kcsan_value_change value_change,
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
print_verbose_info(current);
+ /* Print observed value change. */
+ if (ai->size <= 8) {
+ int hex_len = ai->size * 2;
+ u64 diff = old ^ new;
+
+ if (mask)
+ diff &= mask;
+ if (diff) {
+ pr_err("\n");
+ pr_err("value changed: 0x%0*llx -> 0x%0*llx\n",
+ hex_len, old, hex_len, new);
+ if (mask) {
+ pr_err(" bits changed: 0x%0*llx with mask 0x%0*llx\n",
+ hex_len, diff, hex_len, mask);
+ }
+ }
+ }
+
/* Print report footer. */
pr_err("\n");
pr_err("Reported by Kernel Concurrency Sanitizer on:\n");
dump_stack_print_info(KERN_DEFAULT);
pr_err("==================================================================\n");
- return true;
+ if (panic_on_warn)
+ panic("panic_on_warn set ...\n");
}
static void release_report(unsigned long *flags, struct other_info *other_info)
{
- if (other_info)
- /*
- * Use size to denote valid/invalid, since KCSAN entirely
- * ignores 0-sized accesses.
- */
- other_info->ai.size = 0;
-
+ /*
+ * Use size to denote valid/invalid, since KCSAN entirely ignores
+ * 0-sized accesses.
+ */
+ other_info->ai.size = 0;
raw_spin_unlock_irqrestore(&report_lock, *flags);
}
@@ -460,7 +460,7 @@ static void set_other_info_task_blocking(unsigned long *flags,
* We may be instrumenting a code-path where current->state is already
* something other than TASK_RUNNING.
*/
- const bool is_running = current->state == TASK_RUNNING;
+ const bool is_running = task_is_running(current);
/*
* To avoid deadlock in case we are in an interrupt here and this is a
* race with a task on the same CPU (KCSAN_INTERRUPT_WATCHER), provide a
@@ -575,48 +575,42 @@ discard:
return false;
}
-/*
- * Depending on the report type either sets @other_info and returns false, or
- * awaits @other_info and returns true. If @other_info is not required for the
- * report type, simply acquires @report_lock and returns true.
- */
-static noinline bool prepare_report(unsigned long *flags,
- enum kcsan_report_type type,
- const struct access_info *ai,
- struct other_info *other_info)
-{
- switch (type) {
- case KCSAN_REPORT_CONSUMED_WATCHPOINT:
- prepare_report_producer(flags, ai, other_info);
- return false;
- case KCSAN_REPORT_RACE_SIGNAL:
- return prepare_report_consumer(flags, ai, other_info);
- default:
- /* @other_info not required; just acquire @report_lock. */
- raw_spin_lock_irqsave(&report_lock, *flags);
- return true;
- }
-}
-
-void kcsan_report(const volatile void *ptr, size_t size, int access_type,
- enum kcsan_value_change value_change,
- enum kcsan_report_type type, int watchpoint_idx)
+static struct access_info prepare_access_info(const volatile void *ptr, size_t size,
+ int access_type)
{
- unsigned long flags = 0;
- const struct access_info ai = {
+ return (struct access_info) {
.ptr = ptr,
.size = size,
.access_type = access_type,
.task_pid = in_task() ? task_pid_nr(current) : -1,
.cpu_id = raw_smp_processor_id()
};
- struct other_info *other_info = type == KCSAN_REPORT_RACE_UNKNOWN_ORIGIN
- ? NULL : &other_infos[watchpoint_idx];
+}
+
+void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
+ int watchpoint_idx)
+{
+ const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ unsigned long flags;
kcsan_disable_current();
- if (WARN_ON(watchpoint_idx < 0 || watchpoint_idx >= ARRAY_SIZE(other_infos)))
- goto out;
+ lockdep_off(); /* See kcsan_report_known_origin(). */
+
+ prepare_report_producer(&flags, &ai, &other_infos[watchpoint_idx]);
+ lockdep_on();
+ kcsan_enable_current();
+}
+
+void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
+ enum kcsan_value_change value_change, int watchpoint_idx,
+ u64 old, u64 new, u64 mask)
+{
+ const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ struct other_info *other_info = &other_infos[watchpoint_idx];
+ unsigned long flags = 0;
+
+ kcsan_disable_current();
/*
* Because we may generate reports when we're in scheduler code, the use
* of printk() could deadlock. Until such time that all printing code
@@ -626,22 +620,35 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type,
*/
lockdep_off();
- if (prepare_report(&flags, type, &ai, other_info)) {
- /*
- * Never report if value_change is FALSE, only if we it is
- * either TRUE or MAYBE. In case of MAYBE, further filtering may
- * be done once we know the full stack trace in print_report().
- */
- bool reported = value_change != KCSAN_VALUE_CHANGE_FALSE &&
- print_report(value_change, type, &ai, other_info);
+ if (!prepare_report_consumer(&flags, &ai, other_info))
+ goto out;
+ /*
+ * Never report if value_change is FALSE, only when it is
+ * either TRUE or MAYBE. In case of MAYBE, further filtering may
+ * be done once we know the full stack trace in print_report().
+ */
+ if (value_change != KCSAN_VALUE_CHANGE_FALSE)
+ print_report(value_change, &ai, other_info, old, new, mask);
- if (reported && panic_on_warn)
- panic("panic_on_warn set ...\n");
+ release_report(&flags, other_info);
+out:
+ lockdep_on();
+ kcsan_enable_current();
+}
- release_report(&flags, other_info);
- }
+void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
+ u64 old, u64 new, u64 mask)
+{
+ const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ unsigned long flags;
+
+ kcsan_disable_current();
+ lockdep_off(); /* See kcsan_report_known_origin(). */
+
+ raw_spin_lock_irqsave(&report_lock, flags);
+ print_report(KCSAN_VALUE_CHANGE_TRUE, &ai, NULL, old, new, mask);
+ raw_spin_unlock_irqrestore(&report_lock, flags);
lockdep_on();
-out:
kcsan_enable_current();
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c82c6c06f051..b5e40f069768 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -19,26 +19,9 @@
#include "kexec_internal.h"
-static int copy_user_segment_list(struct kimage *image,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
-{
- int ret;
- size_t segment_bytes;
-
- /* Read in the segments */
- image->nr_segments = nr_segments;
- segment_bytes = nr_segments * sizeof(*segments);
- ret = copy_from_user(image->segment, segments, segment_bytes);
- if (ret)
- ret = -EFAULT;
-
- return ret;
-}
-
static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
unsigned long nr_segments,
- struct kexec_segment __user *segments,
+ struct kexec_segment *segments,
unsigned long flags)
{
int ret;
@@ -58,10 +41,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
return -ENOMEM;
image->start = entry;
-
- ret = copy_user_segment_list(image, nr_segments, segments);
- if (ret)
- goto out_free_image;
+ image->nr_segments = nr_segments;
+ memcpy(image->segment, segments, nr_segments * sizeof(*segments));
if (kexec_on_panic) {
/* Enable special crash kernel control page alloc policy. */
@@ -104,12 +85,23 @@ out_free_image:
}
static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
- struct kexec_segment __user *segments, unsigned long flags)
+ struct kexec_segment *segments, unsigned long flags)
{
struct kimage **dest_image, *image;
unsigned long i;
int ret;
+ /*
+ * Because we write directly to the reserved memory region when loading
+ * crash kernels we need a mutex here to prevent multiple crash kernels
+ * from attempting to load simultaneously, and to prevent a crash kernel
+ * from loading over the top of a in use crash kernel.
+ *
+ * KISS: always take the mutex.
+ */
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
if (flags & KEXEC_ON_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
@@ -121,7 +113,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
if (nr_segments == 0) {
/* Uninstall image */
kimage_free(xchg(dest_image, NULL));
- return 0;
+ ret = 0;
+ goto out_unlock;
}
if (flags & KEXEC_ON_CRASH) {
/*
@@ -134,7 +127,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags);
if (ret)
- return ret;
+ goto out_unlock;
if (flags & KEXEC_PRESERVE_CONTEXT)
image->preserve_context = 1;
@@ -171,6 +164,8 @@ out:
arch_kexec_protect_crashkres();
kimage_free(image);
+out_unlock:
+ mutex_unlock(&kexec_mutex);
return ret;
}
@@ -236,7 +231,8 @@ static inline int kexec_load_check(unsigned long nr_segments,
SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
struct kexec_segment __user *, segments, unsigned long, flags)
{
- int result;
+ struct kexec_segment *ksegments;
+ unsigned long result;
result = kexec_load_check(nr_segments, flags);
if (result)
@@ -247,20 +243,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
return -EINVAL;
- /* Because we write directly to the reserved memory
- * region when loading crash kernels we need a mutex here to
- * prevent multiple crash kernels from attempting to load
- * simultaneously, and to prevent a crash kernel from loading
- * over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
- */
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
+ ksegments = memdup_user(segments, nr_segments * sizeof(ksegments[0]));
+ if (IS_ERR(ksegments))
+ return PTR_ERR(ksegments);
- result = do_kexec_load(entry, nr_segments, segments, flags);
-
- mutex_unlock(&kexec_mutex);
+ result = do_kexec_load(entry, nr_segments, ksegments, flags);
+ kfree(ksegments);
return result;
}
@@ -272,7 +260,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
compat_ulong_t, flags)
{
struct compat_kexec_segment in;
- struct kexec_segment out, __user *ksegments;
+ struct kexec_segment *ksegments;
unsigned long i, result;
result = kexec_load_check(nr_segments, flags);
@@ -285,37 +273,26 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
return -EINVAL;
- ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
+ ksegments = kmalloc_array(nr_segments, sizeof(ksegments[0]),
+ GFP_KERNEL);
+ if (!ksegments)
+ return -ENOMEM;
+
for (i = 0; i < nr_segments; i++) {
result = copy_from_user(&in, &segments[i], sizeof(in));
if (result)
- return -EFAULT;
+ goto fail;
- out.buf = compat_ptr(in.buf);
- out.bufsz = in.bufsz;
- out.mem = in.mem;
- out.memsz = in.memsz;
-
- result = copy_to_user(&ksegments[i], &out, sizeof(out));
- if (result)
- return -EFAULT;
+ ksegments[i].buf = compat_ptr(in.buf);
+ ksegments[i].bufsz = in.bufsz;
+ ksegments[i].mem = in.mem;
+ ksegments[i].memsz = in.memsz;
}
- /* Because we write directly to the reserved memory
- * region when loading crash kernels we need a mutex here to
- * prevent multiple crash kernels from attempting to load
- * simultaneously, and to prevent a crash kernel from loading
- * over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
- */
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
-
result = do_kexec_load(entry, nr_segments, ksegments, flags);
- mutex_unlock(&kexec_mutex);
-
+fail:
+ kfree(ksegments);
return result;
}
#endif
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index f099baee3578..5a5d192a89ac 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -26,6 +26,7 @@
#include <linux/suspend.h>
#include <linux/device.h>
#include <linux/freezer.h>
+#include <linux/panic_notifier.h>
#include <linux/pm.h>
#include <linux/cpu.h>
#include <linux/uaccess.h>
@@ -978,7 +979,6 @@ void crash_kexec(struct pt_regs *regs)
old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
if (old_cpu == PANIC_CPU_INVALID) {
/* This is the 1st CPU which comes here, so go ahead. */
- printk_safe_flush_on_panic();
__crash_kexec(regs);
/*
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 745f08fdd7a6..790a573bbe00 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,6 +35,7 @@
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/jump_label.h>
+#include <linux/static_call.h>
#include <linux/perf_event.h>
#include <asm/sections.h>
@@ -106,7 +107,7 @@ void __weak *alloc_insn_page(void)
return module_alloc(PAGE_SIZE);
}
-void __weak free_insn_page(void *page)
+static void free_insn_page(void *page)
{
module_memfree(page);
}
@@ -321,11 +322,21 @@ int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
}
#ifdef CONFIG_OPTPROBES
+void __weak *alloc_optinsn_page(void)
+{
+ return alloc_insn_page();
+}
+
+void __weak free_optinsn_page(void *page)
+{
+ free_insn_page(page);
+}
+
/* For optimized_kprobe buffer */
struct kprobe_insn_cache kprobe_optinsn_slots = {
.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
- .alloc = alloc_insn_page,
- .free = free_insn_page,
+ .alloc = alloc_optinsn_page,
+ .free = free_optinsn_page,
.sym = KPROBE_OPTINSN_PAGE_SYM,
.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
/* .insn_size is initialized later */
@@ -1183,23 +1194,6 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
}
NOKPROBE_SYMBOL(aggr_post_handler);
-static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
- int trapnr)
-{
- struct kprobe *cur = __this_cpu_read(kprobe_instance);
-
- /*
- * if we faulted "during" the execution of a user specified
- * probe handler, invoke just that probe's fault handler
- */
- if (cur && cur->fault_handler) {
- if (cur->fault_handler(cur, regs, trapnr))
- return 1;
- }
- return 0;
-}
-NOKPROBE_SYMBOL(aggr_fault_handler);
-
/* Walks the list and increments nmissed count for multiprobe case */
void kprobes_inc_nmissed_count(struct kprobe *p)
{
@@ -1330,7 +1324,6 @@ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
ap->addr = p->addr;
ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
ap->pre_handler = aggr_pre_handler;
- ap->fault_handler = aggr_fault_handler;
/* We don't care the kprobe which has gone. */
if (p->post_handler && !kprobe_gone(p))
ap->post_handler = aggr_post_handler;
@@ -1569,6 +1562,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
if (!kernel_text_address((unsigned long) p->addr) ||
within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr) ||
+ static_call_text_reserved(p->addr, p->addr) ||
find_bug((unsigned long)p->addr)) {
ret = -EINVAL;
goto out;
@@ -2014,7 +2008,6 @@ int register_kretprobe(struct kretprobe *rp)
rp->kp.pre_handler = pre_handler_kretprobe;
rp->kp.post_handler = NULL;
- rp->kp.fault_handler = NULL;
/* Pre-allocate memory for max kretprobe instances */
if (rp->maxactive <= 0) {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 0fccf7d0c6a1..5b37a8567168 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -68,16 +68,6 @@ enum KTHREAD_BITS {
KTHREAD_SHOULD_PARK,
};
-static inline void set_kthread_struct(void *kthread)
-{
- /*
- * We abuse ->set_child_tid to avoid the new member and because it
- * can't be wrongly copied by copy_process(). We also rely on fact
- * that the caller can't exec, so PF_KTHREAD can't be cleared.
- */
- current->set_child_tid = (__force void __user *)kthread;
-}
-
static inline struct kthread *to_kthread(struct task_struct *k)
{
WARN_ON(!(k->flags & PF_KTHREAD));
@@ -103,6 +93,22 @@ static inline struct kthread *__to_kthread(struct task_struct *p)
return kthread;
}
+void set_kthread_struct(struct task_struct *p)
+{
+ struct kthread *kthread;
+
+ if (__to_kthread(p))
+ return;
+
+ kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
+ /*
+ * We abuse ->set_child_tid to avoid the new member and because it
+ * can't be wrongly copied by copy_process(). We also rely on fact
+ * that the caller can't exec, so PF_KTHREAD can't be cleared.
+ */
+ p->set_child_tid = (__force void __user *)kthread;
+}
+
void free_kthread_struct(struct task_struct *k)
{
struct kthread *kthread;
@@ -272,8 +278,8 @@ static int kthread(void *_create)
struct kthread *self;
int ret;
- self = kzalloc(sizeof(*self), GFP_KERNEL);
- set_kthread_struct(self);
+ set_kthread_struct(current);
+ self = to_kthread(current);
/* If user was SIGKILLed, I release the structure. */
done = xchg(&create->done, NULL);
@@ -451,7 +457,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
unsigned long flags;
@@ -467,7 +473,7 @@ static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mas
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
{
__kthread_bind_mask(p, cpumask_of(cpu), state);
}
@@ -1156,14 +1162,14 @@ static bool __kthread_cancel_work(struct kthread_work *work)
* modify @dwork's timer so that it expires after @delay. If @delay is zero,
* @work is guaranteed to be queued immediately.
*
- * Return: %true if @dwork was pending and its timer was modified,
- * %false otherwise.
+ * Return: %false if @dwork was idle and queued, %true otherwise.
*
* A special case is when the work is being canceled in parallel.
* It might be caused either by the real kthread_cancel_delayed_work_sync()
* or yet another kthread_mod_delayed_work() call. We let the other command
- * win and return %false here. The caller is supposed to synchronize these
- * operations a reasonable way.
+ * win and return %true here. The return value can be used for reference
+ * counting and the number of queued works stays the same. Anyway, the caller
+ * is supposed to synchronize these operations a reasonable way.
*
* This function is safe to call from any context including IRQ handler.
* See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
@@ -1175,13 +1181,15 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
{
struct kthread_work *work = &dwork->work;
unsigned long flags;
- int ret = false;
+ int ret;
raw_spin_lock_irqsave(&worker->lock, flags);
/* Do not bother with canceling when never queued. */
- if (!work->worker)
+ if (!work->worker) {
+ ret = false;
goto fast_queue;
+ }
/* Work must not be used with >1 worker, see kthread_queue_work() */
WARN_ON_ONCE(work->worker != worker);
@@ -1199,8 +1207,11 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
* be used for reference counting.
*/
kthread_cancel_delayed_work_timer(work, &flags);
- if (work->canceling)
+ if (work->canceling) {
+ /* The number of works in the queue does not change. */
+ ret = true;
goto out;
+ }
ret = __kthread_cancel_work(work);
fast_queue:
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 3a4beb9395c4..291b857a6e20 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -411,7 +411,7 @@ void klp_try_complete_transition(void)
/*
* Ditto for the idle "swapper" tasks.
*/
- get_online_cpus();
+ cpus_read_lock();
for_each_possible_cpu(cpu) {
task = idle_task(cpu);
if (cpu_online(cpu)) {
@@ -423,7 +423,7 @@ void klp_try_complete_transition(void)
task->patch_state = klp_target_state;
}
}
- put_online_cpus();
+ cpus_read_unlock();
if (!complete) {
if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT))
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 3572808223e4..d51cabf28f38 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -24,7 +24,8 @@ obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
-obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o
+obj-$(CONFIG_PREEMPT_RT) += spinlock_rt.o ww_rt_mutex.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e32313072506..bf1c00c881e4 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -760,7 +760,7 @@ static void lockdep_print_held_locks(struct task_struct *p)
* It's not reliable to print a task's held locks if it's not sleeping
* and it's not the current task.
*/
- if (p->state == TASK_RUNNING && p != current)
+ if (p != current && task_is_running(p))
return;
for (i = 0; i < depth; i++) {
printk(" #%d: ", i);
@@ -2306,7 +2306,56 @@ static void print_lock_class_header(struct lock_class *class, int depth)
}
/*
- * printk the shortest lock dependencies from @start to @end in reverse order:
+ * Dependency path printing:
+ *
+ * After BFS we get a lock dependency path (linked via ->parent of lock_list),
+ * printing out each lock in the dependency path will help on understanding how
+ * the deadlock could happen. Here are some details about dependency path
+ * printing:
+ *
+ * 1) A lock_list can be either forwards or backwards for a lock dependency,
+ * for a lock dependency A -> B, there are two lock_lists:
+ *
+ * a) lock_list in the ->locks_after list of A, whose ->class is B and
+ * ->links_to is A. In this case, we can say the lock_list is
+ * "A -> B" (forwards case).
+ *
+ * b) lock_list in the ->locks_before list of B, whose ->class is A
+ * and ->links_to is B. In this case, we can say the lock_list is
+ * "B <- A" (bacwards case).
+ *
+ * The ->trace of both a) and b) point to the call trace where B was
+ * acquired with A held.
+ *
+ * 2) A "helper" lock_list is introduced during BFS, this lock_list doesn't
+ * represent a certain lock dependency, it only provides an initial entry
+ * for BFS. For example, BFS may introduce a "helper" lock_list whose
+ * ->class is A, as a result BFS will search all dependencies starting with
+ * A, e.g. A -> B or A -> C.
+ *
+ * The notation of a forwards helper lock_list is like "-> A", which means
+ * we should search the forwards dependencies starting with "A", e.g A -> B
+ * or A -> C.
+ *
+ * The notation of a bacwards helper lock_list is like "<- B", which means
+ * we should search the backwards dependencies ending with "B", e.g.
+ * B <- A or B <- C.
+ */
+
+/*
+ * printk the shortest lock dependencies from @root to @leaf in reverse order.
+ *
+ * We have a lock dependency path as follow:
+ *
+ * @root @leaf
+ * | |
+ * V V
+ * ->parent ->parent
+ * | lock_list | <--------- | lock_list | ... | lock_list | <--------- | lock_list |
+ * | -> L1 | | L1 -> L2 | ... |Ln-2 -> Ln-1| | Ln-1 -> Ln|
+ *
+ * , so it's natural that we start from @leaf and print every ->class and
+ * ->trace until we reach the @root.
*/
static void __used
print_shortest_lock_dependencies(struct lock_list *leaf,
@@ -2334,6 +2383,61 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
} while (entry && (depth >= 0));
}
+/*
+ * printk the shortest lock dependencies from @leaf to @root.
+ *
+ * We have a lock dependency path (from a backwards search) as follow:
+ *
+ * @leaf @root
+ * | |
+ * V V
+ * ->parent ->parent
+ * | lock_list | ---------> | lock_list | ... | lock_list | ---------> | lock_list |
+ * | L2 <- L1 | | L3 <- L2 | ... | Ln <- Ln-1 | | <- Ln |
+ *
+ * , so when we iterate from @leaf to @root, we actually print the lock
+ * dependency path L1 -> L2 -> .. -> Ln in the non-reverse order.
+ *
+ * Another thing to notice here is that ->class of L2 <- L1 is L1, while the
+ * ->trace of L2 <- L1 is the call trace of L2, in fact we don't have the call
+ * trace of L1 in the dependency path, which is alright, because most of the
+ * time we can figure out where L1 is held from the call trace of L2.
+ */
+static void __used
+print_shortest_lock_dependencies_backwards(struct lock_list *leaf,
+ struct lock_list *root)
+{
+ struct lock_list *entry = leaf;
+ const struct lock_trace *trace = NULL;
+ int depth;
+
+ /*compute depth from generated tree by BFS*/
+ depth = get_lock_depth(leaf);
+
+ do {
+ print_lock_class_header(entry->class, depth);
+ if (trace) {
+ printk("%*s ... acquired at:\n", depth, "");
+ print_lock_trace(trace, 2);
+ printk("\n");
+ }
+
+ /*
+ * Record the pointer to the trace for the next lock_list
+ * entry, see the comments for the function.
+ */
+ trace = entry->trace;
+
+ if (depth == 0 && (entry != root)) {
+ printk("lockdep:%s bad path found in chain graph\n", __func__);
+ break;
+ }
+
+ entry = get_lock_parent(entry);
+ depth--;
+ } while (entry && (depth >= 0));
+}
+
static void
print_irq_lock_scenario(struct lock_list *safe_entry,
struct lock_list *unsafe_entry,
@@ -2448,10 +2552,7 @@ print_bad_irq_dependency(struct task_struct *curr,
lockdep_print_held_locks(curr);
pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
- prev_root->trace = save_trace();
- if (!prev_root->trace)
- return;
- print_shortest_lock_dependencies(backwards_entry, prev_root);
+ print_shortest_lock_dependencies_backwards(backwards_entry, prev_root);
pr_warn("\nthe dependencies between the lock to be acquired");
pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
@@ -2669,8 +2770,18 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
* Step 3: we found a bad match! Now retrieve a lock from the backward
* list whose usage mask matches the exclusive usage mask from the
* lock found on the forward list.
+ *
+ * Note, we should only keep the LOCKF_ENABLED_IRQ_ALL bits, considering
+ * the follow case:
+ *
+ * When trying to add A -> B to the graph, we find that there is a
+ * hardirq-safe L, that L -> ... -> A, and another hardirq-unsafe M,
+ * that B -> ... -> M. However M is **softirq-safe**, if we use exact
+ * invert bits of M's usage_mask, we will find another lock N that is
+ * **softirq-unsafe** and N -> ... -> A, however N -> .. -> M will not
+ * cause a inversion deadlock.
*/
- backward_mask = original_mask(target_entry1->class->usage_mask);
+ backward_mask = original_mask(target_entry1->class->usage_mask & LOCKF_ENABLED_IRQ_ALL);
ret = find_usage_backwards(&this, backward_mask, &target_entry);
if (bfs_error(ret)) {
@@ -2720,7 +2831,7 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
* <target> or not. If it can, <src> -> <target> dependency is already
* in the graph.
*
- * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if
+ * Return BFS_RMATCH if it does, or BFS_RNOMATCH if it does not, return BFS_E* if
* any error appears in the bfs search.
*/
static noinline enum bfs_result
@@ -4579,7 +4690,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
u8 curr_inner;
int depth;
- if (!curr->lockdep_depth || !next_inner || next->trylock)
+ if (!next_inner || next->trylock)
return 0;
if (!next_outer)
@@ -6395,6 +6506,7 @@ asmlinkage __visible void lockdep_sys_exit(void)
void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
+ int dl = READ_ONCE(debug_locks);
/* Note: the following can be executed concurrently, so be careful. */
pr_warn("\n");
@@ -6404,11 +6516,12 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
pr_warn("-----------------------------\n");
pr_warn("%s:%d %s!\n", file, line, s);
pr_warn("\nother info that might help us debug this:\n\n");
- pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n%s",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
: "",
- rcu_scheduler_active, debug_locks);
+ rcu_scheduler_active, dl,
+ dl ? "" : "Possible false positive due to lockdep disabling via debug_locks = 0\n");
/*
* If a CPU is in the RCU-free window in idle (ie: in the section
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 806978314496..b8d9a050c337 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -70,26 +70,28 @@ static int l_show(struct seq_file *m, void *v)
#ifdef CONFIG_DEBUG_LOCKDEP
seq_printf(m, " OPS:%8ld", debug_class_ops_read(class));
#endif
-#ifdef CONFIG_PROVE_LOCKING
- seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
- seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
-#endif
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
+ seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
- get_usage_chars(class, usage);
- seq_printf(m, " %s", usage);
+ get_usage_chars(class, usage);
+ seq_printf(m, " %s", usage);
+ }
seq_printf(m, ": ");
print_name(m, class);
seq_puts(m, "\n");
- list_for_each_entry(entry, &class->locks_after, entry) {
- if (entry->distance == 1) {
- seq_printf(m, " -> [%p] ", entry->class->key);
- print_name(m, entry->class);
- seq_puts(m, "\n");
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ list_for_each_entry(entry, &class->locks_after, entry) {
+ if (entry->distance == 1) {
+ seq_printf(m, " -> [%p] ", entry->class->key);
+ print_name(m, entry->class);
+ seq_puts(m, "\n");
+ }
}
+ seq_puts(m, "\n");
}
- seq_puts(m, "\n");
return 0;
}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index b3adb40549bf..7c5a4a087cc7 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -59,7 +59,7 @@ static struct task_struct **writer_tasks;
static struct task_struct **reader_tasks;
static bool lock_is_write_held;
-static bool lock_is_read_held;
+static atomic_t lock_is_read_held;
static unsigned long last_lock_release;
struct lock_stress_stats {
@@ -682,7 +682,7 @@ static int lock_torture_writer(void *arg)
if (WARN_ON_ONCE(lock_is_write_held))
lwsp->n_lock_fail++;
lock_is_write_held = true;
- if (WARN_ON_ONCE(lock_is_read_held))
+ if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
lwsp->n_lock_fail++; /* rare, but... */
lwsp->n_lock_acquired++;
@@ -717,13 +717,13 @@ static int lock_torture_reader(void *arg)
schedule_timeout_uninterruptible(1);
cxt.cur_ops->readlock(tid);
- lock_is_read_held = true;
+ atomic_inc(&lock_is_read_held);
if (WARN_ON_ONCE(lock_is_write_held))
lrsp->n_lock_fail++; /* rare, but... */
lrsp->n_lock_acquired++;
cxt.cur_ops->read_delay(&rand);
- lock_is_read_held = false;
+ atomic_dec(&lock_is_read_held);
cxt.cur_ops->readunlock(tid);
stutter_wait("lock_torture_reader");
@@ -738,20 +738,22 @@ static int lock_torture_reader(void *arg)
static void __torture_print_stats(char *page,
struct lock_stress_stats *statp, bool write)
{
+ long cur;
bool fail = false;
int i, n_stress;
- long max = 0, min = statp ? statp[0].n_lock_acquired : 0;
+ long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0;
long long sum = 0;
n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
for (i = 0; i < n_stress; i++) {
- if (statp[i].n_lock_fail)
+ if (data_race(statp[i].n_lock_fail))
fail = true;
- sum += statp[i].n_lock_acquired;
- if (max < statp[i].n_lock_acquired)
- max = statp[i].n_lock_acquired;
- if (min > statp[i].n_lock_acquired)
- min = statp[i].n_lock_acquired;
+ cur = data_race(statp[i].n_lock_acquired);
+ sum += cur;
+ if (max < cur)
+ max = cur;
+ if (min > cur)
+ min = cur;
}
page += sprintf(page,
"%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
@@ -996,7 +998,6 @@ static int __init lock_torture_init(void)
}
if (nreaders_stress) {
- lock_is_read_held = false;
cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
sizeof(*cxt.lrsa),
GFP_KERNEL);
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index db9301591e3f..bc8abb8549d2 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -1,6 +1,4 @@
/*
- * kernel/mutex-debug.c
- *
* Debugging code for mutexes
*
* Started by Ingo Molnar:
@@ -22,7 +20,7 @@
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
-#include "mutex-debug.h"
+#include "mutex.h"
/*
* Must be called with lock->wait_lock held.
@@ -32,6 +30,7 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
waiter->magic = waiter;
INIT_LIST_HEAD(&waiter->list);
+ waiter->ww_ctx = MUTEX_POISON_WW_CTX;
}
void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
deleted file mode 100644
index 53e631e1d76d..000000000000
--- a/kernel/locking/mutex-debug.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Mutexes: blocking mutual exclusion locks
- *
- * started by Ingo Molnar:
- *
- * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal declarations,
- * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
- * More details are in kernel/mutex-debug.c.
- */
-
-/*
- * This must be called with lock->wait_lock held.
- */
-extern void debug_mutex_lock_common(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_wake_waiter(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
-extern void debug_mutex_add_waiter(struct mutex *lock,
- struct mutex_waiter *waiter,
- struct task_struct *task);
-extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct task_struct *task);
-extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 013e1b08a1bf..d456579d0952 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -30,17 +30,20 @@
#include <linux/debug_locks.h>
#include <linux/osq_lock.h>
+#ifndef CONFIG_PREEMPT_RT
+#include "mutex.h"
+
#ifdef CONFIG_DEBUG_MUTEXES
-# include "mutex-debug.h"
+# define MUTEX_WARN_ON(cond) DEBUG_LOCKS_WARN_ON(cond)
#else
-# include "mutex.h"
+# define MUTEX_WARN_ON(cond)
#endif
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
atomic_long_set(&lock->owner, 0);
- spin_lock_init(&lock->wait_lock);
+ raw_spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
osq_lock_init(&lock->osq);
@@ -91,55 +94,56 @@ static inline unsigned long __owner_flags(unsigned long owner)
return owner & MUTEX_FLAGS;
}
-/*
- * Trylock variant that returns the owning task on failure.
- */
-static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
+static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff)
{
unsigned long owner, curr = (unsigned long)current;
owner = atomic_long_read(&lock->owner);
for (;;) { /* must loop, can race against a flag */
- unsigned long old, flags = __owner_flags(owner);
+ unsigned long flags = __owner_flags(owner);
unsigned long task = owner & ~MUTEX_FLAGS;
if (task) {
- if (likely(task != curr))
- break;
-
- if (likely(!(flags & MUTEX_FLAG_PICKUP)))
+ if (flags & MUTEX_FLAG_PICKUP) {
+ if (task != curr)
+ break;
+ flags &= ~MUTEX_FLAG_PICKUP;
+ } else if (handoff) {
+ if (flags & MUTEX_FLAG_HANDOFF)
+ break;
+ flags |= MUTEX_FLAG_HANDOFF;
+ } else {
break;
-
- flags &= ~MUTEX_FLAG_PICKUP;
+ }
} else {
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(flags & (MUTEX_FLAG_HANDOFF | MUTEX_FLAG_PICKUP));
+ task = curr;
}
- /*
- * We set the HANDOFF bit, we must make sure it doesn't live
- * past the point where we acquire it. This would be possible
- * if we (accidentally) set the bit on an unlocked mutex.
- */
- flags &= ~MUTEX_FLAG_HANDOFF;
-
- old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
- if (old == owner)
- return NULL;
-
- owner = old;
+ if (atomic_long_try_cmpxchg_acquire(&lock->owner, &owner, task | flags)) {
+ if (task == curr)
+ return NULL;
+ break;
+ }
}
return __owner_task(owner);
}
/*
+ * Trylock or set HANDOFF
+ */
+static inline bool __mutex_trylock_or_handoff(struct mutex *lock, bool handoff)
+{
+ return !__mutex_trylock_common(lock, handoff);
+}
+
+/*
* Actual trylock that will work on any unlocked state.
*/
static inline bool __mutex_trylock(struct mutex *lock)
{
- return !__mutex_trylock_or_owner(lock);
+ return !__mutex_trylock_common(lock, false);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -168,10 +172,7 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
{
unsigned long curr = (unsigned long)current;
- if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr)
- return true;
-
- return false;
+ return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL);
}
#endif
@@ -226,23 +227,18 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
unsigned long owner = atomic_long_read(&lock->owner);
for (;;) {
- unsigned long old, new;
+ unsigned long new;
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
- DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(__owner_task(owner) != current);
+ MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
new = (owner & MUTEX_FLAG_WAITERS);
new |= (unsigned long)task;
if (task)
new |= MUTEX_FLAG_PICKUP;
- old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
- if (old == owner)
+ if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, new))
break;
-
- owner = old;
}
}
@@ -286,218 +282,18 @@ void __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
-/*
- * Wait-Die:
- * The newer transactions are killed when:
- * It (the new transaction) makes a request for a lock being held
- * by an older transaction.
- *
- * Wound-Wait:
- * The newer transactions are wounded when:
- * An older transaction makes a request for a lock being held by
- * the newer transaction.
- */
-
-/*
- * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired
- * it.
- */
-static __always_inline void
-ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
- /*
- * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
- * but released with a normal mutex_unlock in this call.
- *
- * This should never happen, always use ww_mutex_unlock.
- */
- DEBUG_LOCKS_WARN_ON(ww->ctx);
-
- /*
- * Not quite done after calling ww_acquire_done() ?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
-
- if (ww_ctx->contending_lock) {
- /*
- * After -EDEADLK you tried to
- * acquire a different ww_mutex? Bad!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
-
- /*
- * You called ww_mutex_lock after receiving -EDEADLK,
- * but 'forgot' to unlock everything else first?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
- ww_ctx->contending_lock = NULL;
- }
-
- /*
- * Naughty, using a different class will lead to undefined behavior!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
-#endif
- ww_ctx->acquired++;
- ww->ctx = ww_ctx;
-}
-
-/*
- * Determine if context @a is 'after' context @b. IOW, @a is a younger
- * transaction than @b and depending on algorithm either needs to wait for
- * @b or die.
- */
-static inline bool __sched
-__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
-{
-
- return (signed long)(a->stamp - b->stamp) > 0;
-}
-
-/*
- * Wait-Die; wake a younger waiter context (when locks held) such that it can
- * die.
- *
- * Among waiters with context, only the first one can have other locks acquired
- * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and
- * __ww_mutex_check_kill() wake any but the earliest context.
- */
-static bool __sched
-__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter,
- struct ww_acquire_ctx *ww_ctx)
-{
- if (!ww_ctx->is_wait_die)
- return false;
-
- if (waiter->ww_ctx->acquired > 0 &&
- __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) {
- debug_mutex_wake_waiter(lock, waiter);
- wake_up_process(waiter->task);
- }
-
- return true;
-}
-
-/*
- * Wound-Wait; wound a younger @hold_ctx if it holds the lock.
- *
- * Wound the lock holder if there are waiters with older transactions than
- * the lock holders. Even if multiple waiters may wound the lock holder,
- * it's sufficient that only one does.
- */
-static bool __ww_mutex_wound(struct mutex *lock,
- struct ww_acquire_ctx *ww_ctx,
- struct ww_acquire_ctx *hold_ctx)
-{
- struct task_struct *owner = __mutex_owner(lock);
-
- lockdep_assert_held(&lock->wait_lock);
-
- /*
- * Possible through __ww_mutex_add_waiter() when we race with
- * ww_mutex_set_context_fastpath(). In that case we'll get here again
- * through __ww_mutex_check_waiters().
- */
- if (!hold_ctx)
- return false;
-
- /*
- * Can have !owner because of __mutex_unlock_slowpath(), but if owner,
- * it cannot go away because we'll have FLAG_WAITERS set and hold
- * wait_lock.
- */
- if (!owner)
- return false;
-
- if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) {
- hold_ctx->wounded = 1;
-
- /*
- * wake_up_process() paired with set_current_state()
- * inserts sufficient barriers to make sure @owner either sees
- * it's wounded in __ww_mutex_check_kill() or has a
- * wakeup pending to re-read the wounded state.
- */
- if (owner != current)
- wake_up_process(owner);
-
- return true;
- }
-
- return false;
-}
-
-/*
- * We just acquired @lock under @ww_ctx, if there are later contexts waiting
- * behind us on the wait-list, check if they need to die, or wound us.
- *
- * See __ww_mutex_add_waiter() for the list-order construction; basically the
- * list is ordered by stamp, smallest (oldest) first.
- *
- * This relies on never mixing wait-die/wound-wait on the same wait-list;
- * which is currently ensured by that being a ww_class property.
- *
- * The current task must not be on the wait list.
- */
-static void __sched
-__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
-{
- struct mutex_waiter *cur;
-
- lockdep_assert_held(&lock->wait_lock);
-
- list_for_each_entry(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
+#include "ww_mutex.h"
- if (__ww_mutex_die(lock, cur, ww_ctx) ||
- __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx))
- break;
- }
-}
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
- * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx
- * and wake up any waiters so they can recheck.
+ * Trylock variant that returns the owning task on failure.
*/
-static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
{
- ww_mutex_lock_acquired(lock, ctx);
-
- /*
- * The lock->ctx update should be visible on all cores before
- * the WAITERS check is done, otherwise contended waiters might be
- * missed. The contended waiters will either see ww_ctx == NULL
- * and keep spinning, or it will acquire wait_lock, add itself
- * to waiter list and sleep.
- */
- smp_mb(); /* See comments above and below. */
-
- /*
- * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS
- * MB MB
- * [R] MUTEX_FLAG_WAITERS [R] ww->ctx
- *
- * The memory barrier above pairs with the memory barrier in
- * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
- * and/or !empty list.
- */
- if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS)))
- return;
-
- /*
- * Uh oh, we raced in fastpath, check if any of the waiters need to
- * die or wound us.
- */
- spin_lock(&lock->base.wait_lock);
- __ww_mutex_check_waiters(&lock->base, ctx);
- spin_unlock(&lock->base.wait_lock);
+ return __mutex_trylock_common(lock, false);
}
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
static inline
bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
struct mutex_waiter *waiter)
@@ -754,181 +550,20 @@ EXPORT_SYMBOL(mutex_unlock);
*/
void __sched ww_mutex_unlock(struct ww_mutex *lock)
{
- /*
- * The unlocking fastpath is the 0->1 transition from 'locked'
- * into 'unlocked' state:
- */
- if (lock->ctx) {
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
-#endif
- if (lock->ctx->acquired > 0)
- lock->ctx->acquired--;
- lock->ctx = NULL;
- }
-
+ __ww_mutex_unlock(lock);
mutex_unlock(&lock->base);
}
EXPORT_SYMBOL(ww_mutex_unlock);
-
-static __always_inline int __sched
-__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
-{
- if (ww_ctx->acquired > 0) {
-#ifdef CONFIG_DEBUG_MUTEXES
- struct ww_mutex *ww;
-
- ww = container_of(lock, struct ww_mutex, base);
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
- ww_ctx->contending_lock = ww;
-#endif
- return -EDEADLK;
- }
-
- return 0;
-}
-
-
-/*
- * Check the wound condition for the current lock acquire.
- *
- * Wound-Wait: If we're wounded, kill ourself.
- *
- * Wait-Die: If we're trying to acquire a lock already held by an older
- * context, kill ourselves.
- *
- * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to
- * look at waiters before us in the wait-list.
- */
-static inline int __sched
-__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter,
- struct ww_acquire_ctx *ctx)
-{
- struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
- struct mutex_waiter *cur;
-
- if (ctx->acquired == 0)
- return 0;
-
- if (!ctx->is_wait_die) {
- if (ctx->wounded)
- return __ww_mutex_kill(lock, ctx);
-
- return 0;
- }
-
- if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx))
- return __ww_mutex_kill(lock, ctx);
-
- /*
- * If there is a waiter in front of us that has a context, then its
- * stamp is earlier than ours and we must kill ourself.
- */
- cur = waiter;
- list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
-
- return __ww_mutex_kill(lock, ctx);
- }
-
- return 0;
-}
-
-/*
- * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest
- * first. Such that older contexts are preferred to acquire the lock over
- * younger contexts.
- *
- * Waiters without context are interspersed in FIFO order.
- *
- * Furthermore, for Wait-Die kill ourself immediately when possible (there are
- * older contexts already waiting) to avoid unnecessary waiting and for
- * Wound-Wait ensure we wound the owning context when it is younger.
- */
-static inline int __sched
-__ww_mutex_add_waiter(struct mutex_waiter *waiter,
- struct mutex *lock,
- struct ww_acquire_ctx *ww_ctx)
-{
- struct mutex_waiter *cur;
- struct list_head *pos;
- bool is_wait_die;
-
- if (!ww_ctx) {
- __mutex_add_waiter(lock, waiter, &lock->wait_list);
- return 0;
- }
-
- is_wait_die = ww_ctx->is_wait_die;
-
- /*
- * Add the waiter before the first waiter with a higher stamp.
- * Waiters without a context are skipped to avoid starving
- * them. Wait-Die waiters may die here. Wound-Wait waiters
- * never die here, but they are sorted in stamp order and
- * may wound the lock holder.
- */
- pos = &lock->wait_list;
- list_for_each_entry_reverse(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
-
- if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) {
- /*
- * Wait-Die: if we find an older context waiting, there
- * is no point in queueing behind it, as we'd have to
- * die the moment it would acquire the lock.
- */
- if (is_wait_die) {
- int ret = __ww_mutex_kill(lock, ww_ctx);
-
- if (ret)
- return ret;
- }
-
- break;
- }
-
- pos = &cur->list;
-
- /* Wait-Die: ensure younger waiters die. */
- __ww_mutex_die(lock, cur, ww_ctx);
- }
-
- __mutex_add_waiter(lock, waiter, pos);
-
- /*
- * Wound-Wait: if we're blocking on a mutex owned by a younger context,
- * wound that such that we might proceed.
- */
- if (!is_wait_die) {
- struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
-
- /*
- * See ww_mutex_set_context_fastpath(). Orders setting
- * MUTEX_FLAG_WAITERS vs the ww->ctx load,
- * such that either we or the fastpath will wound @ww->ctx.
- */
- smp_mb();
- __ww_mutex_wound(lock, ww_ctx, ww->ctx);
- }
-
- return 0;
-}
-
/*
* Lock a mutex (possibly interruptible), slowpath:
*/
static __always_inline int __sched
-__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
+__mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip,
struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
{
struct mutex_waiter waiter;
- bool first = false;
struct ww_mutex *ww;
int ret;
@@ -937,9 +572,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
might_sleep();
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-#endif
+ MUTEX_WARN_ON(lock->magic != lock);
ww = container_of(lock, struct ww_mutex, base);
if (ww_ctx) {
@@ -953,6 +586,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*/
if (ww_ctx->acquired == 0)
ww_ctx->wounded = 0;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ nest_lock = &ww_ctx->dep_map;
+#endif
}
preempt_disable();
@@ -968,7 +605,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
return 0;
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
/*
* After waiting to acquire the wait_lock, try again.
*/
@@ -980,17 +617,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}
debug_mutex_lock_common(lock, &waiter);
+ waiter.task = current;
+ if (use_ww_ctx)
+ waiter.ww_ctx = ww_ctx;
lock_contended(&lock->dep_map, ip);
if (!use_ww_ctx) {
/* add waiting tasks to the end of the waitqueue (FIFO): */
__mutex_add_waiter(lock, &waiter, &lock->wait_list);
-
-
-#ifdef CONFIG_DEBUG_MUTEXES
- waiter.ww_ctx = MUTEX_POISON_WW_CTX;
-#endif
} else {
/*
* Add in stamp order, waking up waiters that must kill
@@ -999,14 +634,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
if (ret)
goto err_early_kill;
-
- waiter.ww_ctx = ww_ctx;
}
- waiter.task = current;
-
set_current_state(state);
for (;;) {
+ bool first;
+
/*
* Once we hold wait_lock, we're serialized against
* mutex_unlock() handing the lock off to us, do a trylock
@@ -1032,18 +665,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto err;
}
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
schedule_preempt_disabled();
- /*
- * ww_mutex needs to always recheck its position since its waiter
- * list is not FIFO ordered.
- */
- if (ww_ctx || !first) {
- first = __mutex_waiter_is_first(lock, &waiter);
- if (first)
- __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
- }
+ first = __mutex_waiter_is_first(lock, &waiter);
set_current_state(state);
/*
@@ -1051,13 +676,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* state back to RUNNING and fall through the next schedule(),
* or we must see its unlock and acquire.
*/
- if (__mutex_trylock(lock) ||
+ if (__mutex_trylock_or_handoff(lock, first) ||
(first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
break;
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
acquired:
__set_current_state(TASK_RUNNING);
@@ -1082,7 +707,7 @@ skip_wait:
if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
preempt_enable();
return 0;
@@ -1090,7 +715,7 @@ err:
__set_current_state(TASK_RUNNING);
__mutex_remove_waiter(lock, &waiter);
err_early_kill:
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, ip);
preempt_enable();
@@ -1098,18 +723,17 @@ err_early_kill:
}
static int __sched
-__mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+__mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip)
{
return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
}
static int __sched
-__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass,
- struct lockdep_map *nest_lock, unsigned long ip,
- struct ww_acquire_ctx *ww_ctx)
+__ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
+ unsigned long ip, struct ww_acquire_ctx *ww_ctx)
{
- return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true);
+ return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true);
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -1189,8 +813,7 @@ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
might_sleep();
ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE,
- 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
- ctx);
+ 0, _RET_IP_, ctx);
if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
@@ -1205,8 +828,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
might_sleep();
ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE,
- 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
- ctx);
+ 0, _RET_IP_, ctx);
if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
@@ -1237,29 +859,21 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
*/
owner = atomic_long_read(&lock->owner);
for (;;) {
- unsigned long old;
-
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
- DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(__owner_task(owner) != current);
+ MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
if (owner & MUTEX_FLAG_HANDOFF)
break;
- old = atomic_long_cmpxchg_release(&lock->owner, owner,
- __owner_flags(owner));
- if (old == owner) {
+ if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, __owner_flags(owner))) {
if (owner & MUTEX_FLAG_WAITERS)
break;
return;
}
-
- owner = old;
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
debug_mutex_unlock(lock);
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
@@ -1276,7 +890,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
if (owner & MUTEX_FLAG_HANDOFF)
__mutex_handoff(lock, next);
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
wake_up_q(&wake_q);
}
@@ -1380,7 +994,7 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock)
static noinline int __sched
__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
- return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL,
+ return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0,
_RET_IP_, ctx);
}
@@ -1388,7 +1002,7 @@ static noinline int __sched
__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
struct ww_acquire_ctx *ctx)
{
- return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL,
+ return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0,
_RET_IP_, ctx);
}
@@ -1412,9 +1026,7 @@ int __sched mutex_trylock(struct mutex *lock)
{
bool locked;
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-#endif
+ MUTEX_WARN_ON(lock->magic != lock);
locked = __mutex_trylock(lock);
if (locked)
@@ -1455,7 +1067,8 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
}
EXPORT_SYMBOL(ww_mutex_lock_interruptible);
-#endif
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* !CONFIG_PREEMPT_RT */
/**
* atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index f0c710b1d192..0b2a79c4013b 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -5,19 +5,41 @@
* started by Ingo Molnar:
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal prototypes, for the
- * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
*/
-#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
-#define debug_mutex_free_waiter(waiter) do { } while (0)
-#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
-#define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
-#define debug_mutex_unlock(lock) do { } while (0)
-#define debug_mutex_init(lock, name, key) do { } while (0)
+/*
+ * This is the control structure for tasks blocked on mutex, which resides
+ * on the blocked task's kernel stack:
+ */
+struct mutex_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ struct ww_acquire_ctx *ww_ctx;
+#ifdef CONFIG_DEBUG_MUTEXES
+ void *magic;
+#endif
+};
-static inline void
-debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
-{
-}
+#ifdef CONFIG_DEBUG_MUTEXES
+extern void debug_mutex_lock_common(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_wake_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_add_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_unlock(struct mutex *lock);
+extern void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key);
+#else /* CONFIG_DEBUG_MUTEXES */
+# define debug_mutex_lock_common(lock, waiter) do { } while (0)
+# define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
+# define debug_mutex_free_waiter(waiter) do { } while (0)
+# define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
+# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
+# define debug_mutex_unlock(lock) do { } while (0)
+# define debug_mutex_init(lock, name, key) do { } while (0)
+#endif /* !CONFIG_DEBUG_MUTEXES */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 406818196a9f..6bb116c559b4 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -8,20 +8,58 @@
* Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
+ * Adaptive Spinlocks:
+ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ * and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
*
* See Documentation/locking/rt-mutex-design.rst for details.
*/
-#include <linux/spinlock.h>
-#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/deadline.h>
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
-#include <linux/sched/deadline.h>
#include <linux/sched/wake_q.h>
-#include <linux/sched/debug.h>
-#include <linux/timer.h>
+#include <linux/ww_mutex.h>
#include "rtmutex_common.h"
+#ifndef WW_RT
+# define build_ww_mutex() (false)
+# define ww_container_of(rtm) NULL
+
+static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter,
+ struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ return 0;
+}
+
+static inline void __ww_mutex_check_waiters(struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+}
+
+static inline void ww_mutex_lock_acquired(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+}
+
+static inline int __ww_mutex_check_kill(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ return 0;
+}
+
+#else
+# define build_ww_mutex() (true)
+# define ww_container_of(rtm) container_of(rtm, struct ww_mutex, base)
+# include "ww_mutex.h"
+#endif
+
/*
* lock->owner state tracking:
*
@@ -50,7 +88,7 @@
*/
static __always_inline void
-rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
+rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
{
unsigned long val = (unsigned long)owner;
@@ -60,13 +98,13 @@ rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
WRITE_ONCE(lock->owner, (struct task_struct *)val);
}
-static __always_inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
}
-static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock)
{
unsigned long owner, *p = (unsigned long *) &lock->owner;
@@ -141,15 +179,26 @@ static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex *lock)
* set up.
*/
#ifndef CONFIG_DEBUG_RT_MUTEXES
-# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
-# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return try_cmpxchg_acquire(&lock->owner, &old, new);
+}
+
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return try_cmpxchg_release(&lock->owner, &old, new);
+}
/*
* Callers must hold the ->wait_lock -- which is the whole purpose as we force
* all future threads that attempt to [Rmw] the lock to the slowpath. As such
* relaxed semantics suffice.
*/
-static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
unsigned long owner, *p = (unsigned long *) &lock->owner;
@@ -165,7 +214,7 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
* 2) Drop lock->wait_lock
* 3) Try to unlock the lock with cmpxchg
*/
-static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
unsigned long flags)
__releases(lock->wait_lock)
{
@@ -201,10 +250,22 @@ static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#else
-# define rt_mutex_cmpxchg_acquire(l,c,n) (0)
-# define rt_mutex_cmpxchg_release(l,c,n) (0)
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
+
+}
-static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
+}
+
+static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
@@ -213,7 +274,7 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
/*
* Simple slow path only version: lock->owner is protected by lock->wait_lock.
*/
-static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
unsigned long flags)
__releases(lock->wait_lock)
{
@@ -223,11 +284,28 @@ static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#endif
+static __always_inline int __waiter_prio(struct task_struct *task)
+{
+ int prio = task->prio;
+
+ if (!rt_prio(prio))
+ return DEFAULT_PRIO;
+
+ return prio;
+}
+
+static __always_inline void
+waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ waiter->prio = __waiter_prio(task);
+ waiter->deadline = task->dl.deadline;
+}
+
/*
* Only use with rt_mutex_waiter_{less,equal}()
*/
#define task_to_waiter(p) \
- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
+ &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
struct rt_mutex_waiter *right)
@@ -265,22 +343,63 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
return 1;
}
+static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
+ struct rt_mutex_waiter *top_waiter)
+{
+ if (rt_mutex_waiter_less(waiter, top_waiter))
+ return true;
+
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
+ /*
+ * Note that RT tasks are excluded from same priority (lateral)
+ * steals to prevent the introduction of an unbounded latency.
+ */
+ if (rt_prio(waiter->prio) || dl_prio(waiter->prio))
+ return false;
+
+ return rt_mutex_waiter_equal(waiter, top_waiter);
+#else
+ return false;
+#endif
+}
+
#define __node_2_waiter(node) \
rb_entry((node), struct rt_mutex_waiter, tree_entry)
static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
{
- return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b));
+ struct rt_mutex_waiter *aw = __node_2_waiter(a);
+ struct rt_mutex_waiter *bw = __node_2_waiter(b);
+
+ if (rt_mutex_waiter_less(aw, bw))
+ return 1;
+
+ if (!build_ww_mutex())
+ return 0;
+
+ if (rt_mutex_waiter_less(bw, aw))
+ return 0;
+
+ /* NOTE: relies on waiter->ww_ctx being set before insertion */
+ if (aw->ww_ctx) {
+ if (!bw->ww_ctx)
+ return 1;
+
+ return (signed long)(aw->ww_ctx->stamp -
+ bw->ww_ctx->stamp) < 0;
+ }
+
+ return 0;
}
static __always_inline void
-rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
}
static __always_inline void
-rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
if (RB_EMPTY_NODE(&waiter->tree_entry))
return;
@@ -326,6 +445,35 @@ static __always_inline void rt_mutex_adjust_prio(struct task_struct *p)
rt_mutex_setprio(p, pi_task);
}
+/* RT mutex specific wake_q wrappers */
+static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
+ struct rt_mutex_waiter *w)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state != TASK_NORMAL) {
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ WARN_ON_ONCE(wqh->rtlock_task);
+ get_task_struct(w->task);
+ wqh->rtlock_task = w->task;
+ } else {
+ wake_q_add(&wqh->head, w->task);
+ }
+}
+
+static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) {
+ wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT);
+ put_task_struct(wqh->rtlock_task);
+ wqh->rtlock_task = NULL;
+ }
+
+ if (!wake_q_empty(&wqh->head))
+ wake_up_q(&wqh->head);
+
+ /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */
+ preempt_enable();
+}
+
/*
* Deadlock detection is conditional:
*
@@ -343,17 +491,12 @@ static __always_inline bool
rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
enum rtmutex_chainwalk chwalk)
{
- if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEX))
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
return waiter != NULL;
return chwalk == RT_MUTEX_FULL_CHAINWALK;
}
-/*
- * Max number of times we'll walk the boosting chain:
- */
-int max_lock_depth = 1024;
-
-static __always_inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
+static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_struct *p)
{
return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
}
@@ -423,15 +566,15 @@ static __always_inline struct rt_mutex *task_blocked_on_lock(struct task_struct
*/
static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
enum rtmutex_chainwalk chwalk,
- struct rt_mutex *orig_lock,
- struct rt_mutex *next_lock,
+ struct rt_mutex_base *orig_lock,
+ struct rt_mutex_base *next_lock,
struct rt_mutex_waiter *orig_waiter,
struct task_struct *top_task)
{
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
struct rt_mutex_waiter *prerequeue_top_waiter;
int ret = 0, depth = 0;
- struct rt_mutex *lock;
+ struct rt_mutex_base *lock;
bool detect_deadlock;
bool requeue = true;
@@ -514,6 +657,31 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
goto out_unlock_pi;
/*
+ * There could be 'spurious' loops in the lock graph due to ww_mutex,
+ * consider:
+ *
+ * P1: A, ww_A, ww_B
+ * P2: ww_B, ww_A
+ * P3: A
+ *
+ * P3 should not return -EDEADLK because it gets trapped in the cycle
+ * created by P1 and P2 (which will resolve -- and runs into
+ * max_lock_depth above). Therefore disable detect_deadlock such that
+ * the below termination condition can trigger once all relevant tasks
+ * are boosted.
+ *
+ * Even when we start with ww_mutex we can disable deadlock detection,
+ * since we would supress a ww_mutex induced deadlock at [6] anyway.
+ * Supressing it here however is not sufficient since we might still
+ * hit [6] due to adjustment driven iteration.
+ *
+ * NOTE: if someone were to create a deadlock between 2 ww_classes we'd
+ * utterly fail to report it; lockdep should.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock)
+ detect_deadlock = false;
+
+ /*
* Drop out, when the task has no waiters. Note,
* top_waiter can be NULL, when we are in the deboosting
* mode!
@@ -574,8 +742,21 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* walk, we detected a deadlock.
*/
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
- raw_spin_unlock(&lock->wait_lock);
ret = -EDEADLK;
+
+ /*
+ * When the deadlock is due to ww_mutex; also see above. Don't
+ * report the deadlock and instead let the ww_mutex wound/die
+ * logic pick which of the contending threads gets -EDEADLK.
+ *
+ * NOTE: assumes the cycle only contains a single ww_class; any
+ * other configuration and we fail to report; also, see
+ * lockdep.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx)
+ ret = 0;
+
+ raw_spin_unlock(&lock->wait_lock);
goto out_unlock_pi;
}
@@ -653,8 +834,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* serializes all pi_waiters access and rb_erase() does not care about
* the values of the node being removed.
*/
- waiter->prio = task->prio;
- waiter->deadline = task->dl.deadline;
+ waiter_update_prio(waiter, task);
rt_mutex_enqueue(lock, waiter);
@@ -676,7 +856,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* to get the lock.
*/
if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
- wake_up_process(rt_mutex_top_waiter(lock)->task);
+ wake_up_state(waiter->task, waiter->wake_state);
raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
@@ -779,7 +959,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* callsite called task_blocked_on_lock(), otherwise NULL
*/
static int __sched
-try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
{
lockdep_assert_held(&lock->wait_lock);
@@ -815,19 +995,21 @@ try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* trylock attempt.
*/
if (waiter) {
- /*
- * If waiter is not the highest priority waiter of
- * @lock, give up.
- */
- if (waiter != rt_mutex_top_waiter(lock))
- return 0;
+ struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
/*
- * We can acquire the lock. Remove the waiter from the
- * lock waiters tree.
+ * If waiter is the highest priority waiter of @lock,
+ * or allowed to steal it, take it over.
*/
- rt_mutex_dequeue(lock, waiter);
-
+ if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) {
+ /*
+ * We can acquire the lock. Remove the waiter from the
+ * lock waiters tree.
+ */
+ rt_mutex_dequeue(lock, waiter);
+ } else {
+ return 0;
+ }
} else {
/*
* If the lock has waiters already we check whether @task is
@@ -838,13 +1020,9 @@ try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* not need to be dequeued.
*/
if (rt_mutex_has_waiters(lock)) {
- /*
- * If @task->prio is greater than or equal to
- * the top waiter priority (kernel view),
- * @task lost.
- */
- if (!rt_mutex_waiter_less(task_to_waiter(task),
- rt_mutex_top_waiter(lock)))
+ /* Check whether the trylock can steal it. */
+ if (!rt_mutex_steal(task_to_waiter(task),
+ rt_mutex_top_waiter(lock)))
return 0;
/*
@@ -897,14 +1075,15 @@ takeit:
*
* This must be called with lock->wait_lock held and interrupts disabled
*/
-static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock,
+static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task,
+ struct ww_acquire_ctx *ww_ctx,
enum rtmutex_chainwalk chwalk)
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
- struct rt_mutex *next_lock;
+ struct rt_mutex_base *next_lock;
int chain_walk = 0, res;
lockdep_assert_held(&lock->wait_lock);
@@ -924,8 +1103,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_lock(&task->pi_lock);
waiter->task = task;
waiter->lock = lock;
- waiter->prio = task->prio;
- waiter->deadline = task->dl.deadline;
+ waiter_update_prio(waiter, task);
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -936,6 +1114,21 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_unlock(&task->pi_lock);
+ if (build_ww_mutex() && ww_ctx) {
+ struct rt_mutex *rtm;
+
+ /* Check whether the waiter should back out immediately */
+ rtm = container_of(lock, struct rt_mutex, rtmutex);
+ res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx);
+ if (res) {
+ raw_spin_lock(&task->pi_lock);
+ rt_mutex_dequeue(lock, waiter);
+ task->pi_blocked_on = NULL;
+ raw_spin_unlock(&task->pi_lock);
+ return res;
+ }
+ }
+
if (!owner)
return 0;
@@ -986,8 +1179,8 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock,
*
* Called with lock->wait_lock held and interrupts disabled.
*/
-static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q,
- struct rt_mutex *lock)
+static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
+ struct rt_mutex_base *lock)
{
struct rt_mutex_waiter *waiter;
@@ -1023,235 +1216,14 @@ static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q,
* deboost but before waking our donor task, hence the preempt_disable()
* before unlock.
*
- * Pairs with preempt_enable() in rt_mutex_postunlock();
+ * Pairs with preempt_enable() in rt_mutex_wake_up_q();
*/
preempt_disable();
- wake_q_add(wake_q, waiter->task);
+ rt_mutex_wake_q_add(wqh, waiter);
raw_spin_unlock(&current->pi_lock);
}
-/*
- * Remove a waiter from a lock and give up
- *
- * Must be called with lock->wait_lock held and interrupts disabled. I must
- * have just failed to try_to_take_rt_mutex().
- */
-static void __sched remove_waiter(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
-{
- bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
- struct task_struct *owner = rt_mutex_owner(lock);
- struct rt_mutex *next_lock;
-
- lockdep_assert_held(&lock->wait_lock);
-
- raw_spin_lock(&current->pi_lock);
- rt_mutex_dequeue(lock, waiter);
- current->pi_blocked_on = NULL;
- raw_spin_unlock(&current->pi_lock);
-
- /*
- * Only update priority if the waiter was the highest priority
- * waiter of the lock and there is an owner to update.
- */
- if (!owner || !is_top_waiter)
- return;
-
- raw_spin_lock(&owner->pi_lock);
-
- rt_mutex_dequeue_pi(owner, waiter);
-
- if (rt_mutex_has_waiters(lock))
- rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
-
- rt_mutex_adjust_prio(owner);
-
- /* Store the lock on which owner is blocked or NULL */
- next_lock = task_blocked_on_lock(owner);
-
- raw_spin_unlock(&owner->pi_lock);
-
- /*
- * Don't walk the chain, if the owner task is not blocked
- * itself.
- */
- if (!next_lock)
- return;
-
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(owner);
-
- raw_spin_unlock_irq(&lock->wait_lock);
-
- rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
- next_lock, NULL, current);
-
- raw_spin_lock_irq(&lock->wait_lock);
-}
-
-/*
- * Recheck the pi chain, in case we got a priority setting
- *
- * Called from sched_setscheduler
- */
-void __sched rt_mutex_adjust_pi(struct task_struct *task)
-{
- struct rt_mutex_waiter *waiter;
- struct rt_mutex *next_lock;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
-
- waiter = task->pi_blocked_on;
- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- return;
- }
- next_lock = waiter->lock;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(task);
-
- rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
- next_lock, NULL, task);
-}
-
-void __sched rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
-{
- debug_rt_mutex_init_waiter(waiter);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
- RB_CLEAR_NODE(&waiter->tree_entry);
- waiter->task = NULL;
-}
-
-/**
- * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
- * @lock: the rt_mutex to take
- * @state: the state the task should block in (TASK_INTERRUPTIBLE
- * or TASK_UNINTERRUPTIBLE)
- * @timeout: the pre-initialized and started timer, or NULL for none
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Must be called with lock->wait_lock held and interrupts disabled
- */
-static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter)
-{
- int ret = 0;
-
- for (;;) {
- /* Try to acquire the lock: */
- if (try_to_take_rt_mutex(lock, current, waiter))
- break;
-
- if (timeout && !timeout->task) {
- ret = -ETIMEDOUT;
- break;
- }
- if (signal_pending_state(state, current)) {
- ret = -EINTR;
- break;
- }
-
- raw_spin_unlock_irq(&lock->wait_lock);
-
- schedule();
-
- raw_spin_lock_irq(&lock->wait_lock);
- set_current_state(state);
- }
-
- __set_current_state(TASK_RUNNING);
- return ret;
-}
-
-static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
- struct rt_mutex_waiter *w)
-{
- /*
- * If the result is not -EDEADLOCK or the caller requested
- * deadlock detection, nothing to do here.
- */
- if (res != -EDEADLOCK || detect_deadlock)
- return;
-
- /*
- * Yell loudly and stop the task right here.
- */
- WARN(1, "rtmutex deadlock detected\n");
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- schedule();
- }
-}
-
-/*
- * Slow path lock function:
- */
-static int __sched rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk)
-{
- struct rt_mutex_waiter waiter;
- unsigned long flags;
- int ret = 0;
-
- rt_mutex_init_waiter(&waiter);
-
- /*
- * Technically we could use raw_spin_[un]lock_irq() here, but this can
- * be called in early boot if the cmpxchg() fast path is disabled
- * (debug, no architecture support). In this case we will acquire the
- * rtmutex with lock->wait_lock held. But we cannot unconditionally
- * enable interrupts in that early boot case. So we need to use the
- * irqsave/restore variants.
- */
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
-
- /* Try to acquire the lock again: */
- if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- return 0;
- }
-
- set_current_state(state);
-
- /* Setup the timer, when timeout != NULL */
- if (unlikely(timeout))
- hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-
- ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
-
- if (likely(!ret))
- /* sleep on the mutex */
- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
-
- if (unlikely(ret)) {
- __set_current_state(TASK_RUNNING);
- remove_waiter(lock, &waiter);
- rt_mutex_handle_deadlock(ret, chwalk, &waiter);
- }
-
- /*
- * try_to_take_rt_mutex() sets the waiter bit
- * unconditionally. We might have to fix that up.
- */
- fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-
- /* Remove pending timer: */
- if (unlikely(timeout))
- hrtimer_cancel(&timeout->timer);
-
- debug_rt_mutex_free_waiter(&waiter);
-
- return ret;
-}
-
-static int __sched __rt_mutex_slowtrylock(struct rt_mutex *lock)
+static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock)
{
int ret = try_to_take_rt_mutex(lock, current, NULL);
@@ -1267,7 +1239,7 @@ static int __sched __rt_mutex_slowtrylock(struct rt_mutex *lock)
/*
* Slow path try-lock function:
*/
-static int __sched rt_mutex_slowtrylock(struct rt_mutex *lock)
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock)
{
unsigned long flags;
int ret;
@@ -1293,25 +1265,20 @@ static int __sched rt_mutex_slowtrylock(struct rt_mutex *lock)
return ret;
}
-/*
- * Performs the wakeup of the top-waiter and re-enables preemption.
- */
-void __sched rt_mutex_postunlock(struct wake_q_head *wake_q)
+static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock)
{
- wake_up_q(wake_q);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 1;
- /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */
- preempt_enable();
+ return rt_mutex_slowtrylock(lock);
}
/*
* Slow path to release a rt-mutex.
- *
- * Return whether the current task needs to call rt_mutex_postunlock().
*/
-static void __sched rt_mutex_slowunlock(struct rt_mutex *lock)
+static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock)
{
- DEFINE_WAKE_Q(wake_q);
+ DEFINE_RT_WAKE_Q(wqh);
unsigned long flags;
/* irqsave required to support early boot calls */
@@ -1364,422 +1331,387 @@ static void __sched rt_mutex_slowunlock(struct rt_mutex *lock)
*
* Queue the next waiter for wakeup once we release the wait_lock.
*/
- mark_wakeup_next_waiter(&wake_q, lock);
+ mark_wakeup_next_waiter(&wqh, lock);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- rt_mutex_postunlock(&wake_q);
+ rt_mutex_wake_up_q(&wqh);
}
-/*
- * debug aware fast / slowpath lock,trylock,unlock
- *
- * The atomic acquire/release ops are compiled away, when either the
- * architecture does not support cmpxchg or when debugging is enabled.
- */
-static __always_inline int __rt_mutex_lock(struct rt_mutex *lock, long state,
- unsigned int subclass)
+static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock)
{
- int ret;
-
- might_sleep();
- mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 0;
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
- ret = rt_mutex_slowlock(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
- if (ret)
- mutex_release(&lock->dep_map, _RET_IP_);
- return ret;
+ rt_mutex_slowunlock(lock);
}
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-/**
- * rt_mutex_lock_nested - lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- * @subclass: the lockdep subclass
- */
-void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
+#ifdef CONFIG_SMP
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
{
- __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
-
-#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+ bool res = true;
-/**
- * rt_mutex_lock - lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- */
-void __sched rt_mutex_lock(struct rt_mutex *lock)
+ rcu_read_lock();
+ for (;;) {
+ /* If owner changed, trylock again. */
+ if (owner != rt_mutex_owner(lock))
+ break;
+ /*
+ * Ensure that @owner is dereferenced after checking that
+ * the lock owner still matches @owner. If that fails,
+ * @owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+ /*
+ * Stop spinning when:
+ * - the lock owner has been scheduled out
+ * - current is not longer the top waiter
+ * - current is requested to reschedule (redundant
+ * for CONFIG_PREEMPT_RCU=y)
+ * - the VCPU on which owner runs is preempted
+ */
+ if (!owner->on_cpu || need_resched() ||
+ rt_mutex_waiter_is_top_waiter(lock, waiter) ||
+ vcpu_is_preempted(task_cpu(owner))) {
+ res = false;
+ break;
+ }
+ cpu_relax();
+ }
+ rcu_read_unlock();
+ return res;
+}
+#else
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
{
- __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0);
+ return false;
}
-EXPORT_SYMBOL_GPL(rt_mutex_lock);
#endif
-/**
- * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
- *
- * @lock: the rt_mutex to be locked
- *
- * Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
+#ifdef RT_MUTEX_BUILD_MUTEX
+/*
+ * Functions required for:
+ * - rtmutex, futex on all kernels
+ * - mutex and rwsem substitutions on RT kernels
*/
-int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
-{
- return __rt_mutex_lock(lock, TASK_INTERRUPTIBLE, 0);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
-/**
- * rt_mutex_trylock - try to lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- *
- * This function can only be called in thread context. It's safe to call it
- * from atomic regions, but not from hard or soft interrupt context.
+/*
+ * Remove a waiter from a lock and give up
*
- * Returns:
- * 1 on success
- * 0 on contention
+ * Must be called with lock->wait_lock held and interrupts disabled. It must
+ * have just failed to try_to_take_rt_mutex().
*/
-int __sched rt_mutex_trylock(struct rt_mutex *lock)
+static void __sched remove_waiter(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
{
- int ret;
+ bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
+ struct task_struct *owner = rt_mutex_owner(lock);
+ struct rt_mutex_base *next_lock;
- if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
- return 0;
+ lockdep_assert_held(&lock->wait_lock);
+
+ raw_spin_lock(&current->pi_lock);
+ rt_mutex_dequeue(lock, waiter);
+ current->pi_blocked_on = NULL;
+ raw_spin_unlock(&current->pi_lock);
/*
- * No lockdep annotation required because lockdep disables the fast
- * path.
+ * Only update priority if the waiter was the highest priority
+ * waiter of the lock and there is an owner to update.
*/
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 1;
-
- ret = rt_mutex_slowtrylock(lock);
- if (ret)
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(rt_mutex_trylock);
-
-/**
- * rt_mutex_unlock - unlock a rt_mutex
- *
- * @lock: the rt_mutex to be unlocked
- */
-void __sched rt_mutex_unlock(struct rt_mutex *lock)
-{
- mutex_release(&lock->dep_map, _RET_IP_);
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ if (!owner || !is_top_waiter)
return;
- rt_mutex_slowunlock(lock);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+ raw_spin_lock(&owner->pi_lock);
-/*
- * Futex variants, must not use fastpath.
- */
-int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
-{
- return rt_mutex_slowtrylock(lock);
-}
+ rt_mutex_dequeue_pi(owner, waiter);
-int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
-{
- return __rt_mutex_slowtrylock(lock);
-}
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
-/**
- * __rt_mutex_futex_unlock - Futex variant, that since futex variants
- * do not use the fast-path, can be simple and will not need to retry.
- *
- * @lock: The rt_mutex to be unlocked
- * @wake_q: The wake queue head from which to get the next lock waiter
- */
-bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wake_q)
-{
- lockdep_assert_held(&lock->wait_lock);
+ rt_mutex_adjust_prio(owner);
- debug_rt_mutex_unlock(lock);
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
- if (!rt_mutex_has_waiters(lock)) {
- lock->owner = NULL;
- return false; /* done */
- }
+ raw_spin_unlock(&owner->pi_lock);
/*
- * We've already deboosted, mark_wakeup_next_waiter() will
- * retain preempt_disabled when we drop the wait_lock, to
- * avoid inversion prior to the wakeup. preempt_disable()
- * therein pairs with rt_mutex_postunlock().
+ * Don't walk the chain, if the owner task is not blocked
+ * itself.
*/
- mark_wakeup_next_waiter(wake_q, lock);
+ if (!next_lock)
+ return;
- return true; /* call postunlock() */
-}
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(owner);
-void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
-{
- DEFINE_WAKE_Q(wake_q);
- unsigned long flags;
- bool postunlock;
+ raw_spin_unlock_irq(&lock->wait_lock);
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
- postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
+ next_lock, NULL, current);
- if (postunlock)
- rt_mutex_postunlock(&wake_q);
+ raw_spin_lock_irq(&lock->wait_lock);
}
/**
- * __rt_mutex_init - initialize the rt_mutex
- *
- * @lock: The rt_mutex to be initialized
- * @name: The lock name used for debugging
- * @key: The lock class key used for debugging
- *
- * Initialize the rt_mutex to unlocked state.
+ * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop
+ * @lock: the rt_mutex to take
+ * @ww_ctx: WW mutex context pointer
+ * @state: the state the task should block in (TASK_INTERRUPTIBLE
+ * or TASK_UNINTERRUPTIBLE)
+ * @timeout: the pre-initialized and started timer, or NULL for none
+ * @waiter: the pre-initialized rt_mutex_waiter
*
- * Initializing of a locked rt_mutex is not allowed
+ * Must be called with lock->wait_lock held and interrupts disabled
*/
-void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name,
- struct lock_class_key *key)
+static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ struct hrtimer_sleeper *timeout,
+ struct rt_mutex_waiter *waiter)
{
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
+ struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
+ struct task_struct *owner;
+ int ret = 0;
- __rt_mutex_basic_init(lock);
-}
-EXPORT_SYMBOL_GPL(__rt_mutex_init);
+ for (;;) {
+ /* Try to acquire the lock: */
+ if (try_to_take_rt_mutex(lock, current, waiter))
+ break;
-/**
- * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
- * proxy owner
- *
- * @lock: the rt_mutex to be locked
- * @proxy_owner:the task to set as owner
- *
- * No locking. Caller has to do serializing itself
- *
- * Special API call for PI-futex support. This initializes the rtmutex and
- * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
- * possible at this point because the pi_state which contains the rtmutex
- * is not yet visible to other tasks.
- */
-void __sched rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner)
-{
- __rt_mutex_basic_init(lock);
- rt_mutex_set_owner(lock, proxy_owner);
+ if (timeout && !timeout->task) {
+ ret = -ETIMEDOUT;
+ break;
+ }
+ if (signal_pending_state(state, current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ if (build_ww_mutex() && ww_ctx) {
+ ret = __ww_mutex_check_kill(rtm, waiter, ww_ctx);
+ if (ret)
+ break;
+ }
+
+ if (waiter == rt_mutex_top_waiter(lock))
+ owner = rt_mutex_owner(lock);
+ else
+ owner = NULL;
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
+ schedule();
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ set_current_state(state);
+ }
+
+ __set_current_state(TASK_RUNNING);
+ return ret;
}
-/**
- * rt_mutex_proxy_unlock - release a lock on behalf of owner
- *
- * @lock: the rt_mutex to be locked
- *
- * No locking. Caller has to do serializing itself
- *
- * Special API call for PI-futex support. This merrily cleans up the rtmutex
- * (debugging) state. Concurrent operations on this rt_mutex are not
- * possible because it belongs to the pi_state which is about to be freed
- * and it is not longer visible to other tasks.
- */
-void __sched rt_mutex_proxy_unlock(struct rt_mutex *lock)
+static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_waiter *w)
{
- debug_rt_mutex_proxy_unlock(lock);
- rt_mutex_set_owner(lock, NULL);
+ /*
+ * If the result is not -EDEADLOCK or the caller requested
+ * deadlock detection, nothing to do here.
+ */
+ if (res != -EDEADLOCK || detect_deadlock)
+ return;
+
+ if (build_ww_mutex() && w->ww_ctx)
+ return;
+
+ /*
+ * Yell loudly and stop the task right here.
+ */
+ WARN(1, "rtmutex deadlock detected\n");
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
}
/**
- * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
- * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
- *
- * NOTE: does _NOT_ remove the @waiter on failure; must either call
- * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for PI-futex support.
+ * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held
+ * @lock: The rtmutex to block lock
+ * @ww_ctx: WW mutex context pointer
+ * @state: The task state for sleeping
+ * @chwalk: Indicator whether full or partial chainwalk is requested
+ * @waiter: Initializer waiter for blocking
*/
-int __sched __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task)
+static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *waiter)
{
+ struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
+ struct ww_mutex *ww = ww_container_of(rtm);
int ret;
lockdep_assert_held(&lock->wait_lock);
- if (try_to_take_rt_mutex(lock, task, NULL))
- return 1;
+ /* Try to acquire the lock again: */
+ if (try_to_take_rt_mutex(lock, current, NULL)) {
+ if (build_ww_mutex() && ww_ctx) {
+ __ww_mutex_check_waiters(rtm, ww_ctx);
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ }
+ return 0;
+ }
- /* We enforce deadlock detection for futexes */
- ret = task_blocks_on_rt_mutex(lock, waiter, task,
- RT_MUTEX_FULL_CHAINWALK);
+ set_current_state(state);
- if (ret && !rt_mutex_owner(lock)) {
- /*
- * Reset the return value. We might have
- * returned with -EDEADLK and the owner
- * released the lock while we were walking the
- * pi chain. Let the waiter sort it out.
- */
- ret = 0;
+ ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk);
+ if (likely(!ret))
+ ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter);
+
+ if (likely(!ret)) {
+ /* acquired the lock */
+ if (build_ww_mutex() && ww_ctx) {
+ if (!ww_ctx->is_wait_die)
+ __ww_mutex_check_waiters(rtm, ww_ctx);
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ }
+ } else {
+ __set_current_state(TASK_RUNNING);
+ remove_waiter(lock, waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, waiter);
}
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit
+ * unconditionally. We might have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
return ret;
}
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
- * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
- *
- * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
- * on failure.
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for PI-futex support.
- */
-int __sched rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task)
+static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state)
{
+ struct rt_mutex_waiter waiter;
int ret;
- raw_spin_lock_irq(&lock->wait_lock);
- ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
- if (unlikely(ret))
- remove_waiter(lock, waiter);
- raw_spin_unlock_irq(&lock->wait_lock);
+ rt_mutex_init_waiter(&waiter);
+ waiter.ww_ctx = ww_ctx;
+ ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK,
+ &waiter);
+
+ debug_rt_mutex_free_waiter(&waiter);
return ret;
}
-/**
- * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
- * @lock: the rt_mutex we were woken on
- * @to: the timeout, null if none. hrtimer should already have
- * been started.
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Wait for the lock acquisition started on our behalf by
- * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
- * rt_mutex_cleanup_proxy_lock().
- *
- * Returns:
- * 0 - success
- * <0 - error, one of -EINTR, -ETIMEDOUT
- *
- * Special API call for PI-futex support
+/*
+ * rt_mutex_slowlock - Locking slowpath invoked when fast path fails
+ * @lock: The rtmutex to block lock
+ * @ww_ctx: WW mutex context pointer
+ * @state: The task state for sleeping
*/
-int __sched rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter)
+static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state)
{
+ unsigned long flags;
int ret;
- raw_spin_lock_irq(&lock->wait_lock);
- /* sleep on the mutex */
- set_current_state(TASK_INTERRUPTIBLE);
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
/*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
*/
- fixup_rt_mutex_waiters(lock);
- raw_spin_unlock_irq(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return ret;
}
+static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
+ unsigned int state)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 0;
+
+ return rt_mutex_slowlock(lock, NULL, state);
+}
+#endif /* RT_MUTEX_BUILD_MUTEX */
+
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
+/*
+ * Functions required for spin/rw_lock substitution on RT kernels
+ */
+
/**
- * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
- * @lock: the rt_mutex we were woken on
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
- * rt_mutex_wait_proxy_lock().
- *
- * Unless we acquired the lock; we're still enqueued on the wait-list and can
- * in fact still be granted ownership until we're removed. Therefore we can
- * find we are in fact the owner and must disregard the
- * rt_mutex_wait_proxy_lock() failure.
- *
- * Returns:
- * true - did the cleanup, we done.
- * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
- * caller should disregards its return value.
- *
- * Special API call for PI-futex support
+ * rtlock_slowlock_locked - Slow path lock acquisition for RT locks
+ * @lock: The underlying RT mutex
*/
-bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
+static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
{
- bool cleanup = false;
+ struct rt_mutex_waiter waiter;
+ struct task_struct *owner;
- raw_spin_lock_irq(&lock->wait_lock);
- /*
- * Do an unconditional try-lock, this deals with the lock stealing
- * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
- * sets a NULL owner.
- *
- * We're not interested in the return value, because the subsequent
- * test on rt_mutex_owner() will infer that. If the trylock succeeded,
- * we will own the lock and it will have removed the waiter. If we
- * failed the trylock, we're still not owner and we need to remove
- * ourselves.
- */
- try_to_take_rt_mutex(lock, current, waiter);
- /*
- * Unless we're the owner; we're still enqueued on the wait_list.
- * So check if we became owner, if not, take us off the wait_list.
- */
- if (rt_mutex_owner(lock) != current) {
- remove_waiter(lock, waiter);
- cleanup = true;
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (try_to_take_rt_mutex(lock, current, NULL))
+ return;
+
+ rt_mutex_init_rtlock_waiter(&waiter);
+
+ /* Save current state and set state to TASK_RTLOCK_WAIT */
+ current_save_and_set_rtlock_wait_state();
+
+ task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK);
+
+ for (;;) {
+ /* Try to acquire the lock again */
+ if (try_to_take_rt_mutex(lock, current, &waiter))
+ break;
+
+ if (&waiter == rt_mutex_top_waiter(lock))
+ owner = rt_mutex_owner(lock);
+ else
+ owner = NULL;
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner))
+ schedule_rtlock();
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ set_current_state(TASK_RTLOCK_WAIT);
}
+
+ /* Restore the task state */
+ current_restore_rtlock_saved_state();
+
/*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally.
+ * We might have to fix that up:
*/
fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock_irq(&lock->wait_lock);
-
- return cleanup;
+ debug_rt_mutex_free_waiter(&waiter);
}
-#ifdef CONFIG_DEBUG_RT_MUTEXES
-void rt_mutex_debug_task_free(struct task_struct *task)
+static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
{
- DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
- DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ rtlock_slowlock_locked(lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
}
-#endif
+
+#endif /* RT_MUTEX_BUILD_SPINLOCKS */
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
new file mode 100644
index 000000000000..5c9299aaabae
--- /dev/null
+++ b/kernel/locking/rtmutex_api.c
@@ -0,0 +1,590 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rtmutex API
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_MUTEX
+#include "rtmutex.c"
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+/*
+ * Debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock,
+ unsigned int state,
+ unsigned int subclass)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ ret = __rt_mutex_lock(&lock->rtmutex, state);
+ if (ret)
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+
+void rt_mutex_base_init(struct rt_mutex_base *rtb)
+{
+ __rt_mutex_base_init(rtb);
+}
+EXPORT_SYMBOL(rt_mutex_base_init);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/**
+ * rt_mutex_lock_nested - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ * @subclass: the lockdep subclass
+ */
+void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
+
+#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+#endif
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
+{
+ return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * This function can only be called in thread context. It's safe to call it
+ * from atomic regions, but not from hard or soft interrupt context.
+ *
+ * Returns:
+ * 1 on success
+ * 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->rtmutex);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/*
+ * Futex variants, must not use fastpath.
+ */
+int __sched rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+{
+ return rt_mutex_slowtrylock(lock);
+}
+
+int __sched __rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
+/**
+ * __rt_mutex_futex_unlock - Futex variant, that since futex variants
+ * do not use the fast-path, can be simple and will not need to retry.
+ *
+ * @lock: The rt_mutex to be unlocked
+ * @wqh: The wake queue head from which to get the next lock waiter
+ */
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
+ struct rt_wake_q_head *wqh)
+{
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
+ }
+
+ /*
+ * We've already deboosted, mark_wakeup_next_waiter() will
+ * retain preempt_disabled when we drop the wait_lock, to
+ * avoid inversion prior to the wakeup. preempt_disable()
+ * therein pairs with rt_mutex_postunlock().
+ */
+ mark_wakeup_next_waiter(wqh, lock);
+
+ return true; /* call postunlock() */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex_base *lock)
+{
+ DEFINE_RT_WAKE_Q(wqh);
+ unsigned long flags;
+ bool postunlock;
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ postunlock = __rt_mutex_futex_unlock(lock, &wqh);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wqh);
+}
+
+/**
+ * __rt_mutex_init - initialize the rt_mutex
+ *
+ * @lock: The rt_mutex to be initialized
+ * @name: The lock name used for debugging
+ * @key: The lock class key used for debugging
+ *
+ * Initialize the rt_mutex to unlocked state.
+ *
+ * Initializing of a locked rt_mutex is not allowed
+ */
+void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ __rt_mutex_base_init(&lock->rtmutex);
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ * proxy owner
+ *
+ * @lock: the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This initializes the rtmutex and
+ * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
+ * possible at this point because the pi_state which contains the rtmutex
+ * is not yet visible to other tasks.
+ */
+void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
+ struct task_struct *proxy_owner)
+{
+ static struct lock_class_key pi_futex_key;
+
+ __rt_mutex_base_init(lock);
+ /*
+ * On PREEMPT_RT the futex hashbucket spinlock becomes 'sleeping'
+ * and rtmutex based. That causes a lockdep false positive, because
+ * some of the futex functions invoke spin_unlock(&hb->lock) with
+ * the wait_lock of the rtmutex associated to the pi_futex held.
+ * spin_unlock() in turn takes wait_lock of the rtmutex on which
+ * the spinlock is based, which makes lockdep notice a lock
+ * recursion. Give the futex/rtmutex wait_lock a separate key.
+ */
+ lockdep_set_class(&lock->wait_lock, &pi_futex_key);
+ rt_mutex_set_owner(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This just cleans up the rtmutex
+ * (debugging) state. Concurrent operations on this rt_mutex are not
+ * possible because it belongs to the pi_state which is about to be freed
+ * and it is not longer visible to other tasks.
+ */
+void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
+{
+ debug_rt_mutex_proxy_unlock(lock);
+ rt_mutex_set_owner(lock, NULL);
+}
+
+/**
+ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: does _NOT_ remove the @waiter on failure; must either call
+ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (try_to_take_rt_mutex(lock, task, NULL))
+ return 1;
+
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL,
+ RT_MUTEX_FULL_CHAINWALK);
+
+ if (ret && !rt_mutex_owner(lock)) {
+ /*
+ * Reset the return value. We might have
+ * returned with -EDEADLK and the owner
+ * released the lock while we were walking the
+ * pi chain. Let the waiter sort it out.
+ */
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
+ * on failure.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @to: the timeout, null if none. hrtimer should already have
+ * been started.
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Wait for the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
+ *
+ * Returns:
+ * 0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT
+ *
+ * Special API call for PI-futex support
+ */
+int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
+ ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter);
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
+ * rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ * true - did the cleanup, we done.
+ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ * caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool cleanup = false;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Do an unconditional try-lock, this deals with the lock stealing
+ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
+ * sets a NULL owner.
+ *
+ * We're not interested in the return value, because the subsequent
+ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
+ * we will own the lock and it will have removed the waiter. If we
+ * failed the trylock, we're still not owner and we need to remove
+ * ourselves.
+ */
+ try_to_take_rt_mutex(lock, current, waiter);
+ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
+ remove_waiter(lock, waiter);
+ cleanup = true;
+ }
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return cleanup;
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void __sched rt_mutex_adjust_pi(struct task_struct *task)
+{
+ struct rt_mutex_waiter *waiter;
+ struct rt_mutex_base *next_lock;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ return;
+ }
+ next_lock = waiter->lock;
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
+
+ rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
+ next_lock, NULL, task);
+}
+
+/*
+ * Performs the wakeup of the top-waiter and re-enables preemption.
+ */
+void __sched rt_mutex_postunlock(struct rt_wake_q_head *wqh)
+{
+ rt_mutex_wake_up_q(wqh);
+}
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
+ DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+/* Mutexes */
+void __mutex_rt_init(struct mutex *mutex, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
+ lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(__mutex_rt_init);
+
+static __always_inline int __mutex_lock_common(struct mutex *lock,
+ unsigned int state,
+ unsigned int subclass,
+ struct lockdep_map *nest_lock,
+ unsigned long ip)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+ ret = __rt_mutex_lock(&lock->rtmutex, state);
+ if (ret)
+ mutex_release(&lock->dep_map, ip);
+ else
+ lock_acquired(&lock->dep_map, ip);
+ return ret;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+void __sched _mutex_lock_nest_lock(struct mutex *lock,
+ struct lockdep_map *nest_lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest_lock, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
+int __sched mutex_lock_interruptible_nested(struct mutex *lock,
+ unsigned int subclass)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+
+int __sched mutex_lock_killable_nested(struct mutex *lock,
+ unsigned int subclass)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+
+void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+ int token;
+
+ might_sleep();
+
+ token = io_schedule_prepare();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
+#else /* CONFIG_DEBUG_LOCK_ALLOC */
+
+void __sched mutex_lock(struct mutex *lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock);
+
+int __sched mutex_lock_interruptible(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+int __sched mutex_lock_killable(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_killable);
+
+void __sched mutex_lock_io(struct mutex *lock)
+{
+ int token = io_schedule_prepare();
+
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL(mutex_lock_io);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+int __sched mutex_trylock(struct mutex *lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL(mutex_trylock);
+
+void __sched mutex_unlock(struct mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->rtmutex);
+}
+EXPORT_SYMBOL(mutex_unlock);
+
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index a90c22abdbca..c47e8361bfb5 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -25,29 +25,90 @@
* @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task
* @lock: Pointer to the rt_mutex on which the waiter blocks
+ * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
* @prio: Priority of the waiter
* @deadline: Deadline of the waiter if applicable
+ * @ww_ctx: WW context pointer
*/
struct rt_mutex_waiter {
struct rb_node tree_entry;
struct rb_node pi_tree_entry;
struct task_struct *task;
- struct rt_mutex *lock;
+ struct rt_mutex_base *lock;
+ unsigned int wake_state;
int prio;
u64 deadline;
+ struct ww_acquire_ctx *ww_ctx;
};
+/**
+ * rt_wake_q_head - Wrapper around regular wake_q_head to support
+ * "sleeping" spinlocks on RT
+ * @head: The regular wake_q_head for sleeping lock variants
+ * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups
+ */
+struct rt_wake_q_head {
+ struct wake_q_head head;
+ struct task_struct *rtlock_task;
+};
+
+#define DEFINE_RT_WAKE_Q(name) \
+ struct rt_wake_q_head name = { \
+ .head = WAKE_Q_HEAD_INITIALIZER(name.head), \
+ .rtlock_task = NULL, \
+ }
+
+/*
+ * PI-futex support (proxy locking functions, etc.):
+ */
+extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
+ struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter);
+
+extern int rt_mutex_futex_trylock(struct rt_mutex_base *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
+ struct rt_wake_q_head *wqh);
+
+extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh);
+
/*
* Must be guarded because this header is included from rcu/tree_plugin.h
* unconditionally.
*/
#ifdef CONFIG_RT_MUTEXES
-static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock)
{
return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
}
-static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex *lock)
+/*
+ * Lockless speculative check whether @waiter is still the top waiter on
+ * @lock. This is solely comparing pointers and not derefencing the
+ * leftmost entry which might be about to vanish.
+ */
+static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ struct rb_node *leftmost = rb_first_cached(&lock->waiters);
+
+ return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter;
+}
+
+static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
{
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
struct rt_mutex_waiter *w = NULL;
@@ -72,19 +133,12 @@ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
#define RT_MUTEX_HAS_WAITERS 1UL
-static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
{
unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
}
-#else /* CONFIG_RT_MUTEXES */
-/* Used in rcu/tree_plugin.h */
-static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
-{
- return NULL;
-}
-#endif /* !CONFIG_RT_MUTEXES */
/*
* Constants for rt mutex functions which have a selectable deadlock
@@ -101,49 +155,21 @@ enum rtmutex_chainwalk {
RT_MUTEX_FULL_CHAINWALK,
};
-static inline void __rt_mutex_basic_init(struct rt_mutex *lock)
+static inline void __rt_mutex_base_init(struct rt_mutex_base *lock)
{
- lock->owner = NULL;
raw_spin_lock_init(&lock->wait_lock);
lock->waiters = RB_ROOT_CACHED;
+ lock->owner = NULL;
}
-/*
- * PI-futex support (proxy locking functions, etc.):
- */
-extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner);
-extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
-extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task);
-extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task);
-extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter);
-extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter);
-
-extern int rt_mutex_futex_trylock(struct rt_mutex *l);
-extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
-
-extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
-extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh);
-
-extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
-
/* Debug functions */
-static inline void debug_rt_mutex_unlock(struct rt_mutex *lock)
+static inline void debug_rt_mutex_unlock(struct rt_mutex_base *lock)
{
if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
}
-static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
+static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
{
if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
@@ -161,4 +187,27 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
memset(waiter, 0x22, sizeof(*waiter));
}
+static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+ waiter->wake_state = TASK_NORMAL;
+ waiter->task = NULL;
+}
+
+static inline void rt_mutex_init_rtlock_waiter(struct rt_mutex_waiter *waiter)
+{
+ rt_mutex_init_waiter(waiter);
+ waiter->wake_state = TASK_RTLOCK_WAIT;
+}
+
+#else /* CONFIG_RT_MUTEXES */
+/* Used in rcu/tree_plugin.h */
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
+{
+ return NULL;
+}
+#endif /* !CONFIG_RT_MUTEXES */
+
#endif
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
new file mode 100644
index 000000000000..88191f6e252c
--- /dev/null
+++ b/kernel/locking/rwbase_rt.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * RT-specific reader/writer semaphores and reader/writer locks
+ *
+ * down_write/write_lock()
+ * 1) Lock rtmutex
+ * 2) Remove the reader BIAS to force readers into the slow path
+ * 3) Wait until all readers have left the critical section
+ * 4) Mark it write locked
+ *
+ * up_write/write_unlock()
+ * 1) Remove the write locked marker
+ * 2) Set the reader BIAS, so readers can use the fast path again
+ * 3) Unlock rtmutex, to release blocked readers
+ *
+ * down_read/read_lock()
+ * 1) Try fast path acquisition (reader BIAS is set)
+ * 2) Take tmutex::wait_lock, which protects the writelocked flag
+ * 3) If !writelocked, acquire it for read
+ * 4) If writelocked, block on tmutex
+ * 5) unlock rtmutex, goto 1)
+ *
+ * up_read/read_unlock()
+ * 1) Try fast path release (reader count != 1)
+ * 2) Wake the writer waiting in down_write()/write_lock() #3
+ *
+ * down_read/read_lock()#3 has the consequence, that rw semaphores and rw
+ * locks on RT are not writer fair, but writers, which should be avoided in
+ * RT tasks (think mmap_sem), are subject to the rtmutex priority/DL
+ * inheritance mechanism.
+ *
+ * It's possible to make the rw primitives writer fair by keeping a list of
+ * active readers. A blocked writer would force all newly incoming readers
+ * to block on the rtmutex, but the rtmutex would have to be proxy locked
+ * for one reader after the other. We can't use multi-reader inheritance
+ * because there is no way to support that with SCHED_DEADLINE.
+ * Implementing the one by one reader boosting/handover mechanism is a
+ * major surgery for a very dubious value.
+ *
+ * The risk of writer starvation is there, but the pathological use cases
+ * which trigger it are not necessarily the typical RT workloads.
+ *
+ * Fast-path orderings:
+ * The lock/unlock of readers can run in fast paths: lock and unlock are only
+ * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE
+ * semantics of rwbase_rt. Atomic ops should thus provide _acquire()
+ * and _release() (or stronger).
+ *
+ * Common code shared between RT rw_semaphore and rwlock
+ */
+
+static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb)
+{
+ int r;
+
+ /*
+ * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
+ * set.
+ */
+ for (r = atomic_read(&rwb->readers); r < 0;) {
+ /* Fully-ordered if cmpxchg() succeeds, provides ACQUIRE */
+ if (likely(atomic_try_cmpxchg(&rwb->readers, &r, r + 1)))
+ return 1;
+ }
+ return 0;
+}
+
+static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ int ret;
+
+ raw_spin_lock_irq(&rtm->wait_lock);
+ /*
+ * Allow readers, as long as the writer has not completely
+ * acquired the semaphore for write.
+ */
+ if (atomic_read(&rwb->readers) != WRITER_BIAS) {
+ atomic_inc(&rwb->readers);
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ return 0;
+ }
+
+ /*
+ * Call into the slow lock path with the rtmutex->wait_lock
+ * held, so this can't result in the following race:
+ *
+ * Reader1 Reader2 Writer
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * down_read()
+ * unlock(m->wait_lock)
+ * up_read()
+ * wake(Writer)
+ * lock(m->wait_lock)
+ * sem->writelocked=true
+ * unlock(m->wait_lock)
+ *
+ * up_write()
+ * sem->writelocked=false
+ * rtmutex_unlock(m)
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * rtmutex_lock(m)
+ *
+ * That would put Reader1 behind the writer waiting on
+ * Reader2 to call up_read(), which might be unbound.
+ */
+
+ /*
+ * For rwlocks this returns 0 unconditionally, so the below
+ * !ret conditionals are optimized out.
+ */
+ ret = rwbase_rtmutex_slowlock_locked(rtm, state);
+
+ /*
+ * On success the rtmutex is held, so there can't be a writer
+ * active. Increment the reader count and immediately drop the
+ * rtmutex again.
+ *
+ * rtmutex->wait_lock has to be unlocked in any case of course.
+ */
+ if (!ret)
+ atomic_inc(&rwb->readers);
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ if (!ret)
+ rwbase_rtmutex_unlock(rtm);
+ return ret;
+}
+
+static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ if (rwbase_read_trylock(rwb))
+ return 0;
+
+ return __rwbase_read_lock(rwb, state);
+}
+
+static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ struct task_struct *owner;
+
+ raw_spin_lock_irq(&rtm->wait_lock);
+ /*
+ * Wake the writer, i.e. the rtmutex owner. It might release the
+ * rtmutex concurrently in the fast path (due to a signal), but to
+ * clean up rwb->readers it needs to acquire rtm->wait_lock. The
+ * worst case which can happen is a spurious wakeup.
+ */
+ owner = rt_mutex_owner(rtm);
+ if (owner)
+ wake_up_state(owner, state);
+
+ raw_spin_unlock_irq(&rtm->wait_lock);
+}
+
+static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ /*
+ * rwb->readers can only hit 0 when a writer is waiting for the
+ * active readers to leave the critical section.
+ *
+ * dec_and_test() is fully ordered, provides RELEASE.
+ */
+ if (unlikely(atomic_dec_and_test(&rwb->readers)))
+ __rwbase_read_unlock(rwb, state);
+}
+
+static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias,
+ unsigned long flags)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+
+ /*
+ * _release() is needed in case that reader is in fast path, pairing
+ * with atomic_try_cmpxchg() in rwbase_read_trylock(), provides RELEASE
+ */
+ (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers);
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_rtmutex_unlock(rtm);
+}
+
+static inline void rwbase_write_unlock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ __rwbase_write_unlock(rwb, WRITER_BIAS, flags);
+}
+
+static inline void rwbase_write_downgrade(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ /* Release it and account current as reader */
+ __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags);
+}
+
+static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ /* Can do without CAS because we're serialized by wait_lock. */
+ lockdep_assert_held(&rwb->rtmutex.wait_lock);
+
+ /*
+ * _acquire is needed in case the reader is in the fast path, pairing
+ * with rwbase_read_unlock(), provides ACQUIRE.
+ */
+ if (!atomic_read_acquire(&rwb->readers)) {
+ atomic_set(&rwb->readers, WRITER_BIAS);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ /* Take the rtmutex as a first step */
+ if (rwbase_rtmutex_lock_state(rtm, state))
+ return -EINTR;
+
+ /* Force readers into slow path */
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (__rwbase_write_trylock(rwb))
+ goto out_unlock;
+
+ rwbase_set_and_save_current_state(state);
+ for (;;) {
+ /* Optimized out for rwlocks */
+ if (rwbase_signal_pending_state(state, current)) {
+ rwbase_restore_current_state();
+ __rwbase_write_unlock(rwb, 0, flags);
+ return -EINTR;
+ }
+
+ if (__rwbase_write_trylock(rwb))
+ break;
+
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_schedule();
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+
+ set_current_state(state);
+ }
+ rwbase_restore_current_state();
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ return 0;
+}
+
+static inline int rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ if (!rwbase_rtmutex_trylock(rtm))
+ return 0;
+
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (__rwbase_write_trylock(rwb)) {
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ return 1;
+ }
+ __rwbase_write_unlock(rwb, 0, flags);
+ return 0;
+}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 809b0016d344..000e8d5a2884 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -28,6 +28,7 @@
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#ifndef CONFIG_PREEMPT_RT
#include "lock_events.h"
/*
@@ -889,7 +890,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
* Wait for the read lock to be granted
*/
static struct rw_semaphore __sched *
-rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state)
+rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)
{
long adjustment = -RWSEM_READER_BIAS;
long rcnt = (count >> RWSEM_READER_SHIFT);
@@ -1165,7 +1166,7 @@ out_nolock:
* handle waking up a waiter on the semaphore
* - up_read/up_write has decremented the active part of count if we come here
*/
-static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count)
+static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
DEFINE_WAKE_Q(wake_q);
@@ -1297,7 +1298,7 @@ static inline void __up_read(struct rw_semaphore *sem)
if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
RWSEM_FLAG_WAITERS)) {
clear_nonspinnable(sem);
- rwsem_wake(sem, tmp);
+ rwsem_wake(sem);
}
}
@@ -1319,7 +1320,7 @@ static inline void __up_write(struct rw_semaphore *sem)
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
if (unlikely(tmp & RWSEM_FLAG_WAITERS))
- rwsem_wake(sem, tmp);
+ rwsem_wake(sem);
}
/*
@@ -1344,6 +1345,116 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
rwsem_downgrade_wake(sem);
}
+#else /* !CONFIG_PREEMPT_RT */
+
+#define RT_MUTEX_BUILD_MUTEX
+#include "rtmutex.c"
+
+#define rwbase_set_and_save_current_state(state) \
+ set_current_state(state)
+
+#define rwbase_restore_current_state() \
+ __set_current_state(TASK_RUNNING)
+
+#define rwbase_rtmutex_lock_state(rtm, state) \
+ __rt_mutex_lock(rtm, state)
+
+#define rwbase_rtmutex_slowlock_locked(rtm, state) \
+ __rt_mutex_slowlock_locked(rtm, NULL, state)
+
+#define rwbase_rtmutex_unlock(rtm) \
+ __rt_mutex_unlock(rtm)
+
+#define rwbase_rtmutex_trylock(rtm) \
+ __rt_mutex_trylock(rtm)
+
+#define rwbase_signal_pending_state(state, current) \
+ signal_pending_state(state, current)
+
+#define rwbase_schedule() \
+ schedule()
+
+#include "rwbase_rt.c"
+
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+ init_rwbase_rt(&(sem)->rwbase);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
+#endif
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __down_read_interruptible(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
+}
+
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_read_trylock(&sem->rwbase);
+}
+
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
+}
+
+static inline void __sched __down_write(struct rw_semaphore *sem)
+{
+ rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+ return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_write_trylock(&sem->rwbase);
+}
+
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ rwbase_write_unlock(&sem->rwbase);
+}
+
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ rwbase_write_downgrade(&sem->rwbase);
+}
+
+/* Debug stubs for the common API */
+#define DEBUG_RWSEMS_WARN_ON(c, sem)
+
+static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+}
+
+static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+{
+ int count = atomic_read(&sem->rwbase.readers);
+
+ return count < 0 && count != READER_BIAS;
+}
+
+#endif /* CONFIG_PREEMPT_RT */
+
/*
* lock for reading
*/
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9aa855a96c4a..9ee381e4d2a4 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -54,6 +54,7 @@ void down(struct semaphore *sem)
{
unsigned long flags;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -77,6 +78,7 @@ int down_interruptible(struct semaphore *sem)
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -103,6 +105,7 @@ int down_killable(struct semaphore *sem)
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -157,6 +160,7 @@ int down_timeout(struct semaphore *sem, long timeout)
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index c8d7ad9fb9b2..c5830cfa379a 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
* __[spin|read|write]_lock_bh()
*/
BUILD_LOCK_OPS(spin, raw_spinlock);
+
+#ifndef CONFIG_PREEMPT_RT
BUILD_LOCK_OPS(read, rwlock);
BUILD_LOCK_OPS(write, rwlock);
+#endif
#endif
@@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
EXPORT_SYMBOL(_raw_spin_unlock_bh);
#endif
+#ifndef CONFIG_PREEMPT_RT
+
#ifndef CONFIG_INLINE_READ_TRYLOCK
int __lockfunc _raw_read_trylock(rwlock_t *lock)
{
@@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
EXPORT_SYMBOL(_raw_write_unlock_bh);
#endif
+#endif /* !CONFIG_PREEMPT_RT */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index b9d93087ee66..14235671a1a7 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
EXPORT_SYMBOL(__raw_spin_lock_init);
+#ifndef CONFIG_PREEMPT_RT
void __rwlock_init(rwlock_t *lock, const char *name,
struct lock_class_key *key)
{
@@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
}
EXPORT_SYMBOL(__rwlock_init);
+#endif
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
@@ -139,6 +141,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
arch_spin_unlock(&lock->raw_lock);
}
+#ifndef CONFIG_PREEMPT_RT
static void rwlock_bug(rwlock_t *lock, const char *msg)
{
if (!debug_locks_off())
@@ -228,3 +231,5 @@ void do_raw_write_unlock(rwlock_t *lock)
debug_write_unlock(lock);
arch_write_unlock(&lock->raw_lock);
}
+
+#endif /* !CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
new file mode 100644
index 000000000000..d2912e44d61f
--- /dev/null
+++ b/kernel/locking/spinlock_rt.c
@@ -0,0 +1,263 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PREEMPT_RT substitution for spin/rw_locks
+ *
+ * spinlocks and rwlocks on RT are based on rtmutexes, with a few twists to
+ * resemble the non RT semantics:
+ *
+ * - Contrary to plain rtmutexes, spinlocks and rwlocks are state
+ * preserving. The task state is saved before blocking on the underlying
+ * rtmutex, and restored when the lock has been acquired. Regular wakeups
+ * during that time are redirected to the saved state so no wake up is
+ * missed.
+ *
+ * - Non RT spin/rwlocks disable preemption and eventually interrupts.
+ * Disabling preemption has the side effect of disabling migration and
+ * preventing RCU grace periods.
+ *
+ * The RT substitutions explicitly disable migration and take
+ * rcu_read_lock() across the lock held section.
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_SPINLOCKS
+#include "rtmutex.c"
+
+static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
+{
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+}
+
+static __always_inline void __rt_spin_lock(spinlock_t *lock)
+{
+ ___might_sleep(__FILE__, __LINE__, 0);
+ rtlock_lock(&lock->lock);
+ rcu_read_lock();
+ migrate_disable();
+}
+
+void __sched rt_spin_lock(spinlock_t *lock)
+{
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+
+void __sched rt_spin_lock_nest_lock(spinlock_t *lock,
+ struct lockdep_map *nest_lock)
+{
+ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nest_lock);
+#endif
+
+void __sched rt_spin_unlock(spinlock_t *lock)
+{
+ spin_release(&lock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+
+ if (unlikely(!rt_mutex_cmpxchg_release(&lock->lock, current, NULL)))
+ rt_mutex_slowunlock(&lock->lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __sched rt_spin_lock_unlock(spinlock_t *lock)
+{
+ spin_lock(lock);
+ spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_unlock);
+
+static __always_inline int __rt_spin_trylock(spinlock_t *lock)
+{
+ int ret = 1;
+
+ if (unlikely(!rt_mutex_cmpxchg_acquire(&lock->lock, NULL, current)))
+ ret = rt_mutex_slowtrylock(&lock->lock);
+
+ if (ret) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+
+int __sched rt_spin_trylock(spinlock_t *lock)
+{
+ return __rt_spin_trylock(lock);
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __sched rt_spin_trylock_bh(spinlock_t *lock)
+{
+ int ret;
+
+ local_bh_disable();
+ ret = __rt_spin_trylock(lock);
+ if (!ret)
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_spin_lock_init(spinlock_t *lock, const char *name,
+ struct lock_class_key *key, bool percpu)
+{
+ u8 type = percpu ? LD_LOCK_PERCPU : LD_LOCK_NORMAL;
+
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map_type(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG,
+ LD_WAIT_INV, type);
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+#endif
+
+/*
+ * RT-specific reader/writer locks
+ */
+#define rwbase_set_and_save_current_state(state) \
+ current_save_and_set_rtlock_wait_state()
+
+#define rwbase_restore_current_state() \
+ current_restore_rtlock_saved_state()
+
+static __always_inline int
+rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state)
+{
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+ return 0;
+}
+
+static __always_inline int
+rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state)
+{
+ rtlock_slowlock_locked(rtm);
+ return 0;
+}
+
+static __always_inline void rwbase_rtmutex_unlock(struct rt_mutex_base *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, current, NULL)))
+ return;
+
+ rt_mutex_slowunlock(rtm);
+}
+
+static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(rtm);
+}
+
+#define rwbase_signal_pending_state(state, current) (0)
+
+#define rwbase_schedule() \
+ schedule_rtlock()
+
+#include "rwbase_rt.c"
+/*
+ * The common functions which get wrapped into the rwlock API.
+ */
+int __sched rt_read_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_read_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+int __sched rt_write_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_write_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+void __sched rt_read_lock(rwlock_t *rwlock)
+{
+ ___might_sleep(__FILE__, __LINE__, 0);
+ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_read_lock);
+
+void __sched rt_write_lock(rwlock_t *rwlock)
+{
+ ___might_sleep(__FILE__, __LINE__, 0);
+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+void __sched rt_read_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+ rwbase_read_unlock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+void __sched rt_write_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ rcu_read_unlock();
+ migrate_enable();
+ rwbase_write_unlock(&rwlock->rwbase);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+int __sched rt_rwlock_is_contended(rwlock_t *rwlock)
+{
+ return rw_base_is_contended(&rwlock->rwbase);
+}
+EXPORT_SYMBOL(rt_rwlock_is_contended);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_rwlock_init(rwlock_t *rwlock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+ lockdep_init_map_wait(&rwlock->dep_map, name, key, 0, LD_WAIT_CONFIG);
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+#endif
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
new file mode 100644
index 000000000000..56f139201f24
--- /dev/null
+++ b/kernel/locking/ww_mutex.h
@@ -0,0 +1,569 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef WW_RT
+
+#define MUTEX mutex
+#define MUTEX_WAITER mutex_waiter
+
+static inline struct mutex_waiter *
+__ww_waiter_first(struct mutex *lock)
+{
+ struct mutex_waiter *w;
+
+ w = list_first_entry(&lock->wait_list, struct mutex_waiter, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_next(struct mutex *lock, struct mutex_waiter *w)
+{
+ w = list_next_entry(w, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w)
+{
+ w = list_prev_entry(w, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_last(struct mutex *lock)
+{
+ struct mutex_waiter *w;
+
+ w = list_last_entry(&lock->wait_list, struct mutex_waiter, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline void
+__ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos)
+{
+ struct list_head *p = &lock->wait_list;
+ if (pos)
+ p = &pos->list;
+ __mutex_add_waiter(lock, waiter, p);
+}
+
+static inline struct task_struct *
+__ww_mutex_owner(struct mutex *lock)
+{
+ return __mutex_owner(lock);
+}
+
+static inline bool
+__ww_mutex_has_waiters(struct mutex *lock)
+{
+ return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS;
+}
+
+static inline void lock_wait_lock(struct mutex *lock)
+{
+ raw_spin_lock(&lock->wait_lock);
+}
+
+static inline void unlock_wait_lock(struct mutex *lock)
+{
+ raw_spin_unlock(&lock->wait_lock);
+}
+
+static inline void lockdep_assert_wait_lock_held(struct mutex *lock)
+{
+ lockdep_assert_held(&lock->wait_lock);
+}
+
+#else /* WW_RT */
+
+#define MUTEX rt_mutex
+#define MUTEX_WAITER rt_mutex_waiter
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_first(struct rt_mutex *lock)
+{
+ struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+{
+ struct rb_node *n = rb_next(&w->tree_entry);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+{
+ struct rb_node *n = rb_prev(&w->tree_entry);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_last(struct rt_mutex *lock)
+{
+ struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+}
+
+static inline void
+__ww_waiter_add(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *pos)
+{
+ /* RT unconditionally adds the waiter first and then removes it on error */
+}
+
+static inline struct task_struct *
+__ww_mutex_owner(struct rt_mutex *lock)
+{
+ return rt_mutex_owner(&lock->rtmutex);
+}
+
+static inline bool
+__ww_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return rt_mutex_has_waiters(&lock->rtmutex);
+}
+
+static inline void lock_wait_lock(struct rt_mutex *lock)
+{
+ raw_spin_lock(&lock->rtmutex.wait_lock);
+}
+
+static inline void unlock_wait_lock(struct rt_mutex *lock)
+{
+ raw_spin_unlock(&lock->rtmutex.wait_lock);
+}
+
+static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock)
+{
+ lockdep_assert_held(&lock->rtmutex.wait_lock);
+}
+
+#endif /* WW_RT */
+
+/*
+ * Wait-Die:
+ * The newer transactions are killed when:
+ * It (the new transaction) makes a request for a lock being held
+ * by an older transaction.
+ *
+ * Wound-Wait:
+ * The newer transactions are wounded when:
+ * An older transaction makes a request for a lock being held by
+ * the newer transaction.
+ */
+
+/*
+ * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired
+ * it.
+ */
+static __always_inline void
+ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef DEBUG_WW_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
+
+ /*
+ * Naughty, using a different class will lead to undefined behavior!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+ ww->ctx = ww_ctx;
+}
+
+/*
+ * Determine if @a is 'less' than @b. IOW, either @a is a lower priority task
+ * or, when of equal priority, a younger transaction than @b.
+ *
+ * Depending on the algorithm, @a will either need to wait for @b, or die.
+ */
+static inline bool
+__ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
+{
+/*
+ * Can only do the RT prio for WW_RT, because task->prio isn't stable due to PI,
+ * so the wait_list ordering will go wobbly. rt_mutex re-queues the waiter and
+ * isn't affected by this.
+ */
+#ifdef WW_RT
+ /* kernel prio; less is more */
+ int a_prio = a->task->prio;
+ int b_prio = b->task->prio;
+
+ if (rt_prio(a_prio) || rt_prio(b_prio)) {
+
+ if (a_prio > b_prio)
+ return true;
+
+ if (a_prio < b_prio)
+ return false;
+
+ /* equal static prio */
+
+ if (dl_prio(a_prio)) {
+ if (dl_time_before(b->task->dl.deadline,
+ a->task->dl.deadline))
+ return true;
+
+ if (dl_time_before(a->task->dl.deadline,
+ b->task->dl.deadline))
+ return false;
+ }
+
+ /* equal prio */
+ }
+#endif
+
+ /* FIFO order tie break -- bigger is younger */
+ return (signed long)(a->stamp - b->stamp) > 0;
+}
+
+/*
+ * Wait-Die; wake a lesser waiter context (when locks held) such that it can
+ * die.
+ *
+ * Among waiters with context, only the first one can have other locks acquired
+ * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and
+ * __ww_mutex_check_kill() wake any but the earliest context.
+ */
+static bool
+__ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ if (!ww_ctx->is_wait_die)
+ return false;
+
+ if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) {
+#ifndef WW_RT
+ debug_mutex_wake_waiter(lock, waiter);
+#endif
+ wake_up_process(waiter->task);
+ }
+
+ return true;
+}
+
+/*
+ * Wound-Wait; wound a lesser @hold_ctx if it holds the lock.
+ *
+ * Wound the lock holder if there are waiters with more important transactions
+ * than the lock holders. Even if multiple waiters may wound the lock holder,
+ * it's sufficient that only one does.
+ */
+static bool __ww_mutex_wound(struct MUTEX *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct ww_acquire_ctx *hold_ctx)
+{
+ struct task_struct *owner = __ww_mutex_owner(lock);
+
+ lockdep_assert_wait_lock_held(lock);
+
+ /*
+ * Possible through __ww_mutex_add_waiter() when we race with
+ * ww_mutex_set_context_fastpath(). In that case we'll get here again
+ * through __ww_mutex_check_waiters().
+ */
+ if (!hold_ctx)
+ return false;
+
+ /*
+ * Can have !owner because of __mutex_unlock_slowpath(), but if owner,
+ * it cannot go away because we'll have FLAG_WAITERS set and hold
+ * wait_lock.
+ */
+ if (!owner)
+ return false;
+
+ if (ww_ctx->acquired > 0 && __ww_ctx_less(hold_ctx, ww_ctx)) {
+ hold_ctx->wounded = 1;
+
+ /*
+ * wake_up_process() paired with set_current_state()
+ * inserts sufficient barriers to make sure @owner either sees
+ * it's wounded in __ww_mutex_check_kill() or has a
+ * wakeup pending to re-read the wounded state.
+ */
+ if (owner != current)
+ wake_up_process(owner);
+
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We just acquired @lock under @ww_ctx, if there are more important contexts
+ * waiting behind us on the wait-list, check if they need to die, or wound us.
+ *
+ * See __ww_mutex_add_waiter() for the list-order construction; basically the
+ * list is ordered by stamp, smallest (oldest) first.
+ *
+ * This relies on never mixing wait-die/wound-wait on the same wait-list;
+ * which is currently ensured by that being a ww_class property.
+ *
+ * The current task must not be on the wait list.
+ */
+static void
+__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ struct MUTEX_WAITER *cur;
+
+ lockdep_assert_wait_lock_held(lock);
+
+ for (cur = __ww_waiter_first(lock); cur;
+ cur = __ww_waiter_next(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_mutex_die(lock, cur, ww_ctx) ||
+ __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx))
+ break;
+ }
+}
+
+/*
+ * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx
+ * and wake up any waiters so they can recheck.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ ww_mutex_lock_acquired(lock, ctx);
+
+ /*
+ * The lock->ctx update should be visible on all cores before
+ * the WAITERS check is done, otherwise contended waiters might be
+ * missed. The contended waiters will either see ww_ctx == NULL
+ * and keep spinning, or it will acquire wait_lock, add itself
+ * to waiter list and sleep.
+ */
+ smp_mb(); /* See comments above and below. */
+
+ /*
+ * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS
+ * MB MB
+ * [R] MUTEX_FLAG_WAITERS [R] ww->ctx
+ *
+ * The memory barrier above pairs with the memory barrier in
+ * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
+ * and/or !empty list.
+ */
+ if (likely(!__ww_mutex_has_waiters(&lock->base)))
+ return;
+
+ /*
+ * Uh oh, we raced in fastpath, check if any of the waiters need to
+ * die or wound us.
+ */
+ lock_wait_lock(&lock->base);
+ __ww_mutex_check_waiters(&lock->base, ctx);
+ unlock_wait_lock(&lock->base);
+}
+
+static __always_inline int
+__ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ if (ww_ctx->acquired > 0) {
+#ifdef DEBUG_WW_MUTEXES
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+ ww_ctx->contending_lock = ww;
+#endif
+ return -EDEADLK;
+ }
+
+ return 0;
+}
+
+/*
+ * Check the wound condition for the current lock acquire.
+ *
+ * Wound-Wait: If we're wounded, kill ourself.
+ *
+ * Wait-Die: If we're trying to acquire a lock already held by an older
+ * context, kill ourselves.
+ *
+ * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to
+ * look at waiters before us in the wait-list.
+ */
+static inline int
+__ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
+ struct ww_acquire_ctx *ctx)
+{
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
+ struct MUTEX_WAITER *cur;
+
+ if (ctx->acquired == 0)
+ return 0;
+
+ if (!ctx->is_wait_die) {
+ if (ctx->wounded)
+ return __ww_mutex_kill(lock, ctx);
+
+ return 0;
+ }
+
+ if (hold_ctx && __ww_ctx_less(ctx, hold_ctx))
+ return __ww_mutex_kill(lock, ctx);
+
+ /*
+ * If there is a waiter in front of us that has a context, then its
+ * stamp is earlier than ours and we must kill ourself.
+ */
+ for (cur = __ww_waiter_prev(lock, waiter); cur;
+ cur = __ww_waiter_prev(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ return __ww_mutex_kill(lock, ctx);
+ }
+
+ return 0;
+}
+
+/*
+ * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest
+ * first. Such that older contexts are preferred to acquire the lock over
+ * younger contexts.
+ *
+ * Waiters without context are interspersed in FIFO order.
+ *
+ * Furthermore, for Wait-Die kill ourself immediately when possible (there are
+ * older contexts already waiting) to avoid unnecessary waiting and for
+ * Wound-Wait ensure we wound the owning context when it is younger.
+ */
+static inline int
+__ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
+ struct MUTEX *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ struct MUTEX_WAITER *cur, *pos = NULL;
+ bool is_wait_die;
+
+ if (!ww_ctx) {
+ __ww_waiter_add(lock, waiter, NULL);
+ return 0;
+ }
+
+ is_wait_die = ww_ctx->is_wait_die;
+
+ /*
+ * Add the waiter before the first waiter with a higher stamp.
+ * Waiters without a context are skipped to avoid starving
+ * them. Wait-Die waiters may die here. Wound-Wait waiters
+ * never die here, but they are sorted in stamp order and
+ * may wound the lock holder.
+ */
+ for (cur = __ww_waiter_last(lock); cur;
+ cur = __ww_waiter_prev(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_ctx_less(ww_ctx, cur->ww_ctx)) {
+ /*
+ * Wait-Die: if we find an older context waiting, there
+ * is no point in queueing behind it, as we'd have to
+ * die the moment it would acquire the lock.
+ */
+ if (is_wait_die) {
+ int ret = __ww_mutex_kill(lock, ww_ctx);
+
+ if (ret)
+ return ret;
+ }
+
+ break;
+ }
+
+ pos = cur;
+
+ /* Wait-Die: ensure younger waiters die. */
+ __ww_mutex_die(lock, cur, ww_ctx);
+ }
+
+ __ww_waiter_add(lock, waiter, pos);
+
+ /*
+ * Wound-Wait: if we're blocking on a mutex owned by a younger context,
+ * wound that such that we might proceed.
+ */
+ if (!is_wait_die) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+
+ /*
+ * See ww_mutex_set_context_fastpath(). Orders setting
+ * MUTEX_FLAG_WAITERS vs the ww->ctx load,
+ * such that either we or the fastpath will wound @ww->ctx.
+ */
+ smp_mb();
+ __ww_mutex_wound(lock, ww_ctx, ww->ctx);
+ }
+
+ return 0;
+}
+
+static inline void __ww_mutex_unlock(struct ww_mutex *lock)
+{
+ if (lock->ctx) {
+#ifdef DEBUG_WW_MUTEXES
+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+ if (lock->ctx->acquired > 0)
+ lock->ctx->acquired--;
+ lock->ctx = NULL;
+ }
+}
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
new file mode 100644
index 000000000000..3f1fff7d2780
--- /dev/null
+++ b/kernel/locking/ww_rt_mutex.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rtmutex API
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_MUTEX
+#define WW_RT
+#include "rtmutex.c"
+
+static int __sched
+__ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ unsigned int state, unsigned long ip)
+{
+ struct lockdep_map __maybe_unused *nest_lock = NULL;
+ struct rt_mutex *rtm = &lock->base;
+ int ret;
+
+ might_sleep();
+
+ if (ww_ctx) {
+ if (unlikely(ww_ctx == READ_ONCE(lock->ctx)))
+ return -EALREADY;
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ nest_lock = &ww_ctx->dep_map;
+#endif
+ }
+ mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip);
+
+ if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) {
+ if (ww_ctx)
+ ww_mutex_set_context_fastpath(lock, ww_ctx);
+ return 0;
+ }
+
+ ret = rt_mutex_slowlock(&rtm->rtmutex, ww_ctx, state);
+
+ if (ret)
+ mutex_release(&rtm->dep_map, ip);
+ return ret;
+}
+
+int __sched
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __ww_rt_mutex_lock(lock, ctx, TASK_UNINTERRUPTIBLE, _RET_IP_);
+}
+EXPORT_SYMBOL(ww_mutex_lock);
+
+int __sched
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __ww_rt_mutex_lock(lock, ctx, TASK_INTERRUPTIBLE, _RET_IP_);
+}
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);
+
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+ struct rt_mutex *rtm = &lock->base;
+
+ __ww_mutex_unlock(lock);
+
+ mutex_release(&rtm->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&rtm->rtmutex);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
diff --git a/kernel/module.c b/kernel/module.c
index 927d46cb8eb9..5c26a76e800b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -13,6 +13,7 @@
#include <linux/trace_events.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
+#include <linux/buildid.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/sysfs.h>
@@ -1018,8 +1019,7 @@ void __symbol_put(const char *symbol)
};
preempt_disable();
- if (!find_symbol(&fsa))
- BUG();
+ BUG_ON(!find_symbol(&fsa));
module_put(fsa.owner);
preempt_enable();
}
@@ -1466,6 +1466,13 @@ resolve_symbol_wait(struct module *mod,
return ksym;
}
+#ifdef CONFIG_KALLSYMS
+static inline bool sect_empty(const Elf_Shdr *sect)
+{
+ return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
+}
+#endif
+
/*
* /sys/module/foo/sections stuff
* J. Corbet <corbet@lwn.net>
@@ -1473,11 +1480,6 @@ resolve_symbol_wait(struct module *mod,
#ifdef CONFIG_SYSFS
#ifdef CONFIG_KALLSYMS
-static inline bool sect_empty(const Elf_Shdr *sect)
-{
- return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
-}
-
struct module_sect_attr {
struct bin_attribute battr;
unsigned long address;
@@ -2798,6 +2800,26 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
}
#endif /* CONFIG_KALLSYMS */
+#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+static void init_build_id(struct module *mod, const struct load_info *info)
+{
+ const Elf_Shdr *sechdr;
+ unsigned int i;
+
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ sechdr = &info->sechdrs[i];
+ if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
+ !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
+ sechdr->sh_size))
+ break;
+ }
+}
+#else
+static void init_build_id(struct module *mod, const struct load_info *info)
+{
+}
+#endif
+
static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
{
if (!debug)
@@ -3333,6 +3355,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(unsigned long),
&mod->num_kprobe_blacklist);
#endif
+#ifdef CONFIG_PRINTK_INDEX
+ mod->printk_index_start = section_objs(info, ".printk_index",
+ sizeof(*mod->printk_index_start),
+ &mod->printk_index_size);
+#endif
#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
mod->static_call_sites = section_objs(info, ".static_call_sites",
sizeof(*mod->static_call_sites),
@@ -4022,6 +4049,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto free_arch_cleanup;
}
+ init_build_id(mod, info);
dynamic_debug_setup(mod, info->debug, info->num_debug);
/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
@@ -4255,6 +4283,7 @@ const char *module_address_lookup(unsigned long addr,
unsigned long *size,
unsigned long *offset,
char **modname,
+ const unsigned char **modbuildid,
char *namebuf)
{
const char *ret = NULL;
@@ -4265,6 +4294,13 @@ const char *module_address_lookup(unsigned long addr,
if (mod) {
if (modname)
*modname = mod->name;
+ if (modbuildid) {
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+ *modbuildid = mod->build_id;
+#else
+ *modbuildid = NULL;
+#endif
+ }
ret = find_kallsyms_symbol(mod, addr, size, offset);
}
@@ -4425,9 +4461,10 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
ret = fn(data, kallsyms_symbol_name(kallsyms, i),
mod, kallsyms_symbol_value(sym));
if (ret != 0)
- break;
+ goto out;
}
}
+out:
mutex_unlock(&module_mutex);
return ret;
}
@@ -4452,8 +4489,10 @@ static void cfi_init(struct module *mod)
/* Fix init/exit functions to point to the CFI jump table */
if (init)
mod->init = *init;
+#ifdef CONFIG_MODULE_UNLOAD
if (exit)
mod->exit = *exit;
+#endif
cfi_module_add(mod, module_addr_min);
#endif
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 1b019cbca594..b8251dc0bc0f 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -172,25 +172,6 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
-int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh,
- unsigned long val_up, unsigned long val_down, void *v)
-{
- unsigned long flags;
- int ret;
-
- /*
- * Musn't use RCU; because then the notifier list can
- * change between the up and down traversal.
- */
- spin_lock_irqsave(&nh->lock, flags);
- ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
- spin_unlock_irqrestore(&nh->lock, flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust);
-NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust);
-
/**
* atomic_notifier_call_chain - Call functions in an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index abc01fcad8c7..eec72ca962e2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -568,6 +568,6 @@ out:
int __init nsproxy_cache_init(void)
{
- nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
+ nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT);
return 0;
}
diff --git a/kernel/padata.c b/kernel/padata.c
index d4d3ba6e1728..18d3a5c699d8 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -9,19 +9,6 @@
*
* Copyright (c) 2020 Oracle and/or its affiliates.
* Author: Daniel Jordan <daniel.m.jordan@oracle.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <linux/completion.h>
@@ -211,7 +198,7 @@ int padata_do_parallel(struct padata_shell *ps,
if ((pinst->flags & PADATA_RESET))
goto out;
- atomic_inc(&pd->refcnt);
+ refcount_inc(&pd->refcnt);
padata->pd = pd;
padata->cb_cpu = *cb_cpu;
@@ -383,7 +370,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
}
local_bh_enable();
- if (atomic_sub_and_test(cnt, &pd->refcnt))
+ if (refcount_sub_and_test(cnt, &pd->refcnt))
padata_free_pd(pd);
}
@@ -593,7 +580,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
padata_init_reorder_list(pd);
padata_init_squeues(pd);
pd->seq_nr = -1;
- atomic_set(&pd->refcnt, 1);
+ refcount_set(&pd->refcnt, 1);
spin_lock_init(&pd->lock);
pd->cpu = cpumask_first(pd->cpumask.pcpu);
INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
@@ -667,7 +654,7 @@ static int padata_replace(struct padata_instance *pinst)
synchronize_rcu();
list_for_each_entry_continue_reverse(ps, &pinst->pslist, list)
- if (atomic_dec_and_test(&ps->opd->refcnt))
+ if (refcount_dec_and_test(&ps->opd->refcnt))
padata_free_pd(ps->opd);
pinst->flags &= ~PADATA_RESET;
@@ -733,7 +720,7 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
struct cpumask *serial_mask, *parallel_mask;
int err = -EINVAL;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&pinst->lock);
switch (cpumask_type) {
@@ -753,7 +740,7 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
out:
mutex_unlock(&pinst->lock);
- put_online_cpus();
+ cpus_read_unlock();
return err;
}
@@ -992,7 +979,7 @@ struct padata_instance *padata_alloc(const char *name)
if (!pinst->parallel_wq)
goto err_free_inst;
- get_online_cpus();
+ cpus_read_lock();
pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM |
WQ_CPU_INTENSIVE, 1, name);
@@ -1026,7 +1013,7 @@ struct padata_instance *padata_alloc(const char *name)
&pinst->cpu_dead_node);
#endif
- put_online_cpus();
+ cpus_read_unlock();
return pinst;
@@ -1036,7 +1023,7 @@ err_free_masks:
err_free_serial_wq:
destroy_workqueue(pinst->serial_wq);
err_put_cpus:
- put_online_cpus();
+ cpus_read_unlock();
destroy_workqueue(pinst->parallel_wq);
err_free_inst:
kfree(pinst);
@@ -1074,9 +1061,9 @@ struct padata_shell *padata_alloc_shell(struct padata_instance *pinst)
ps->pinst = pinst;
- get_online_cpus();
+ cpus_read_lock();
pd = padata_alloc_pd(ps);
- put_online_cpus();
+ cpus_read_unlock();
if (!pd)
goto out_free_ps;
diff --git a/kernel/panic.c b/kernel/panic.c
index 332736a72a58..cefd7d82366f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,6 +23,7 @@
#include <linux/reboot.h>
#include <linux/delay.h>
#include <linux/kexec.h>
+#include <linux/panic_notifier.h>
#include <linux/sched.h>
#include <linux/sysrq.h>
#include <linux/init.h>
@@ -247,7 +248,6 @@ void panic(const char *fmt, ...)
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (!_crash_kexec_post_notifiers) {
- printk_safe_flush_on_panic();
__crash_kexec(NULL);
/*
@@ -271,8 +271,6 @@ void panic(const char *fmt, ...)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
- /* Call flush even twice. It tries harder with a single online CPU */
- printk_safe_flush_on_panic();
kmsg_dump(KMSG_DUMP_PANIC);
/*
diff --git a/kernel/params.c b/kernel/params.c
index 2daa2780a92c..8299bd764e42 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -243,6 +243,24 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint);
+int param_set_uint_minmax(const char *val, const struct kernel_param *kp,
+ unsigned int min, unsigned int max)
+{
+ unsigned int num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = kstrtouint(val, 0, &num);
+ if (ret)
+ return ret;
+ if (num < min || num > max)
+ return -EINVAL;
+ *((unsigned int *)kp->arg) = num;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(param_set_uint_minmax);
+
int param_set_charp(const char *val, const struct kernel_param *kp)
{
if (strlen(val) > 1024) {
diff --git a/kernel/pid.c b/kernel/pid.c
index ebdf9c60cd0b..efe87db44683 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -550,13 +550,21 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
* Note, that this function can only be called after the fd table has
* been unshared to avoid leaking the pidfd to the new process.
*
+ * This symbol should not be explicitly exported to loadable modules.
+ *
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
-static int pidfd_create(struct pid *pid, unsigned int flags)
+int pidfd_create(struct pid *pid, unsigned int flags)
{
int fd;
+ if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ return -EINVAL;
+
+ if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+ return -EINVAL;
+
fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
flags | O_RDWR | O_CLOEXEC);
if (fd < 0)
@@ -596,10 +604,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
if (!p)
return -ESRCH;
- if (pid_has_task(p, PIDTYPE_TGID))
- fd = pidfd_create(p, flags);
- else
- fd = -EINVAL;
+ fd = pidfd_create(p, flags);
put_pid(p);
return fd;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ca43239a255a..a46a3723bc66 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -51,7 +51,8 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
mutex_lock(&pid_caches_mutex);
/* Name collision forces to do allocation under mutex. */
if (!*pkc)
- *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0);
+ *pkc = kmem_cache_create(name, len, 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0);
mutex_unlock(&pid_caches_mutex);
/* current can fail, but someone else can succeed. */
return READ_ONCE(*pkc);
@@ -449,7 +450,7 @@ const struct proc_ns_operations pidns_for_children_operations = {
static __init int pid_namespaces_init(void)
{
- pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+ pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
#ifdef CONFIG_CHECKPOINT_RESTORE
register_sysctl_paths(kern_path, pid_ns_ctl_table);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 6bfe3ead10ad..a12779650f15 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -98,20 +98,20 @@ config PM_STD_PARTITION
default ""
help
The default resume partition is the partition that the suspend-
- to-disk implementation will look for a suspended disk image.
+ to-disk implementation will look for a suspended disk image.
- The partition specified here will be different for almost every user.
+ The partition specified here will be different for almost every user.
It should be a valid swap partition (at least for now) that is turned
- on before suspending.
+ on before suspending.
The partition specified can be overridden by specifying:
- resume=/dev/<other device>
+ resume=/dev/<other device>
- which will set the resume partition to the device specified.
+ which will set the resume partition to the device specified.
Note there is currently not a way to specify which device to save the
- suspended image to. It will simply pick the first available swap
+ suspended image to. It will simply pick the first available swap
device.
config PM_SLEEP
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 0f4530b3a8cd..a332ccd829e2 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -170,7 +170,9 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
/* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
for (i = 0; i < nr_states; i++) {
- table[i].cost = div64_u64(fmax * table[i].power,
+ unsigned long power_res = em_scale_power(table[i].power);
+
+ table[i].cost = div64_u64(fmax * power_res,
table[i].frequency);
}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index da0b41914177..559acef3fddb 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -31,6 +31,7 @@
#include <linux/genhd.h>
#include <linux/ktime.h>
#include <linux/security.h>
+#include <linux/secretmem.h>
#include <trace/events/power.h>
#include "power.h"
@@ -81,7 +82,9 @@ void hibernate_release(void)
bool hibernation_available(void)
{
- return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION);
+ return nohibernate == 0 &&
+ !security_locked_down(LOCKDOWN_HIBERNATION) &&
+ !secretmem_active();
}
/**
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 12c7e1bb442f..44169f3081fd 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -577,7 +577,7 @@ static inline void pm_print_times_init(void) {}
struct kobject *power_kobj;
-/**
+/*
* state - control system sleep states.
*
* show() returns available sleep state labels, which may be "mem", "standby",
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 50cc63534486..37401c99b7d7 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * drivers/power/process.c - Functions for starting/stopping processes on
+ * drivers/power/process.c - Functions for starting/stopping processes on
* suspend transitions.
*
* Originally from swsusp.
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1a221dcb3c01..f7a986078213 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -331,7 +331,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
*
* Memory bitmap is a structure consisting of many linked lists of
* objects. The main list's elements are of type struct zone_bitmap
- * and each of them corresonds to one zone. For each zone bitmap
+ * and each of them corresponds to one zone. For each zone bitmap
* object there is a list of objects of type struct bm_block that
* represent each blocks of bitmap in which information is stored.
*
@@ -1146,7 +1146,7 @@ int create_basic_memory_bitmaps(void)
Free_second_object:
kfree(bm2);
Free_first_bitmap:
- memory_bm_free(bm1, PG_UNSAFE_CLEAR);
+ memory_bm_free(bm1, PG_UNSAFE_CLEAR);
Free_first_object:
kfree(bm1);
return -ENOMEM;
@@ -1500,7 +1500,7 @@ static struct memory_bitmap copy_bm;
/**
* swsusp_free - Free pages allocated for hibernation image.
*
- * Image pages are alocated before snapshot creation, so they need to be
+ * Image pages are allocated before snapshot creation, so they need to be
* released after resume.
*/
void swsusp_free(void)
@@ -2326,7 +2326,7 @@ static struct memory_bitmap *safe_highmem_bm;
* (@nr_highmem_p points to the variable containing the number of highmem image
* pages). The pages that are "safe" (ie. will not be overwritten when the
* hibernation image is restored entirely) have the corresponding bits set in
- * @bm (it must be unitialized).
+ * @bm (it must be uninitialized).
*
* NOTE: This function should not be called if there are no highmem image pages.
*/
@@ -2483,7 +2483,7 @@ static inline void free_highmem_data(void) {}
/**
* prepare_image - Make room for loading hibernation image.
- * @new_bm: Unitialized memory bitmap structure.
+ * @new_bm: Uninitialized memory bitmap structure.
* @bm: Memory bitmap with unsafe pages marked.
*
* Use @bm to mark the pages that will be overwritten in the process of
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index d8cae434f9eb..eb75f394a059 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -96,7 +96,7 @@ static void s2idle_enter(void)
s2idle_state = S2IDLE_STATE_ENTER;
raw_spin_unlock_irq(&s2idle_lock);
- get_online_cpus();
+ cpus_read_lock();
cpuidle_resume();
/* Push all the CPUs into the idle loop. */
@@ -106,7 +106,7 @@ static void s2idle_enter(void)
s2idle_state == S2IDLE_STATE_WAKE);
cpuidle_pause();
- put_online_cpus();
+ cpus_read_unlock();
raw_spin_lock_irq(&s2idle_lock);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index e1ed58adb69e..d20526c5be15 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -129,7 +129,7 @@ static int __init has_wakealarm(struct device *dev, const void *data)
{
struct rtc_device *candidate = to_rtc_device(dev);
- if (!candidate->ops->set_alarm)
+ if (!test_bit(RTC_FEATURE_ALARM, candidate->features))
return 0;
if (!device_may_wakeup(candidate->dev.parent))
return 0;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index bea3cb8afa11..3cb89baebc79 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1125,7 +1125,7 @@ struct dec_data {
};
/**
- * Deompression function that runs in its own thread.
+ * Decompression function that runs in its own thread.
*/
static int lzo_decompress_threadfn(void *data)
{
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index eee3dc9b60a9..d118739874c0 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -3,3 +3,4 @@ obj-y = printk.o
obj-$(CONFIG_PRINTK) += printk_safe.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
obj-$(CONFIG_PRINTK) += printk_ringbuffer.o
+obj-$(CONFIG_PRINTK_INDEX) += index.o
diff --git a/kernel/printk/index.c b/kernel/printk/index.c
new file mode 100644
index 000000000000..d3709408debe
--- /dev/null
+++ b/kernel/printk/index.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Userspace indexing of printk formats
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
+#include "internal.h"
+
+extern struct pi_entry *__start_printk_index[];
+extern struct pi_entry *__stop_printk_index[];
+
+/* The base dir for module formats, typically debugfs/printk/index/ */
+static struct dentry *dfs_index;
+
+static struct pi_entry *pi_get_entry(const struct module *mod, loff_t pos)
+{
+ struct pi_entry **entries;
+ unsigned int nr_entries;
+
+#ifdef CONFIG_MODULES
+ if (mod) {
+ entries = mod->printk_index_start;
+ nr_entries = mod->printk_index_size;
+ }
+#endif
+
+ if (!mod) {
+ /* vmlinux, comes from linker symbols */
+ entries = __start_printk_index;
+ nr_entries = __stop_printk_index - __start_printk_index;
+ }
+
+ if (pos >= nr_entries)
+ return NULL;
+
+ return entries[pos];
+}
+
+static void *pi_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ const struct module *mod = s->file->f_inode->i_private;
+ struct pi_entry *entry = pi_get_entry(mod, *pos);
+
+ (*pos)++;
+
+ return entry;
+}
+
+static void *pi_start(struct seq_file *s, loff_t *pos)
+{
+ /*
+ * Make show() print the header line. Do not update *pos because
+ * pi_next() still has to return the entry at index 0 later.
+ */
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ return pi_next(s, NULL, pos);
+}
+
+/*
+ * We need both ESCAPE_ANY and explicit characters from ESCAPE_SPECIAL in @only
+ * because otherwise ESCAPE_NAP will cause double quotes and backslashes to be
+ * ignored for quoting.
+ */
+#define seq_escape_printf_format(s, src) \
+ seq_escape_str(s, src, ESCAPE_ANY | ESCAPE_NAP | ESCAPE_APPEND, "\"\\")
+
+static int pi_show(struct seq_file *s, void *v)
+{
+ const struct pi_entry *entry = v;
+ int level = LOGLEVEL_DEFAULT;
+ enum printk_info_flags flags = 0;
+ u16 prefix_len = 0;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(s, "# <level/flags> filename:line function \"format\"\n");
+ return 0;
+ }
+
+ if (!entry->fmt)
+ return 0;
+
+ if (entry->level)
+ printk_parse_prefix(entry->level, &level, &flags);
+ else
+ prefix_len = printk_parse_prefix(entry->fmt, &level, &flags);
+
+
+ if (flags & LOG_CONT) {
+ /*
+ * LOGLEVEL_DEFAULT here means "use the same level as the
+ * message we're continuing from", not the default message
+ * loglevel, so don't display it as such.
+ */
+ if (level == LOGLEVEL_DEFAULT)
+ seq_puts(s, "<c>");
+ else
+ seq_printf(s, "<%d,c>", level);
+ } else
+ seq_printf(s, "<%d>", level);
+
+ seq_printf(s, " %s:%d %s \"", entry->file, entry->line, entry->func);
+ if (entry->subsys_fmt_prefix)
+ seq_escape_printf_format(s, entry->subsys_fmt_prefix);
+ seq_escape_printf_format(s, entry->fmt + prefix_len);
+ seq_puts(s, "\"\n");
+
+ return 0;
+}
+
+static void pi_stop(struct seq_file *p, void *v) { }
+
+static const struct seq_operations dfs_index_sops = {
+ .start = pi_start,
+ .next = pi_next,
+ .show = pi_show,
+ .stop = pi_stop,
+};
+
+DEFINE_SEQ_ATTRIBUTE(dfs_index);
+
+#ifdef CONFIG_MODULES
+static const char *pi_get_module_name(struct module *mod)
+{
+ return mod ? mod->name : "vmlinux";
+}
+#else
+static const char *pi_get_module_name(struct module *mod)
+{
+ return "vmlinux";
+}
+#endif
+
+static void pi_create_file(struct module *mod)
+{
+ debugfs_create_file(pi_get_module_name(mod), 0444, dfs_index,
+ mod, &dfs_index_fops);
+}
+
+#ifdef CONFIG_MODULES
+static void pi_remove_file(struct module *mod)
+{
+ debugfs_remove(debugfs_lookup(pi_get_module_name(mod), dfs_index));
+}
+
+static int pi_module_notify(struct notifier_block *nb, unsigned long op,
+ void *data)
+{
+ struct module *mod = data;
+
+ switch (op) {
+ case MODULE_STATE_COMING:
+ pi_create_file(mod);
+ break;
+ case MODULE_STATE_GOING:
+ pi_remove_file(mod);
+ break;
+ default: /* we don't care about other module states */
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block module_printk_fmts_nb = {
+ .notifier_call = pi_module_notify,
+};
+
+static void __init pi_setup_module_notifier(void)
+{
+ register_module_notifier(&module_printk_fmts_nb);
+}
+#else
+static inline void __init pi_setup_module_notifier(void) { }
+#endif
+
+static int __init pi_init(void)
+{
+ struct dentry *dfs_root = debugfs_create_dir("printk", NULL);
+
+ dfs_index = debugfs_create_dir("index", dfs_root);
+ pi_setup_module_notifier();
+ pi_create_file(NULL);
+
+ return 0;
+}
+
+/* debugfs comes up on core and must be initialised first */
+postcore_initcall(pi_init);
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 51615c909b2f..9f3ed2fdb721 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -6,11 +6,11 @@
#ifdef CONFIG_PRINTK
-#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff
-#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000
-#define PRINTK_NMI_CONTEXT_MASK 0xff0000000
-
-#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000
+/* Flags for a single printk record. */
+enum printk_info_flags {
+ LOG_NEWLINE = 2, /* text ended with a newline */
+ LOG_CONT = 8, /* text is a fragment of a continuation line */
+};
__printf(4, 0)
int vprintk_store(int facility, int level,
@@ -19,10 +19,7 @@ int vprintk_store(int facility, int level,
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
-void __printk_safe_enter(void);
-void __printk_safe_exit(void);
-void printk_safe_init(void);
bool printk_percpu_data_ready(void);
#define printk_safe_enter_irqsave(flags) \
@@ -37,20 +34,10 @@ bool printk_percpu_data_ready(void);
local_irq_restore(flags); \
} while (0)
-#define printk_safe_enter_irq() \
- do { \
- local_irq_disable(); \
- __printk_safe_enter(); \
- } while (0)
-
-#define printk_safe_exit_irq() \
- do { \
- __printk_safe_exit(); \
- local_irq_enable(); \
- } while (0)
-
void defer_console_output(void);
+u16 printk_parse_prefix(const char *text, int *level,
+ enum printk_info_flags *flags);
#else
/*
@@ -61,9 +48,5 @@ void defer_console_output(void);
#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
-#define printk_safe_enter_irq() local_irq_disable()
-#define printk_safe_exit_irq() local_irq_enable()
-
-static inline void printk_safe_init(void) { }
static inline bool printk_percpu_data_ready(void) { return false; }
#endif /* CONFIG_PRINTK */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 421c35571797..a8d0a58deebc 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -350,13 +350,8 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
* non-prinatable characters are escaped in the "\xff" notation.
*/
-enum log_flags {
- LOG_NEWLINE = 2, /* text ended with a newline */
- LOG_CONT = 8, /* text is a fragment of a continuation line */
-};
-
/* syslog_lock protects syslog_* variables and write access to clear_seq. */
-static DEFINE_RAW_SPINLOCK(syslog_lock);
+static DEFINE_MUTEX(syslog_lock);
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
@@ -732,27 +727,22 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
if (ret)
return ret;
- printk_safe_enter_irq();
if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- printk_safe_exit_irq();
goto out;
}
- printk_safe_exit_irq();
ret = wait_event_interruptible(log_wait,
prb_read_valid(prb, atomic64_read(&user->seq), r));
if (ret)
goto out;
- printk_safe_enter_irq();
}
if (r->info->seq != atomic64_read(&user->seq)) {
/* our last seen message is gone, return error and reset */
atomic64_set(&user->seq, r->info->seq);
ret = -EPIPE;
- printk_safe_exit_irq();
goto out;
}
@@ -762,7 +752,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
&r->info->dev_info);
atomic64_set(&user->seq, r->info->seq + 1);
- printk_safe_exit_irq();
if (len > count) {
ret = -EINVAL;
@@ -797,7 +786,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
if (offset)
return -ESPIPE;
- printk_safe_enter_irq();
switch (whence) {
case SEEK_SET:
/* the first record */
@@ -818,7 +806,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
default:
ret = -EINVAL;
}
- printk_safe_exit_irq();
return ret;
}
@@ -833,7 +820,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
poll_wait(file, &log_wait, wait);
- printk_safe_enter_irq();
if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
/* return error when data has vanished underneath us */
if (info.seq != atomic64_read(&user->seq))
@@ -841,7 +827,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
else
ret = EPOLLIN|EPOLLRDNORM;
}
- printk_safe_exit_irq();
return ret;
}
@@ -874,9 +859,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
prb_rec_init_rd(&user->record, &user->info,
&user->text_buf[0], sizeof(user->text_buf));
- printk_safe_enter_irq();
atomic64_set(&user->seq, prb_first_valid_seq(prb));
- printk_safe_exit_irq();
file->private_data = user;
return 0;
@@ -1042,9 +1025,6 @@ static inline void log_buf_add_cpu(void) {}
static void __init set_percpu_data_ready(void)
{
- printk_safe_init();
- /* Make sure we set this flag only after printk_safe() init is done */
- barrier();
__printk_percpu_data_ready = true;
}
@@ -1082,6 +1062,7 @@ void __init setup_log_buf(int early)
struct prb_desc *new_descs;
struct printk_info info;
struct printk_record r;
+ unsigned int text_size;
size_t new_descs_size;
size_t new_infos_size;
unsigned long flags;
@@ -1142,24 +1123,37 @@ void __init setup_log_buf(int early)
new_descs, ilog2(new_descs_count),
new_infos);
- printk_safe_enter_irqsave(flags);
+ local_irq_save(flags);
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
free = __LOG_BUF_LEN;
- prb_for_each_record(0, &printk_rb_static, seq, &r)
- free -= add_to_rb(&printk_rb_dynamic, &r);
+ prb_for_each_record(0, &printk_rb_static, seq, &r) {
+ text_size = add_to_rb(&printk_rb_dynamic, &r);
+ if (text_size > free)
+ free = 0;
+ else
+ free -= text_size;
+ }
- /*
- * This is early enough that everything is still running on the
- * boot CPU and interrupts are disabled. So no new messages will
- * appear during the transition to the dynamic buffer.
- */
prb = &printk_rb_dynamic;
- printk_safe_exit_irqrestore(flags);
+ local_irq_restore(flags);
+
+ /*
+ * Copy any remaining messages that might have appeared from
+ * NMI context after copying but before switching to the
+ * dynamic buffer.
+ */
+ prb_for_each_record(seq, &printk_rb_static, seq, &r) {
+ text_size = add_to_rb(&printk_rb_dynamic, &r);
+ if (text_size > free)
+ free = 0;
+ else
+ free -= text_size;
+ }
if (seq != prb_next_seq(&printk_rb_static)) {
pr_err("dropped %llu messages\n",
@@ -1172,9 +1166,9 @@ void __init setup_log_buf(int early)
return;
err_free_descs:
- memblock_free(__pa(new_descs), new_descs_size);
+ memblock_free_ptr(new_descs, new_descs_size);
err_free_log_buf:
- memblock_free(__pa(new_log_buf), new_log_buf_len);
+ memblock_free_ptr(new_log_buf, new_log_buf_len);
}
static bool __read_mostly ignore_loglevel;
@@ -1481,12 +1475,14 @@ static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size,
return seq;
}
+/* The caller is responsible for making sure @size is greater than 0. */
static int syslog_print(char __user *buf, int size)
{
struct printk_info info;
struct printk_record r;
char *text;
int len = 0;
+ u64 seq;
text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
if (!text)
@@ -1494,17 +1490,35 @@ static int syslog_print(char __user *buf, int size)
prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
- while (size > 0) {
+ mutex_lock(&syslog_lock);
+
+ /*
+ * Wait for the @syslog_seq record to be available. @syslog_seq may
+ * change while waiting.
+ */
+ do {
+ seq = syslog_seq;
+
+ mutex_unlock(&syslog_lock);
+ len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL));
+ mutex_lock(&syslog_lock);
+
+ if (len)
+ goto out;
+ } while (syslog_seq != seq);
+
+ /*
+ * Copy records that fit into the buffer. The above cycle makes sure
+ * that the first record is always available.
+ */
+ do {
size_t n;
size_t skip;
+ int err;
- printk_safe_enter_irq();
- raw_spin_lock(&syslog_lock);
- if (!prb_read_valid(prb, syslog_seq, &r)) {
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ if (!prb_read_valid(prb, syslog_seq, &r))
break;
- }
+
if (r.info->seq != syslog_seq) {
/* message is gone, move to next valid one */
syslog_seq = r.info->seq;
@@ -1531,13 +1545,15 @@ static int syslog_print(char __user *buf, int size)
syslog_partial += n;
} else
n = 0;
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
if (!n)
break;
- if (copy_to_user(buf, text + skip, n)) {
+ mutex_unlock(&syslog_lock);
+ err = copy_to_user(buf, text + skip, n);
+ mutex_lock(&syslog_lock);
+
+ if (err) {
if (!len)
len = -EFAULT;
break;
@@ -1546,8 +1562,9 @@ static int syslog_print(char __user *buf, int size)
len += n;
size -= n;
buf += n;
- }
-
+ } while (size);
+out:
+ mutex_unlock(&syslog_lock);
kfree(text);
return len;
}
@@ -1566,7 +1583,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
return -ENOMEM;
time = printk_time;
- printk_safe_enter_irq();
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
@@ -1587,23 +1603,20 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
break;
}
- printk_safe_exit_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- printk_safe_enter_irq();
if (len < 0)
break;
}
if (clear) {
- raw_spin_lock(&syslog_lock);
+ mutex_lock(&syslog_lock);
latched_seq_write(&clear_seq, seq);
- raw_spin_unlock(&syslog_lock);
+ mutex_unlock(&syslog_lock);
}
- printk_safe_exit_irq();
kfree(text);
return len;
@@ -1611,23 +1624,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
static void syslog_clear(void)
{
- printk_safe_enter_irq();
- raw_spin_lock(&syslog_lock);
+ mutex_lock(&syslog_lock);
latched_seq_write(&clear_seq, prb_next_seq(prb));
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
-}
-
-/* Return a consistent copy of @syslog_seq. */
-static u64 read_syslog_seq_irq(void)
-{
- u64 seq;
-
- raw_spin_lock_irq(&syslog_lock);
- seq = syslog_seq;
- raw_spin_unlock_irq(&syslog_lock);
-
- return seq;
+ mutex_unlock(&syslog_lock);
}
int do_syslog(int type, char __user *buf, int len, int source)
@@ -1653,11 +1652,6 @@ int do_syslog(int type, char __user *buf, int len, int source)
return 0;
if (!access_ok(buf, len))
return -EFAULT;
-
- error = wait_event_interruptible(log_wait,
- prb_read_valid(prb, read_syslog_seq_irq(), NULL));
- if (error)
- return error;
error = syslog_print(buf, len);
break;
/* Read/clear last kernel messages */
@@ -1703,12 +1697,10 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- printk_safe_enter_irq();
- raw_spin_lock(&syslog_lock);
+ mutex_lock(&syslog_lock);
if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
/* No unread messages. */
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ mutex_unlock(&syslog_lock);
return 0;
}
if (info.seq != syslog_seq) {
@@ -1736,8 +1728,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
}
error -= syslog_partial;
}
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ mutex_unlock(&syslog_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -1940,6 +1931,76 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
}
}
+/*
+ * Recursion is tracked separately on each CPU. If NMIs are supported, an
+ * additional NMI context per CPU is also separately tracked. Until per-CPU
+ * is available, a separate "early tracking" is performed.
+ */
+static DEFINE_PER_CPU(u8, printk_count);
+static u8 printk_count_early;
+#ifdef CONFIG_HAVE_NMI
+static DEFINE_PER_CPU(u8, printk_count_nmi);
+static u8 printk_count_nmi_early;
+#endif
+
+/*
+ * Recursion is limited to keep the output sane. printk() should not require
+ * more than 1 level of recursion (allowing, for example, printk() to trigger
+ * a WARN), but a higher value is used in case some printk-internal errors
+ * exist, such as the ringbuffer validation checks failing.
+ */
+#define PRINTK_MAX_RECURSION 3
+
+/*
+ * Return a pointer to the dedicated counter for the CPU+context of the
+ * caller.
+ */
+static u8 *__printk_recursion_counter(void)
+{
+#ifdef CONFIG_HAVE_NMI
+ if (in_nmi()) {
+ if (printk_percpu_data_ready())
+ return this_cpu_ptr(&printk_count_nmi);
+ return &printk_count_nmi_early;
+ }
+#endif
+ if (printk_percpu_data_ready())
+ return this_cpu_ptr(&printk_count);
+ return &printk_count_early;
+}
+
+/*
+ * Enter recursion tracking. Interrupts are disabled to simplify tracking.
+ * The caller must check the boolean return value to see if the recursion is
+ * allowed. On failure, interrupts are not disabled.
+ *
+ * @recursion_ptr must be a variable of type (u8 *) and is the same variable
+ * that is passed to printk_exit_irqrestore().
+ */
+#define printk_enter_irqsave(recursion_ptr, flags) \
+({ \
+ bool success = true; \
+ \
+ typecheck(u8 *, recursion_ptr); \
+ local_irq_save(flags); \
+ (recursion_ptr) = __printk_recursion_counter(); \
+ if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \
+ local_irq_restore(flags); \
+ success = false; \
+ } else { \
+ (*(recursion_ptr))++; \
+ } \
+ success; \
+})
+
+/* Exit recursion tracking, restoring interrupts. */
+#define printk_exit_irqrestore(recursion_ptr, flags) \
+ do { \
+ typecheck(u8 *, recursion_ptr); \
+ (*(recursion_ptr))--; \
+ local_irq_restore(flags); \
+ } while (0)
+
int printk_delay_msec __read_mostly;
static inline void printk_delay(void)
@@ -1961,23 +2022,24 @@ static inline u32 printk_caller_id(void)
}
/**
- * parse_prefix - Parse level and control flags.
+ * printk_parse_prefix - Parse level and control flags.
*
* @text: The terminated text message.
* @level: A pointer to the current level value, will be updated.
- * @lflags: A pointer to the current log flags, will be updated.
+ * @flags: A pointer to the current printk_info flags, will be updated.
*
* @level may be NULL if the caller is not interested in the parsed value.
* Otherwise the variable pointed to by @level must be set to
* LOGLEVEL_DEFAULT in order to be updated with the parsed value.
*
- * @lflags may be NULL if the caller is not interested in the parsed value.
- * Otherwise the variable pointed to by @lflags will be OR'd with the parsed
+ * @flags may be NULL if the caller is not interested in the parsed value.
+ * Otherwise the variable pointed to by @flags will be OR'd with the parsed
* value.
*
* Return: The length of the parsed level and control flags.
*/
-static u16 parse_prefix(char *text, int *level, enum log_flags *lflags)
+u16 printk_parse_prefix(const char *text, int *level,
+ enum printk_info_flags *flags)
{
u16 prefix_len = 0;
int kern_level;
@@ -1993,8 +2055,8 @@ static u16 parse_prefix(char *text, int *level, enum log_flags *lflags)
*level = kern_level - '0';
break;
case 'c': /* KERN_CONT */
- if (lflags)
- *lflags |= LOG_CONT;
+ if (flags)
+ *flags |= LOG_CONT;
}
prefix_len += 2;
@@ -2004,8 +2066,9 @@ static u16 parse_prefix(char *text, int *level, enum log_flags *lflags)
return prefix_len;
}
-static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lflags,
- const char *fmt, va_list args)
+static u16 printk_sprint(char *text, u16 size, int facility,
+ enum printk_info_flags *flags, const char *fmt,
+ va_list args)
{
u16 text_len;
@@ -2014,14 +2077,14 @@ static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lfl
/* Mark and strip a trailing newline. */
if (text_len && text[text_len - 1] == '\n') {
text_len--;
- *lflags |= LOG_NEWLINE;
+ *flags |= LOG_NEWLINE;
}
/* Strip log level and control flags. */
if (facility == 0) {
u16 prefix_len;
- prefix_len = parse_prefix(text, NULL, NULL);
+ prefix_len = printk_parse_prefix(text, NULL, NULL);
if (prefix_len) {
text_len -= prefix_len;
memmove(text, text + prefix_len, text_len);
@@ -2038,13 +2101,16 @@ int vprintk_store(int facility, int level,
{
const u32 caller_id = printk_caller_id();
struct prb_reserved_entry e;
- enum log_flags lflags = 0;
+ enum printk_info_flags flags = 0;
struct printk_record r;
+ unsigned long irqflags;
u16 trunc_msg_len = 0;
char prefix_buf[8];
+ u8 *recursion_ptr;
u16 reserve_size;
va_list args2;
u16 text_len;
+ int ret = 0;
u64 ts_nsec;
/*
@@ -2055,6 +2121,9 @@ int vprintk_store(int facility, int level,
*/
ts_nsec = local_clock();
+ if (!printk_enter_irqsave(recursion_ptr, irqflags))
+ return 0;
+
/*
* The sprintf needs to come first since the syslog prefix might be
* passed in as a parameter. An extra byte must be reserved so that
@@ -2070,29 +2139,30 @@ int vprintk_store(int facility, int level,
/* Extract log level or control flags. */
if (facility == 0)
- parse_prefix(&prefix_buf[0], &level, &lflags);
+ printk_parse_prefix(&prefix_buf[0], &level, &flags);
if (level == LOGLEVEL_DEFAULT)
level = default_message_loglevel;
if (dev_info)
- lflags |= LOG_NEWLINE;
+ flags |= LOG_NEWLINE;
- if (lflags & LOG_CONT) {
+ if (flags & LOG_CONT) {
prb_rec_init_wr(&r, reserve_size);
if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) {
text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
- facility, &lflags, fmt, args);
+ facility, &flags, fmt, args);
r.info->text_len += text_len;
- if (lflags & LOG_NEWLINE) {
+ if (flags & LOG_NEWLINE) {
r.info->flags |= LOG_NEWLINE;
prb_final_commit(&e);
} else {
prb_commit(&e);
}
- return text_len;
+ ret = text_len;
+ goto out;
}
}
@@ -2108,29 +2178,32 @@ int vprintk_store(int facility, int level,
prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
if (!prb_reserve(&e, prb, &r))
- return 0;
+ goto out;
}
/* fill message */
- text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args);
+ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args);
if (trunc_msg_len)
memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
r.info->text_len = text_len + trunc_msg_len;
r.info->facility = facility;
r.info->level = level & 7;
- r.info->flags = lflags & 0x1f;
+ r.info->flags = flags & 0x1f;
r.info->ts_nsec = ts_nsec;
r.info->caller_id = caller_id;
if (dev_info)
memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
/* A message without a trailing newline can be continued. */
- if (!(lflags & LOG_NEWLINE))
+ if (!(flags & LOG_NEWLINE))
prb_commit(&e);
else
prb_final_commit(&e);
- return (text_len + trunc_msg_len);
+ ret = text_len + trunc_msg_len;
+out:
+ printk_exit_irqrestore(recursion_ptr, irqflags);
+ return ret;
}
asmlinkage int vprintk_emit(int facility, int level,
@@ -2139,7 +2212,6 @@ asmlinkage int vprintk_emit(int facility, int level,
{
int printed_len;
bool in_sched = false;
- unsigned long flags;
/* Suppress unimportant messages after panic happens */
if (unlikely(suppress_printk))
@@ -2153,9 +2225,7 @@ asmlinkage int vprintk_emit(int facility, int level,
boot_delay_msec(level);
printk_delay();
- printk_safe_enter_irqsave(flags);
printed_len = vprintk_store(facility, level, dev_info, fmt, args);
- printk_safe_exit_irqrestore(flags);
/* If called from the scheduler, we can not call up(). */
if (!in_sched) {
@@ -2186,28 +2256,7 @@ int vprintk_default(const char *fmt, va_list args)
}
EXPORT_SYMBOL_GPL(vprintk_default);
-/**
- * printk - print a kernel message
- * @fmt: format string
- *
- * This is printk(). It can be called from any context. We want it to work.
- *
- * We try to grab the console_lock. If we succeed, it's easy - we log the
- * output and call the console drivers. If we fail to get the semaphore, we
- * place the output into the log buffer and return. The current holder of
- * the console_sem will notice the new output in console_unlock(); and will
- * send it to the consoles before releasing the lock.
- *
- * One effect of this deferred printing is that code which calls printk() and
- * then changes console_loglevel may break. This is because console_loglevel
- * is inspected when the actual printing occurs.
- *
- * See also:
- * printf(3)
- *
- * See the vsnprintf() documentation for format string extensions over C99.
- */
-asmlinkage __visible int printk(const char *fmt, ...)
+asmlinkage __visible int _printk(const char *fmt, ...)
{
va_list args;
int r;
@@ -2218,7 +2267,7 @@ asmlinkage __visible int printk(const char *fmt, ...)
return r;
}
-EXPORT_SYMBOL(printk);
+EXPORT_SYMBOL(_printk);
#else /* CONFIG_PRINTK */
@@ -2404,6 +2453,18 @@ module_param_named(console_suspend, console_suspend_enabled,
MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
" and hibernate operations");
+static bool printk_console_no_auto_verbose;
+
+void console_verbose(void)
+{
+ if (console_loglevel && !printk_console_no_auto_verbose)
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
+}
+EXPORT_SYMBOL_GPL(console_verbose);
+
+module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644);
+MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc");
+
/**
* suspend_console - suspend the console subsystem
*
@@ -2545,6 +2606,7 @@ void console_unlock(void)
bool do_cond_resched, retry;
struct printk_info info;
struct printk_record r;
+ u64 __maybe_unused next_seq;
if (console_suspended) {
up_console_sem();
@@ -2584,9 +2646,9 @@ again:
for (;;) {
size_t ext_len = 0;
+ int handover;
size_t len;
- printk_safe_enter_irqsave(flags);
skip:
if (!prb_read_valid(prb, console_seq, &r))
break;
@@ -2636,26 +2698,31 @@ skip:
* were to occur on another CPU, it may wait for this one to
* finish. This task can not be preempted if there is a
* waiter waiting to take over.
+ *
+ * Interrupts are disabled because the hand over to a waiter
+ * must not be interrupted until the hand over is completed
+ * (@console_waiter is cleared).
*/
+ printk_safe_enter_irqsave(flags);
console_lock_spinning_enable();
stop_critical_timings(); /* don't trace print latency */
call_console_drivers(ext_text, ext_len, text, len);
start_critical_timings();
- if (console_lock_spinning_disable_and_check()) {
- printk_safe_exit_irqrestore(flags);
- return;
- }
-
+ handover = console_lock_spinning_disable_and_check();
printk_safe_exit_irqrestore(flags);
+ if (handover)
+ return;
if (do_cond_resched)
cond_resched();
}
- console_locked = 0;
+ /* Get consistent value of the next-to-be-used sequence number. */
+ next_seq = console_seq;
+ console_locked = 0;
up_console_sem();
/*
@@ -2664,9 +2731,7 @@ skip:
* there's a new owner and the console_unlock() from them will do the
* flush, no worries.
*/
- retry = prb_read_valid(prb, console_seq, NULL);
- printk_safe_exit_irqrestore(flags);
-
+ retry = prb_read_valid(prb, next_seq, NULL);
if (retry && console_trylock())
goto again;
}
@@ -2728,13 +2793,8 @@ void console_flush_on_panic(enum con_flush_mode mode)
console_trylock();
console_may_schedule = 0;
- if (mode == CONSOLE_REPLAY_ALL) {
- unsigned long flags;
-
- printk_safe_enter_irqsave(flags);
+ if (mode == CONSOLE_REPLAY_ALL)
console_seq = prb_first_valid_seq(prb);
- printk_safe_exit_irqrestore(flags);
- }
console_unlock();
}
@@ -2869,7 +2929,6 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
*/
void register_console(struct console *newcon)
{
- unsigned long flags;
struct console *bcon = NULL;
int err;
@@ -2974,9 +3033,9 @@ void register_console(struct console *newcon)
exclusive_console_stop_seq = console_seq;
/* Get a consistent copy of @syslog_seq. */
- raw_spin_lock_irqsave(&syslog_lock, flags);
+ mutex_lock(&syslog_lock);
console_seq = syslog_seq;
- raw_spin_unlock_irqrestore(&syslog_lock, flags);
+ mutex_unlock(&syslog_lock);
}
console_unlock();
console_sysfs_notify();
@@ -3203,7 +3262,7 @@ int vprintk_deferred(const char *fmt, va_list args)
return r;
}
-int printk_deferred(const char *fmt, ...)
+int _printk_deferred(const char *fmt, ...)
{
va_list args;
int r;
@@ -3386,14 +3445,12 @@ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
struct printk_info info;
unsigned int line_count;
struct printk_record r;
- unsigned long flags;
size_t l = 0;
bool ret = false;
if (iter->cur_seq < min_seq)
iter->cur_seq = min_seq;
- printk_safe_enter_irqsave(flags);
prb_rec_init_rd(&r, &info, line, size);
/* Read text or count text lines? */
@@ -3414,7 +3471,6 @@ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
iter->cur_seq = r.info->seq + 1;
ret = true;
out:
- printk_safe_exit_irqrestore(flags);
if (len)
*len = l;
return ret;
@@ -3446,7 +3502,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
u64 min_seq = latched_seq_read_nolock(&clear_seq);
struct printk_info info;
struct printk_record r;
- unsigned long flags;
u64 seq;
u64 next_seq;
size_t len = 0;
@@ -3459,7 +3514,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
if (iter->cur_seq < min_seq)
iter->cur_seq = min_seq;
- printk_safe_enter_irqsave(flags);
if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
if (info.seq != iter->cur_seq) {
/* messages are gone, move to first available one */
@@ -3468,10 +3522,8 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
}
/* last entry */
- if (iter->cur_seq >= iter->next_seq) {
- printk_safe_exit_irqrestore(flags);
+ if (iter->cur_seq >= iter->next_seq)
goto out;
- }
/*
* Find first record that fits, including all following records,
@@ -3503,7 +3555,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
iter->next_seq = next_seq;
ret = true;
- printk_safe_exit_irqrestore(flags);
out:
if (len_out)
*len_out = len;
@@ -3521,13 +3572,125 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
*/
void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
{
- unsigned long flags;
-
- printk_safe_enter_irqsave(flags);
iter->cur_seq = latched_seq_read_nolock(&clear_seq);
iter->next_seq = prb_next_seq(prb);
- printk_safe_exit_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
#endif
+
+#ifdef CONFIG_SMP
+static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1);
+static atomic_t printk_cpulock_nested = ATOMIC_INIT(0);
+
+/**
+ * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant
+ * spinning lock is not owned by any CPU.
+ *
+ * Context: Any context.
+ */
+void __printk_wait_on_cpu_lock(void)
+{
+ do {
+ cpu_relax();
+ } while (atomic_read(&printk_cpulock_owner) != -1);
+}
+EXPORT_SYMBOL(__printk_wait_on_cpu_lock);
+
+/**
+ * __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant
+ * spinning lock.
+ *
+ * If no processor has the lock, the calling processor takes the lock and
+ * becomes the owner. If the calling processor is already the owner of the
+ * lock, this function succeeds immediately.
+ *
+ * Context: Any context. Expects interrupts to be disabled.
+ * Return: 1 on success, otherwise 0.
+ */
+int __printk_cpu_trylock(void)
+{
+ int cpu;
+ int old;
+
+ cpu = smp_processor_id();
+
+ /*
+ * Guarantee loads and stores from this CPU when it is the lock owner
+ * are _not_ visible to the previous lock owner. This pairs with
+ * __printk_cpu_unlock:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, then
+ * __printk_cpu_unlock:A can never read from __printk_cpu_trylock:B.
+ *
+ * Relies on:
+ *
+ * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B
+ * of the previous CPU
+ * matching
+ * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B
+ * of this CPU
+ */
+ old = atomic_cmpxchg_acquire(&printk_cpulock_owner, -1,
+ cpu); /* LMM(__printk_cpu_trylock:A) */
+ if (old == -1) {
+ /*
+ * This CPU is now the owner and begins loading/storing
+ * data: LMM(__printk_cpu_trylock:B)
+ */
+ return 1;
+
+ } else if (old == cpu) {
+ /* This CPU is already the owner. */
+ atomic_inc(&printk_cpulock_nested);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__printk_cpu_trylock);
+
+/**
+ * __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock.
+ *
+ * The calling processor must be the owner of the lock.
+ *
+ * Context: Any context. Expects interrupts to be disabled.
+ */
+void __printk_cpu_unlock(void)
+{
+ if (atomic_read(&printk_cpulock_nested)) {
+ atomic_dec(&printk_cpulock_nested);
+ return;
+ }
+
+ /*
+ * This CPU is finished loading/storing data:
+ * LMM(__printk_cpu_unlock:A)
+ */
+
+ /*
+ * Guarantee loads and stores from this CPU when it was the
+ * lock owner are visible to the next lock owner. This pairs
+ * with __printk_cpu_trylock:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B,
+ * then __printk_cpu_trylock:B reads from __printk_cpu_unlock:A.
+ *
+ * Relies on:
+ *
+ * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B
+ * of this CPU
+ * matching
+ * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B
+ * of the next CPU
+ */
+ atomic_set_release(&printk_cpulock_owner,
+ -1); /* LMM(__printk_cpu_unlock:B) */
+}
+EXPORT_SYMBOL(__printk_cpu_unlock);
+#endif /* CONFIG_SMP */
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 94232186fccb..ef0f9a2044da 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -4,347 +4,16 @@
*/
#include <linux/preempt.h>
-#include <linux/spinlock.h>
-#include <linux/debug_locks.h>
#include <linux/kdb.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
-#include <linux/irq_work.h>
#include <linux/printk.h>
#include <linux/kprobes.h>
#include "internal.h"
-/*
- * In NMI and safe mode, printk() avoids taking locks. Instead,
- * it uses an alternative implementation that temporary stores
- * the strings into a per-CPU buffer. The content of the buffer
- * is later flushed into the main ring buffer via IRQ work.
- *
- * The alternative implementation is chosen transparently
- * by examining current printk() context mask stored in @printk_context
- * per-CPU variable.
- *
- * The implementation allows to flush the strings also from another CPU.
- * There are situations when we want to make sure that all buffers
- * were handled or when IRQs are blocked.
- */
-
-#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
- sizeof(atomic_t) - \
- sizeof(atomic_t) - \
- sizeof(struct irq_work))
-
-struct printk_safe_seq_buf {
- atomic_t len; /* length of written data */
- atomic_t message_lost;
- struct irq_work work; /* IRQ work that flushes the buffer */
- unsigned char buffer[SAFE_LOG_BUF_LEN];
-};
-
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
static DEFINE_PER_CPU(int, printk_context);
-static DEFINE_RAW_SPINLOCK(safe_read_lock);
-
-#ifdef CONFIG_PRINTK_NMI
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
-#endif
-
-/* Get flushed in a more safe context. */
-static void queue_flush_work(struct printk_safe_seq_buf *s)
-{
- if (printk_percpu_data_ready())
- irq_work_queue(&s->work);
-}
-
-/*
- * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
- * have dedicated buffers, because otherwise printk-safe preempted by
- * NMI-printk would have overwritten the NMI messages.
- *
- * The messages are flushed from irq work (or from panic()), possibly,
- * from other CPU, concurrently with printk_safe_log_store(). Should this
- * happen, printk_safe_log_store() will notice the buffer->len mismatch
- * and repeat the write.
- */
-static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
- const char *fmt, va_list args)
-{
- int add;
- size_t len;
- va_list ap;
-
-again:
- len = atomic_read(&s->len);
-
- /* The trailing '\0' is not counted into len. */
- if (len >= sizeof(s->buffer) - 1) {
- atomic_inc(&s->message_lost);
- queue_flush_work(s);
- return 0;
- }
-
- /*
- * Make sure that all old data have been read before the buffer
- * was reset. This is not needed when we just append data.
- */
- if (!len)
- smp_rmb();
-
- va_copy(ap, args);
- add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap);
- va_end(ap);
- if (!add)
- return 0;
-
- /*
- * Do it once again if the buffer has been flushed in the meantime.
- * Note that atomic_cmpxchg() is an implicit memory barrier that
- * makes sure that the data were written before updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, len + add) != len)
- goto again;
-
- queue_flush_work(s);
- return add;
-}
-
-static inline void printk_safe_flush_line(const char *text, int len)
-{
- /*
- * Avoid any console drivers calls from here, because we may be
- * in NMI or printk_safe context (when in panic). The messages
- * must go only into the ring buffer at this stage. Consoles will
- * get explicitly called later when a crashdump is not generated.
- */
- printk_deferred("%.*s", len, text);
-}
-
-/* printk part of the temporary buffer line by line */
-static int printk_safe_flush_buffer(const char *start, size_t len)
-{
- const char *c, *end;
- bool header;
-
- c = start;
- end = start + len;
- header = true;
-
- /* Print line by line. */
- while (c < end) {
- if (*c == '\n') {
- printk_safe_flush_line(start, c - start + 1);
- start = ++c;
- header = true;
- continue;
- }
-
- /* Handle continuous lines or missing new line. */
- if ((c + 1 < end) && printk_get_level(c)) {
- if (header) {
- c = printk_skip_level(c);
- continue;
- }
-
- printk_safe_flush_line(start, c - start);
- start = c++;
- header = true;
- continue;
- }
-
- header = false;
- c++;
- }
-
- /* Check if there was a partial line. Ignore pure header. */
- if (start < end && !header) {
- static const char newline[] = KERN_CONT "\n";
-
- printk_safe_flush_line(start, end - start);
- printk_safe_flush_line(newline, strlen(newline));
- }
-
- return len;
-}
-
-static void report_message_lost(struct printk_safe_seq_buf *s)
-{
- int lost = atomic_xchg(&s->message_lost, 0);
-
- if (lost)
- printk_deferred("Lost %d message(s)!\n", lost);
-}
-
-/*
- * Flush data from the associated per-CPU buffer. The function
- * can be called either via IRQ work or independently.
- */
-static void __printk_safe_flush(struct irq_work *work)
-{
- struct printk_safe_seq_buf *s =
- container_of(work, struct printk_safe_seq_buf, work);
- unsigned long flags;
- size_t len;
- int i;
-
- /*
- * The lock has two functions. First, one reader has to flush all
- * available message to make the lockless synchronization with
- * writers easier. Second, we do not want to mix messages from
- * different CPUs. This is especially important when printing
- * a backtrace.
- */
- raw_spin_lock_irqsave(&safe_read_lock, flags);
-
- i = 0;
-more:
- len = atomic_read(&s->len);
-
- /*
- * This is just a paranoid check that nobody has manipulated
- * the buffer an unexpected way. If we printed something then
- * @len must only increase. Also it should never overflow the
- * buffer size.
- */
- if ((i && i >= len) || len > sizeof(s->buffer)) {
- const char *msg = "printk_safe_flush: internal error\n";
-
- printk_safe_flush_line(msg, strlen(msg));
- len = 0;
- }
-
- if (!len)
- goto out; /* Someone else has already flushed the buffer. */
-
- /* Make sure that data has been written up to the @len */
- smp_rmb();
- i += printk_safe_flush_buffer(s->buffer + i, len - i);
-
- /*
- * Check that nothing has got added in the meantime and truncate
- * the buffer. Note that atomic_cmpxchg() is an implicit memory
- * barrier that makes sure that the data were copied before
- * updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, 0) != len)
- goto more;
-
-out:
- report_message_lost(s);
- raw_spin_unlock_irqrestore(&safe_read_lock, flags);
-}
-
-/**
- * printk_safe_flush - flush all per-cpu nmi buffers.
- *
- * The buffers are flushed automatically via IRQ work. This function
- * is useful only when someone wants to be sure that all buffers have
- * been flushed at some point.
- */
-void printk_safe_flush(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
-#ifdef CONFIG_PRINTK_NMI
- __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
-#endif
- __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work);
- }
-}
-
-/**
- * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system
- * goes down.
- *
- * Similar to printk_safe_flush() but it can be called even in NMI context when
- * the system goes down. It does the best effort to get NMI messages into
- * the main ring buffer.
- *
- * Note that it could try harder when there is only one CPU online.
- */
-void printk_safe_flush_on_panic(void)
-{
- /*
- * Make sure that we could access the safe buffers.
- * Do not risk a double release when more CPUs are up.
- */
- if (raw_spin_is_locked(&safe_read_lock)) {
- if (num_online_cpus() > 1)
- return;
-
- debug_locks_off();
- raw_spin_lock_init(&safe_read_lock);
- }
-
- printk_safe_flush();
-}
-
-#ifdef CONFIG_PRINTK_NMI
-/*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
- */
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
-void noinstr printk_nmi_enter(void)
-{
- this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
-}
-
-void noinstr printk_nmi_exit(void)
-{
- this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
-}
-
-/*
- * Marks a code that might produce many messages in NMI context
- * and the risk of losing them is more critical than eventual
- * reordering.
- *
- * It has effect only when called in NMI context. Then printk()
- * will store the messages into the main logbuf directly.
- */
-void printk_nmi_direct_enter(void)
-{
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-void printk_nmi_direct_exit(void)
-{
- this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-#else
-
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- return 0;
-}
-
-#endif /* CONFIG_PRINTK_NMI */
-
-/*
- * Lock-less printk(), to avoid deadlocks should the printk() recurse
- * into itself. It uses a per-CPU buffer to store the message, just like
- * NMI.
- */
-static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
/* Can be preempted by NMI. */
void __printk_safe_enter(void)
{
@@ -369,46 +38,15 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* Use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
*/
- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) {
- unsigned long flags;
+ if (this_cpu_read(printk_context) || in_nmi()) {
int len;
- printk_safe_enter_irqsave(flags);
len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
- printk_safe_exit_irqrestore(flags);
defer_console_output();
return len;
}
- /* Use extra buffer in NMI. */
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- return vprintk_nmi(fmt, args);
-
- /* Use extra buffer to prevent a recursion deadlock in safe mode. */
- if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
- return vprintk_safe(fmt, args);
-
/* No obstacles. */
return vprintk_default(fmt, args);
}
EXPORT_SYMBOL(vprintk);
-
-void __init printk_safe_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct printk_safe_seq_buf *s;
-
- s = &per_cpu(safe_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-
-#ifdef CONFIG_PRINTK_NMI
- s = &per_cpu(nmi_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-#endif
- }
-
- /* Flush pending messages that did not have scheduled IRQ works. */
- printk_safe_flush();
-}
diff --git a/kernel/profile.c b/kernel/profile.c
index c2ebddb5e974..eb9c7f0f5ac5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -41,7 +41,8 @@ struct profile_hit {
#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
static atomic_t *prof_buffer;
-static unsigned long prof_len, prof_shift;
+static unsigned long prof_len;
+static unsigned short int prof_shift;
int prof_on __read_mostly;
EXPORT_SYMBOL_GPL(prof_on);
@@ -67,8 +68,8 @@ int profile_setup(char *str)
if (str[strlen(sleepstr)] == ',')
str += strlen(sleepstr) + 1;
if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel sleep profiling enabled (shift: %ld)\n",
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel sleep profiling enabled (shift: %u)\n",
prof_shift);
#else
pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
@@ -78,21 +79,21 @@ int profile_setup(char *str)
if (str[strlen(schedstr)] == ',')
str += strlen(schedstr) + 1;
if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel schedule profiling enabled (shift: %ld)\n",
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel schedule profiling enabled (shift: %u)\n",
prof_shift);
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
prof_on = KVM_PROFILING;
if (str[strlen(kvmstr)] == ',')
str += strlen(kvmstr) + 1;
if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel KVM profiling enabled (shift: %ld)\n",
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel KVM profiling enabled (shift: %u)\n",
prof_shift);
} else if (get_option(&str, &par)) {
- prof_shift = par;
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
prof_on = CPU_PROFILING;
- pr_info("kernel profiling enabled (shift: %ld)\n",
+ pr_info("kernel profiling enabled (shift: %u)\n",
prof_shift);
}
return 1;
@@ -468,7 +469,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
unsigned long p = *ppos;
ssize_t read;
char *pnt;
- unsigned int sample_step = 1 << prof_shift;
+ unsigned long sample_step = 1UL << prof_shift;
profile_flip_buffers();
if (p >= (prof_len+1)*sizeof(unsigned int))
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2997ca600d18..f8589bf8d7dc 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -197,7 +197,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
spin_lock_irq(&task->sighand->siglock);
if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
!__fatal_signal_pending(task)) {
- task->state = __TASK_TRACED;
+ WRITE_ONCE(task->__state, __TASK_TRACED);
ret = true;
}
spin_unlock_irq(&task->sighand->siglock);
@@ -207,7 +207,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
static void ptrace_unfreeze_traced(struct task_struct *task)
{
- if (task->state != __TASK_TRACED)
+ if (READ_ONCE(task->__state) != __TASK_TRACED)
return;
WARN_ON(!task->ptrace || task->parent != current);
@@ -217,11 +217,11 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
* Recheck state under the lock to close this race.
*/
spin_lock_irq(&task->sighand->siglock);
- if (task->state == __TASK_TRACED) {
+ if (READ_ONCE(task->__state) == __TASK_TRACED) {
if (__fatal_signal_pending(task))
wake_up_state(task, __TASK_TRACED);
else
- task->state = TASK_TRACED;
+ WRITE_ONCE(task->__state, TASK_TRACED);
}
spin_unlock_irq(&task->sighand->siglock);
}
@@ -256,7 +256,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
*/
read_lock(&tasklist_lock);
if (child->ptrace && child->parent == current) {
- WARN_ON(child->state == __TASK_TRACED);
+ WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
/*
* child->sighand can't be NULL, release_task()
* does ptrace_unlink() before __exit_signal().
@@ -273,7 +273,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
* ptrace_stop() changes ->state back to TASK_RUNNING,
* so we should not worry about leaking __TASK_TRACED.
*/
- WARN_ON(child->state == __TASK_TRACED);
+ WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
ret = -ESRCH;
}
}
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 1942c1f1bb65..4fd64999300f 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -116,7 +116,7 @@ config RCU_EQS_DEBUG
config RCU_STRICT_GRACE_PERIOD
bool "Provide debug RCU implementation with short grace periods"
- depends on DEBUG_KERNEL && RCU_EXPERT
+ depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4
default n
select PREEMPT_COUNT if PREEMPT=n
help
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index bf0827d4b659..24b5f2c2de87 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -308,6 +308,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
}
}
+extern void rcu_init_geometry(void);
+
/* Returns a pointer to the first leaf rcu_node structure. */
#define rcu_first_leaf_node() (rcu_state.level[rcu_num_lvls - 1])
@@ -422,12 +424,6 @@ do { \
#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
-#ifdef CONFIG_SRCU
-void srcu_init(void);
-#else /* #ifdef CONFIG_SRCU */
-static inline void srcu_init(void) { }
-#endif /* #else #ifdef CONFIG_SRCU */
-
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
static inline bool rcu_gp_is_normal(void) { return true; }
@@ -441,7 +437,11 @@ bool rcu_gp_is_expedited(void); /* Internal RCU use. */
void rcu_expedite_gp(void);
void rcu_unexpedite_gp(void);
void rcupdate_announce_bootup_oddness(void);
+#ifdef CONFIG_TASKS_RCU_GENERIC
void show_rcu_tasks_gp_kthreads(void);
+#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
+static inline void show_rcu_tasks_gp_kthreads(void) {}
+#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
void rcu_request_urgent_qs_task(struct task_struct *t);
#endif /* #else #ifdef CONFIG_TINY_RCU */
@@ -519,6 +519,7 @@ static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
static inline unsigned long
srcu_batches_completed(struct srcu_struct *sp) { return 0; }
static inline void rcu_force_quiescent_state(void) { }
+static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; }
static inline void show_rcu_gp_kthreads(void) { }
static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
static inline void rcu_fwd_progress_check(unsigned long j) { }
@@ -527,6 +528,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
unsigned long rcu_get_gp_seq(void);
unsigned long rcu_exp_batches_completed(void);
unsigned long srcu_batches_completed(struct srcu_struct *sp);
+bool rcu_check_boost_fail(unsigned long gp_state, int *cpup);
void show_rcu_gp_kthreads(void);
int rcu_get_gp_kthreads_prio(void);
void rcu_fwd_progress_check(unsigned long j);
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index dca51fe9c73f..2cc34a22a506 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -487,7 +487,7 @@ retry:
if (gp_async) {
cur_ops->gp_barrier();
}
- writer_n_durations[me] = i_max;
+ writer_n_durations[me] = i_max + 1;
torture_kthread_stopping("rcu_scale_writer");
return 0;
}
@@ -561,7 +561,7 @@ rcu_scale_cleanup(void)
wdpp = writer_durations[i];
if (!wdpp)
continue;
- for (j = 0; j <= writer_n_durations[i]; j++) {
+ for (j = 0; j < writer_n_durations[i]; j++) {
wdp = &wdpp[j];
pr_alert("%s%s %4d writer-duration: %5d %llu\n",
scale_type, SCALE_FLAG,
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 29d2f4c647d3..ab4215266ebe 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -245,12 +245,6 @@ static const char *rcu_torture_writer_state_getname(void)
return rcu_torture_writer_state_names[i];
}
-#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_PREEMPT_RT)
-# define rcu_can_boost() 1
-#else
-# define rcu_can_boost() 0
-#endif
-
#ifdef CONFIG_RCU_TRACE
static u64 notrace rcu_trace_clock_local(void)
{
@@ -331,6 +325,7 @@ struct rcu_torture_ops {
void (*read_delay)(struct torture_random_state *rrsp,
struct rt_read_seg *rtrsp);
void (*readunlock)(int idx);
+ int (*readlock_held)(void);
unsigned long (*get_gp_seq)(void);
unsigned long (*gp_diff)(unsigned long new, unsigned long old);
void (*deferred_free)(struct rcu_torture *p);
@@ -345,6 +340,7 @@ struct rcu_torture_ops {
void (*fqs)(void);
void (*stats)(void);
void (*gp_kthread_dbg)(void);
+ bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
int (*stall_dur)(void);
int irq_capable;
int can_boost;
@@ -359,6 +355,11 @@ static struct rcu_torture_ops *cur_ops;
* Definitions for rcu torture testing.
*/
+static int torture_readlock_not_held(void)
+{
+ return rcu_read_lock_bh_held() || rcu_read_lock_sched_held();
+}
+
static int rcu_torture_read_lock(void) __acquires(RCU)
{
rcu_read_lock();
@@ -483,30 +484,32 @@ static void rcu_sync_torture_init(void)
}
static struct rcu_torture_ops rcu_ops = {
- .ttype = RCU_FLAVOR,
- .init = rcu_sync_torture_init,
- .readlock = rcu_torture_read_lock,
- .read_delay = rcu_read_delay,
- .readunlock = rcu_torture_read_unlock,
- .get_gp_seq = rcu_get_gp_seq,
- .gp_diff = rcu_seq_diff,
- .deferred_free = rcu_torture_deferred_free,
- .sync = synchronize_rcu,
- .exp_sync = synchronize_rcu_expedited,
- .get_gp_state = get_state_synchronize_rcu,
- .start_gp_poll = start_poll_synchronize_rcu,
- .poll_gp_state = poll_state_synchronize_rcu,
- .cond_sync = cond_synchronize_rcu,
- .call = call_rcu,
- .cb_barrier = rcu_barrier,
- .fqs = rcu_force_quiescent_state,
- .stats = NULL,
- .gp_kthread_dbg = show_rcu_gp_kthreads,
- .stall_dur = rcu_jiffies_till_stall_check,
- .irq_capable = 1,
- .can_boost = rcu_can_boost(),
- .extendables = RCUTORTURE_MAX_EXTEND,
- .name = "rcu"
+ .ttype = RCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .readlock_held = torture_readlock_not_held,
+ .get_gp_seq = rcu_get_gp_seq,
+ .gp_diff = rcu_seq_diff,
+ .deferred_free = rcu_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .exp_sync = synchronize_rcu_expedited,
+ .get_gp_state = get_state_synchronize_rcu,
+ .start_gp_poll = start_poll_synchronize_rcu,
+ .poll_gp_state = poll_state_synchronize_rcu,
+ .cond_sync = cond_synchronize_rcu,
+ .call = call_rcu,
+ .cb_barrier = rcu_barrier,
+ .fqs = rcu_force_quiescent_state,
+ .stats = NULL,
+ .gp_kthread_dbg = show_rcu_gp_kthreads,
+ .check_boost_failed = rcu_check_boost_fail,
+ .stall_dur = rcu_jiffies_till_stall_check,
+ .irq_capable = 1,
+ .can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
+ .extendables = RCUTORTURE_MAX_EXTEND,
+ .name = "rcu"
};
/*
@@ -540,6 +543,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock,
+ .readlock_held = torture_readlock_not_held,
.get_gp_seq = rcu_no_completed,
.deferred_free = rcu_busted_torture_deferred_free,
.sync = synchronize_rcu_busted,
@@ -589,6 +593,11 @@ static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
srcu_read_unlock(srcu_ctlp, idx);
}
+static int torture_srcu_read_lock_held(void)
+{
+ return srcu_read_lock_held(srcu_ctlp);
+}
+
static unsigned long srcu_torture_completed(void)
{
return srcu_batches_completed(srcu_ctlp);
@@ -646,6 +655,7 @@ static struct rcu_torture_ops srcu_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
@@ -681,6 +691,7 @@ static struct rcu_torture_ops srcud_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
@@ -700,6 +711,7 @@ static struct rcu_torture_ops busted_srcud_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
@@ -787,6 +799,7 @@ static struct rcu_torture_ops trivial_ops = {
.readlock = rcu_torture_read_lock_trivial,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock_trivial,
+ .readlock_held = torture_readlock_not_held,
.get_gp_seq = rcu_no_completed,
.sync = synchronize_rcu_trivial,
.exp_sync = synchronize_rcu_trivial,
@@ -850,6 +863,7 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.readlock = tasks_tracing_torture_read_lock,
.read_delay = srcu_read_delay, /* just reuse srcu's version. */
.readunlock = tasks_tracing_torture_read_unlock,
+ .readlock_held = rcu_read_lock_trace_held,
.get_gp_seq = rcu_no_completed,
.deferred_free = rcu_tasks_tracing_torture_deferred_free,
.sync = synchronize_rcu_tasks_trace,
@@ -871,32 +885,13 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
return cur_ops->gp_diff(new, old);
}
-static bool __maybe_unused torturing_tasks(void)
-{
- return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops;
-}
-
/*
* RCU torture priority-boost testing. Runs one real-time thread per
- * CPU for moderate bursts, repeatedly registering RCU callbacks and
- * spinning waiting for them to be invoked. If a given callback takes
- * too long to be invoked, we assume that priority inversion has occurred.
+ * CPU for moderate bursts, repeatedly starting grace periods and waiting
+ * for them to complete. If a given grace period takes too long, we assume
+ * that priority inversion has occurred.
*/
-struct rcu_boost_inflight {
- struct rcu_head rcu;
- int inflight;
-};
-
-static void rcu_torture_boost_cb(struct rcu_head *head)
-{
- struct rcu_boost_inflight *rbip =
- container_of(head, struct rcu_boost_inflight, rcu);
-
- /* Ensure RCU-core accesses precede clearing ->inflight */
- smp_store_release(&rbip->inflight, 0);
-}
-
static int old_rt_runtime = -1;
static void rcu_torture_disable_rt_throttle(void)
@@ -923,49 +918,68 @@ static void rcu_torture_enable_rt_throttle(void)
old_rt_runtime = -1;
}
-static bool rcu_torture_boost_failed(unsigned long start, unsigned long end)
+static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *start)
{
+ int cpu;
static int dbg_done;
+ unsigned long end = jiffies;
+ bool gp_done;
+ unsigned long j;
+ static unsigned long last_persist;
+ unsigned long lp;
+ unsigned long mininterval = test_boost_duration * HZ - HZ / 2;
- if (end - start > test_boost_duration * HZ - HZ / 2) {
+ if (end - *start > mininterval) {
+ // Recheck after checking time to avoid false positives.
+ smp_mb(); // Time check before grace-period check.
+ if (cur_ops->poll_gp_state(gp_state))
+ return false; // passed, though perhaps just barely
+ if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, &cpu)) {
+ // At most one persisted message per boost test.
+ j = jiffies;
+ lp = READ_ONCE(last_persist);
+ if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp)
+ pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu);
+ return false; // passed on a technicality
+ }
VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
n_rcu_torture_boost_failure++;
- if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg)
+ if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) {
+ pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n",
+ current->rt_priority, gp_state, end - *start);
cur_ops->gp_kthread_dbg();
+ // Recheck after print to flag grace period ending during splat.
+ gp_done = cur_ops->poll_gp_state(gp_state);
+ pr_info("Boost inversion: GP %lu %s.\n", gp_state,
+ gp_done ? "ended already" : "still pending");
+
+ }
- return true; /* failed */
+ return true; // failed
+ } else if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, NULL)) {
+ *start = jiffies;
}
- return false; /* passed */
+ return false; // passed
}
static int rcu_torture_boost(void *arg)
{
- unsigned long call_rcu_time;
unsigned long endtime;
+ unsigned long gp_state;
+ unsigned long gp_state_time;
unsigned long oldstarttime;
- struct rcu_boost_inflight rbi = { .inflight = 0 };
VERBOSE_TOROUT_STRING("rcu_torture_boost started");
/* Set real-time priority. */
sched_set_fifo_low(current);
- init_rcu_head_on_stack(&rbi.rcu);
/* Each pass through the following loop does one boost-test cycle. */
do {
bool failed = false; // Test failed already in this test interval
- bool firsttime = true;
+ bool gp_initiated = false;
- /* Increment n_rcu_torture_boosts once per boost-test */
- while (!kthread_should_stop()) {
- if (mutex_trylock(&boost_mutex)) {
- n_rcu_torture_boosts++;
- mutex_unlock(&boost_mutex);
- break;
- }
- schedule_timeout_uninterruptible(1);
- }
if (kthread_should_stop())
goto checkwait;
@@ -979,33 +993,33 @@ static int rcu_torture_boost(void *arg)
goto checkwait;
}
- /* Do one boost-test interval. */
+ // Do one boost-test interval.
endtime = oldstarttime + test_boost_duration * HZ;
while (time_before(jiffies, endtime)) {
- /* If we don't have a callback in flight, post one. */
- if (!smp_load_acquire(&rbi.inflight)) {
- /* RCU core before ->inflight = 1. */
- smp_store_release(&rbi.inflight, 1);
- cur_ops->call(&rbi.rcu, rcu_torture_boost_cb);
- /* Check if the boost test failed */
- if (!firsttime && !failed)
- failed = rcu_torture_boost_failed(call_rcu_time, jiffies);
- call_rcu_time = jiffies;
- firsttime = false;
+ // Has current GP gone too long?
+ if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+ failed = rcu_torture_boost_failed(gp_state, &gp_state_time);
+ // If we don't have a grace period in flight, start one.
+ if (!gp_initiated || cur_ops->poll_gp_state(gp_state)) {
+ gp_state = cur_ops->start_gp_poll();
+ gp_initiated = true;
+ gp_state_time = jiffies;
}
- if (stutter_wait("rcu_torture_boost"))
+ if (stutter_wait("rcu_torture_boost")) {
sched_set_fifo_low(current);
+ // If the grace period already ended,
+ // we don't know when that happened, so
+ // start over.
+ if (cur_ops->poll_gp_state(gp_state))
+ gp_initiated = false;
+ }
if (torture_must_stop())
goto checkwait;
}
- /*
- * If boost never happened, then inflight will always be 1, in
- * this case the boost check would never happen in the above
- * loop so do another one here.
- */
- if (!firsttime && !failed && smp_load_acquire(&rbi.inflight))
- rcu_torture_boost_failed(call_rcu_time, jiffies);
+ // In case the grace period extended beyond the end of the loop.
+ if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+ rcu_torture_boost_failed(gp_state, &gp_state_time);
/*
* Set the start time of the next test interval.
@@ -1014,11 +1028,12 @@ static int rcu_torture_boost(void *arg)
* interval. Besides, we are running at RT priority,
* so delays should be relatively rare.
*/
- while (oldstarttime == boost_starttime &&
- !kthread_should_stop()) {
+ while (oldstarttime == boost_starttime && !kthread_should_stop()) {
if (mutex_trylock(&boost_mutex)) {
- boost_starttime = jiffies +
- test_boost_interval * HZ;
+ if (oldstarttime == boost_starttime) {
+ boost_starttime = jiffies + test_boost_interval * HZ;
+ n_rcu_torture_boosts++;
+ }
mutex_unlock(&boost_mutex);
break;
}
@@ -1030,15 +1045,11 @@ checkwait: if (stutter_wait("rcu_torture_boost"))
sched_set_fifo_low(current);
} while (!torture_must_stop());
- while (smp_load_acquire(&rbi.inflight))
- schedule_timeout_uninterruptible(1); // rcu_barrier() deadlocks.
-
/* Clean up and exit. */
- while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
+ while (!kthread_should_stop()) {
torture_shutdown_absorb("rcu_torture_boost");
schedule_timeout_uninterruptible(1);
}
- destroy_rcu_head_on_stack(&rbi.rcu);
torture_kthread_stopping("rcu_torture_boost");
return 0;
}
@@ -1553,11 +1564,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
- rcu_read_lock_bh_held() ||
- rcu_read_lock_sched_held() ||
- srcu_read_lock_held(srcu_ctlp) ||
- rcu_read_lock_trace_held() ||
- torturing_tasks());
+ !cur_ops->readlock_held || cur_ops->readlock_held());
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
@@ -1831,10 +1838,10 @@ rcu_torture_stats_print(void)
srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
&flags, &gp_seq);
wtp = READ_ONCE(writer_task);
- pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#lx cpu %d\n",
+ pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n",
rcu_torture_writer_state_getname(),
rcu_torture_writer_state, gp_seq, flags,
- wtp == NULL ? ~0UL : wtp->state,
+ wtp == NULL ? ~0U : wtp->__state,
wtp == NULL ? -1 : (int)task_cpu(wtp));
if (!splatted && wtp) {
sched_show_task(wtp);
@@ -1861,46 +1868,47 @@ rcu_torture_stats(void *arg)
torture_shutdown_absorb("rcu_torture_stats");
} while (!torture_must_stop());
torture_kthread_stopping("rcu_torture_stats");
+ return 0;
+}
- {
- struct rcu_head *rhp;
- struct kmem_cache *kcp;
- static int z;
-
- kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
- rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
- pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
- pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
- mem_dump_obj(ZERO_SIZE_PTR);
- pr_alert("mem_dump_obj(NULL):");
- mem_dump_obj(NULL);
- pr_alert("mem_dump_obj(%px):", &rhp);
- mem_dump_obj(&rhp);
- pr_alert("mem_dump_obj(%px):", rhp);
- mem_dump_obj(rhp);
- pr_alert("mem_dump_obj(%px):", &rhp->func);
- mem_dump_obj(&rhp->func);
- pr_alert("mem_dump_obj(%px):", &z);
- mem_dump_obj(&z);
- kmem_cache_free(kcp, rhp);
- kmem_cache_destroy(kcp);
- rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
- pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
- pr_alert("mem_dump_obj(kmalloc %px):", rhp);
- mem_dump_obj(rhp);
- pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
- mem_dump_obj(&rhp->func);
- kfree(rhp);
- rhp = vmalloc(4096);
- pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
- pr_alert("mem_dump_obj(vmalloc %px):", rhp);
- mem_dump_obj(rhp);
- pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
- mem_dump_obj(&rhp->func);
- vfree(rhp);
- }
+/* Test mem_dump_obj() and friends. */
+static void rcu_torture_mem_dump_obj(void)
+{
+ struct rcu_head *rhp;
+ struct kmem_cache *kcp;
+ static int z;
- return 0;
+ kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
+ rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
+ pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
+ pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
+ mem_dump_obj(ZERO_SIZE_PTR);
+ pr_alert("mem_dump_obj(NULL):");
+ mem_dump_obj(NULL);
+ pr_alert("mem_dump_obj(%px):", &rhp);
+ mem_dump_obj(&rhp);
+ pr_alert("mem_dump_obj(%px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(%px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ pr_alert("mem_dump_obj(%px):", &z);
+ mem_dump_obj(&z);
+ kmem_cache_free(kcp, rhp);
+ kmem_cache_destroy(kcp);
+ rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+ pr_alert("mem_dump_obj(kmalloc %px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ kfree(rhp);
+ rhp = vmalloc(4096);
+ pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+ pr_alert("mem_dump_obj(vmalloc %px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ vfree(rhp);
}
static void
@@ -2014,8 +2022,13 @@ static int rcu_torture_stall(void *args)
__func__, raw_smp_processor_id());
while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(),
stop_at))
- if (stall_cpu_block)
+ if (stall_cpu_block) {
+#ifdef CONFIG_PREEMPTION
+ preempt_schedule();
+#else
schedule_timeout_uninterruptible(HZ);
+#endif
+ }
if (stall_cpu_irqsoff)
local_irq_enable();
else if (!stall_cpu_block)
@@ -2634,7 +2647,7 @@ static bool rcu_torture_can_boost(void)
if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2)
return false;
- if (!cur_ops->call)
+ if (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)
return false;
prio = rcu_get_gp_kthreads_prio();
@@ -2642,7 +2655,7 @@ static bool rcu_torture_can_boost(void)
return false;
if (prio < 2) {
- if (boost_warn_once == 1)
+ if (boost_warn_once == 1)
return false;
pr_alert("%s: WARN: RCU kthread priority too low to test boosting. Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME);
@@ -2818,6 +2831,8 @@ rcu_torture_cleanup(void)
if (cur_ops->cleanup != NULL)
cur_ops->cleanup();
+ rcu_torture_mem_dump_obj();
+
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
if (err_segs_recorded) {
@@ -3120,6 +3135,21 @@ rcu_torture_init(void)
if (firsterr < 0)
goto unwind;
rcutor_hp = firsterr;
+
+ // Testing RCU priority boosting requires rcutorture do
+ // some serious abuse. Counter this by running ksoftirqd
+ // at higher priority.
+ if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
+ for_each_online_cpu(cpu) {
+ struct sched_param sp;
+ struct task_struct *t;
+
+ t = per_cpu(ksoftirqd, cpu);
+ WARN_ON_ONCE(!t);
+ sp.sched_priority = 2;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+ }
}
shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 02dd9767b559..66dc14cf5687 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -362,6 +362,145 @@ static struct ref_scale_ops rwsem_ops = {
.name = "rwsem"
};
+// Definitions for global spinlock
+static DEFINE_SPINLOCK(test_lock);
+
+static void ref_lock_section(const int nloops)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ spin_lock(&test_lock);
+ spin_unlock(&test_lock);
+ }
+ preempt_enable();
+}
+
+static void ref_lock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ spin_lock(&test_lock);
+ un_delay(udl, ndl);
+ spin_unlock(&test_lock);
+ }
+ preempt_enable();
+}
+
+static struct ref_scale_ops lock_ops = {
+ .readsection = ref_lock_section,
+ .delaysection = ref_lock_delay_section,
+ .name = "lock"
+};
+
+// Definitions for global irq-save spinlock
+
+static void ref_lock_irq_section(const int nloops)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ spin_lock_irqsave(&test_lock, flags);
+ spin_unlock_irqrestore(&test_lock, flags);
+ }
+ preempt_enable();
+}
+
+static void ref_lock_irq_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ spin_lock_irqsave(&test_lock, flags);
+ un_delay(udl, ndl);
+ spin_unlock_irqrestore(&test_lock, flags);
+ }
+ preempt_enable();
+}
+
+static struct ref_scale_ops lock_irq_ops = {
+ .readsection = ref_lock_irq_section,
+ .delaysection = ref_lock_irq_delay_section,
+ .name = "lock-irq"
+};
+
+// Definitions acquire-release.
+static DEFINE_PER_CPU(unsigned long, test_acqrel);
+
+static void ref_acqrel_section(const int nloops)
+{
+ unsigned long x;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
+ smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
+ }
+ preempt_enable();
+}
+
+static void ref_acqrel_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned long x;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
+ un_delay(udl, ndl);
+ smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
+ }
+ preempt_enable();
+}
+
+static struct ref_scale_ops acqrel_ops = {
+ .readsection = ref_acqrel_section,
+ .delaysection = ref_acqrel_delay_section,
+ .name = "acqrel"
+};
+
+static volatile u64 stopopts;
+
+static void ref_clock_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += ktime_get_real_fast_ns();
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_clock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += ktime_get_real_fast_ns();
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static struct ref_scale_ops clock_ops = {
+ .readsection = ref_clock_section,
+ .delaysection = ref_clock_delay_section,
+ .name = "clock"
+};
+
static void rcu_scale_one_reader(void)
{
if (readdelay <= 0)
@@ -382,13 +521,13 @@ ref_scale_reader(void *arg)
s64 duration;
VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me);
- set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)));
set_user_nice(current, MAX_NICE);
atomic_inc(&n_init);
if (holdoff)
schedule_timeout_interruptible(holdoff * HZ);
repeat:
- VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, raw_smp_processor_id());
// Wait for signal that this reader can start.
wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
@@ -398,7 +537,7 @@ repeat:
goto end;
// Make sure that the CPU is affinitized appropriately during testing.
- WARN_ON_ONCE(smp_processor_id() != me);
+ WARN_ON_ONCE(raw_smp_processor_id() != me);
WRITE_ONCE(rt->start_reader, 0);
if (!atomic_dec_return(&n_started))
@@ -653,8 +792,8 @@ ref_scale_init(void)
long i;
int firsterr = 0;
static struct ref_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops,
- &refcnt_ops, &rwlock_ops, &rwsem_ops,
+ &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops,
+ &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops,
};
if (!torture_init_begin(scale_type, verbose))
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 26344dc6483b..a0ba2ed49bc6 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
*/
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
- int newval = ssp->srcu_lock_nesting[idx] - 1;
+ int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
if (!newval && READ_ONCE(ssp->srcu_gp_waiting))
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index e26547b34ad3..6833d8887181 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -80,7 +80,7 @@ do { \
* srcu_read_unlock() running against them. So if the is_static parameter
* is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
*/
-static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
+static void init_srcu_struct_nodes(struct srcu_struct *ssp)
{
int cpu;
int i;
@@ -90,6 +90,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
struct srcu_node *snp;
struct srcu_node *snp_first;
+ /* Initialize geometry if it has not already been initialized. */
+ rcu_init_geometry();
+
/* Work out the overall tree geometry. */
ssp->level[0] = &ssp->node[0];
for (i = 1; i < rcu_num_lvls; i++)
@@ -148,14 +151,6 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
sdp->ssp = ssp;
sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
- if (is_static)
- continue;
-
- /* Dynamically allocated, better be no srcu_read_locks()! */
- for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
- sdp->srcu_lock_count[i] = 0;
- sdp->srcu_unlock_count[i] = 0;
- }
}
}
@@ -179,7 +174,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
ssp->sda = alloc_percpu(struct srcu_data);
if (!ssp->sda)
return -ENOMEM;
- init_srcu_struct_nodes(ssp, is_static);
+ init_srcu_struct_nodes(ssp);
ssp->srcu_gp_seq_needed_exp = 0;
ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
@@ -777,9 +772,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
spin_unlock_irqrestore_rcu_node(sdp, flags);
/*
- * No local callbacks, so probabalistically probe global state.
+ * No local callbacks, so probabilistically probe global state.
* Exact information would require acquiring locks, which would
- * kill scalability, hence the probabalistic nature of the probe.
+ * kill scalability, hence the probabilistic nature of the probe.
*/
/* First, see if enough time has passed since the last GP. */
@@ -1000,6 +995,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
* synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
* passed the same srcu_struct structure.
*
+ * Implementation of these memory-ordering guarantees is similar to
+ * that of synchronize_rcu().
+ *
* If SRCU is likely idle, expedite the first request. This semantic
* was provided by Classic SRCU, and is relied upon by its users, so TREE
* SRCU must also provide it. Note that detecting idleness is heuristic
@@ -1392,11 +1390,15 @@ void __init srcu_init(void)
{
struct srcu_struct *ssp;
+ /*
+ * Once that is set, call_srcu() can follow the normal path and
+ * queue delayed work. This must follow RCU workqueues creation
+ * and timers initialization.
+ */
srcu_init_done = true;
while (!list_empty(&srcu_boot_list)) {
ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
work.work.entry);
- check_init_srcu_struct(ssp);
list_del_init(&ssp->work.work.entry);
queue_work(rcu_gp_wq, &ssp->work.work);
}
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index d4558ab7a07d..33d896d85902 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -94,9 +94,9 @@ static void rcu_sync_func(struct rcu_head *rhp)
rcu_sync_call(rsp);
} else {
/*
- * We're at least a GP after the last rcu_sync_exit(); eveybody
+ * We're at least a GP after the last rcu_sync_exit(); everybody
* will now have observed the write side critical section.
- * Let 'em rip!.
+ * Let 'em rip!
*/
WRITE_ONCE(rsp->gp_state, GP_IDLE);
}
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 350ebf5051f9..806160c44b17 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -23,7 +23,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
* @cbs_head: Head of callback list.
* @cbs_tail: Tail pointer for callback list.
- * @cbs_wq: Wait queue allowning new callback to get kthread's attention.
+ * @cbs_wq: Wait queue allowing new callback to get kthread's attention.
* @cbs_lock: Lock protecting callback list.
* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
* @gp_func: This flavor's grace-period-wait function.
@@ -377,6 +377,46 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// Finally, this implementation does not support high call_rcu_tasks()
// rates from multiple CPUs. If this is required, per-CPU callback lists
// will be needed.
+//
+// The implementation uses rcu_tasks_wait_gp(), which relies on function
+// pointers in the rcu_tasks structure. The rcu_spawn_tasks_kthread()
+// function sets these function pointers up so that rcu_tasks_wait_gp()
+// invokes these functions in this order:
+//
+// rcu_tasks_pregp_step():
+// Invokes synchronize_rcu() in order to wait for all in-flight
+// t->on_rq and t->nvcsw transitions to complete. This works because
+// all such transitions are carried out with interrupts disabled.
+// rcu_tasks_pertask(), invoked on every non-idle task:
+// For every runnable non-idle task other than the current one, use
+// get_task_struct() to pin down that task, snapshot that task's
+// number of voluntary context switches, and add that task to the
+// holdout list.
+// rcu_tasks_postscan():
+// Invoke synchronize_srcu() to ensure that all tasks that were
+// in the process of exiting (and which thus might not know to
+// synchronize with this RCU Tasks grace period) have completed
+// exiting.
+// check_all_holdout_tasks(), repeatedly until holdout list is empty:
+// Scans the holdout list, attempting to identify a quiescent state
+// for each task on the list. If there is a quiescent state, the
+// corresponding task is removed from the holdout list.
+// rcu_tasks_postgp():
+// Invokes synchronize_rcu() in order to ensure that all prior
+// t->on_rq and t->nvcsw transitions are seen by all CPUs and tasks
+// to have happened before the end of this RCU Tasks grace period.
+// Again, this works because all such transitions are carried out
+// with interrupts disabled.
+//
+// For each exiting task, the exit_tasks_rcu_start() and
+// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
+// read-side critical sections waited for by rcu_tasks_postscan().
+//
+// Pre-grace-period update-side code is ordered before the grace via the
+// ->cbs_lock and the smp_mb__after_spinlock(). Pre-grace-period read-side
+// code is ordered before the grace period via synchronize_rcu() call
+// in rcu_tasks_pregp_step() and by the scheduler's locks and interrupt
+// disabling.
/* Pre-grace-period preparation. */
static void rcu_tasks_pregp_step(void)
@@ -504,7 +544,7 @@ DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
* or transition to usermode execution. As such, there are no read-side
* primitives analogous to rcu_read_lock() and rcu_read_unlock() because
* this primitive is intended to determine that all tasks have passed
- * through a safe state, not so much for data-strcuture synchronization.
+ * through a safe state, not so much for data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -603,10 +643,15 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
//
// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of
// passing an empty function to schedule_on_each_cpu(). This approach
-// provides an asynchronous call_rcu_tasks_rude() API and batching
-// of concurrent calls to the synchronous synchronize_rcu_rude() API.
-// This sends IPIs far and wide and induces otherwise unnecessary context
-// switches on all online CPUs, whether idle or not.
+// provides an asynchronous call_rcu_tasks_rude() API and batching of
+// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API.
+// This invokes schedule_on_each_cpu() in order to send IPIs far and wide
+// and induces otherwise unnecessary context switches on all online CPUs,
+// whether idle or not.
+//
+// Callback handling is provided by the rcu_tasks_kthread() function.
+//
+// Ordering is provided by the scheduler's context-switch code.
// Empty function to allow workqueues to force a context switch.
static void rcu_tasks_be_rude(struct work_struct *work)
@@ -637,7 +682,7 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
* there are no read-side primitives analogous to rcu_read_lock() and
* rcu_read_unlock() because this primitive is intended to determine
* that all tasks have passed through a safe state, not so much for
- * data-strcuture synchronization.
+ * data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -740,7 +785,10 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
// set that task's .need_qs flag so that task's next outermost
// rcu_read_unlock_trace() will report the quiescent state (in which
// case the count of readers is incremented). If both attempts fail,
-// the task is added to a "holdout" list.
+// the task is added to a "holdout" list. Note that IPIs are used
+// to invoke trc_read_check_handler() in the context of running tasks
+// in order to avoid ordering overhead on common-case shared-variable
+// accessses.
// rcu_tasks_trace_postscan():
// Initialize state and attempt to identify an immediate quiescent
// state as above (but only for idle tasks), unblock CPU-hotplug
@@ -802,7 +850,7 @@ static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw);
/* If we are the last reader, wake up the grace-period kthread. */
void rcu_read_unlock_trace_special(struct task_struct *t, int nesting)
{
- int nq = t->trc_reader_special.b.need_qs;
+ int nq = READ_ONCE(t->trc_reader_special.b.need_qs);
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) &&
t->trc_reader_special.b.need_mb)
@@ -849,7 +897,7 @@ static void trc_read_check_handler(void *t_in)
// If the task is not in a read-side critical section, and
// if this is the last reader, awaken the grace-period kthread.
- if (likely(!t->trc_reader_nesting)) {
+ if (likely(!READ_ONCE(t->trc_reader_nesting))) {
if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
wake_up(&trc_wait);
// Mark as checked after decrement to avoid false
@@ -858,7 +906,7 @@ static void trc_read_check_handler(void *t_in)
goto reset_ipi;
}
// If we are racing with an rcu_read_unlock_trace(), try again later.
- if (unlikely(t->trc_reader_nesting < 0)) {
+ if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) {
if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
wake_up(&trc_wait);
goto reset_ipi;
@@ -868,14 +916,14 @@ static void trc_read_check_handler(void *t_in)
// Get here if the task is in a read-side critical section. Set
// its state so that it will awaken the grace-period kthread upon
// exit from that critical section.
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
reset_ipi:
// Allow future IPIs to be sent on CPU and for task.
// Also order this IPI handler against any later manipulations of
// the intended task.
- smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^
+ smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^
smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^
}
@@ -905,13 +953,13 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
n_heavy_reader_ofl_updates++;
in_qs = true;
} else {
+ // The task is not running, so C-language access is safe.
in_qs = likely(!t->trc_reader_nesting);
}
- // Mark as checked. Because this is called from the grace-period
- // kthread, also remove the task from the holdout list.
+ // Mark as checked so that the grace-period kthread will
+ // remove it from the holdout list.
t->trc_reader_checked = true;
- trc_del_holdout(t);
if (in_qs)
return true; // Already in quiescent state, done!!!
@@ -920,7 +968,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
// state so that it will awaken the grace-period kthread upon exit
// from that critical section.
atomic_inc(&trc_n_readers_need_end); // One more to wait on.
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
return true;
}
@@ -938,8 +986,7 @@ static void trc_wait_for_one_reader(struct task_struct *t,
// The current task had better be in a quiescent state.
if (t == current) {
t->trc_reader_checked = true;
- trc_del_holdout(t);
- WARN_ON_ONCE(t->trc_reader_nesting);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
return;
}
@@ -951,6 +998,12 @@ static void trc_wait_for_one_reader(struct task_struct *t,
}
put_task_struct(t);
+ // If this task is not yet on the holdout list, then we are in
+ // an RCU read-side critical section. Otherwise, the invocation of
+ // rcu_add_holdout() that added it to the list did the necessary
+ // get_task_struct(). Either way, the task cannot be freed out
+ // from under this code.
+
// If currently running, send an IPI, either way, add to list.
trc_add_holdout(t, bhp);
if (task_curr(t) &&
@@ -1049,8 +1102,8 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0],
".i"[is_idle_task(t)],
".N"[cpu > 0 && tick_nohz_full_cpu(cpu)],
- t->trc_reader_nesting,
- " N"[!!t->trc_reader_special.b.need_qs],
+ READ_ONCE(t->trc_reader_nesting),
+ " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)],
cpu);
sched_show_task(t);
}
@@ -1144,7 +1197,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
static void exit_tasks_rcu_finish_trace(struct task_struct *t)
{
WRITE_ONCE(t->trc_reader_checked, true);
- WARN_ON_ONCE(t->trc_reader_nesting);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
WRITE_ONCE(t->trc_reader_nesting, 0);
if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
rcu_read_unlock_trace_special(t, 0);
@@ -1163,7 +1216,7 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t)
* there are no read-side primitives analogous to rcu_read_lock() and
* rcu_read_unlock() because this primitive is intended to determine
* that all tasks have passed through a safe state, not so much for
- * data-strcuture synchronization.
+ * data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -1356,5 +1409,4 @@ void __init rcu_init_tasks_generic(void)
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}
-void show_rcu_tasks_gp_kthreads(void) {}
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c8a029fbb114..340b3f8b090d 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -221,5 +221,4 @@ void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
rcu_early_boot_tests();
- srcu_init();
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8e78b2430c16..bce848e50512 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -32,6 +32,8 @@
#include <linux/export.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
+#include <linux/panic.h>
+#include <linux/panic_notifier.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
@@ -72,17 +74,10 @@
/* Data structures. */
-/*
- * Steal a bit from the bottom of ->dynticks for idle entry/exit
- * control. Initially this is for TLB flushing.
- */
-#define RCU_DYNTICK_CTRL_MASK 0x1
-#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
-
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nesting = 1,
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
- .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+ .dynticks = ATOMIC_INIT(1),
#ifdef CONFIG_RCU_NOCB_CPU
.cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
#endif
@@ -186,6 +181,17 @@ module_param(rcu_unlock_delay, int, 0444);
static int rcu_min_cached_objs = 5;
module_param(rcu_min_cached_objs, int, 0444);
+// A page shrinker can ask for pages to be freed to make them
+// available for other parts of the system. This usually happens
+// under low memory conditions, and in that case we should also
+// defer page-cache filling for a short time period.
+//
+// The default value is 5 seconds, which is long enough to reduce
+// interference with the shrinker while it asks other systems to
+// drain their caches.
+static int rcu_delay_page_cache_fill_msec = 5000;
+module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+
/* Retrieve RCU kthreads priority for rcutorture */
int rcu_get_gp_kthreads_prio(void)
{
@@ -202,7 +208,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
* the need for long delays to increase some race probabilities with the
* need for fast grace periods to increase other race probabilities.
*/
-#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */
+#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
/*
* Compute the mask of online CPUs for the specified rcu_node structure.
@@ -242,6 +248,16 @@ void rcu_softirq_qs(void)
{
rcu_qs();
rcu_preempt_deferred_qs(current);
+ rcu_tasks_qs(current, false);
+}
+
+/*
+ * Increment the current CPU's rcu_data structure's ->dynticks field
+ * with ordering. Return the new value.
+ */
+static noinline noinstr unsigned long rcu_dynticks_inc(int incby)
+{
+ return arch_atomic_add_return(incby, this_cpu_ptr(&rcu_data.dynticks));
}
/*
@@ -252,7 +268,6 @@ void rcu_softirq_qs(void)
*/
static noinstr void rcu_dynticks_eqs_enter(void)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
int seq;
/*
@@ -261,13 +276,9 @@ static noinstr void rcu_dynticks_eqs_enter(void)
* next idle sojourn.
*/
rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
- seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ seq = rcu_dynticks_inc(1);
// RCU is no longer watching. Better be in extended quiescent state!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (seq & RCU_DYNTICK_CTRL_CTR));
- /* Better not have special action (TLB flush) pending! */
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (seq & RCU_DYNTICK_CTRL_MASK));
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1));
}
/*
@@ -277,7 +288,6 @@ static noinstr void rcu_dynticks_eqs_enter(void)
*/
static noinstr void rcu_dynticks_eqs_exit(void)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
int seq;
/*
@@ -285,15 +295,10 @@ static noinstr void rcu_dynticks_eqs_exit(void)
* and we also must force ordering with the next RCU read-side
* critical section.
*/
- seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ seq = rcu_dynticks_inc(1);
// RCU is now watching. Better not be in an extended quiescent state!
rcu_dynticks_task_trace_exit(); // After ->dynticks update!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !(seq & RCU_DYNTICK_CTRL_CTR));
- if (seq & RCU_DYNTICK_CTRL_MASK) {
- arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
- smp_mb__after_atomic(); /* _exit after clearing mask. */
- }
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1));
}
/*
@@ -310,9 +315,9 @@ static void rcu_dynticks_eqs_online(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR)
+ if (atomic_read(&rdp->dynticks) & 0x1)
return;
- atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ rcu_dynticks_inc(1);
}
/*
@@ -322,9 +327,7 @@ static void rcu_dynticks_eqs_online(void)
*/
static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
+ return !(atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1);
}
/*
@@ -333,9 +336,8 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
*/
static int rcu_dynticks_snap(struct rcu_data *rdp)
{
- int snap = atomic_add_return(0, &rdp->dynticks);
-
- return snap & ~RCU_DYNTICK_CTRL_MASK;
+ smp_mb(); // Fundamental RCU ordering guarantee.
+ return atomic_read_acquire(&rdp->dynticks);
}
/*
@@ -344,7 +346,7 @@ static int rcu_dynticks_snap(struct rcu_data *rdp)
*/
static bool rcu_dynticks_in_eqs(int snap)
{
- return !(snap & RCU_DYNTICK_CTRL_CTR);
+ return !(snap & 0x1);
}
/* Return true if the specified CPU is currently idle from an RCU viewpoint. */
@@ -375,8 +377,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
int snap;
// If not quiescent, force back to earlier extended quiescent state.
- snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK |
- RCU_DYNTICK_CTRL_CTR);
+ snap = atomic_read(&rdp->dynticks) & ~0x1;
smp_rmb(); // Order ->dynticks and *vp reads.
if (READ_ONCE(*vp))
@@ -384,32 +385,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
smp_rmb(); // Order *vp read and ->dynticks re-read.
// If still in the same extended quiescent state, we are good!
- return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK);
-}
-
-/*
- * Set the special (bottom) bit of the specified CPU so that it
- * will take special action (such as flushing its TLB) on the
- * next exit from an extended quiescent state. Returns true if
- * the bit was successfully set, or false if the CPU was not in
- * an extended quiescent state.
- */
-bool rcu_eqs_special_set(int cpu)
-{
- int old;
- int new;
- int new_old;
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-
- new_old = atomic_read(&rdp->dynticks);
- do {
- old = new_old;
- if (old & RCU_DYNTICK_CTRL_CTR)
- return false;
- new = old | RCU_DYNTICK_CTRL_MASK;
- new_old = atomic_cmpxchg(&rdp->dynticks, old, new);
- } while (new_old != old);
- return true;
+ return snap == atomic_read(&rdp->dynticks);
}
/*
@@ -425,13 +401,12 @@ bool rcu_eqs_special_set(int cpu)
*/
notrace void rcu_momentary_dyntick_idle(void)
{
- int special;
+ int seq;
raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
- special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
- &this_cpu_ptr(&rcu_data)->dynticks);
+ seq = rcu_dynticks_inc(2);
/* It is illegal to call this from idle state. */
- WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
+ WARN_ON_ONCE(!(seq & 0x1));
rcu_preempt_deferred_qs(current);
}
EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
@@ -833,28 +808,6 @@ void noinstr rcu_irq_exit(void)
rcu_nmi_exit();
}
-/**
- * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq
- * towards in kernel preemption
- *
- * Same as rcu_irq_exit() but has a sanity check that scheduling is safe
- * from RCU point of view. Invoked from return from interrupt before kernel
- * preemption.
- */
-void rcu_irq_exit_preempt(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_nmi_exit();
-
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
- "RCU dynticks_nesting counter underflow/zero!");
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
- DYNTICK_IRQ_NONIDLE,
- "Bad RCU dynticks_nmi_nesting counter\n");
- RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
- "RCU in extended quiescent state!");
-}
-
#ifdef CONFIG_PROVE_RCU
/**
* rcu_irq_exit_check_preempt - Validate that scheduling is possible
@@ -959,7 +912,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
*/
void noinstr rcu_user_exit(void)
{
- rcu_eqs_exit(1);
+ rcu_eqs_exit(true);
}
/**
@@ -1225,7 +1178,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
/*
- * We are reporting a quiescent state on behalf of some other CPU, so
+ * When trying to report a quiescent state on behalf of some other CPU,
* it is our responsibility to check for and handle potential overflow
* of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
* After all, the CPU might be in deep idle state, and thus executing no
@@ -1333,7 +1286,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
*/
jtsq = READ_ONCE(jiffies_to_sched_qs);
ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu);
- rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu);
+ rnhqp = per_cpu_ptr(&rcu_data.rcu_need_heavy_qs, rdp->cpu);
if (!READ_ONCE(*rnhqp) &&
(time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
time_after(jiffies, rcu_state.jiffies_resched) ||
@@ -1780,7 +1733,7 @@ static void rcu_strict_gp_boundary(void *unused)
/*
* Initialize a new grace period. Return false if no grace period required.
*/
-static bool rcu_gp_init(void)
+static noinline_for_stack bool rcu_gp_init(void)
{
unsigned long firstseq;
unsigned long flags;
@@ -1974,7 +1927,7 @@ static void rcu_gp_fqs(bool first_time)
/*
* Loop doing repeated quiescent-state forcing until the grace period ends.
*/
-static void rcu_gp_fqs_loop(void)
+static noinline_for_stack void rcu_gp_fqs_loop(void)
{
bool first_gp_fqs;
int gf = 0;
@@ -2001,8 +1954,8 @@ static void rcu_gp_fqs_loop(void)
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswait"));
WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
- ret = swait_event_idle_timeout_exclusive(
- rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
+ (void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq,
+ rcu_gp_fqs_check_wake(&gf), j);
rcu_gp_torture_wait();
WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
/* Locking provides needed memory barriers. */
@@ -2048,7 +2001,7 @@ static void rcu_gp_fqs_loop(void)
/*
* Clean up after the old grace period.
*/
-static void rcu_gp_cleanup(void)
+static noinline void rcu_gp_cleanup(void)
{
int cpu;
bool needgp = false;
@@ -2479,9 +2432,6 @@ int rcutree_dead_cpu(unsigned int cpu)
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
- /* Do any needed no-CB deferred wakeups from this CPU. */
- do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
-
// Stop-machine done, so allow nohz_full to disable tick.
tick_dep_clear(TICK_DEP_BIT_RCU);
return 0;
@@ -2489,7 +2439,7 @@ int rcutree_dead_cpu(unsigned int cpu)
/*
* Invoke any RCU callbacks that have made it to the end of their grace
- * period. Thottle as specified by rdp->blimit.
+ * period. Throttle as specified by rdp->blimit.
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
@@ -2629,7 +2579,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
* state, for example, user mode or idle loop. It also schedules RCU
* core processing. If the current grace period has gone on too long,
* it will ask the scheduler to manufacture a context switch for the sole
- * purpose of providing a providing the needed quiescent state.
+ * purpose of providing the needed quiescent state.
*/
void rcu_sched_clock_irq(int user)
{
@@ -2911,7 +2861,6 @@ static int __init rcu_spawn_core_kthreads(void)
"%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
return 0;
}
-early_initcall(rcu_spawn_core_kthreads);
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
@@ -3082,12 +3031,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
* period elapses, in other words after all pre-existing RCU read-side
* critical sections have completed. However, the callback function
* might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked. RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
- * may be nested. In addition, regions of code across which interrupts,
- * preemption, or softirqs have been disabled also serve as RCU read-side
- * critical sections. This includes hardware interrupt handlers, softirq
- * handlers, and NMI handlers.
+ * that started after call_rcu() was invoked.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested. In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections. This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
*
* Note that all CPUs must agree that the grace period extended beyond
* all pre-existing RCU read-side critical section. On systems with more
@@ -3107,6 +3058,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
* between the call to call_rcu() and the invocation of "func()" -- even
* if CPU A and CPU B are the same CPU (but again only if the system has
* more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
@@ -3171,6 +3125,7 @@ struct kfree_rcu_cpu_work {
* Even though it is lockless an access has to be protected by the
* per-cpu lock.
* @page_cache_work: A work to refill the cache when it is empty
+ * @backoff_page_cache_fill: Delay cache refills
* @work_in_progress: Indicates that page_cache_work is running
* @hrtimer: A hrtimer for scheduling a page_cache_work
* @nr_bkv_objs: number of allocated objects at @bkvcache.
@@ -3190,7 +3145,8 @@ struct kfree_rcu_cpu {
bool initialized;
int count;
- struct work_struct page_cache_work;
+ struct delayed_work page_cache_work;
+ atomic_t backoff_page_cache_fill;
atomic_t work_in_progress;
struct hrtimer hrtimer;
@@ -3237,7 +3193,7 @@ get_cached_bnode(struct kfree_rcu_cpu *krcp)
if (!krcp->nr_bkv_objs)
return NULL;
- krcp->nr_bkv_objs--;
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
return (struct kvfree_rcu_bulk_data *)
llist_del_first(&krcp->bkvcache);
}
@@ -3251,14 +3207,33 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp,
return false;
llist_add((struct llist_node *) bnode, &krcp->bkvcache);
- krcp->nr_bkv_objs++;
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
return true;
+}
+static int
+drain_page_cache(struct kfree_rcu_cpu *krcp)
+{
+ unsigned long flags;
+ struct llist_node *page_list, *pos, *n;
+ int freed = 0;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ page_list = llist_del_all(&krcp->bkvcache);
+ WRITE_ONCE(krcp->nr_bkv_objs, 0);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ llist_for_each_safe(pos, n, page_list) {
+ free_page((unsigned long)pos);
+ freed++;
+ }
+
+ return freed;
}
/*
* This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->bhead_free or ->head_free.
+ * It frees all the objects queued on ->bkvhead_free or ->head_free.
*/
static void kfree_rcu_work(struct work_struct *work)
{
@@ -3285,7 +3260,7 @@ static void kfree_rcu_work(struct work_struct *work)
krwp->head_free = NULL;
raw_spin_unlock_irqrestore(&krcp->lock, flags);
- // Handle two first channels.
+ // Handle the first two channels.
for (i = 0; i < FREE_N_CHANNELS; i++) {
for (; bkvhead[i]; bkvhead[i] = bnext) {
bnext = bkvhead[i]->next;
@@ -3323,9 +3298,11 @@ static void kfree_rcu_work(struct work_struct *work)
}
/*
- * Emergency case only. It can happen under low memory
- * condition when an allocation gets failed, so the "bulk"
- * path can not be temporary maintained.
+ * This is used when the "bulk" path can not be used for the
+ * double-argument of kvfree_rcu(). This happens when the
+ * page-cache is empty, which means that objects are instead
+ * queued on a linked list through their rcu_head structures.
+ * This list is named "Channel 3".
*/
for (; head; head = next) {
unsigned long offset = (unsigned long)head->func;
@@ -3345,34 +3322,31 @@ static void kfree_rcu_work(struct work_struct *work)
}
/*
- * Schedule the kfree batch RCU work to run in workqueue context after a GP.
- *
- * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
- * timeout has been reached.
+ * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
*/
-static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
+static void kfree_rcu_monitor(struct work_struct *work)
{
- struct kfree_rcu_cpu_work *krwp;
- bool repeat = false;
+ struct kfree_rcu_cpu *krcp = container_of(work,
+ struct kfree_rcu_cpu, monitor_work.work);
+ unsigned long flags;
int i, j;
- lockdep_assert_held(&krcp->lock);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ // Attempt to start a new batch.
for (i = 0; i < KFREE_N_BATCHES; i++) {
- krwp = &(krcp->krw_arr[i]);
+ struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
- /*
- * Try to detach bkvhead or head and attach it over any
- * available corresponding free channel. It can be that
- * a previous RCU batch is in progress, it means that
- * immediately to queue another one is not possible so
- * return false to tell caller to retry.
- */
+ // Try to detach bkvhead or head and attach it over any
+ // available corresponding free channel. It can be that
+ // a previous RCU batch is in progress, it means that
+ // immediately to queue another one is not possible so
+ // in that case the monitor work is rearmed.
if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
(krcp->head && !krwp->head_free)) {
- // Channel 1 corresponds to SLAB ptrs.
- // Channel 2 corresponds to vmalloc ptrs.
+ // Channel 1 corresponds to the SLAB-pointer bulk path.
+ // Channel 2 corresponds to vmalloc-pointer bulk path.
for (j = 0; j < FREE_N_CHANNELS; j++) {
if (!krwp->bkvhead_free[j]) {
krwp->bkvhead_free[j] = krcp->bkvhead[j];
@@ -3380,7 +3354,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
}
}
- // Channel 3 corresponds to emergency path.
+ // Channel 3 corresponds to both SLAB and vmalloc
+ // objects queued on the linked list.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
krcp->head = NULL;
@@ -3388,65 +3363,35 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
WRITE_ONCE(krcp->count, 0);
- /*
- * One work is per one batch, so there are three
- * "free channels", the batch can handle. It can
- * be that the work is in the pending state when
- * channels have been detached following by each
- * other.
- */
+ // One work is per one batch, so there are three
+ // "free channels", the batch can handle. It can
+ // be that the work is in the pending state when
+ // channels have been detached following by each
+ // other.
queue_rcu_work(system_wq, &krwp->rcu_work);
}
-
- // Repeat if any "free" corresponding channel is still busy.
- if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
- repeat = true;
}
- return !repeat;
-}
-
-static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
- unsigned long flags)
-{
- // Attempt to start a new batch.
- krcp->monitor_todo = false;
- if (queue_kfree_rcu_work(krcp)) {
- // Success! Our job is done here.
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
- return;
- }
+ // If there is nothing to detach, it means that our job is
+ // successfully done here. In case of having at least one
+ // of the channels that is still busy we should rearm the
+ // work to repeat an attempt. Because previous batches are
+ // still in progress.
+ if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
+ krcp->monitor_todo = false;
+ else
+ schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
- // Previous RCU batch still in progress, try again later.
- krcp->monitor_todo = true;
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
-/*
- * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
- * It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
- */
-static void kfree_rcu_monitor(struct work_struct *work)
-{
- unsigned long flags;
- struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
- monitor_work.work);
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- if (krcp->monitor_todo)
- kfree_rcu_drain_unlock(krcp, flags);
- else
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-}
-
static enum hrtimer_restart
schedule_page_work_fn(struct hrtimer *t)
{
struct kfree_rcu_cpu *krcp =
container_of(t, struct kfree_rcu_cpu, hrtimer);
- queue_work(system_highpri_wq, &krcp->page_cache_work);
+ queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
return HRTIMER_NORESTART;
}
@@ -3455,12 +3400,16 @@ static void fill_page_cache_func(struct work_struct *work)
struct kvfree_rcu_bulk_data *bnode;
struct kfree_rcu_cpu *krcp =
container_of(work, struct kfree_rcu_cpu,
- page_cache_work);
+ page_cache_work.work);
unsigned long flags;
+ int nr_pages;
bool pushed;
int i;
- for (i = 0; i < rcu_min_cached_objs; i++) {
+ nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
+ 1 : rcu_min_cached_objs;
+
+ for (i = 0; i < nr_pages; i++) {
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
@@ -3477,6 +3426,7 @@ static void fill_page_cache_func(struct work_struct *work)
}
atomic_set(&krcp->work_in_progress, 0);
+ atomic_set(&krcp->backoff_page_cache_fill, 0);
}
static void
@@ -3484,10 +3434,15 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
{
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!atomic_xchg(&krcp->work_in_progress, 1)) {
- hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL);
- krcp->hrtimer.function = schedule_page_work_fn;
- hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ if (atomic_read(&krcp->backoff_page_cache_fill)) {
+ queue_delayed_work(system_wq,
+ &krcp->page_cache_work,
+ msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
+ } else {
+ hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ krcp->hrtimer.function = schedule_page_work_fn;
+ hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ }
}
}
@@ -3552,11 +3507,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
}
/*
- * Queue a request for lazy invocation of appropriate free routine after a
- * grace period. Please note there are three paths are maintained, two are the
- * main ones that use array of pointers interface and third one is emergency
- * one, that is used only when the main path can not be maintained temporary,
- * due to memory pressure.
+ * Queue a request for lazy invocation of the appropriate free routine
+ * after a grace period. Please note that three paths are maintained,
+ * two for the common case using arrays of pointers and a third one that
+ * is used only when the main paths cannot be used, for example, due to
+ * memory pressure.
*
* Each kvfree_call_rcu() request is added to a batch. The batch will be drained
* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
@@ -3645,6 +3600,8 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
count += READ_ONCE(krcp->count);
+ count += READ_ONCE(krcp->nr_bkv_objs);
+ atomic_set(&krcp->backoff_page_cache_fill, 1);
}
return count;
@@ -3654,18 +3611,14 @@ static unsigned long
kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
int cpu, freed = 0;
- unsigned long flags;
for_each_possible_cpu(cpu) {
int count;
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
count = krcp->count;
- raw_spin_lock_irqsave(&krcp->lock, flags);
- if (krcp->monitor_todo)
- kfree_rcu_drain_unlock(krcp, flags);
- else
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ count += drain_page_cache(krcp);
+ kfree_rcu_monitor(&krcp->monitor_work.work);
sc->nr_to_scan -= count;
freed += count;
@@ -3693,7 +3646,8 @@ void __init kfree_rcu_scheduler_running(void)
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
raw_spin_lock_irqsave(&krcp->lock, flags);
- if (!krcp->head || krcp->monitor_todo) {
+ if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
+ krcp->monitor_todo) {
raw_spin_unlock_irqrestore(&krcp->lock, flags);
continue;
}
@@ -3750,10 +3704,12 @@ static int rcu_blocking_is_gp(void)
* read-side critical sections have completed. Note, however, that
* upon return from synchronize_rcu(), the caller might well be executing
* concurrently with new RCU read-side critical sections that began while
- * synchronize_rcu() was waiting. RCU read-side critical sections are
- * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
- * In addition, regions of code across which interrupts, preemption, or
- * softirqs have been disabled also serve as RCU read-side critical
+ * synchronize_rcu() was waiting.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested. In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
* sections. This includes hardware interrupt handlers, softirq handlers,
* and NMI handlers.
*
@@ -3774,6 +3730,9 @@ static int rcu_blocking_is_gp(void)
* to have executed a full memory barrier during the execution of
* synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
* again only if the system has more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
*/
void synchronize_rcu(void)
{
@@ -3844,11 +3803,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
/**
* poll_state_synchronize_rcu - Conditionally wait for an RCU grace period
*
- * @oldstate: return from call to get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+ * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
*
* If a full RCU grace period has elapsed since the earlier call from
* which oldstate was obtained, return @true, otherwise return @false.
- * If @false is returned, it is the caller's responsibilty to invoke this
+ * If @false is returned, it is the caller's responsibility to invoke this
* function later on until it does return @true. Alternatively, the caller
* can explicitly wait for a grace period, for example, by passing @oldstate
* to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
@@ -3860,6 +3819,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
* (many hours even on 32-bit systems) should check them occasionally
* and either refresh them or set a flag indicating that the grace period
* has completed.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate, and that returned at the end
+ * of this function.
*/
bool poll_state_synchronize_rcu(unsigned long oldstate)
{
@@ -3874,7 +3838,7 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
/**
* cond_synchronize_rcu - Conditionally wait for an RCU grace period
*
- * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+ * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
*
* If a full RCU grace period has elapsed since the earlier call to
* get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
@@ -3884,6 +3848,11 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
* counter wrap is harmless. If the counter wraps, we have waited for
* more than 2 billion grace periods (and way more on a 64-bit system!),
* so waiting for one additional grace period should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate, and that returned at the end
+ * of this function.
*/
void cond_synchronize_rcu(unsigned long oldstate)
{
@@ -3911,7 +3880,7 @@ static int rcu_pending(int user)
check_cpu_stall(rdp);
/* Does this CPU need a deferred NOCB wakeup? */
- if (rcu_nocb_need_deferred_wakeup(rdp))
+ if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE))
return 1;
/* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
@@ -4039,7 +4008,7 @@ void rcu_barrier(void)
*/
init_completion(&rcu_state.barrier_completion);
atomic_set(&rcu_state.barrier_cpu_count, 2);
- get_online_cpus();
+ cpus_read_lock();
/*
* Force each CPU with callbacks to register a new callback.
@@ -4070,7 +4039,7 @@ void rcu_barrier(void)
rcu_state.barrier_sequence);
}
}
- put_online_cpus();
+ cpus_read_unlock();
/*
* Now that we have an rcu_barrier_callback() callback on each
@@ -4094,7 +4063,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
/*
* Propagate ->qsinitmask bits up the rcu_node tree to account for the
* first CPU in a given leaf rcu_node structure coming online. The caller
- * must hold the corresponding leaf rcu_node ->lock with interrrupts
+ * must hold the corresponding leaf rcu_node ->lock with interrupts
* disabled.
*/
static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
@@ -4189,7 +4158,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- rcu_prepare_kthreads(cpu);
+ rcu_spawn_one_boost_kthread(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
@@ -4472,6 +4441,7 @@ static int __init rcu_spawn_gp_kthread(void)
wake_up_process(t);
rcu_spawn_nocb_kthreads();
rcu_spawn_boost_kthreads();
+ rcu_spawn_core_kthreads();
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
@@ -4582,11 +4552,25 @@ static void __init rcu_init_one(void)
* replace the definitions in tree.h because those are needed to size
* the ->node array in the rcu_state structure.
*/
-static void __init rcu_init_geometry(void)
+void rcu_init_geometry(void)
{
ulong d;
int i;
+ static unsigned long old_nr_cpu_ids;
int rcu_capacity[RCU_NUM_LVLS];
+ static bool initialized;
+
+ if (initialized) {
+ /*
+ * Warn if setup_nr_cpu_ids() had not yet been invoked,
+ * unless nr_cpus_ids == NR_CPUS, in which case who cares?
+ */
+ WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
+ return;
+ }
+
+ old_nr_cpu_ids = nr_cpu_ids;
+ initialized = true;
/*
* Initialize any unspecified boot parameters.
@@ -4687,6 +4671,18 @@ static void __init kfree_rcu_batch_init(void)
int cpu;
int i;
+ /* Clamp it to [0:100] seconds interval. */
+ if (rcu_delay_page_cache_fill_msec < 0 ||
+ rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
+
+ rcu_delay_page_cache_fill_msec =
+ clamp(rcu_delay_page_cache_fill_msec, 0,
+ (int) (100 * MSEC_PER_SEC));
+
+ pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
+ rcu_delay_page_cache_fill_msec);
+ }
+
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
@@ -4696,7 +4692,7 @@ static void __init kfree_rcu_batch_init(void)
}
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
- INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
+ INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
if (register_shrinker(&kfree_rcu_shrinker))
@@ -4730,12 +4726,11 @@ void __init rcu_init(void)
rcutree_online_cpu(cpu);
}
- /* Create workqueue for expedited GPs and for Tree SRCU. */
+ /* Create workqueue for Tree SRCU and for expedited GPs. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_par_gp_wq);
- srcu_init();
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
@@ -4747,4 +4742,5 @@ void __init rcu_init(void)
#include "tree_stall.h"
#include "tree_exp.h"
+#include "tree_nocb.h"
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 71821d59d95c..305cf6aeb408 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -115,6 +115,7 @@ struct rcu_node {
/* boosting for this rcu_node structure. */
unsigned int boost_kthread_status;
/* State of boost_kthread_task for tracing. */
+ unsigned long n_boosts; /* Number of boosts for this rcu_node structure. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
@@ -153,7 +154,7 @@ struct rcu_data {
unsigned long gp_seq; /* Track rsp->gp_seq counter. */
unsigned long gp_seq_needed; /* Track furthest future GP request. */
union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
- bool core_needs_qs; /* Core waits for quiesc state. */
+ bool core_needs_qs; /* Core waits for quiescent state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible ->gp_seq wrap. */
bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */
@@ -218,7 +219,6 @@ struct rcu_data {
/* The following fields are used by GP kthread, hence own cacheline. */
raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp;
- struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */
u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */
u8 nocb_gp_bypass; /* Found a bypass on last scan? */
u8 nocb_gp_gp; /* GP to wait for on last scan? */
@@ -257,10 +257,10 @@ struct rcu_data {
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
-#define RCU_NOCB_WAKE_OFF -1
#define RCU_NOCB_WAKE_NOT 0
-#define RCU_NOCB_WAKE 1
-#define RCU_NOCB_WAKE_FORCE 2
+#define RCU_NOCB_WAKE_BYPASS 1
+#define RCU_NOCB_WAKE 2
+#define RCU_NOCB_WAKE_FORCE 3
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
@@ -417,8 +417,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static bool rcu_is_callbacks_kthread(void);
static void rcu_cpu_kthread_setup(unsigned int cpu);
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
static void __init rcu_spawn_boost_kthreads(void);
-static void rcu_prepare_kthreads(int cpu);
static void rcu_cleanup_after_idle(void);
static void rcu_prepare_for_idle(void);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
@@ -434,7 +434,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
bool *was_alldone, unsigned long flags);
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
unsigned long flags);
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
new file mode 100644
index 000000000000..8fdf44f8523f
--- /dev/null
+++ b/kernel/rcu/tree_nocb.h
@@ -0,0 +1,1496 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptible semantics.
+ *
+ * Copyright Red Hat, 2009
+ * Copyright IBM Corporation, 2009
+ * Copyright SUSE, 2021
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
+ * Frederic Weisbecker <frederic@kernel.org>
+ */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
+static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
+static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
+{
+ return lockdep_is_held(&rdp->nocb_lock);
+}
+
+static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
+{
+ /* Race on early boot between thread creation and assignment */
+ if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread)
+ return true;
+
+ if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread)
+ if (in_task())
+ return true;
+ return false;
+}
+
+/*
+ * Offload callback processing from the boot-time-specified set of CPUs
+ * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
+ * created that pull the callbacks from the corresponding CPU, wait for
+ * a grace period to elapse, and invoke the callbacks. These kthreads
+ * are organized into GP kthreads, which manage incoming callbacks, wait for
+ * grace periods, and awaken CB kthreads, and the CB kthreads, which only
+ * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
+ * do a wake_up() on their GP kthread when they insert a callback into any
+ * empty list, unless the rcu_nocb_poll boot parameter has been specified,
+ * in which case each kthread actively polls its CPU. (Which isn't so great
+ * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
+ *
+ * This is intended to be used in conjunction with Frederic Weisbecker's
+ * adaptive-idle work, which would seriously reduce OS jitter on CPUs
+ * running CPU-bound user-mode computations.
+ *
+ * Offloading of callbacks can also be used as an energy-efficiency
+ * measure because CPUs with no RCU callbacks queued are more aggressive
+ * about entering dyntick-idle mode.
+ */
+
+
+/*
+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
+ * If the list is invalid, a warning is emitted and all CPUs are offloaded.
+ */
+static int __init rcu_nocb_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+ if (cpulist_parse(str, rcu_nocb_mask)) {
+ pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+ cpumask_setall(rcu_nocb_mask);
+ }
+ return 1;
+}
+__setup("rcu_nocbs=", rcu_nocb_setup);
+
+static int __init parse_rcu_nocb_poll(char *arg)
+{
+ rcu_nocb_poll = true;
+ return 0;
+}
+early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+
+/*
+ * Don't bother bypassing ->cblist if the call_rcu() rate is low.
+ * After all, the main point of bypassing is to avoid lock contention
+ * on ->nocb_lock, which only can happen at high call_rcu() rates.
+ */
+static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
+module_param(nocb_nobypass_lim_per_jiffy, int, 0);
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
+ * lock isn't immediately available, increment ->nocb_lock_contended to
+ * flag the contention.
+ */
+static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
+ __acquires(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ if (raw_spin_trylock(&rdp->nocb_bypass_lock))
+ return;
+ atomic_inc(&rdp->nocb_lock_contended);
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ smp_mb__after_atomic(); /* atomic_inc() before lock. */
+ raw_spin_lock(&rdp->nocb_bypass_lock);
+ smp_mb__before_atomic(); /* atomic_dec() after lock. */
+ atomic_dec(&rdp->nocb_lock_contended);
+}
+
+/*
+ * Spinwait until the specified rcu_data structure's ->nocb_lock is
+ * not contended. Please note that this is extremely special-purpose,
+ * relying on the fact that at most two kthreads and one CPU contend for
+ * this lock, and also that the two kthreads are guaranteed to have frequent
+ * grace-period-duration time intervals between successive acquisitions
+ * of the lock. This allows us to use an extremely simple throttling
+ * mechanism, and further to apply it only to the CPU doing floods of
+ * call_rcu() invocations. Don't try this at home!
+ */
+static void rcu_nocb_wait_contended(struct rcu_data *rdp)
+{
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
+ cpu_relax();
+}
+
+/*
+ * Conditionally acquire the specified rcu_data structure's
+ * ->nocb_bypass_lock.
+ */
+static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ return raw_spin_trylock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_bypass_lock.
+ */
+static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
+ __releases(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (!rcu_rdp_is_offloaded(rdp))
+ return;
+ raw_spin_lock(&rdp->nocb_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_lock);
+ }
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock and restore
+ * interrupts, but only if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ } else {
+ local_irq_restore(flags);
+ }
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (rcu_rdp_is_offloaded(rdp))
+ lockdep_assert_held(&rdp->nocb_lock);
+}
+
+/*
+ * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+ * grace period.
+ */
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+ swake_up_all(sq);
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+ init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+ init_swait_queue_head(&rnp->nocb_gp_wq[1]);
+}
+
+/* Is the specified CPU a no-CBs CPU? */
+bool rcu_is_nocb_cpu(int cpu)
+{
+ if (cpumask_available(rcu_nocb_mask))
+ return cpumask_test_cpu(cpu, rcu_nocb_mask);
+ return false;
+}
+
+static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp,
+ bool force, unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ bool needwake = false;
+
+ if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("AlreadyAwake"));
+ return false;
+ }
+
+ if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&rdp_gp->nocb_timer);
+ }
+
+ if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
+ WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
+ needwake = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ if (needwake) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+ }
+
+ return needwake;
+}
+
+/*
+ * Kick the GP kthread for this NOCB group.
+ */
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return __wake_nocb_gp(rdp_gp, rdp, force, flags);
+}
+
+/*
+ * Arrange to wake the GP kthread for this NOCB group at some future
+ * time when it is safe to do so.
+ */
+static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+
+ /*
+ * Bypass wakeup overrides previous deferments. In case
+ * of callback storm, no need to wake up too early.
+ */
+ if (waketype == RCU_NOCB_WAKE_BYPASS) {
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ } else {
+ if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
+ if (rdp_gp->nocb_defer_wakeup < waketype)
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ }
+
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
+{
+ struct rcu_cblist rcl;
+
+ WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
+ rcu_lockdep_assert_cblist_protected(rdp);
+ lockdep_assert_held(&rdp->nocb_bypass_lock);
+ if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+ return false;
+ }
+ /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
+ if (rhp)
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+ rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ rcu_nocb_bypass_unlock(rdp);
+ return true;
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
+{
+ if (!rcu_rdp_is_offloaded(rdp))
+ return true;
+ rcu_lockdep_assert_cblist_protected(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ return rcu_nocb_do_flush_bypass(rdp, rhp, j);
+}
+
+/*
+ * If the ->nocb_bypass_lock is immediately available, flush the
+ * ->nocb_bypass queue into ->cblist.
+ */
+static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
+{
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_rdp_is_offloaded(rdp) ||
+ !rcu_nocb_bypass_trylock(rdp))
+ return;
+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
+}
+
+/*
+ * See whether it is appropriate to use the ->nocb_bypass list in order
+ * to control contention on ->nocb_lock. A limited number of direct
+ * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
+ * is non-empty, further callbacks must be placed into ->nocb_bypass,
+ * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
+ * back to direct use of ->cblist. However, ->nocb_bypass should not be
+ * used if ->cblist is empty, because otherwise callbacks can be stranded
+ * on ->nocb_bypass because we cannot count on the current CPU ever again
+ * invoking call_rcu(). The general rule is that if ->nocb_bypass is
+ * non-empty, the corresponding no-CBs grace-period kthread must not be
+ * in an indefinite sleep state.
+ *
+ * Finally, it is not permitted to use the bypass during early boot,
+ * as doing so would confuse the auto-initialization code. Besides
+ * which, there is no point in worrying about lock contention while
+ * there is only one CPU in operation.
+ */
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags)
+{
+ unsigned long c;
+ unsigned long cur_gp_seq;
+ unsigned long j = jiffies;
+ long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+
+ lockdep_assert_irqs_disabled();
+
+ // Pure softirq/rcuc based processing: no bypassing, no
+ // locking.
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // In the process of (de-)offloading: no bypassing, but
+ // locking.
+ if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false; /* Not offloaded, no bypassing. */
+ }
+
+ // Don't use ->nocb_bypass during early boot.
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+ rcu_nocb_lock(rdp);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // If we have advanced to a new jiffy, reset counts to allow
+ // moving back from ->nocb_bypass to ->cblist.
+ if (j == rdp->nocb_nobypass_last) {
+ c = rdp->nocb_nobypass_count + 1;
+ } else {
+ WRITE_ONCE(rdp->nocb_nobypass_last, j);
+ c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
+ if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
+ nocb_nobypass_lim_per_jiffy))
+ c = 0;
+ else if (c > nocb_nobypass_lim_per_jiffy)
+ c = nocb_nobypass_lim_per_jiffy;
+ }
+ WRITE_ONCE(rdp->nocb_nobypass_count, c);
+
+ // If there hasn't yet been all that many ->cblist enqueues
+ // this jiffy, tell the caller to enqueue onto ->cblist. But flush
+ // ->nocb_bypass first.
+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+
+ // If ->nocb_bypass has been used too long or is too full,
+ // flush ->nocb_bypass to ->cblist.
+ if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+ ncbs >= qhimark) {
+ rcu_nocb_lock(rdp);
+ if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ return true; // Callback already enqueued.
+ }
+
+ // We need to use the bypass.
+ rcu_nocb_wait_contended(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+ if (!ncbs) {
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
+ }
+ rcu_nocb_bypass_unlock(rdp);
+ smp_mb(); /* Order enqueue before wake. */
+ if (ncbs) {
+ local_irq_restore(flags);
+ } else {
+ // No-CBs GP kthread might be indefinitely asleep, if so, wake.
+ rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQwake"));
+ __call_rcu_nocb_wake(rdp, true, flags);
+ } else {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQnoWake"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+ }
+ return true; // Callback already enqueued.
+}
+
+/*
+ * Awaken the no-CBs grace-period kthread if needed, either due to it
+ * legitimately being asleep or due to overload conditions.
+ *
+ * If warranted, also wake up the kthread servicing this CPUs queues.
+ */
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ unsigned long cur_gp_seq;
+ unsigned long j;
+ long len;
+ struct task_struct *t;
+
+ // If we are being polled or there is no kthread, just leave.
+ t = READ_ONCE(rdp->nocb_gp_kthread);
+ if (rcu_nocb_poll || !t) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeNotPoll"));
+ return;
+ }
+ // Need to actually to a wakeup.
+ len = rcu_segcblist_n_cbs(&rdp->cblist);
+ if (was_alldone) {
+ rdp->qlen_last_fqs_check = len;
+ if (!irqs_disabled_flags(flags)) {
+ /* ... if queue was empty ... */
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp(rdp, false);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeEmpty"));
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
+ }
+ } else if (len > rdp->qlen_last_fqs_check + qhimark) {
+ /* ... or if many callbacks queued. */
+ rdp->qlen_last_fqs_check = len;
+ j = jiffies;
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+ smp_mb(); /* Enqueue before timer_pending(). */
+ if ((rdp->nocb_cb_sleep ||
+ !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
+ !timer_pending(&rdp->nocb_timer)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
+ TPS("WakeOvfIsDeferred"));
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+ return;
+}
+
+/*
+ * Check if we ignore this rdp.
+ *
+ * We check that without holding the nocb lock but
+ * we make sure not to miss a freshly offloaded rdp
+ * with the current ordering:
+ *
+ * rdp_offload_toggle() nocb_gp_enabled_cb()
+ * ------------------------- ----------------------------
+ * WRITE flags LOCK nocb_gp_lock
+ * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
+ * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
+ * UNLOCK nocb_gp_lock READ flags
+ */
+static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
+
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
+ bool *needwake_state)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *needwake_state = true;
+ }
+ return false;
+ }
+
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We will ignore this rdp until it ever gets re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *needwake_state = true;
+ return true;
+}
+
+
+/*
+ * No-CBs GP kthreads come here to wait for additional callbacks to show up
+ * or for grace periods to end.
+ */
+static void nocb_gp_wait(struct rcu_data *my_rdp)
+{
+ bool bypass = false;
+ long bypass_ncbs;
+ int __maybe_unused cpu = my_rdp->cpu;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool gotcbs = false;
+ unsigned long j = jiffies;
+ bool needwait_gp = false; // This prevents actual uninitialized use.
+ bool needwake;
+ bool needwake_gp;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
+ bool wasempty = false;
+
+ /*
+ * Each pass through the following loop checks for CBs and for the
+ * nearest grace period (if any) to wait for next. The CB kthreads
+ * and the global grace-period kthread are awakened if needed.
+ */
+ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
+ bool needwake_state = false;
+
+ if (!nocb_gp_enabled_cb(rdp))
+ continue;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+ continue;
+ }
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ if (bypass_ncbs &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
+ bypass_ncbs > 2 * qhimark)) {
+ // Bypass full or old, so flush it.
+ (void)rcu_nocb_try_flush_bypass(rdp, j);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+ continue; /* No callbacks here, try next. */
+ }
+ if (bypass_ncbs) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("Bypass"));
+ bypass = true;
+ }
+ rnp = rdp->mynode;
+
+ // Advance callbacks if helpful and low contention.
+ needwake_gp = false;
+ if (!rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL) ||
+ (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+ needwake_gp = rcu_advance_cbs(rnp, rdp);
+ wasempty = rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL);
+ raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
+ }
+ // Need to wait on some grace period?
+ WARN_ON_ONCE(wasempty &&
+ !rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL));
+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
+ if (!needwait_gp ||
+ ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
+ wait_gp_seq = cur_gp_seq;
+ needwait_gp = true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("NeedWaitGP"));
+ }
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ needwake = rdp->nocb_cb_sleep;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ smp_mb(); /* CB invocation -after- GP end. */
+ } else {
+ needwake = false;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake) {
+ swake_up_one(&rdp->nocb_cb_wq);
+ gotcbs = true;
+ }
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+ }
+
+ my_rdp->nocb_gp_bypass = bypass;
+ my_rdp->nocb_gp_gp = needwait_gp;
+ my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
+
+ if (bypass && !rcu_nocb_poll) {
+ // At least one child with non-empty ->nocb_bypass, so set
+ // timer in order to avoid stranding its callbacks.
+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
+ TPS("WakeBypassIsDeferred"));
+ }
+ if (rcu_nocb_poll) {
+ /* Polling, so trace if first poll in the series. */
+ if (gotcbs)
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
+ schedule_timeout_idle(1);
+ } else if (!needwait_gp) {
+ /* Wait for callbacks to appear. */
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
+ swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+ } else {
+ rnp = my_rdp->mynode;
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
+ swait_event_interruptible_exclusive(
+ rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
+ rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
+ }
+ if (!rcu_nocb_poll) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ }
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ my_rdp->nocb_gp_seq = -1;
+ WARN_ON(signal_pending(current));
+}
+
+/*
+ * No-CBs grace-period-wait kthread. There is one of these per group
+ * of CPUs, but only once at least one CPU in that group has come online
+ * at least once since boot. This kthread checks for newly posted
+ * callbacks from any of the CPUs it is responsible for, waits for a
+ * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
+ * that then have callback-invocation work to do.
+ */
+static int rcu_nocb_gp_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ for (;;) {
+ WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
+ nocb_gp_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+static inline bool nocb_cb_can_run(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
+{
+ return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+}
+
+/*
+ * Invoke any ready callbacks from the corresponding no-CBs CPU,
+ * then, if there are no more, wait for more to appear.
+ */
+static void nocb_cb_wait(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool needwake_state = false;
+ bool needwake_gp = false;
+ bool can_sleep = true;
+ struct rcu_node *rnp = rdp->mynode;
+
+ local_irq_save(flags);
+ rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ /*
+ * Disable BH to provide the expected environment. Also, when
+ * transitioning to/from NOCB mode, a self-requeuing callback might
+ * be invoked from softirq. A short grace period could cause both
+ * instances of this callback would execute concurrently.
+ */
+ local_bh_disable();
+ rcu_do_batch(rdp);
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
+ raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
+ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+ if (rcu_segcblist_ready_cbs(cblist))
+ can_sleep = false;
+ } else {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We won't touch the callbacks and keep sleeping until we ever
+ * get re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+
+ WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
+
+ if (rdp->nocb_cb_sleep)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+
+ do {
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+
+ // VVV Ensure CB invocation follows _sleep test.
+ if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ } while (!nocb_cb_can_run(rdp));
+}
+
+/*
+ * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
+ * nocb_cb_wait() to do the dirty work.
+ */
+static int rcu_nocb_cb_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ // Each pass through this loop does one callback batch, and,
+ // if there are no more ready callbacks, waits for them.
+ for (;;) {
+ nocb_cb_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+/* Is a deferred wakeup of rcu_nocb_kthread() required? */
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return READ_ONCE(rdp->nocb_defer_wakeup) >= level;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread(). */
+static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp, int level,
+ unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ int ndw;
+ int ret;
+
+ if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ return false;
+ }
+
+ ndw = rdp_gp->nocb_defer_wakeup;
+ ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+
+ return ret;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
+{
+ unsigned long flags;
+ struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
+
+ WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
+
+ raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
+ smp_mb__after_spinlock(); /* Timer expire before wakeup. */
+ do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
+}
+
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check. Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE))
+ return false;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags);
+}
+
+void rcu_nocb_flush_deferred_wakeup(void)
+{
+ do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
+
+static int rdp_offload_toggle(struct rcu_data *rdp,
+ bool offload, unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ bool wake_gp = false;
+
+ rcu_segcblist_offload(cblist, offload);
+
+ if (rdp->nocb_cb_sleep)
+ rdp->nocb_cb_sleep = false;
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ /*
+ * Ignore former value of nocb_cb_sleep and force wake up as it could
+ * have been spuriously set to false already.
+ */
+ swake_up_one(&rdp->nocb_cb_wq);
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ if (rdp_gp->nocb_gp_sleep) {
+ rdp_gp->nocb_gp_sleep = false;
+ wake_gp = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ return 0;
+}
+
+static long rcu_nocb_rdp_deoffload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+
+ pr_info("De-offloading %d\n", rdp->cpu);
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Flush once and for all now. This suffices because we are
+ * running on the target CPU holding ->nocb_lock (thus having
+ * interrupts disabled), and because rdp_offload_toggle()
+ * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
+ * Thus future calls to rcu_segcblist_completely_offloaded() will
+ * return false, which means that future calls to rcu_nocb_try_bypass()
+ * will refuse to put anything into the bypass.
+ */
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+ ret = rdp_offload_toggle(rdp, false, flags);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
+ SEGCBLIST_KTHREAD_GP));
+ /*
+ * Lock one last time to acquire latest callback updates from kthreads
+ * so we can later handle callbacks locally without locking.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb
+ * lock is released but how about being paranoid for once?
+ */
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
+ /*
+ * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
+ * rcu_nocb_unlock_irqrestore() anymore.
+ */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ /* Sanity check */
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+
+
+ return ret;
+}
+
+int rcu_nocb_cpu_deoffload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ mutex_lock(&rcu_state.barrier_mutex);
+ cpus_read_lock();
+ if (rcu_rdp_is_offloaded(rdp)) {
+ if (cpu_online(cpu)) {
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+ if (!ret)
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Can't CB-deoffload an offline CPU\n");
+ ret = -EINVAL;
+ }
+ }
+ cpus_read_unlock();
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+
+static long rcu_nocb_rdp_offload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ /*
+ * For now we only support re-offload, ie: the rdp must have been
+ * offloaded on boot first.
+ */
+ if (!rdp->nocb_gp_rdp)
+ return -EINVAL;
+
+ pr_info("Offloading %d\n", rdp->cpu);
+ /*
+ * Can't use rcu_nocb_lock_irqsave() while we are in
+ * SEGCBLIST_SOFTIRQ_ONLY mode.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+
+ /*
+ * We didn't take the nocb lock while working on the
+ * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
+ * Every modifications that have been done previously on
+ * rdp->cblist must be visible remotely by the nocb kthreads
+ * upon wake up after reading the cblist flags.
+ *
+ * The layout against nocb_lock enforces that ordering:
+ *
+ * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
+ * ------------------------- ----------------------------
+ * WRITE callbacks rcu_nocb_lock()
+ * rcu_nocb_lock() READ flags
+ * WRITE flags READ callbacks
+ * rcu_nocb_unlock() rcu_nocb_unlock()
+ */
+ ret = rdp_offload_toggle(rdp, true, flags);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+
+ return ret;
+}
+
+int rcu_nocb_cpu_offload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ mutex_lock(&rcu_state.barrier_mutex);
+ cpus_read_lock();
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ if (cpu_online(cpu)) {
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+ if (!ret)
+ cpumask_set_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Can't CB-offload an offline CPU\n");
+ ret = -EINVAL;
+ }
+ }
+ cpus_read_unlock();
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+
+void __init rcu_init_nohz(void)
+{
+ int cpu;
+ bool need_rcu_nocb_mask = false;
+ struct rcu_data *rdp;
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
+ need_rcu_nocb_mask = true;
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+
+ if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
+ if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
+ pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
+ return;
+ }
+ }
+ if (!cpumask_available(rcu_nocb_mask))
+ return;
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running)
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+
+ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
+ cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+ rcu_nocb_mask);
+ }
+ if (cpumask_empty(rcu_nocb_mask))
+ pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
+ else
+ pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
+ cpumask_pr_args(rcu_nocb_mask));
+ if (rcu_nocb_poll)
+ pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
+
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rcu_segcblist_empty(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_offload(&rdp->cblist, true);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
+ }
+ rcu_organize_nocb_kthreads();
+}
+
+/* Initialize per-rcu_data variables for no-CBs CPUs. */
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+ init_swait_queue_head(&rdp->nocb_cb_wq);
+ init_swait_queue_head(&rdp->nocb_gp_wq);
+ init_swait_queue_head(&rdp->nocb_state_wq);
+ raw_spin_lock_init(&rdp->nocb_lock);
+ raw_spin_lock_init(&rdp->nocb_bypass_lock);
+ raw_spin_lock_init(&rdp->nocb_gp_lock);
+ timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
+ rcu_cblist_init(&rdp->nocb_bypass);
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
+ * for this CPU's group has not yet been created, spawn it as well.
+ */
+static void rcu_spawn_one_nocb_kthread(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp_gp;
+ struct task_struct *t;
+
+ /*
+ * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
+ * then nothing to do.
+ */
+ if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
+ return;
+
+ /* If we didn't spawn the GP kthread first, reorganize! */
+ rdp_gp = rdp->nocb_gp_rdp;
+ if (!rdp_gp->nocb_gp_kthread) {
+ t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
+ "rcuog/%d", rdp_gp->cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
+ return;
+ WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
+ }
+
+ /* Spawn the kthread for this CPU. */
+ t = kthread_run(rcu_nocb_cb_kthread, rdp,
+ "rcuo%c/%d", rcu_state.abbr, cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
+ return;
+ WRITE_ONCE(rdp->nocb_cb_kthread, t);
+ WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo kthread, spawn it.
+ */
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+ if (rcu_scheduler_fully_active)
+ rcu_spawn_one_nocb_kthread(cpu);
+}
+
+/*
+ * Once the scheduler is running, spawn rcuo kthreads for all online
+ * no-CBs CPUs. This assumes that the early_initcall()s happen before
+ * non-boot CPUs come online -- if this changes, we will need to add
+ * some mutual exclusion.
+ */
+static void __init rcu_spawn_nocb_kthreads(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ rcu_spawn_cpu_nocb_kthread(cpu);
+}
+
+/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_gp_stride = -1;
+module_param(rcu_nocb_gp_stride, int, 0444);
+
+/*
+ * Initialize GP-CB relationships for all no-CBs CPU.
+ */
+static void __init rcu_organize_nocb_kthreads(void)
+{
+ int cpu;
+ bool firsttime = true;
+ bool gotnocbs = false;
+ bool gotnocbscbs = true;
+ int ls = rcu_nocb_gp_stride;
+ int nl = 0; /* Next GP kthread. */
+ struct rcu_data *rdp;
+ struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
+ struct rcu_data *rdp_prev = NULL;
+
+ if (!cpumask_available(rcu_nocb_mask))
+ return;
+ if (ls == -1) {
+ ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
+ rcu_nocb_gp_stride = ls;
+ }
+
+ /*
+ * Each pass through this loop sets up one rcu_data structure.
+ * Should the corresponding CPU come online in the future, then
+ * we will spawn the needed set of rcu_nocb_kthread() kthreads.
+ */
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->cpu >= nl) {
+ /* New GP kthread, set up for CBs & next GP. */
+ gotnocbs = true;
+ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+ rdp->nocb_gp_rdp = rdp;
+ rdp_gp = rdp;
+ if (dump_tree) {
+ if (!firsttime)
+ pr_cont("%s\n", gotnocbscbs
+ ? "" : " (self only)");
+ gotnocbscbs = false;
+ firsttime = false;
+ pr_alert("%s: No-CB GP kthread CPU %d:",
+ __func__, cpu);
+ }
+ } else {
+ /* Another CB kthread, link to previous GP kthread. */
+ gotnocbscbs = true;
+ rdp->nocb_gp_rdp = rdp_gp;
+ rdp_prev->nocb_next_cb_rdp = rdp;
+ if (dump_tree)
+ pr_cont(" %d", cpu);
+ }
+ rdp_prev = rdp;
+ }
+ if (gotnocbs && dump_tree)
+ pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
+}
+
+/*
+ * Bind the current task to the offloaded CPUs. If there are no offloaded
+ * CPUs, leave the task unbound. Splat if the bind attempt fails.
+ */
+void rcu_bind_current_to_nocb(void)
+{
+ if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
+ WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
+}
+EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
+
+// The ->on_cpu field is available only in CONFIG_SMP=y, so...
+#ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : "";
+}
+#else // #ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return "";
+}
+#endif // #else #ifdef CONFIG_SMP
+
+/*
+ * Dump out nocb grace-period kthread state for the specified rcu_data
+ * structure.
+ */
+static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
+{
+ struct rcu_node *rnp = rdp->mynode;
+
+ pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
+ rdp->cpu,
+ "kK"[!!rdp->nocb_gp_kthread],
+ "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "tT"[timer_pending(&rdp->nocb_timer)],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[swait_active(&rdp->nocb_gp_wq)],
+ ".W"[swait_active(&rnp->nocb_gp_wq[0])],
+ ".W"[swait_active(&rnp->nocb_gp_wq[1])],
+ ".B"[!!rdp->nocb_gp_bypass],
+ ".G"[!!rdp->nocb_gp_gp],
+ (long)rdp->nocb_gp_seq,
+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
+ rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
+}
+
+/* Dump out nocb kthread state for the specified rcu_data structure. */
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+ char bufw[20];
+ char bufr[20];
+ struct rcu_segcblist *rsclp = &rdp->cblist;
+ bool waslocked;
+ bool wassleep;
+
+ if (rdp->nocb_gp_rdp == rdp)
+ show_rcu_nocb_gp_state(rdp);
+
+ sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
+ rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
+ "kK"[!!rdp->nocb_cb_kthread],
+ "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
+ "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
+ "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
+ "sS"[!!rdp->nocb_cb_sleep],
+ ".W"[swait_active(&rdp->nocb_cb_wq)],
+ jiffies - rdp->nocb_bypass_first,
+ jiffies - rdp->nocb_nobypass_last,
+ rdp->nocb_nobypass_count,
+ ".D"[rcu_segcblist_ready_cbs(rsclp)],
+ ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
+ ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
+ ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
+ ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
+
+ /* It is OK for GP kthreads to have GP state. */
+ if (rdp->nocb_gp_rdp == rdp)
+ return;
+
+ waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
+ wassleep = swait_active(&rdp->nocb_gp_wq);
+ if (!rdp->nocb_gp_sleep && !waslocked && !wassleep)
+ return; /* Nothing untoward. */
+
+ pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n",
+ "lL"[waslocked],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[wassleep]);
+}
+
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
+{
+ return 0;
+}
+
+static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
+{
+ return false;
+}
+
+/* No ->nocb_lock to acquire. */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ local_irq_restore(flags);
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+}
+
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return NULL;
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+}
+
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
+{
+ return true;
+}
+
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags)
+{
+ return false;
+}
+
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags)
+{
+ WARN_ON_ONCE(1); /* Should be dead code! */
+}
+
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+}
+
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return false;
+}
+
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ return false;
+}
+
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+}
+
+static void __init rcu_spawn_nocb_kthreads(void)
+{
+}
+
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ad0156b86937..d070059163d7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -13,48 +13,6 @@
#include "../locking/rtmutex_common.h"
-#ifdef CONFIG_RCU_NOCB_CPU
-static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
-static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
- return lockdep_is_held(&rdp->nocb_lock);
-}
-
-static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
-{
- /* Race on early boot between thread creation and assignment */
- if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread)
- return true;
-
- if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread)
- if (in_task())
- return true;
- return false;
-}
-
-static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
-{
- return (timer_curr_running(&rdp->nocb_timer) && !in_irq());
-}
-#else
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
- return 0;
-}
-
-static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
-{
- return false;
-}
-
-static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
-{
- return false;
-}
-
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
-
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
{
/*
@@ -72,8 +30,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
rcu_lockdep_is_held_nocb(rdp) ||
(rdp == this_cpu_ptr(&rcu_data) &&
!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
- rcu_current_is_nocb_kthread(rdp) ||
- rcu_running_nocb_timer(rdp)),
+ rcu_current_is_nocb_kthread(rdp)),
"Unsafe read of RCU_NOCB offloaded state"
);
@@ -356,7 +313,7 @@ void rcu_note_context_switch(bool preempt)
trace_rcu_utilization(TPS("Start context switch"));
lockdep_assert_irqs_disabled();
- WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0);
+ WARN_ONCE(!preempt && rcu_preempt_depth() > 0, "Voluntary context switch within RCU read-side critical section!");
if (rcu_preempt_depth() > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
@@ -415,17 +372,20 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
static void rcu_preempt_read_enter(void)
{
- current->rcu_read_lock_nesting++;
+ WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1);
}
static int rcu_preempt_read_exit(void)
{
- return --current->rcu_read_lock_nesting;
+ int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1;
+
+ WRITE_ONCE(current->rcu_read_lock_nesting, ret);
+ return ret;
}
static void rcu_preempt_depth_set(int val)
{
- current->rcu_read_lock_nesting = val;
+ WRITE_ONCE(current->rcu_read_lock_nesting, val);
}
/*
@@ -569,7 +529,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
WRITE_ONCE(rnp->exp_tasks, np);
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
- drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx.rtmutex) == t;
if (&t->rcu_node_entry == rnp->boost_tasks)
WRITE_ONCE(rnp->boost_tasks, np);
}
@@ -596,7 +556,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
/* Unboost if we were boosted. */
if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
- rt_mutex_futex_unlock(&rnp->boost_mtx);
+ rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
/*
* If this was the last task on the expedited lists,
@@ -1093,11 +1053,12 @@ static int rcu_boost(struct rcu_node *rnp)
* section.
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
- rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
+ rt_mutex_init_proxy_locked(&rnp->boost_mtx.rtmutex, t);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* Lock only for side effect: boosts task t's priority. */
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
+ rnp->n_boosts++;
return READ_ONCE(rnp->exp_tasks) != NULL ||
READ_ONCE(rnp->boost_tasks) != NULL;
@@ -1197,22 +1158,16 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
*/
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
- int rnp_index = rnp - rcu_get_root();
unsigned long flags;
+ int rnp_index = rnp - rcu_get_root();
struct sched_param sp;
struct task_struct *t;
- if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
- return;
-
- if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
+ if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
return;
rcu_state.boost = 1;
- if (rnp->boost_kthread_task != NULL)
- return;
-
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
if (WARN_ON_ONCE(IS_ERR(t)))
@@ -1264,17 +1219,8 @@ static void __init rcu_spawn_boost_kthreads(void)
struct rcu_node *rnp;
rcu_for_each_leaf_node(rnp)
- rcu_spawn_one_boost_kthread(rnp);
-}
-
-static void rcu_prepare_kthreads(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp = rdp->mynode;
-
- /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
- if (rcu_scheduler_fully_active)
- rcu_spawn_one_boost_kthread(rnp);
+ if (rcu_rnp_online_cpus(rnp))
+ rcu_spawn_one_boost_kthread(rnp);
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1294,15 +1240,15 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
}
-static void __init rcu_spawn_boost_kthreads(void)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}
-static void rcu_prepare_kthreads(int cpu)
+static void __init rcu_spawn_boost_kthreads(void)
{
}
@@ -1503,1449 +1449,6 @@ static void rcu_cleanup_after_idle(void)
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#ifdef CONFIG_RCU_NOCB_CPU
-
-/*
- * Offload callback processing from the boot-time-specified set of CPUs
- * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
- * created that pull the callbacks from the corresponding CPU, wait for
- * a grace period to elapse, and invoke the callbacks. These kthreads
- * are organized into GP kthreads, which manage incoming callbacks, wait for
- * grace periods, and awaken CB kthreads, and the CB kthreads, which only
- * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
- * do a wake_up() on their GP kthread when they insert a callback into any
- * empty list, unless the rcu_nocb_poll boot parameter has been specified,
- * in which case each kthread actively polls its CPU. (Which isn't so great
- * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
- *
- * This is intended to be used in conjunction with Frederic Weisbecker's
- * adaptive-idle work, which would seriously reduce OS jitter on CPUs
- * running CPU-bound user-mode computations.
- *
- * Offloading of callbacks can also be used as an energy-efficiency
- * measure because CPUs with no RCU callbacks queued are more aggressive
- * about entering dyntick-idle mode.
- */
-
-
-/*
- * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
- * If the list is invalid, a warning is emitted and all CPUs are offloaded.
- */
-static int __init rcu_nocb_setup(char *str)
-{
- alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- if (!strcasecmp(str, "all")) /* legacy: use "0-N" instead */
- cpumask_setall(rcu_nocb_mask);
- else
- if (cpulist_parse(str, rcu_nocb_mask)) {
- pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
- cpumask_setall(rcu_nocb_mask);
- }
- return 1;
-}
-__setup("rcu_nocbs=", rcu_nocb_setup);
-
-static int __init parse_rcu_nocb_poll(char *arg)
-{
- rcu_nocb_poll = true;
- return 0;
-}
-early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
-
-/*
- * Don't bother bypassing ->cblist if the call_rcu() rate is low.
- * After all, the main point of bypassing is to avoid lock contention
- * on ->nocb_lock, which only can happen at high call_rcu() rates.
- */
-static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
-module_param(nocb_nobypass_lim_per_jiffy, int, 0);
-
-/*
- * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
- * lock isn't immediately available, increment ->nocb_lock_contended to
- * flag the contention.
- */
-static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
- __acquires(&rdp->nocb_bypass_lock)
-{
- lockdep_assert_irqs_disabled();
- if (raw_spin_trylock(&rdp->nocb_bypass_lock))
- return;
- atomic_inc(&rdp->nocb_lock_contended);
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- smp_mb__after_atomic(); /* atomic_inc() before lock. */
- raw_spin_lock(&rdp->nocb_bypass_lock);
- smp_mb__before_atomic(); /* atomic_dec() after lock. */
- atomic_dec(&rdp->nocb_lock_contended);
-}
-
-/*
- * Spinwait until the specified rcu_data structure's ->nocb_lock is
- * not contended. Please note that this is extremely special-purpose,
- * relying on the fact that at most two kthreads and one CPU contend for
- * this lock, and also that the two kthreads are guaranteed to have frequent
- * grace-period-duration time intervals between successive acquisitions
- * of the lock. This allows us to use an extremely simple throttling
- * mechanism, and further to apply it only to the CPU doing floods of
- * call_rcu() invocations. Don't try this at home!
- */
-static void rcu_nocb_wait_contended(struct rcu_data *rdp)
-{
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
- cpu_relax();
-}
-
-/*
- * Conditionally acquire the specified rcu_data structure's
- * ->nocb_bypass_lock.
- */
-static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- return raw_spin_trylock(&rdp->nocb_bypass_lock);
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_bypass_lock.
- */
-static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
- __releases(&rdp->nocb_bypass_lock)
-{
- lockdep_assert_irqs_disabled();
- raw_spin_unlock(&rdp->nocb_bypass_lock);
-}
-
-/*
- * Acquire the specified rcu_data structure's ->nocb_lock, but only
- * if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_lock(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (!rcu_rdp_is_offloaded(rdp))
- return;
- raw_spin_lock(&rdp->nocb_lock);
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_lock, but only
- * if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_unlock(struct rcu_data *rdp)
-{
- if (rcu_rdp_is_offloaded(rdp)) {
- lockdep_assert_irqs_disabled();
- raw_spin_unlock(&rdp->nocb_lock);
- }
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_lock and restore
- * interrupts, but only if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
- unsigned long flags)
-{
- if (rcu_rdp_is_offloaded(rdp)) {
- lockdep_assert_irqs_disabled();
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- } else {
- local_irq_restore(flags);
- }
-}
-
-/* Lockdep check that ->cblist may be safely accessed. */
-static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (rcu_rdp_is_offloaded(rdp))
- lockdep_assert_held(&rdp->nocb_lock);
-}
-
-/*
- * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
- * grace period.
- */
-static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
-{
- swake_up_all(sq);
-}
-
-static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
-{
- return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
- init_swait_queue_head(&rnp->nocb_gp_wq[0]);
- init_swait_queue_head(&rnp->nocb_gp_wq[1]);
-}
-
-/* Is the specified CPU a no-CBs CPU? */
-bool rcu_is_nocb_cpu(int cpu)
-{
- if (cpumask_available(rcu_nocb_mask))
- return cpumask_test_cpu(cpu, rcu_nocb_mask);
- return false;
-}
-
-/*
- * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
- * and this function releases it.
- */
-static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
- unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- bool needwake = false;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
-
- lockdep_assert_held(&rdp->nocb_lock);
- if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("AlreadyAwake"));
- return false;
- }
-
- if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- del_timer(&rdp->nocb_timer);
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
- WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
- needwake = true;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
- }
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
- if (needwake)
- wake_up_process(rdp_gp->nocb_gp_kthread);
-
- return needwake;
-}
-
-/*
- * Arrange to wake the GP kthread for this NOCB group at some future
- * time when it is safe to do so.
- */
-static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
- const char *reason)
-{
- if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF)
- return;
- if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
- mod_timer(&rdp->nocb_timer, jiffies + 1);
- if (rdp->nocb_defer_wakeup < waketype)
- WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
-}
-
-/*
- * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
- * However, if there is a callback to be enqueued and if ->nocb_bypass
- * proves to be initially empty, just return false because the no-CB GP
- * kthread may need to be awakened in this case.
- *
- * Note that this function always returns true if rhp is NULL.
- */
-static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- struct rcu_cblist rcl;
-
- WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
- rcu_lockdep_assert_cblist_protected(rdp);
- lockdep_assert_held(&rdp->nocb_bypass_lock);
- if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
- raw_spin_unlock(&rdp->nocb_bypass_lock);
- return false;
- }
- /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
- if (rhp)
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
- rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
- rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- rcu_nocb_bypass_unlock(rdp);
- return true;
-}
-
-/*
- * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
- * However, if there is a callback to be enqueued and if ->nocb_bypass
- * proves to be initially empty, just return false because the no-CB GP
- * kthread may need to be awakened in this case.
- *
- * Note that this function always returns true if rhp is NULL.
- */
-static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- if (!rcu_rdp_is_offloaded(rdp))
- return true;
- rcu_lockdep_assert_cblist_protected(rdp);
- rcu_nocb_bypass_lock(rdp);
- return rcu_nocb_do_flush_bypass(rdp, rhp, j);
-}
-
-/*
- * If the ->nocb_bypass_lock is immediately available, flush the
- * ->nocb_bypass queue into ->cblist.
- */
-static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
-{
- rcu_lockdep_assert_cblist_protected(rdp);
- if (!rcu_rdp_is_offloaded(rdp) ||
- !rcu_nocb_bypass_trylock(rdp))
- return;
- WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
-}
-
-/*
- * See whether it is appropriate to use the ->nocb_bypass list in order
- * to control contention on ->nocb_lock. A limited number of direct
- * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
- * is non-empty, further callbacks must be placed into ->nocb_bypass,
- * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
- * back to direct use of ->cblist. However, ->nocb_bypass should not be
- * used if ->cblist is empty, because otherwise callbacks can be stranded
- * on ->nocb_bypass because we cannot count on the current CPU ever again
- * invoking call_rcu(). The general rule is that if ->nocb_bypass is
- * non-empty, the corresponding no-CBs grace-period kthread must not be
- * in an indefinite sleep state.
- *
- * Finally, it is not permitted to use the bypass during early boot,
- * as doing so would confuse the auto-initialization code. Besides
- * which, there is no point in worrying about lock contention while
- * there is only one CPU in operation.
- */
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
-{
- unsigned long c;
- unsigned long cur_gp_seq;
- unsigned long j = jiffies;
- long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-
- lockdep_assert_irqs_disabled();
-
- // Pure softirq/rcuc based processing: no bypassing, no
- // locking.
- if (!rcu_rdp_is_offloaded(rdp)) {
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false;
- }
-
- // In the process of (de-)offloading: no bypassing, but
- // locking.
- if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false; /* Not offloaded, no bypassing. */
- }
-
- // Don't use ->nocb_bypass during early boot.
- if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
- rcu_nocb_lock(rdp);
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false;
- }
-
- // If we have advanced to a new jiffy, reset counts to allow
- // moving back from ->nocb_bypass to ->cblist.
- if (j == rdp->nocb_nobypass_last) {
- c = rdp->nocb_nobypass_count + 1;
- } else {
- WRITE_ONCE(rdp->nocb_nobypass_last, j);
- c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
- if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
- nocb_nobypass_lim_per_jiffy))
- c = 0;
- else if (c > nocb_nobypass_lim_per_jiffy)
- c = nocb_nobypass_lim_per_jiffy;
- }
- WRITE_ONCE(rdp->nocb_nobypass_count, c);
-
- // If there hasn't yet been all that many ->cblist enqueues
- // this jiffy, tell the caller to enqueue onto ->cblist. But flush
- // ->nocb_bypass first.
- if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return false; // Caller must enqueue the callback.
- }
-
- // If ->nocb_bypass has been used too long or is too full,
- // flush ->nocb_bypass to ->cblist.
- if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
- ncbs >= qhimark) {
- rcu_nocb_lock(rdp);
- if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return false; // Caller must enqueue the callback.
- }
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return true; // Callback already enqueued.
- }
-
- // We need to use the bypass.
- rcu_nocb_wait_contended(rdp);
- rcu_nocb_bypass_lock(rdp);
- ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
- rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
- if (!ncbs) {
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
- }
- rcu_nocb_bypass_unlock(rdp);
- smp_mb(); /* Order enqueue before wake. */
- if (ncbs) {
- local_irq_restore(flags);
- } else {
- // No-CBs GP kthread might be indefinitely asleep, if so, wake.
- rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
- if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstBQwake"));
- __call_rcu_nocb_wake(rdp, true, flags);
- } else {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstBQnoWake"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
- }
- return true; // Callback already enqueued.
-}
-
-/*
- * Awaken the no-CBs grace-period kthead if needed, either due to it
- * legitimately being asleep or due to overload conditions.
- *
- * If warranted, also wake up the kthread servicing this CPUs queues.
- */
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
- unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- unsigned long cur_gp_seq;
- unsigned long j;
- long len;
- struct task_struct *t;
-
- // If we are being polled or there is no kthread, just leave.
- t = READ_ONCE(rdp->nocb_gp_kthread);
- if (rcu_nocb_poll || !t) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeNotPoll"));
- return;
- }
- // Need to actually to a wakeup.
- len = rcu_segcblist_n_cbs(&rdp->cblist);
- if (was_alldone) {
- rdp->qlen_last_fqs_check = len;
- if (!irqs_disabled_flags(flags)) {
- /* ... if queue was empty ... */
- wake_nocb_gp(rdp, false, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeEmpty"));
- } else {
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
- TPS("WakeEmptyIsDeferred"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
- } else if (len > rdp->qlen_last_fqs_check + qhimark) {
- /* ... or if many callbacks queued. */
- rdp->qlen_last_fqs_check = len;
- j = jiffies;
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- smp_mb(); /* Enqueue before timer_pending(). */
- if ((rdp->nocb_cb_sleep ||
- !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
- !timer_pending(&rdp->nocb_bypass_timer))
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
- TPS("WakeOvfIsDeferred"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- } else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
- }
- return;
-}
-
-/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
-static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
-{
- unsigned long flags;
- struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
-
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
- rcu_nocb_lock_irqsave(rdp, flags);
- smp_mb__after_spinlock(); /* Timer expire before wakeup. */
- __call_rcu_nocb_wake(rdp, true, flags);
-}
-
-/*
- * Check if we ignore this rdp.
- *
- * We check that without holding the nocb lock but
- * we make sure not to miss a freshly offloaded rdp
- * with the current ordering:
- *
- * rdp_offload_toggle() nocb_gp_enabled_cb()
- * ------------------------- ----------------------------
- * WRITE flags LOCK nocb_gp_lock
- * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
- * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
- * UNLOCK nocb_gp_lock READ flags
- */
-static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
-
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
-static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
- bool *needwake_state)
-{
- struct rcu_segcblist *cblist = &rdp->cblist;
-
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *needwake_state = true;
- }
- return false;
- }
-
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We will ignore this rdp until it ever gets re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *needwake_state = true;
- return true;
-}
-
-
-/*
- * No-CBs GP kthreads come here to wait for additional callbacks to show up
- * or for grace periods to end.
- */
-static void nocb_gp_wait(struct rcu_data *my_rdp)
-{
- bool bypass = false;
- long bypass_ncbs;
- int __maybe_unused cpu = my_rdp->cpu;
- unsigned long cur_gp_seq;
- unsigned long flags;
- bool gotcbs = false;
- unsigned long j = jiffies;
- bool needwait_gp = false; // This prevents actual uninitialized use.
- bool needwake;
- bool needwake_gp;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
- unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
- bool wasempty = false;
-
- /*
- * Each pass through the following loop checks for CBs and for the
- * nearest grace period (if any) to wait for next. The CB kthreads
- * and the global grace-period kthread are awakened if needed.
- */
- WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
- bool needwake_state = false;
-
- if (!nocb_gp_enabled_cb(rdp))
- continue;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
- rcu_nocb_lock_irqsave(rdp, flags);
- if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- continue;
- }
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- if (bypass_ncbs &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
- bypass_ncbs > 2 * qhimark)) {
- // Bypass full or old, so flush it.
- (void)rcu_nocb_try_flush_bypass(rdp, j);
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- continue; /* No callbacks here, try next. */
- }
- if (bypass_ncbs) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("Bypass"));
- bypass = true;
- }
- rnp = rdp->mynode;
- if (bypass) { // Avoid race with first bypass CB.
- WRITE_ONCE(my_rdp->nocb_defer_wakeup,
- RCU_NOCB_WAKE_NOT);
- del_timer(&my_rdp->nocb_timer);
- }
- // Advance callbacks if helpful and low contention.
- needwake_gp = false;
- if (!rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL) ||
- (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
- raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
- needwake_gp = rcu_advance_cbs(rnp, rdp);
- wasempty = rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL);
- raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
- }
- // Need to wait on some grace period?
- WARN_ON_ONCE(wasempty &&
- !rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL));
- if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
- if (!needwait_gp ||
- ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
- wait_gp_seq = cur_gp_seq;
- needwait_gp = true;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("NeedWaitGP"));
- }
- if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- needwake = rdp->nocb_cb_sleep;
- WRITE_ONCE(rdp->nocb_cb_sleep, false);
- smp_mb(); /* CB invocation -after- GP end. */
- } else {
- needwake = false;
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake) {
- swake_up_one(&rdp->nocb_cb_wq);
- gotcbs = true;
- }
- if (needwake_gp)
- rcu_gp_kthread_wake();
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- }
-
- my_rdp->nocb_gp_bypass = bypass;
- my_rdp->nocb_gp_gp = needwait_gp;
- my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
- if (bypass && !rcu_nocb_poll) {
- // At least one child with non-empty ->nocb_bypass, so set
- // timer in order to avoid stranding its callbacks.
- raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
- mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
- raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
- }
- if (rcu_nocb_poll) {
- /* Polling, so trace if first poll in the series. */
- if (gotcbs)
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
- schedule_timeout_idle(1);
- } else if (!needwait_gp) {
- /* Wait for callbacks to appear. */
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
- swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
- } else {
- rnp = my_rdp->mynode;
- trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
- swait_event_interruptible_exclusive(
- rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
- rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
- }
- if (!rcu_nocb_poll) {
- raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
- if (bypass)
- del_timer(&my_rdp->nocb_bypass_timer);
- WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
- raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
- }
- my_rdp->nocb_gp_seq = -1;
- WARN_ON(signal_pending(current));
-}
-
-/*
- * No-CBs grace-period-wait kthread. There is one of these per group
- * of CPUs, but only once at least one CPU in that group has come online
- * at least once since boot. This kthread checks for newly posted
- * callbacks from any of the CPUs it is responsible for, waits for a
- * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
- * that then have callback-invocation work to do.
- */
-static int rcu_nocb_gp_kthread(void *arg)
-{
- struct rcu_data *rdp = arg;
-
- for (;;) {
- WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
- nocb_gp_wait(rdp);
- cond_resched_tasks_rcu_qs();
- }
- return 0;
-}
-
-static inline bool nocb_cb_can_run(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
-static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
-{
- return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
-}
-
-/*
- * Invoke any ready callbacks from the corresponding no-CBs CPU,
- * then, if there are no more, wait for more to appear.
- */
-static void nocb_cb_wait(struct rcu_data *rdp)
-{
- struct rcu_segcblist *cblist = &rdp->cblist;
- unsigned long cur_gp_seq;
- unsigned long flags;
- bool needwake_state = false;
- bool needwake_gp = false;
- bool can_sleep = true;
- struct rcu_node *rnp = rdp->mynode;
-
- local_irq_save(flags);
- rcu_momentary_dyntick_idle();
- local_irq_restore(flags);
- /*
- * Disable BH to provide the expected environment. Also, when
- * transitioning to/from NOCB mode, a self-requeuing callback might
- * be invoked from softirq. A short grace period could cause both
- * instances of this callback would execute concurrently.
- */
- local_bh_disable();
- rcu_do_batch(rdp);
- local_bh_enable();
- lockdep_assert_irqs_enabled();
- rcu_nocb_lock_irqsave(rdp, flags);
- if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
- rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
- raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
- needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- }
-
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
- }
- if (rcu_segcblist_ready_cbs(cblist))
- can_sleep = false;
- } else {
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We won't touch the callbacks and keep sleeping until we ever
- * get re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
- }
-
- WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
-
- if (rdp->nocb_cb_sleep)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
-
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_gp)
- rcu_gp_kthread_wake();
-
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
-
- do {
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- nocb_cb_wait_cond(rdp));
-
- // VVV Ensure CB invocation follows _sleep test.
- if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
- }
- } while (!nocb_cb_can_run(rdp));
-}
-
-/*
- * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
- * nocb_cb_wait() to do the dirty work.
- */
-static int rcu_nocb_cb_kthread(void *arg)
-{
- struct rcu_data *rdp = arg;
-
- // Each pass through this loop does one callback batch, and,
- // if there are no more ready callbacks, waits for them.
- for (;;) {
- nocb_cb_wait(rdp);
- cond_resched_tasks_rcu_qs();
- }
- return 0;
-}
-
-/* Is a deferred wakeup of rcu_nocb_kthread() required? */
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
-{
- return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT;
-}
-
-/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
-{
- unsigned long flags;
- int ndw;
- int ret;
-
- rcu_nocb_lock_irqsave(rdp, flags);
- if (!rcu_nocb_need_deferred_wakeup(rdp)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return false;
- }
- ndw = READ_ONCE(rdp->nocb_defer_wakeup);
- ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
-
- return ret;
-}
-
-/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
-static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
-{
- struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
-
- do_nocb_deferred_wakeup_common(rdp);
-}
-
-/*
- * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
- * This means we do an inexact common-case check. Note that if
- * we miss, ->nocb_timer will eventually clean things up.
- */
-static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
- if (rcu_nocb_need_deferred_wakeup(rdp))
- return do_nocb_deferred_wakeup_common(rdp);
- return false;
-}
-
-void rcu_nocb_flush_deferred_wakeup(void)
-{
- do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
-
-static int rdp_offload_toggle(struct rcu_data *rdp,
- bool offload, unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- struct rcu_segcblist *cblist = &rdp->cblist;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
- bool wake_gp = false;
-
- rcu_segcblist_offload(cblist, offload);
-
- if (rdp->nocb_cb_sleep)
- rdp->nocb_cb_sleep = false;
- rcu_nocb_unlock_irqrestore(rdp, flags);
-
- /*
- * Ignore former value of nocb_cb_sleep and force wake up as it could
- * have been spuriously set to false already.
- */
- swake_up_one(&rdp->nocb_cb_wq);
-
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- if (rdp_gp->nocb_gp_sleep) {
- rdp_gp->nocb_gp_sleep = false;
- wake_gp = true;
- }
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
-
- if (wake_gp)
- wake_up_process(rdp_gp->nocb_gp_kthread);
-
- return 0;
-}
-
-static long rcu_nocb_rdp_deoffload(void *arg)
-{
- struct rcu_data *rdp = arg;
- struct rcu_segcblist *cblist = &rdp->cblist;
- unsigned long flags;
- int ret;
-
- WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
-
- pr_info("De-offloading %d\n", rdp->cpu);
-
- rcu_nocb_lock_irqsave(rdp, flags);
- /*
- * Flush once and for all now. This suffices because we are
- * running on the target CPU holding ->nocb_lock (thus having
- * interrupts disabled), and because rdp_offload_toggle()
- * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
- * Thus future calls to rcu_segcblist_completely_offloaded() will
- * return false, which means that future calls to rcu_nocb_try_bypass()
- * will refuse to put anything into the bypass.
- */
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
- ret = rdp_offload_toggle(rdp, false, flags);
- swait_event_exclusive(rdp->nocb_state_wq,
- !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
- SEGCBLIST_KTHREAD_GP));
- rcu_nocb_lock_irqsave(rdp, flags);
- /* Make sure nocb timer won't stay around */
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF);
- rcu_nocb_unlock_irqrestore(rdp, flags);
- del_timer_sync(&rdp->nocb_timer);
-
- /*
- * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY with CB unlocked
- * and IRQs disabled but let's be paranoid.
- */
- rcu_nocb_lock_irqsave(rdp, flags);
- rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
- /*
- * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
- * rcu_nocb_unlock_irqrestore() anymore.
- */
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
-
- /* Sanity check */
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
-
-
- return ret;
-}
-
-int rcu_nocb_cpu_deoffload(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- int ret = 0;
-
- if (rdp == rdp->nocb_gp_rdp) {
- pr_info("Can't deoffload an rdp GP leader (yet)\n");
- return -EINVAL;
- }
- mutex_lock(&rcu_state.barrier_mutex);
- cpus_read_lock();
- if (rcu_rdp_is_offloaded(rdp)) {
- if (cpu_online(cpu)) {
- ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
- if (!ret)
- cpumask_clear_cpu(cpu, rcu_nocb_mask);
- } else {
- pr_info("NOCB: Can't CB-deoffload an offline CPU\n");
- ret = -EINVAL;
- }
- }
- cpus_read_unlock();
- mutex_unlock(&rcu_state.barrier_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
-
-static long rcu_nocb_rdp_offload(void *arg)
-{
- struct rcu_data *rdp = arg;
- struct rcu_segcblist *cblist = &rdp->cblist;
- unsigned long flags;
- int ret;
-
- WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
- /*
- * For now we only support re-offload, ie: the rdp must have been
- * offloaded on boot first.
- */
- if (!rdp->nocb_gp_rdp)
- return -EINVAL;
-
- pr_info("Offloading %d\n", rdp->cpu);
- /*
- * Can't use rcu_nocb_lock_irqsave() while we are in
- * SEGCBLIST_SOFTIRQ_ONLY mode.
- */
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- /* Re-enable nocb timer */
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- /*
- * We didn't take the nocb lock while working on the
- * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
- * Every modifications that have been done previously on
- * rdp->cblist must be visible remotely by the nocb kthreads
- * upon wake up after reading the cblist flags.
- *
- * The layout against nocb_lock enforces that ordering:
- *
- * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
- * ------------------------- ----------------------------
- * WRITE callbacks rcu_nocb_lock()
- * rcu_nocb_lock() READ flags
- * WRITE flags READ callbacks
- * rcu_nocb_unlock() rcu_nocb_unlock()
- */
- ret = rdp_offload_toggle(rdp, true, flags);
- swait_event_exclusive(rdp->nocb_state_wq,
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
-
- return ret;
-}
-
-int rcu_nocb_cpu_offload(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- int ret = 0;
-
- mutex_lock(&rcu_state.barrier_mutex);
- cpus_read_lock();
- if (!rcu_rdp_is_offloaded(rdp)) {
- if (cpu_online(cpu)) {
- ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
- if (!ret)
- cpumask_set_cpu(cpu, rcu_nocb_mask);
- } else {
- pr_info("NOCB: Can't CB-offload an offline CPU\n");
- ret = -EINVAL;
- }
- }
- cpus_read_unlock();
- mutex_unlock(&rcu_state.barrier_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
-
-void __init rcu_init_nohz(void)
-{
- int cpu;
- bool need_rcu_nocb_mask = false;
- struct rcu_data *rdp;
-
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
- need_rcu_nocb_mask = true;
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
- if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
- pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
- return;
- }
- }
- if (!cpumask_available(rcu_nocb_mask))
- return;
-
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running)
- cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
- pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
- cpumask_and(rcu_nocb_mask, cpu_possible_mask,
- rcu_nocb_mask);
- }
- if (cpumask_empty(rcu_nocb_mask))
- pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
- else
- pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
- cpumask_pr_args(rcu_nocb_mask));
- if (rcu_nocb_poll)
- pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
-
- for_each_cpu(cpu, rcu_nocb_mask) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rcu_segcblist_empty(&rdp->cblist))
- rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_offload(&rdp->cblist, true);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
- }
- rcu_organize_nocb_kthreads();
-}
-
-/* Initialize per-rcu_data variables for no-CBs CPUs. */
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
- init_swait_queue_head(&rdp->nocb_cb_wq);
- init_swait_queue_head(&rdp->nocb_gp_wq);
- init_swait_queue_head(&rdp->nocb_state_wq);
- raw_spin_lock_init(&rdp->nocb_lock);
- raw_spin_lock_init(&rdp->nocb_bypass_lock);
- raw_spin_lock_init(&rdp->nocb_gp_lock);
- timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
- timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
- rcu_cblist_init(&rdp->nocb_bypass);
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
- * for this CPU's group has not yet been created, spawn it as well.
- */
-static void rcu_spawn_one_nocb_kthread(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_data *rdp_gp;
- struct task_struct *t;
-
- /*
- * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
- * then nothing to do.
- */
- if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
- return;
-
- /* If we didn't spawn the GP kthread first, reorganize! */
- rdp_gp = rdp->nocb_gp_rdp;
- if (!rdp_gp->nocb_gp_kthread) {
- t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
- "rcuog/%d", rdp_gp->cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
- return;
- WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
- }
-
- /* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_cb_kthread, rdp,
- "rcuo%c/%d", rcu_state.abbr, cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
- return;
- WRITE_ONCE(rdp->nocb_cb_kthread, t);
- WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthread, spawn it.
- */
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
- if (rcu_scheduler_fully_active)
- rcu_spawn_one_nocb_kthread(cpu);
-}
-
-/*
- * Once the scheduler is running, spawn rcuo kthreads for all online
- * no-CBs CPUs. This assumes that the early_initcall()s happen before
- * non-boot CPUs come online -- if this changes, we will need to add
- * some mutual exclusion.
- */
-static void __init rcu_spawn_nocb_kthreads(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- rcu_spawn_cpu_nocb_kthread(cpu);
-}
-
-/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
-static int rcu_nocb_gp_stride = -1;
-module_param(rcu_nocb_gp_stride, int, 0444);
-
-/*
- * Initialize GP-CB relationships for all no-CBs CPU.
- */
-static void __init rcu_organize_nocb_kthreads(void)
-{
- int cpu;
- bool firsttime = true;
- bool gotnocbs = false;
- bool gotnocbscbs = true;
- int ls = rcu_nocb_gp_stride;
- int nl = 0; /* Next GP kthread. */
- struct rcu_data *rdp;
- struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
- struct rcu_data *rdp_prev = NULL;
-
- if (!cpumask_available(rcu_nocb_mask))
- return;
- if (ls == -1) {
- ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
- rcu_nocb_gp_stride = ls;
- }
-
- /*
- * Each pass through this loop sets up one rcu_data structure.
- * Should the corresponding CPU come online in the future, then
- * we will spawn the needed set of rcu_nocb_kthread() kthreads.
- */
- for_each_cpu(cpu, rcu_nocb_mask) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rdp->cpu >= nl) {
- /* New GP kthread, set up for CBs & next GP. */
- gotnocbs = true;
- nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
- rdp->nocb_gp_rdp = rdp;
- rdp_gp = rdp;
- if (dump_tree) {
- if (!firsttime)
- pr_cont("%s\n", gotnocbscbs
- ? "" : " (self only)");
- gotnocbscbs = false;
- firsttime = false;
- pr_alert("%s: No-CB GP kthread CPU %d:",
- __func__, cpu);
- }
- } else {
- /* Another CB kthread, link to previous GP kthread. */
- gotnocbscbs = true;
- rdp->nocb_gp_rdp = rdp_gp;
- rdp_prev->nocb_next_cb_rdp = rdp;
- if (dump_tree)
- pr_cont(" %d", cpu);
- }
- rdp_prev = rdp;
- }
- if (gotnocbs && dump_tree)
- pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
-}
-
-/*
- * Bind the current task to the offloaded CPUs. If there are no offloaded
- * CPUs, leave the task unbound. Splat if the bind attempt fails.
- */
-void rcu_bind_current_to_nocb(void)
-{
- if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
- WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
-}
-EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
-
-// The ->on_cpu field is available only in CONFIG_SMP=y, so...
-#ifdef CONFIG_SMP
-static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
-{
- return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : "";
-}
-#else // #ifdef CONFIG_SMP
-static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
-{
- return "";
-}
-#endif // #else #ifdef CONFIG_SMP
-
-/*
- * Dump out nocb grace-period kthread state for the specified rcu_data
- * structure.
- */
-static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
-{
- struct rcu_node *rnp = rdp->mynode;
-
- pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
- rdp->cpu,
- "kK"[!!rdp->nocb_gp_kthread],
- "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
- "dD"[!!rdp->nocb_defer_wakeup],
- "tT"[timer_pending(&rdp->nocb_timer)],
- "bB"[timer_pending(&rdp->nocb_bypass_timer)],
- "sS"[!!rdp->nocb_gp_sleep],
- ".W"[swait_active(&rdp->nocb_gp_wq)],
- ".W"[swait_active(&rnp->nocb_gp_wq[0])],
- ".W"[swait_active(&rnp->nocb_gp_wq[1])],
- ".B"[!!rdp->nocb_gp_bypass],
- ".G"[!!rdp->nocb_gp_gp],
- (long)rdp->nocb_gp_seq,
- rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
- rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
- rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
- show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
-}
-
-/* Dump out nocb kthread state for the specified rcu_data structure. */
-static void show_rcu_nocb_state(struct rcu_data *rdp)
-{
- char bufw[20];
- char bufr[20];
- struct rcu_segcblist *rsclp = &rdp->cblist;
- bool waslocked;
- bool wastimer;
- bool wassleep;
-
- if (rdp->nocb_gp_rdp == rdp)
- show_rcu_nocb_gp_state(rdp);
-
- sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
- sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
- pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
- rdp->cpu, rdp->nocb_gp_rdp->cpu,
- rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
- "kK"[!!rdp->nocb_cb_kthread],
- "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
- "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
- "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
- "sS"[!!rdp->nocb_cb_sleep],
- ".W"[swait_active(&rdp->nocb_cb_wq)],
- jiffies - rdp->nocb_bypass_first,
- jiffies - rdp->nocb_nobypass_last,
- rdp->nocb_nobypass_count,
- ".D"[rcu_segcblist_ready_cbs(rsclp)],
- ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
- rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
- ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
- rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
- ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
- ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
- rcu_segcblist_n_cbs(&rdp->cblist),
- rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
- rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
- show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
-
- /* It is OK for GP kthreads to have GP state. */
- if (rdp->nocb_gp_rdp == rdp)
- return;
-
- waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
- wastimer = timer_pending(&rdp->nocb_bypass_timer);
- wassleep = swait_active(&rdp->nocb_gp_wq);
- if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep)
- return; /* Nothing untowards. */
-
- pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n",
- "lL"[waslocked],
- "dD"[!!rdp->nocb_defer_wakeup],
- "tT"[wastimer],
- "sS"[!!rdp->nocb_gp_sleep],
- ".W"[wassleep]);
-}
-
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-
-/* No ->nocb_lock to acquire. */
-static void rcu_nocb_lock(struct rcu_data *rdp)
-{
-}
-
-/* No ->nocb_lock to release. */
-static void rcu_nocb_unlock(struct rcu_data *rdp)
-{
-}
-
-/* No ->nocb_lock to release. */
-static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
- unsigned long flags)
-{
- local_irq_restore(flags);
-}
-
-/* Lockdep check that ->cblist may be safely accessed. */
-static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
-}
-
-static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
-{
-}
-
-static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
-{
- return NULL;
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
-}
-
-static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- return true;
-}
-
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
-{
- return false;
-}
-
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
- unsigned long flags)
-{
- WARN_ON_ONCE(1); /* Should be dead code! */
-}
-
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
-}
-
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
-{
- return false;
-}
-
-static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
- return false;
-}
-
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
-}
-
-static void __init rcu_spawn_nocb_kthreads(void)
-{
-}
-
-static void show_rcu_nocb_state(struct rcu_data *rdp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
-
/*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing?
@@ -2995,17 +1498,17 @@ static void noinstr rcu_dynticks_task_exit(void)
/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
static void rcu_dynticks_task_trace_enter(void)
{
-#ifdef CONFIG_TASKS_RCU_TRACE
+#ifdef CONFIG_TASKS_TRACE_RCU
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
current->trc_reader_special.b.need_mb = true;
-#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
static void rcu_dynticks_task_trace_exit(void)
{
-#ifdef CONFIG_TASKS_RCU_TRACE
+#ifdef CONFIG_TASKS_TRACE_RCU
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
current->trc_reader_special.b.need_mb = false;
-#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 59b95cc5cbdf..677ee3d8671b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -7,6 +7,8 @@
* Author: Paul E. McKenney <paulmck@linux.ibm.com>
*/
+#include <linux/kvm_para.h>
+
//////////////////////////////////////////////////////////////////////////////
//
// Controlling CPU stall warnings, including delay calculation.
@@ -117,17 +119,14 @@ static void panic_on_rcu_stall(void)
}
/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
- *
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
+ * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period
*
* The caller must disable hard irqs.
*/
void rcu_cpu_stall_reset(void)
{
- WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
}
//////////////////////////////////////////////////////////////////////////////
@@ -267,8 +266,10 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
struct task_struct *ts[8];
lockdep_assert_irqs_disabled();
- if (!rcu_preempt_blocked_readers_cgp(rnp))
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
+ }
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
rnp->level, rnp->grplo, rnp->grphi);
t = list_entry(rnp->gp_tasks->prev,
@@ -280,8 +281,8 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
break;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- for (i--; i; i--) {
- t = ts[i];
+ while (i) {
+ t = ts[--i];
if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr))
pr_cont(" P%d", t->pid);
else
@@ -314,6 +315,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
* tasks blocked within RCU read-side critical sections.
*/
static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
@@ -349,7 +351,7 @@ static void rcu_dump_cpu_stacks(void)
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d",
rdp->last_accelerate & 0xffff, jiffies & 0xffff,
@@ -460,12 +462,13 @@ static void rcu_check_gp_kthread_starvation(void)
if (rcu_is_gp_kthread_starving(&j)) {
cpu = gpk ? task_cpu(gpk) : -1;
- pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
+ pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n",
rcu_state.name, j,
(long)rcu_seq_current(&rcu_state.gp_seq),
- data_race(rcu_state.gp_flags),
- gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
- gpk ? gpk->state : ~0, cpu);
+ data_race(READ_ONCE(rcu_state.gp_flags)),
+ gp_state_getname(rcu_state.gp_state),
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu);
if (gpk) {
pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
pr_err("RCU grace-period kthread stack dump:\n");
@@ -503,12 +506,12 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void)
time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) &&
gpk && !READ_ONCE(gpk->on_rq)) {
cpu = task_cpu(gpk);
- pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx\n",
+ pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n",
rcu_state.name, (jiffies - jiffies_fqs),
(long)rcu_seq_current(&rcu_state.gp_seq),
data_race(rcu_state.gp_flags),
gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
- gpk->state);
+ data_race(READ_ONCE(gpk->__state)));
pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu));
}
@@ -567,11 +570,11 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
pr_err("INFO: Stall ended before state dump start\n");
} else {
j = jiffies;
- gpa = data_race(rcu_state.gp_activity);
+ gpa = data_race(READ_ONCE(rcu_state.gp_activity));
pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
rcu_state.name, j - gpa, j, gpa,
- data_race(jiffies_till_next_fqs),
- rcu_get_root()->qsmask);
+ data_race(READ_ONCE(jiffies_till_next_fqs)),
+ data_race(READ_ONCE(rcu_get_root()->qsmask)));
}
}
/* Rewrite if needed in case of slow consoles. */
@@ -645,6 +648,7 @@ static void print_cpu_stall(unsigned long gps)
static void check_cpu_stall(struct rcu_data *rdp)
{
+ bool didstall = false;
unsigned long gs1;
unsigned long gs2;
unsigned long gps;
@@ -690,24 +694,46 @@ static void check_cpu_stall(struct rcu_data *rdp)
ULONG_CMP_GE(gps, js))
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
- jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ jn = jiffies + ULONG_MAX / 2;
if (rcu_gp_in_progress() &&
(READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like an RCU stall. Check to see if the host
+ * stopped the vm.
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return;
+
/* We haven't checked in, so go dump stack. */
print_cpu_stall(gps);
if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
rcu_ftrace_dump(DUMP_ALL);
+ didstall = true;
} else if (rcu_gp_in_progress() &&
ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like an RCU stall. Check to see if the host
+ * stopped the vm.
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return;
+
/* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(gs2, gps);
if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
rcu_ftrace_dump(DUMP_ALL);
+ didstall = true;
+ }
+ if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) {
+ jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rcu_state.jiffies_stall, jn);
}
}
@@ -717,6 +743,63 @@ static void check_cpu_stall(struct rcu_data *rdp)
/*
+ * Check to see if a failure to end RCU priority inversion was due to
+ * a CPU not passing through a quiescent state. When this happens, there
+ * is nothing that RCU priority boosting can do to help, so we shouldn't
+ * count this as an RCU priority boosting failure. A return of true says
+ * RCU priority boosting is to blame, and false says otherwise. If false
+ * is returned, the first of the CPUs to blame is stored through cpup.
+ * If there was no CPU blocking the current grace period, but also nothing
+ * in need of being boosted, *cpup is set to -1. This can happen in case
+ * of vCPU preemption while the last CPU is reporting its quiscent state,
+ * for example.
+ *
+ * If cpup is NULL, then a lockless quick check is carried out, suitable
+ * for high-rate usage. On the other hand, if cpup is non-NULL, each
+ * rcu_node structure's ->lock is acquired, ruling out high-rate usage.
+ */
+bool rcu_check_boost_fail(unsigned long gp_state, int *cpup)
+{
+ bool atb = false;
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rnp) {
+ if (!cpup) {
+ if (data_race(READ_ONCE(rnp->qsmask))) {
+ return false;
+ } else {
+ if (READ_ONCE(rnp->gp_tasks))
+ atb = true;
+ continue;
+ }
+ }
+ *cpup = -1;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (rnp->gp_tasks)
+ atb = true;
+ if (!rnp->qsmask) {
+ // No CPUs without quiescent states for this rnp.
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ continue;
+ }
+ // Find the first holdout CPU.
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ *cpup = cpu;
+ return false;
+ }
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ // Can't blame CPUs, so must blame RCU priority boosting.
+ return atb;
+}
+EXPORT_SYMBOL_GPL(rcu_check_boost_fail);
+
+/*
* Show the state of the grace-period kthreads.
*/
void show_rcu_gp_kthreads(void)
@@ -726,29 +809,41 @@ void show_rcu_gp_kthreads(void)
unsigned long j;
unsigned long ja;
unsigned long jr;
+ unsigned long js;
unsigned long jw;
struct rcu_data *rdp;
struct rcu_node *rnp;
struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
j = jiffies;
- ja = j - data_race(rcu_state.gp_activity);
- jr = j - data_race(rcu_state.gp_req_activity);
- jw = j - data_race(rcu_state.gp_wake_time);
- pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+ ja = j - data_race(READ_ONCE(rcu_state.gp_activity));
+ jr = j - data_race(READ_ONCE(rcu_state.gp_req_activity));
+ js = j - data_race(READ_ONCE(rcu_state.gp_start));
+ jw = j - data_race(READ_ONCE(rcu_state.gp_wake_time));
+ pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
rcu_state.name, gp_state_getname(rcu_state.gp_state),
- rcu_state.gp_state, t ? t->state : 0x1ffffL,
- ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
- (long)data_race(rcu_state.gp_seq),
- (long)data_race(rcu_get_root()->gp_seq_needed),
- data_race(rcu_state.gp_flags));
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ t ? data_race(READ_ONCE(t->__state)) : 0x1ffff, t ? t->rt_priority : 0xffU,
+ js, ja, jr, jw, (long)data_race(READ_ONCE(rcu_state.gp_wake_seq)),
+ (long)data_race(READ_ONCE(rcu_state.gp_seq)),
+ (long)data_race(READ_ONCE(rcu_get_root()->gp_seq_needed)),
+ data_race(READ_ONCE(rcu_state.gp_max)),
+ data_race(READ_ONCE(rcu_state.gp_flags)));
rcu_for_each_node_breadth_first(rnp) {
- if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq),
- READ_ONCE(rnp->gp_seq_needed)))
+ if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) &&
+ !data_race(READ_ONCE(rnp->qsmask)) && !data_race(READ_ONCE(rnp->boost_tasks)) &&
+ !data_race(READ_ONCE(rnp->exp_tasks)) && !data_race(READ_ONCE(rnp->gp_tasks)))
continue;
- pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
- rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq),
- (long)data_race(rnp->gp_seq_needed));
+ pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n",
+ rnp->grplo, rnp->grphi,
+ (long)data_race(READ_ONCE(rnp->gp_seq)),
+ (long)data_race(READ_ONCE(rnp->gp_seq_needed)),
+ data_race(READ_ONCE(rnp->qsmask)),
+ ".b"[!!data_race(READ_ONCE(rnp->boost_kthread_task))],
+ ".B"[!!data_race(READ_ONCE(rnp->boost_tasks))],
+ ".E"[!!data_race(READ_ONCE(rnp->exp_tasks))],
+ ".G"[!!data_race(READ_ONCE(rnp->gp_tasks))],
+ data_race(READ_ONCE(rnp->n_boosts)));
if (!rcu_is_leaf_node(rnp))
continue;
for_each_leaf_node_possible_cpu(rnp, cpu) {
@@ -758,12 +853,12 @@ void show_rcu_gp_kthreads(void)
READ_ONCE(rdp->gp_seq_needed)))
continue;
pr_info("\tcpu %d ->gp_seq_needed %ld\n",
- cpu, (long)data_race(rdp->gp_seq_needed));
+ cpu, (long)data_race(READ_ONCE(rdp->gp_seq_needed)));
}
}
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- cbs += data_race(rdp->n_cbs_invoked);
+ cbs += data_race(READ_ONCE(rdp->n_cbs_invoked));
if (rcu_segcblist_is_offloaded(&rdp->cblist))
show_rcu_nocb_state(rdp);
}
@@ -845,11 +940,11 @@ void rcu_fwd_progress_check(unsigned long j)
if (rcu_gp_in_progress()) {
pr_info("%s: GP age %lu jiffies\n",
- __func__, jiffies - rcu_state.gp_start);
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_start)));
show_rcu_gp_kthreads();
} else {
pr_info("%s: Last GP end %lu jiffies ago\n",
- __func__, jiffies - rcu_state.gp_end);
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_end)));
preempt_disable();
rdp = this_cpu_ptr(&rcu_data);
rcu_check_gp_start_stall(rdp->mynode, rdp, j);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index b95ae86c40a7..c21b38cc25e9 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -277,7 +277,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
noinstr int notrace debug_lockdep_rcu_enabled(void)
{
- return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
+ return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && READ_ONCE(debug_locks) &&
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -524,6 +524,7 @@ static void test_callback(struct rcu_head *r)
}
DEFINE_STATIC_SRCU(early_srcu);
+static unsigned long early_srcu_cookie;
struct early_boot_kfree_rcu {
struct rcu_head rh;
@@ -536,8 +537,10 @@ static void early_boot_test_call_rcu(void)
struct early_boot_kfree_rcu *rhp;
call_rcu(&head, test_callback);
- if (IS_ENABLED(CONFIG_SRCU))
+ if (IS_ENABLED(CONFIG_SRCU)) {
+ early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
call_srcu(&early_srcu, &shead, test_callback);
+ }
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
if (!WARN_ON_ONCE(!rhp))
kfree_rcu(rhp, rh);
@@ -563,6 +566,7 @@ static int rcu_verify_early_boot_tests(void)
if (IS_ENABLED(CONFIG_SRCU)) {
early_boot_test_counter++;
srcu_barrier(&early_srcu);
+ WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
}
}
if (rcu_self_test_counter != early_boot_test_counter) {
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 31bf2611ee12..efb40d095d1e 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -7,6 +7,7 @@
#define pr_fmt(fmt) "reboot: " fmt
+#include <linux/atomic.h>
#include <linux/ctype.h>
#include <linux/export.h>
#include <linux/kexec.h>
@@ -519,6 +520,84 @@ void orderly_reboot(void)
}
EXPORT_SYMBOL_GPL(orderly_reboot);
+/**
+ * hw_failure_emergency_poweroff_func - emergency poweroff work after a known delay
+ * @work: work_struct associated with the emergency poweroff function
+ *
+ * This function is called in very critical situations to force
+ * a kernel poweroff after a configurable timeout value.
+ */
+static void hw_failure_emergency_poweroff_func(struct work_struct *work)
+{
+ /*
+ * We have reached here after the emergency shutdown waiting period has
+ * expired. This means orderly_poweroff has not been able to shut off
+ * the system for some reason.
+ *
+ * Try to shut down the system immediately using kernel_power_off
+ * if populated
+ */
+ pr_emerg("Hardware protection timed-out. Trying forced poweroff\n");
+ kernel_power_off();
+
+ /*
+ * Worst of the worst case trigger emergency restart
+ */
+ pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n");
+ emergency_restart();
+}
+
+static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work,
+ hw_failure_emergency_poweroff_func);
+
+/**
+ * hw_failure_emergency_poweroff - Trigger an emergency system poweroff
+ *
+ * This may be called from any critical situation to trigger a system shutdown
+ * after a given period of time. If time is negative this is not scheduled.
+ */
+static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
+{
+ if (poweroff_delay_ms <= 0)
+ return;
+ schedule_delayed_work(&hw_failure_emergency_poweroff_work,
+ msecs_to_jiffies(poweroff_delay_ms));
+}
+
+/**
+ * hw_protection_shutdown - Trigger an emergency system poweroff
+ *
+ * @reason: Reason of emergency shutdown to be printed.
+ * @ms_until_forced: Time to wait for orderly shutdown before tiggering a
+ * forced shudown. Negative value disables the forced
+ * shutdown.
+ *
+ * Initiate an emergency system shutdown in order to protect hardware from
+ * further damage. Usage examples include a thermal protection or a voltage or
+ * current regulator failures.
+ * NOTE: The request is ignored if protection shutdown is already pending even
+ * if the previous request has given a large timeout for forced shutdown.
+ * Can be called from any context.
+ */
+void hw_protection_shutdown(const char *reason, int ms_until_forced)
+{
+ static atomic_t allow_proceed = ATOMIC_INIT(1);
+
+ pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason);
+
+ /* Shutdown should be initiated only once. */
+ if (!atomic_dec_and_test(&allow_proceed))
+ return;
+
+ /*
+ * Queue a backup emergency shutdown in the event of
+ * orderly_poweroff failure
+ */
+ hw_failure_emergency_poweroff(ms_until_forced);
+ orderly_poweroff(true);
+}
+EXPORT_SYMBOL_GPL(hw_protection_shutdown);
+
static int __init reboot_setup(char *str)
{
for (;;) {
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 35f7bd0fced0..6d45ac3dae7f 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -282,9 +282,17 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(t->flags & PF_EXITING))
return;
- ret = rseq_ip_fixup(regs);
- if (unlikely(ret < 0))
- goto error;
+
+ /*
+ * regs is NULL if and only if the caller is in a syscall path. Skip
+ * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
+ * kill a misbehaving userspace on debug kernels.
+ */
+ if (regs) {
+ ret = rseq_ip_fixup(regs);
+ if (unlikely(ret < 0))
+ goto error;
+ }
if (unlikely(rseq_update_cpu_id(t)))
goto error;
return;
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 2377cbb32474..64a08288b1a6 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -64,6 +64,7 @@ torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU
torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
torture_param(int, weight_resched, -1, "Testing weight for resched_cpu() operations.");
torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations.");
+torture_param(int, weight_single_rpc, -1, "Testing weight for single-CPU RPC operations.");
torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations.");
torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations.");
torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations.");
@@ -86,6 +87,8 @@ struct scf_statistics {
long long n_resched;
long long n_single;
long long n_single_ofl;
+ long long n_single_rpc;
+ long long n_single_rpc_ofl;
long long n_single_wait;
long long n_single_wait_ofl;
long long n_many;
@@ -101,14 +104,17 @@ static DEFINE_PER_CPU(long long, scf_invoked_count);
// Data for random primitive selection
#define SCF_PRIM_RESCHED 0
#define SCF_PRIM_SINGLE 1
-#define SCF_PRIM_MANY 2
-#define SCF_PRIM_ALL 3
-#define SCF_NPRIMS 7 // Need wait and no-wait versions of each,
- // except for SCF_PRIM_RESCHED.
+#define SCF_PRIM_SINGLE_RPC 2
+#define SCF_PRIM_MANY 3
+#define SCF_PRIM_ALL 4
+#define SCF_NPRIMS 8 // Need wait and no-wait versions of each,
+ // except for SCF_PRIM_RESCHED and
+ // SCF_PRIM_SINGLE_RPC.
static char *scf_prim_name[] = {
"resched_cpu",
"smp_call_function_single",
+ "smp_call_function_single_rpc",
"smp_call_function_many",
"smp_call_function",
};
@@ -128,6 +134,8 @@ struct scf_check {
bool scfc_out;
int scfc_cpu; // -1 for not _single().
bool scfc_wait;
+ bool scfc_rpc;
+ struct completion scfc_completion;
};
// Use to wait for all threads to start.
@@ -158,6 +166,7 @@ static void scf_torture_stats_print(void)
scfs.n_resched += scf_stats_p[i].n_resched;
scfs.n_single += scf_stats_p[i].n_single;
scfs.n_single_ofl += scf_stats_p[i].n_single_ofl;
+ scfs.n_single_rpc += scf_stats_p[i].n_single_rpc;
scfs.n_single_wait += scf_stats_p[i].n_single_wait;
scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl;
scfs.n_many += scf_stats_p[i].n_many;
@@ -168,9 +177,10 @@ static void scf_torture_stats_print(void)
if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) ||
atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs))
bangstr = "!!! ";
- pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ",
+ pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld single_rpc: %lld single_rpc_ofl: %lld many: %lld/%lld all: %lld/%lld ",
SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched,
scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl,
+ scfs.n_single_rpc, scfs.n_single_rpc_ofl,
scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait);
torture_onoff_stats();
pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs),
@@ -282,10 +292,13 @@ static void scf_handler(void *scfc_in)
out:
if (unlikely(!scfcp))
return;
- if (scfcp->scfc_wait)
+ if (scfcp->scfc_wait) {
WRITE_ONCE(scfcp->scfc_out, true);
- else
+ if (scfcp->scfc_rpc)
+ complete(&scfcp->scfc_completion);
+ } else {
kfree(scfcp);
+ }
}
// As above, but check for correct CPU.
@@ -319,6 +332,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
scfcp->scfc_cpu = -1;
scfcp->scfc_wait = scfsp->scfs_wait;
scfcp->scfc_out = false;
+ scfcp->scfc_rpc = false;
}
}
switch (scfsp->scfs_prim) {
@@ -350,6 +364,34 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
scfcp = NULL;
}
break;
+ case SCF_PRIM_SINGLE_RPC:
+ if (!scfcp)
+ break;
+ cpu = torture_random(trsp) % nr_cpu_ids;
+ scfp->n_single_rpc++;
+ scfcp->scfc_cpu = cpu;
+ scfcp->scfc_wait = true;
+ init_completion(&scfcp->scfc_completion);
+ scfcp->scfc_rpc = true;
+ barrier(); // Prevent race-reduction compiler optimizations.
+ scfcp->scfc_in = true;
+ ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, 0);
+ if (!ret) {
+ if (use_cpus_read_lock)
+ cpus_read_unlock();
+ else
+ preempt_enable();
+ wait_for_completion(&scfcp->scfc_completion);
+ if (use_cpus_read_lock)
+ cpus_read_lock();
+ else
+ preempt_disable();
+ } else {
+ scfp->n_single_rpc_ofl++;
+ kfree(scfcp);
+ scfcp = NULL;
+ }
+ break;
case SCF_PRIM_MANY:
if (scfsp->scfs_wait)
scfp->n_many_wait++;
@@ -379,10 +421,12 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
}
if (scfcp && scfsp->scfs_wait) {
if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) &&
- !scfcp->scfc_out))
+ !scfcp->scfc_out)) {
+ pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim);
atomic_inc(&n_mb_out_errs); // Leak rather than trash!
- else
+ } else {
kfree(scfcp);
+ }
barrier(); // Prevent race-reduction compiler optimizations.
}
if (use_cpus_read_lock)
@@ -405,15 +449,15 @@ static int scftorture_invoker(void *arg)
VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu);
cpu = scfp->cpu % nr_cpu_ids;
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
+ WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(cpu)));
set_user_nice(current, MAX_NICE);
if (holdoff)
schedule_timeout_interruptible(holdoff * HZ);
- VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id());
+ VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, raw_smp_processor_id());
// Make sure that the CPU is affinitized appropriately during testing.
- curcpu = smp_processor_id();
+ curcpu = raw_smp_processor_id();
WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids,
"%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n",
__func__, scfp->cpu, curcpu, nr_cpu_ids);
@@ -453,8 +497,8 @@ static void
scftorture_print_module_parms(const char *tag)
{
pr_alert(SCFTORT_FLAG
- "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag,
- verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait);
+ "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_rpc=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag,
+ verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_rpc, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait);
}
static void scf_cleanup_handler(void *unused)
@@ -469,7 +513,7 @@ static void scf_torture_cleanup(void)
return;
WRITE_ONCE(scfdone, true);
- if (nthreads)
+ if (nthreads && scf_stats_p)
for (i = 0; i < nthreads; i++)
torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task);
else
@@ -497,6 +541,7 @@ static int __init scf_torture_init(void)
int firsterr = 0;
unsigned long weight_resched1 = weight_resched;
unsigned long weight_single1 = weight_single;
+ unsigned long weight_single_rpc1 = weight_single_rpc;
unsigned long weight_single_wait1 = weight_single_wait;
unsigned long weight_many1 = weight_many;
unsigned long weight_many_wait1 = weight_many_wait;
@@ -508,11 +553,13 @@ static int __init scf_torture_init(void)
scftorture_print_module_parms("Start of test");
- if (weight_resched == -1 && weight_single == -1 && weight_single_wait == -1 &&
+ if (weight_resched == -1 &&
+ weight_single == -1 && weight_single_rpc == -1 && weight_single_wait == -1 &&
weight_many == -1 && weight_many_wait == -1 &&
weight_all == -1 && weight_all_wait == -1) {
weight_resched1 = 2 * nr_cpu_ids;
weight_single1 = 2 * nr_cpu_ids;
+ weight_single_rpc1 = 2 * nr_cpu_ids;
weight_single_wait1 = 2 * nr_cpu_ids;
weight_many1 = 2;
weight_many_wait1 = 2;
@@ -523,6 +570,8 @@ static int __init scf_torture_init(void)
weight_resched1 = 0;
if (weight_single == -1)
weight_single1 = 0;
+ if (weight_single_rpc == -1)
+ weight_single_rpc1 = 0;
if (weight_single_wait == -1)
weight_single_wait1 = 0;
if (weight_many == -1)
@@ -534,7 +583,7 @@ static int __init scf_torture_init(void)
if (weight_all_wait == -1)
weight_all_wait1 = 0;
}
- if (weight_single1 == 0 && weight_single_wait1 == 0 &&
+ if (weight_single1 == 0 && weight_single_rpc1 == 0 && weight_single_wait1 == 0 &&
weight_many1 == 0 && weight_many_wait1 == 0 &&
weight_all1 == 0 && weight_all_wait1 == 0) {
VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense");
@@ -546,6 +595,7 @@ static int __init scf_torture_init(void)
else if (weight_resched1)
VERBOSE_SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored");
scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false);
+ scf_sel_add(weight_single_rpc1, SCF_PRIM_SINGLE_RPC, true);
scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true);
scf_sel_add(weight_many1, SCF_PRIM_MANY, false);
scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5fc9c9b70862..978fcfca5871 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_CPU_ISOLATION) += isolation.o
obj-$(CONFIG_PSI) += psi.o
+obj-$(CONFIG_SCHED_CORE) += core_sched.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4ca80df205ce..f21714ea3db8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -84,6 +84,286 @@ unsigned int sysctl_sched_rt_period = 1000000;
__read_mostly int scheduler_running;
+#ifdef CONFIG_SCHED_CORE
+
+DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+/* kernel prio, less is more */
+static inline int __task_prio(struct task_struct *p)
+{
+ if (p->sched_class == &stop_sched_class) /* trumps deadline */
+ return -2;
+
+ if (rt_prio(p->prio)) /* includes deadline */
+ return p->prio; /* [-1, 99] */
+
+ if (p->sched_class == &idle_sched_class)
+ return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
+
+ return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
+}
+
+/*
+ * l(a,b)
+ * le(a,b) := !l(b,a)
+ * g(a,b) := l(b,a)
+ * ge(a,b) := !l(a,b)
+ */
+
+/* real prio, less is less */
+static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+{
+
+ int pa = __task_prio(a), pb = __task_prio(b);
+
+ if (-pa < -pb)
+ return true;
+
+ if (-pb < -pa)
+ return false;
+
+ if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
+ return !dl_time_before(a->dl.deadline, b->dl.deadline);
+
+ if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+ return cfs_prio_less(a, b, in_fi);
+
+ return false;
+}
+
+static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
+{
+ if (a->core_cookie < b->core_cookie)
+ return true;
+
+ if (a->core_cookie > b->core_cookie)
+ return false;
+
+ /* flip prio, so high prio is leftmost */
+ if (prio_less(b, a, task_rq(a)->core->core_forceidle))
+ return true;
+
+ return false;
+}
+
+#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
+
+static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
+{
+ return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
+}
+
+static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
+{
+ const struct task_struct *p = __node_2_sc(node);
+ unsigned long cookie = (unsigned long)key;
+
+ if (cookie < p->core_cookie)
+ return -1;
+
+ if (cookie > p->core_cookie)
+ return 1;
+
+ return 0;
+}
+
+void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+{
+ rq->core->core_task_seq++;
+
+ if (!p->core_cookie)
+ return;
+
+ rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
+}
+
+void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+{
+ rq->core->core_task_seq++;
+
+ if (!sched_core_enqueued(p))
+ return;
+
+ rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
+}
+
+/*
+ * Find left-most (aka, highest priority) task matching @cookie.
+ */
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+ struct rb_node *node;
+
+ node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
+ /*
+ * The idle task always matches any cookie!
+ */
+ if (!node)
+ return idle_sched_class.pick_task(rq);
+
+ return __node_2_sc(node);
+}
+
+static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
+{
+ struct rb_node *node = &p->core_node;
+
+ node = rb_next(node);
+ if (!node)
+ return NULL;
+
+ p = container_of(node, struct task_struct, core_node);
+ if (p->core_cookie != cookie)
+ return NULL;
+
+ return p;
+}
+
+/*
+ * Magic required such that:
+ *
+ * raw_spin_rq_lock(rq);
+ * ...
+ * raw_spin_rq_unlock(rq);
+ *
+ * ends up locking and unlocking the _same_ lock, and all CPUs
+ * always agree on what rq has what lock.
+ *
+ * XXX entirely possible to selectively enable cores, don't bother for now.
+ */
+
+static DEFINE_MUTEX(sched_core_mutex);
+static atomic_t sched_core_count;
+static struct cpumask sched_core_mask;
+
+static void sched_core_lock(int cpu, unsigned long *flags)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t, i = 0;
+
+ local_irq_save(*flags);
+ for_each_cpu(t, smt_mask)
+ raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
+}
+
+static void sched_core_unlock(int cpu, unsigned long *flags)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t;
+
+ for_each_cpu(t, smt_mask)
+ raw_spin_unlock(&cpu_rq(t)->__lock);
+ local_irq_restore(*flags);
+}
+
+static void __sched_core_flip(bool enabled)
+{
+ unsigned long flags;
+ int cpu, t;
+
+ cpus_read_lock();
+
+ /*
+ * Toggle the online cores, one by one.
+ */
+ cpumask_copy(&sched_core_mask, cpu_online_mask);
+ for_each_cpu(cpu, &sched_core_mask) {
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+
+ sched_core_lock(cpu, &flags);
+
+ for_each_cpu(t, smt_mask)
+ cpu_rq(t)->core_enabled = enabled;
+
+ sched_core_unlock(cpu, &flags);
+
+ cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
+ }
+
+ /*
+ * Toggle the offline CPUs.
+ */
+ cpumask_copy(&sched_core_mask, cpu_possible_mask);
+ cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
+
+ for_each_cpu(cpu, &sched_core_mask)
+ cpu_rq(cpu)->core_enabled = enabled;
+
+ cpus_read_unlock();
+}
+
+static void sched_core_assert_empty(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
+}
+
+static void __sched_core_enable(void)
+{
+ static_branch_enable(&__sched_core_enabled);
+ /*
+ * Ensure all previous instances of raw_spin_rq_*lock() have finished
+ * and future ones will observe !sched_core_disabled().
+ */
+ synchronize_rcu();
+ __sched_core_flip(true);
+ sched_core_assert_empty();
+}
+
+static void __sched_core_disable(void)
+{
+ sched_core_assert_empty();
+ __sched_core_flip(false);
+ static_branch_disable(&__sched_core_enabled);
+}
+
+void sched_core_get(void)
+{
+ if (atomic_inc_not_zero(&sched_core_count))
+ return;
+
+ mutex_lock(&sched_core_mutex);
+ if (!atomic_read(&sched_core_count))
+ __sched_core_enable();
+
+ smp_mb__before_atomic();
+ atomic_inc(&sched_core_count);
+ mutex_unlock(&sched_core_mutex);
+}
+
+static void __sched_core_put(struct work_struct *work)
+{
+ if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
+ __sched_core_disable();
+ mutex_unlock(&sched_core_mutex);
+ }
+}
+
+void sched_core_put(void)
+{
+ static DECLARE_WORK(_work, __sched_core_put);
+
+ /*
+ * "There can be only one"
+ *
+ * Either this is the last one, or we don't actually need to do any
+ * 'work'. If it is the last *again*, we rely on
+ * WORK_STRUCT_PENDING_BIT.
+ */
+ if (!atomic_add_unless(&sched_core_count, -1, 1))
+ schedule_work(&_work);
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
+static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+
+#endif /* CONFIG_SCHED_CORE */
+
/*
* part of the period that we allow rt tasks to run in us.
* default: 0.95s
@@ -184,6 +464,79 @@ int sysctl_sched_rt_runtime = 950000;
*
*/
+void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
+{
+ raw_spinlock_t *lock;
+
+ /* Matches synchronize_rcu() in __sched_core_enable() */
+ preempt_disable();
+ if (sched_core_disabled()) {
+ raw_spin_lock_nested(&rq->__lock, subclass);
+ /* preempt_count *MUST* be > 1 */
+ preempt_enable_no_resched();
+ return;
+ }
+
+ for (;;) {
+ lock = __rq_lockp(rq);
+ raw_spin_lock_nested(lock, subclass);
+ if (likely(lock == __rq_lockp(rq))) {
+ /* preempt_count *MUST* be > 1 */
+ preempt_enable_no_resched();
+ return;
+ }
+ raw_spin_unlock(lock);
+ }
+}
+
+bool raw_spin_rq_trylock(struct rq *rq)
+{
+ raw_spinlock_t *lock;
+ bool ret;
+
+ /* Matches synchronize_rcu() in __sched_core_enable() */
+ preempt_disable();
+ if (sched_core_disabled()) {
+ ret = raw_spin_trylock(&rq->__lock);
+ preempt_enable();
+ return ret;
+ }
+
+ for (;;) {
+ lock = __rq_lockp(rq);
+ ret = raw_spin_trylock(lock);
+ if (!ret || (likely(lock == __rq_lockp(rq)))) {
+ preempt_enable();
+ return ret;
+ }
+ raw_spin_unlock(lock);
+ }
+}
+
+void raw_spin_rq_unlock(struct rq *rq)
+{
+ raw_spin_unlock(rq_lockp(rq));
+}
+
+#ifdef CONFIG_SMP
+/*
+ * double_rq_lock - safely lock two runqueues
+ */
+void double_rq_lock(struct rq *rq1, struct rq *rq2)
+{
+ lockdep_assert_irqs_disabled();
+
+ if (rq_order_less(rq2, rq1))
+ swap(rq1, rq2);
+
+ raw_spin_rq_lock(rq1);
+ if (__rq_lockp(rq1) == __rq_lockp(rq2))
+ return;
+
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+}
+#endif
+
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -196,12 +549,12 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
for (;;) {
rq = task_rq(p);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
return rq;
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
@@ -220,7 +573,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
rq = task_rq(p);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
/*
* move_queued_task() task_rq_lock()
*
@@ -242,7 +595,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
rq_pin_lock(rq, rf);
return rq;
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
while (unlikely(task_on_rq_migrating(p)))
@@ -312,7 +665,7 @@ void update_rq_clock(struct rq *rq)
{
s64 delta;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
@@ -585,7 +938,6 @@ void wake_up_q(struct wake_q_head *head)
struct task_struct *task;
task = container_of(node, struct task_struct, wake_q);
- BUG_ON(!task);
/* Task can safely be re-inserted now: */
node = node->next;
task->wake_q.next = NULL;
@@ -611,7 +963,7 @@ void resched_curr(struct rq *rq)
struct task_struct *curr = rq->curr;
int cpu;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (test_tsk_need_resched(curr))
return;
@@ -635,10 +987,10 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
if (cpu_online(cpu) || cpu == smp_processor_id())
resched_curr(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
}
#ifdef CONFIG_SMP
@@ -655,6 +1007,7 @@ int get_nohz_timer_target(void)
{
int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
+ const struct cpumask *hk_mask;
if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
if (!idle_cpu(cpu))
@@ -662,10 +1015,11 @@ int get_nohz_timer_target(void)
default_cpu = cpu;
}
+ hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
+
rcu_read_lock();
for_each_domain(cpu, sd) {
- for_each_cpu_and(i, sched_domain_span(sd),
- housekeeping_cpumask(HK_FLAG_TIMER)) {
+ for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
if (cpu == i)
continue;
@@ -1065,9 +1419,10 @@ static void uclamp_sync_util_min_rt_default(void)
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
+ /* Copy by value as we could modify it */
struct uclamp_se uc_req = p->uclamp_req[clamp_id];
#ifdef CONFIG_UCLAMP_TASK_GROUP
- struct uclamp_se uc_max;
+ unsigned int tg_min, tg_max, value;
/*
* Tasks in autogroups or root task group will be
@@ -1078,9 +1433,11 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
if (task_group(p) == &root_task_group)
return uc_req;
- uc_max = task_group(p)->uclamp[clamp_id];
- if (uc_req.value > uc_max.value || !uc_req.user_defined)
- return uc_max;
+ tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
+ tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
+ value = uc_req.value;
+ value = clamp(value, tg_min, tg_max);
+ uclamp_se_set(&uc_req, value, false);
#endif
return uc_req;
@@ -1137,7 +1494,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
/* Update task effective clamp */
p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
@@ -1177,7 +1534,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
unsigned int bkt_clamp;
unsigned int rq_clamp;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
/*
* If sched_uclamp_used was enabled after task @p was enqueued,
@@ -1278,9 +1635,27 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
uclamp_rq_dec_id(rq, p, clamp_id);
}
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ if (!p->uclamp[clamp_id].active)
+ return;
+
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+
+ /*
+ * Make sure to clear the idle flag if we've transiently reached 0
+ * active tasks on rq.
+ */
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+
static inline void
-uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+uclamp_update_active(struct task_struct *p)
{
+ enum uclamp_id clamp_id;
struct rq_flags rf;
struct rq *rq;
@@ -1300,30 +1675,22 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
* affecting a valid clamp bucket, the next time it's enqueued,
* it will already see the updated clamp bucket value.
*/
- if (p->uclamp[clamp_id].active) {
- uclamp_rq_dec_id(rq, p, clamp_id);
- uclamp_rq_inc_id(rq, p, clamp_id);
- }
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_reinc_id(rq, p, clamp_id);
task_rq_unlock(rq, p, &rf);
}
#ifdef CONFIG_UCLAMP_TASK_GROUP
static inline void
-uclamp_update_active_tasks(struct cgroup_subsys_state *css,
- unsigned int clamps)
+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
{
- enum uclamp_id clamp_id;
struct css_task_iter it;
struct task_struct *p;
css_task_iter_start(css, 0, &it);
- while ((p = css_task_iter_next(&it))) {
- for_each_clamp_id(clamp_id) {
- if ((0x1 << clamp_id) & clamps)
- uclamp_update_active(p, clamp_id);
- }
- }
+ while ((p = css_task_iter_next(&it)))
+ uclamp_update_active(p);
css_task_iter_end(&it);
}
@@ -1590,27 +1957,38 @@ static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */
+bool sched_task_on_rq(struct task_struct *p)
+{
+ return task_on_rq_queued(p);
+}
+
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & ENQUEUE_RESTORE)) {
- sched_info_queued(rq, p);
+ sched_info_enqueue(rq, p);
psi_enqueue(p, flags & ENQUEUE_WAKEUP);
}
uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
+
+ if (sched_core_enabled(rq))
+ sched_core_enqueue(rq, p);
}
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
+ if (sched_core_enabled(rq))
+ sched_core_dequeue(rq, p);
+
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & DEQUEUE_SAVE)) {
- sched_info_dequeued(rq, p);
+ sched_info_dequeue(rq, p);
psi_dequeue(p, flags & DEQUEUE_SLEEP);
}
@@ -1632,12 +2010,18 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
+static inline int __normal_prio(int policy, int rt_prio, int nice)
{
- return p->static_prio;
+ int prio;
+
+ if (dl_policy(policy))
+ prio = MAX_DL_PRIO - 1;
+ else if (rt_policy(policy))
+ prio = MAX_RT_PRIO - 1 - rt_prio;
+ else
+ prio = NICE_TO_PRIO(nice);
+
+ return prio;
}
/*
@@ -1649,15 +2033,7 @@ static inline int __normal_prio(struct task_struct *p)
*/
static inline int normal_prio(struct task_struct *p)
{
- int prio;
-
- if (task_has_dl_policy(p))
- prio = MAX_DL_PRIO-1;
- else if (task_has_rt_policy(p))
- prio = MAX_RT_PRIO-1 - p->rt_priority;
- else
- prio = __normal_prio(p);
- return prio;
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
}
/*
@@ -1814,7 +2190,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
/* Non kernel threads are not allowed during either online or offline. */
if (!(p->flags & PF_KTHREAD))
- return cpu_active(cpu);
+ return cpu_active(cpu) && task_cpu_possible(cpu, p);
/* KTHREAD_IS_PER_CPU is always allowed. */
if (kthread_is_per_cpu(p))
@@ -1850,7 +2226,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int new_cpu)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
deactivate_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
@@ -1916,7 +2292,6 @@ static int migration_cpu_stop(void *data)
struct migration_arg *arg = data;
struct set_affinity_pending *pending = arg->pending;
struct task_struct *p = arg->task;
- int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
bool complete = false;
struct rq_flags rf;
@@ -1954,19 +2329,15 @@ static int migration_cpu_stop(void *data)
if (pending) {
p->migration_pending = NULL;
complete = true;
- }
- if (dest_cpu < 0) {
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
goto out;
-
- dest_cpu = cpumask_any_distribute(&p->cpus_mask);
}
if (task_on_rq_queued(p))
- rq = __migrate_task(rq, &rf, p, dest_cpu);
+ rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
else
- p->wake_cpu = dest_cpu;
+ p->wake_cpu = arg->dest_cpu;
/*
* XXX __migrate_task() can fail, at which point we might end
@@ -2024,7 +2395,7 @@ int push_cpu_stop(void *arg)
struct task_struct *p = arg;
raw_spin_lock_irq(&p->pi_lock);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
if (task_rq(p) != rq)
goto out_unlock;
@@ -2054,7 +2425,7 @@ int push_cpu_stop(void *arg)
out_unlock:
rq->push_busy = false;
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
@@ -2107,7 +2478,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
* Because __kthread_bind() calls this on blocked tasks without
* holding rq->lock.
*/
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
@@ -2126,6 +2497,34 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
__do_set_cpus_allowed(p, new_mask, 0);
}
+int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
+ int node)
+{
+ if (!src->user_cpus_ptr)
+ return 0;
+
+ dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
+ if (!dst->user_cpus_ptr)
+ return -ENOMEM;
+
+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ return 0;
+}
+
+static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
+{
+ struct cpumask *user_mask = NULL;
+
+ swap(p->user_cpus_ptr, user_mask);
+
+ return user_mask;
+}
+
+void release_user_cpus_ptr(struct task_struct *p)
+{
+ kfree(clear_user_cpus_ptr(p));
+}
+
/*
* This function is wildly self concurrent; here be dragons.
*
@@ -2249,7 +2648,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
init_completion(&my_pending.done);
my_pending.arg = (struct migration_arg) {
.task = p,
- .dest_cpu = -1, /* any */
+ .dest_cpu = dest_cpu,
.pending = &my_pending,
};
@@ -2257,6 +2656,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
} else {
pending = p->migration_pending;
refcount_inc(&pending->refs);
+ /*
+ * Affinity has changed, but we've already installed a
+ * pending. migration_cpu_stop() *must* see this, else
+ * we risk a completion of the pending despite having a
+ * task on a disallowed CPU.
+ *
+ * Serialized by p->pi_lock, so this is safe.
+ */
+ pending->arg.dest_cpu = dest_cpu;
}
}
pending = p->migration_pending;
@@ -2277,7 +2685,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
return -EINVAL;
}
- if (task_running(rq, p) || p->state == TASK_WAKING) {
+ if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
/*
* MIGRATE_ENABLE gets here because 'p == current', but for
* anything else we cannot do is_migration_disabled(), punt
@@ -2334,28 +2742,26 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
}
/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
*/
-static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags)
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
+ const struct cpumask *new_mask,
+ u32 flags,
+ struct rq *rq,
+ struct rq_flags *rf)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
{
+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ bool kthread = p->flags & PF_KTHREAD;
+ struct cpumask *user_mask = NULL;
unsigned int dest_cpu;
- struct rq_flags rf;
- struct rq *rq;
int ret = 0;
- rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
- if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
+ if (kthread || is_migration_disabled(p)) {
/*
* Kernel threads are allowed on online && !active CPUs,
* however, during cpu-hot-unplug, even these might get pushed
@@ -2369,6 +2775,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
cpu_valid_mask = cpu_online_mask;
}
+ if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
/*
* Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag.
@@ -2403,36 +2814,195 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
__do_set_cpus_allowed(p, new_mask, flags);
- return affine_move_task(rq, p, &rf, dest_cpu, flags);
+ if (flags & SCA_USER)
+ user_mask = clear_user_cpus_ptr(p);
+
+ ret = affine_move_task(rq, p, rf, dest_cpu, flags);
+
+ kfree(user_mask);
+
+ return ret;
out:
- task_rq_unlock(rq, p, &rf);
+ task_rq_unlock(rq, p, rf);
return ret;
}
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, u32 flags)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+}
+
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
return __set_cpus_allowed_ptr(p, new_mask, 0);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+/*
+ * Change a given task's CPU affinity to the intersection of its current
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask
+ * and pointing @p->user_cpus_ptr to a copy of the old mask.
+ * If the resulting mask is empty, leave the affinity unchanged and return
+ * -EINVAL.
+ */
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
+ struct cpumask *new_mask,
+ const struct cpumask *subset_mask)
+{
+ struct cpumask *user_mask = NULL;
+ struct rq_flags rf;
+ struct rq *rq;
+ int err;
+
+ if (!p->user_cpus_ptr) {
+ user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
+ if (!user_mask)
+ return -ENOMEM;
+ }
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Forcefully restricting the affinity of a deadline task is
+ * likely to cause problems, so fail and noisily override the
+ * mask entirely.
+ */
+ if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+ err = -EPERM;
+ goto err_unlock;
+ }
+
+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
+ /*
+ * We're about to butcher the task affinity, so keep track of what
+ * the user asked for in case we're able to restore it later on.
+ */
+ if (user_mask) {
+ cpumask_copy(user_mask, p->cpus_ptr);
+ p->user_cpus_ptr = user_mask;
+ }
+
+ return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+
+err_unlock:
+ task_rq_unlock(rq, p, &rf);
+ kfree(user_mask);
+ return err;
+}
+
+/*
+ * Restrict the CPU affinity of task @p so that it is a subset of
+ * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
+ * old affinity mask. If the resulting mask is empty, we warn and walk
+ * up the cpuset hierarchy until we find a suitable mask.
+ */
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ cpumask_var_t new_mask;
+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
+
+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
+
+ /*
+ * __migrate_task() can fail silently in the face of concurrent
+ * offlining of the chosen destination CPU, so take the hotplug
+ * lock to ensure that the migration succeeds.
+ */
+ cpus_read_lock();
+ if (!cpumask_available(new_mask))
+ goto out_set_mask;
+
+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
+ goto out_free_mask;
+
+ /*
+ * We failed to find a valid subset of the affinity mask for the
+ * task, so override it based on its cpuset hierarchy.
+ */
+ cpuset_cpus_allowed(p, new_mask);
+ override_mask = new_mask;
+
+out_set_mask:
+ if (printk_ratelimit()) {
+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
+ task_pid_nr(p), p->comm,
+ cpumask_pr_args(override_mask));
+ }
+
+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
+out_free_mask:
+ cpus_read_unlock();
+ free_cpumask_var(new_mask);
+}
+
+static int
+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
+
+/*
+ * Restore the affinity of a task @p which was previously restricted by a
+ * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
+ * @p->user_cpus_ptr.
+ *
+ * It is the caller's responsibility to serialise this with any calls to
+ * force_compatible_cpus_allowed_ptr(@p).
+ */
+void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ struct cpumask *user_mask = p->user_cpus_ptr;
+ unsigned long flags;
+
+ /*
+ * Try to restore the old affinity mask. If this fails, then
+ * we free the mask explicitly to avoid it being inherited across
+ * a subsequent fork().
+ */
+ if (!user_mask || !__sched_setaffinity(p, user_mask))
+ return;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ user_mask = clear_user_cpus_ptr(p);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ kfree(user_mask);
+}
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
+ unsigned int state = READ_ONCE(p->__state);
+
/*
* We should never call set_task_cpu() on a blocked task,
* ttwu() will sort out the placement.
*/
- WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !p->on_rq);
+ WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
/*
* Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
* because schedstat_wait_{start,end} rebase migrating task's wait_start
* time relying on p->on_rq.
*/
- WARN_ON_ONCE(p->state == TASK_RUNNING &&
+ WARN_ON_ONCE(state == TASK_RUNNING &&
p->sched_class == &fair_sched_class &&
(p->on_rq && !task_on_rq_migrating(p)));
@@ -2448,7 +3018,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* task_rq_lock().
*/
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock)));
+ lockdep_is_held(__rq_lockp(task_rq(p)))));
#endif
/*
* Clearly, migrating tasks to offline CPUs is a fairly daft thing.
@@ -2604,7 +3174,7 @@ out:
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
*/
-unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
{
int running, queued;
struct rq_flags rf;
@@ -2632,7 +3202,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* is actually now running somewhere else!
*/
while (task_running(rq, p)) {
- if (match_state && unlikely(p->state != match_state))
+ if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
return 0;
cpu_relax();
}
@@ -2647,7 +3217,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
running = task_running(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
- if (!match_state || p->state == match_state)
+ if (!match_state || READ_ONCE(p->__state) == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &rf);
@@ -2760,9 +3330,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* Look for allowed, online CPU in same node. */
for_each_cpu(dest_cpu, nodemask) {
- if (!cpu_active(dest_cpu))
- continue;
- if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
+ if (is_cpu_allowed(p, dest_cpu))
return dest_cpu;
}
}
@@ -2779,8 +3347,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* No more Mr. Nice Guy. */
switch (state) {
case cpuset:
- if (IS_ENABLED(CONFIG_CPUSETS)) {
- cpuset_cpus_allowed_fallback(p);
+ if (cpuset_cpus_allowed_fallback(p)) {
state = possible;
break;
}
@@ -2792,10 +3359,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
*
* More yuck to audit.
*/
- do_set_cpus_allowed(p, cpu_possible_mask);
+ do_set_cpus_allowed(p, task_cpu_possible_mask(p));
state = fail;
break;
-
case fail:
BUG();
break;
@@ -2956,7 +3522,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf)
{
check_preempt_curr(rq, p, wake_flags);
- p->state = TASK_RUNNING;
+ WRITE_ONCE(p->__state, TASK_RUNNING);
trace_sched_wakeup(p);
#ifdef CONFIG_SMP
@@ -2979,6 +3545,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
if (rq->avg_idle > max)
rq->avg_idle = max;
+ rq->wake_stamp = jiffies;
+ rq->wake_avg_idle = rq->avg_idle / 2;
+
rq->idle_stamp = 0;
}
#endif
@@ -2990,7 +3559,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
{
int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
@@ -3207,6 +3776,55 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
}
/*
+ * Invoked from try_to_wake_up() to check whether the task can be woken up.
+ *
+ * The caller holds p::pi_lock if p != current or has preemption
+ * disabled when p == current.
+ *
+ * The rules of PREEMPT_RT saved_state:
+ *
+ * The related locking code always holds p::pi_lock when updating
+ * p::saved_state, which means the code is fully serialized in both cases.
+ *
+ * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
+ * bits set. This allows to distinguish all wakeup scenarios.
+ */
+static __always_inline
+bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
+ WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
+ state != TASK_RTLOCK_WAIT);
+ }
+
+ if (READ_ONCE(p->__state) & state) {
+ *success = 1;
+ return true;
+ }
+
+#ifdef CONFIG_PREEMPT_RT
+ /*
+ * Saved state preserves the task state across blocking on
+ * an RT lock. If the state matches, set p::saved_state to
+ * TASK_RUNNING, but do not wake the task because it waits
+ * for a lock wakeup. Also indicate success because from
+ * the regular waker's point of view this has succeeded.
+ *
+ * After acquiring the lock the task will restore p::__state
+ * from p::saved_state which ensures that the regular
+ * wakeup is not lost. The restore will also set
+ * p::saved_state to TASK_RUNNING so any further tests will
+ * not result in false positives vs. @success
+ */
+ if (p->saved_state & state) {
+ p->saved_state = TASK_RUNNING;
+ *success = 1;
+ }
+#endif
+ return false;
+}
+
+/*
* Notes on Program-Order guarantees on SMP systems.
*
* MIGRATION
@@ -3345,12 +3963,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
- if (!(p->state & state))
+ if (!ttwu_state_match(p, state, &success))
goto out;
- success = 1;
trace_sched_waking(p);
- p->state = TASK_RUNNING;
+ WRITE_ONCE(p->__state, TASK_RUNNING);
trace_sched_wakeup(p);
goto out;
}
@@ -3363,14 +3980,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
- if (!(p->state & state))
+ if (!ttwu_state_match(p, state, &success))
goto unlock;
trace_sched_waking(p);
- /* We're going to change ->state: */
- success = 1;
-
/*
* Ensure we load p->on_rq _after_ p->state, otherwise it would
* be possible to, falsely, observe p->on_rq == 0 and get stuck
@@ -3429,7 +4043,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* TASK_WAKING such that we can unlock p->pi_lock before doing the
* enqueue, such as ttwu_queue_wakelist().
*/
- p->state = TASK_WAKING;
+ WRITE_ONCE(p->__state, TASK_WAKING);
/*
* If the owning (remote) CPU is still in the middle of schedule() with
@@ -3522,7 +4136,7 @@ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct t
ret = func(p, arg);
rq_unlock(rq, &rf);
} else {
- switch (p->state) {
+ switch (READ_ONCE(p->__state)) {
case TASK_RUNNING:
case TASK_WAKING:
break;
@@ -3648,7 +4262,6 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
#ifdef CONFIG_SCHEDSTATS
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
-static bool __initdata __sched_schedstats = false;
static void set_schedstats(bool enabled)
{
@@ -3672,16 +4285,11 @@ static int __init setup_schedstats(char *str)
if (!str)
goto out;
- /*
- * This code is called before jump labels have been set up, so we can't
- * change the static branch directly just yet. Instead set a temporary
- * variable so init_schedstats() can do it later.
- */
if (!strcmp(str, "enable")) {
- __sched_schedstats = true;
+ set_schedstats(true);
ret = 1;
} else if (!strcmp(str, "disable")) {
- __sched_schedstats = false;
+ set_schedstats(false);
ret = 1;
}
out:
@@ -3692,11 +4300,6 @@ out:
}
__setup("schedstats=", setup_schedstats);
-static void __init init_schedstats(void)
-{
- set_schedstats(__sched_schedstats);
-}
-
#ifdef CONFIG_PROC_SYSCTL
int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
@@ -3718,8 +4321,6 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
return err;
}
#endif /* CONFIG_PROC_SYSCTL */
-#else /* !CONFIG_SCHEDSTATS */
-static inline void init_schedstats(void) {}
#endif /* CONFIG_SCHEDSTATS */
/*
@@ -3735,7 +4336,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_NEW;
+ p->__state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -3755,7 +4356,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
- p->prio = p->normal_prio = __normal_prio(p);
+ p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
/*
@@ -3841,7 +4442,7 @@ void wake_up_new_task(struct task_struct *p)
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- p->state = TASK_RUNNING;
+ WRITE_ONCE(p->__state, TASK_RUNNING);
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
@@ -4001,7 +4602,7 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
void (*func)(struct rq *rq);
struct callback_head *next;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
while (head) {
func = (void (*)(struct rq *))head->func;
@@ -4024,7 +4625,7 @@ static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
{
struct callback_head *head = rq->balance_callback;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (head)
rq->balance_callback = NULL;
@@ -4041,9 +4642,9 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
unsigned long flags;
if (unlikely(head)) {
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
do_balance_callbacks(rq, head);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
}
}
@@ -4074,10 +4675,10 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
* do an early lockdep release here:
*/
rq_unpin_lock(rq, rf);
- spin_release(&rq->lock.dep_map, _THIS_IP_);
+ spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
- rq->lock.owner = next;
+ rq_lockp(rq)->owner = next;
#endif
}
@@ -4088,9 +4689,9 @@ static inline void finish_lock_switch(struct rq *rq)
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
- spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+ spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
__balance_callbacks(rq);
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_rq_unlock_irq(rq);
}
/*
@@ -4203,10 +4804,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* running on another CPU and we could rave with its RUNNING -> DEAD
* transition, resulting in a double drop.
*/
- prev_state = prev->state;
+ prev_state = READ_ONCE(prev->__state);
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
finish_task(prev);
+ tick_nohz_task_switch();
finish_lock_switch(rq);
finish_arch_post_lock_switch();
kcov_finish_switch(current);
@@ -4252,7 +4854,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
put_task_struct_rcu_user(prev);
}
- tick_nohz_task_switch();
return rq;
}
@@ -4348,9 +4949,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
* externally visible scheduler statistics: current number of runnable
* threads, total number of context switches performed since bootup.
*/
-unsigned long nr_running(void)
+unsigned int nr_running(void)
{
- unsigned long i, sum = 0;
+ unsigned int i, sum = 0;
for_each_online_cpu(i)
sum += cpu_rq(i)->nr_running;
@@ -4395,7 +4996,7 @@ unsigned long long nr_context_switches(void)
* it does become runnable.
*/
-unsigned long nr_iowait_cpu(int cpu)
+unsigned int nr_iowait_cpu(int cpu)
{
return atomic_read(&cpu_rq(cpu)->nr_iowait);
}
@@ -4430,9 +5031,9 @@ unsigned long nr_iowait_cpu(int cpu)
* Task CPU affinities can make all that even more 'interesting'.
*/
-unsigned long nr_iowait(void)
+unsigned int nr_iowait(void)
{
- unsigned long i, sum = 0;
+ unsigned int i, sum = 0;
for_each_possible_cpu(i)
sum += nr_iowait_cpu(i);
@@ -4897,7 +5498,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
- if (!preempt && prev->state && prev->non_block_count) {
+ if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
prev->comm, prev->pid, prev->non_block_count);
dump_stack();
@@ -4943,7 +5544,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
@@ -4961,7 +5562,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (unlikely(p == RETRY_TASK))
goto restart;
- /* Assumes fair_sched_class->next == idle_sched_class */
+ /* Assume the next prioritized class is idle_sched_class */
if (!p) {
put_prev_task(rq, prev);
p = pick_next_task_idle(rq);
@@ -4983,6 +5584,545 @@ restart:
BUG();
}
+#ifdef CONFIG_SCHED_CORE
+static inline bool is_task_rq_idle(struct task_struct *t)
+{
+ return (task_rq(t)->idle == t);
+}
+
+static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
+{
+ return is_task_rq_idle(a) || (a->core_cookie == cookie);
+}
+
+static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
+{
+ if (is_task_rq_idle(a) || is_task_rq_idle(b))
+ return true;
+
+ return a->core_cookie == b->core_cookie;
+}
+
+// XXX fairness/fwd progress conditions
+/*
+ * Returns
+ * - NULL if there is no runnable task for this class.
+ * - the highest priority task for this runqueue if it matches
+ * rq->core->core_cookie or its priority is greater than max.
+ * - Else returns idle_task.
+ */
+static struct task_struct *
+pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi)
+{
+ struct task_struct *class_pick, *cookie_pick;
+ unsigned long cookie = rq->core->core_cookie;
+
+ class_pick = class->pick_task(rq);
+ if (!class_pick)
+ return NULL;
+
+ if (!cookie) {
+ /*
+ * If class_pick is tagged, return it only if it has
+ * higher priority than max.
+ */
+ if (max && class_pick->core_cookie &&
+ prio_less(class_pick, max, in_fi))
+ return idle_sched_class.pick_task(rq);
+
+ return class_pick;
+ }
+
+ /*
+ * If class_pick is idle or matches cookie, return early.
+ */
+ if (cookie_equals(class_pick, cookie))
+ return class_pick;
+
+ cookie_pick = sched_core_find(rq, cookie);
+
+ /*
+ * If class > max && class > cookie, it is the highest priority task on
+ * the core (so far) and it must be selected, otherwise we must go with
+ * the cookie pick in order to satisfy the constraint.
+ */
+ if (prio_less(cookie_pick, class_pick, in_fi) &&
+ (!max || prio_less(max, class_pick, in_fi)))
+ return class_pick;
+
+ return cookie_pick;
+}
+
+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *next, *max = NULL;
+ const struct sched_class *class;
+ const struct cpumask *smt_mask;
+ bool fi_before = false;
+ int i, j, cpu, occ = 0;
+ bool need_sync;
+
+ if (!sched_core_enabled(rq))
+ return __pick_next_task(rq, prev, rf);
+
+ cpu = cpu_of(rq);
+
+ /* Stopper task is switching into idle, no need core-wide selection. */
+ if (cpu_is_offline(cpu)) {
+ /*
+ * Reset core_pick so that we don't enter the fastpath when
+ * coming online. core_pick would already be migrated to
+ * another cpu during offline.
+ */
+ rq->core_pick = NULL;
+ return __pick_next_task(rq, prev, rf);
+ }
+
+ /*
+ * If there were no {en,de}queues since we picked (IOW, the task
+ * pointers are all still valid), and we haven't scheduled the last
+ * pick yet, do so now.
+ *
+ * rq->core_pick can be NULL if no selection was made for a CPU because
+ * it was either offline or went offline during a sibling's core-wide
+ * selection. In this case, do a core-wide selection.
+ */
+ if (rq->core->core_pick_seq == rq->core->core_task_seq &&
+ rq->core->core_pick_seq != rq->core_sched_seq &&
+ rq->core_pick) {
+ WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+
+ next = rq->core_pick;
+ if (next != prev) {
+ put_prev_task(rq, prev);
+ set_next_task(rq, next);
+ }
+
+ rq->core_pick = NULL;
+ return next;
+ }
+
+ put_prev_task_balance(rq, prev, rf);
+
+ smt_mask = cpu_smt_mask(cpu);
+ need_sync = !!rq->core->core_cookie;
+
+ /* reset state */
+ rq->core->core_cookie = 0UL;
+ if (rq->core->core_forceidle) {
+ need_sync = true;
+ fi_before = true;
+ rq->core->core_forceidle = false;
+ }
+
+ /*
+ * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
+ *
+ * @task_seq guards the task state ({en,de}queues)
+ * @pick_seq is the @task_seq we did a selection on
+ * @sched_seq is the @pick_seq we scheduled
+ *
+ * However, preemptions can cause multiple picks on the same task set.
+ * 'Fix' this by also increasing @task_seq for every pick.
+ */
+ rq->core->core_task_seq++;
+
+ /*
+ * Optimize for common case where this CPU has no cookies
+ * and there are no cookied tasks running on siblings.
+ */
+ if (!need_sync) {
+ for_each_class(class) {
+ next = class->pick_task(rq);
+ if (next)
+ break;
+ }
+
+ if (!next->core_cookie) {
+ rq->core_pick = NULL;
+ /*
+ * For robustness, update the min_vruntime_fi for
+ * unconstrained picks as well.
+ */
+ WARN_ON_ONCE(fi_before);
+ task_vruntime_update(rq, next, false);
+ goto done;
+ }
+ }
+
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+
+ rq_i->core_pick = NULL;
+
+ if (i != cpu)
+ update_rq_clock(rq_i);
+ }
+
+ /*
+ * Try and select tasks for each sibling in descending sched_class
+ * order.
+ */
+ for_each_class(class) {
+again:
+ for_each_cpu_wrap(i, smt_mask, cpu) {
+ struct rq *rq_i = cpu_rq(i);
+ struct task_struct *p;
+
+ if (rq_i->core_pick)
+ continue;
+
+ /*
+ * If this sibling doesn't yet have a suitable task to
+ * run; ask for the most eligible task, given the
+ * highest priority task already selected for this
+ * core.
+ */
+ p = pick_task(rq_i, class, max, fi_before);
+ if (!p)
+ continue;
+
+ if (!is_task_rq_idle(p))
+ occ++;
+
+ rq_i->core_pick = p;
+ if (rq_i->idle == p && rq_i->nr_running) {
+ rq->core->core_forceidle = true;
+ if (!fi_before)
+ rq->core->core_forceidle_seq++;
+ }
+
+ /*
+ * If this new candidate is of higher priority than the
+ * previous; and they're incompatible; we need to wipe
+ * the slate and start over. pick_task makes sure that
+ * p's priority is more than max if it doesn't match
+ * max's cookie.
+ *
+ * NOTE: this is a linear max-filter and is thus bounded
+ * in execution time.
+ */
+ if (!max || !cookie_match(max, p)) {
+ struct task_struct *old_max = max;
+
+ rq->core->core_cookie = p->core_cookie;
+ max = p;
+
+ if (old_max) {
+ rq->core->core_forceidle = false;
+ for_each_cpu(j, smt_mask) {
+ if (j == i)
+ continue;
+
+ cpu_rq(j)->core_pick = NULL;
+ }
+ occ = 1;
+ goto again;
+ }
+ }
+ }
+ }
+
+ rq->core->core_pick_seq = rq->core->core_task_seq;
+ next = rq->core_pick;
+ rq->core_sched_seq = rq->core->core_pick_seq;
+
+ /* Something should have been selected for current CPU */
+ WARN_ON_ONCE(!next);
+
+ /*
+ * Reschedule siblings
+ *
+ * NOTE: L1TF -- at this point we're no longer running the old task and
+ * sending an IPI (below) ensures the sibling will no longer be running
+ * their task. This ensures there is no inter-sibling overlap between
+ * non-matching user state.
+ */
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+
+ /*
+ * An online sibling might have gone offline before a task
+ * could be picked for it, or it might be offline but later
+ * happen to come online, but its too late and nothing was
+ * picked for it. That's Ok - it will pick tasks for itself,
+ * so ignore it.
+ */
+ if (!rq_i->core_pick)
+ continue;
+
+ /*
+ * Update for new !FI->FI transitions, or if continuing to be in !FI:
+ * fi_before fi update?
+ * 0 0 1
+ * 0 1 1
+ * 1 0 1
+ * 1 1 0
+ */
+ if (!(fi_before && rq->core->core_forceidle))
+ task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
+
+ rq_i->core_pick->core_occupation = occ;
+
+ if (i == cpu) {
+ rq_i->core_pick = NULL;
+ continue;
+ }
+
+ /* Did we break L1TF mitigation requirements? */
+ WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
+
+ if (rq_i->curr == rq_i->core_pick) {
+ rq_i->core_pick = NULL;
+ continue;
+ }
+
+ resched_curr(rq_i);
+ }
+
+done:
+ set_next_task(rq, next);
+ return next;
+}
+
+static bool try_steal_cookie(int this, int that)
+{
+ struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
+ struct task_struct *p;
+ unsigned long cookie;
+ bool success = false;
+
+ local_irq_disable();
+ double_rq_lock(dst, src);
+
+ cookie = dst->core->core_cookie;
+ if (!cookie)
+ goto unlock;
+
+ if (dst->curr != dst->idle)
+ goto unlock;
+
+ p = sched_core_find(src, cookie);
+ if (p == src->idle)
+ goto unlock;
+
+ do {
+ if (p == src->core_pick || p == src->curr)
+ goto next;
+
+ if (!cpumask_test_cpu(this, &p->cpus_mask))
+ goto next;
+
+ if (p->core_occupation > dst->idle->core_occupation)
+ goto next;
+
+ deactivate_task(src, p, 0);
+ set_task_cpu(p, this);
+ activate_task(dst, p, 0);
+
+ resched_curr(dst);
+
+ success = true;
+ break;
+
+next:
+ p = sched_core_next(p, cookie);
+ } while (p);
+
+unlock:
+ double_rq_unlock(dst, src);
+ local_irq_enable();
+
+ return success;
+}
+
+static bool steal_cookie_task(int cpu, struct sched_domain *sd)
+{
+ int i;
+
+ for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
+ if (i == cpu)
+ continue;
+
+ if (need_resched())
+ break;
+
+ if (try_steal_cookie(cpu, i))
+ return true;
+ }
+
+ return false;
+}
+
+static void sched_core_balance(struct rq *rq)
+{
+ struct sched_domain *sd;
+ int cpu = cpu_of(rq);
+
+ preempt_disable();
+ rcu_read_lock();
+ raw_spin_rq_unlock_irq(rq);
+ for_each_domain(cpu, sd) {
+ if (need_resched())
+ break;
+
+ if (steal_cookie_task(cpu, sd))
+ break;
+ }
+ raw_spin_rq_lock_irq(rq);
+ rcu_read_unlock();
+ preempt_enable();
+}
+
+static DEFINE_PER_CPU(struct callback_head, core_balance_head);
+
+void queue_core_balance(struct rq *rq)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ if (!rq->core->core_cookie)
+ return;
+
+ if (!rq->nr_running) /* not forced idle */
+ return;
+
+ queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
+}
+
+static void sched_core_cpu_starting(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ unsigned long flags;
+ int t;
+
+ sched_core_lock(cpu, &flags);
+
+ WARN_ON_ONCE(rq->core != rq);
+
+ /* if we're the first, we'll be our own leader */
+ if (cpumask_weight(smt_mask) == 1)
+ goto unlock;
+
+ /* find the leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ rq = cpu_rq(t);
+ if (rq->core == rq) {
+ core_rq = rq;
+ break;
+ }
+ }
+
+ if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
+ goto unlock;
+
+ /* install and validate core_rq */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
+
+ if (t == cpu)
+ rq->core = core_rq;
+
+ WARN_ON_ONCE(rq->core != core_rq);
+ }
+
+unlock:
+ sched_core_unlock(cpu, &flags);
+}
+
+static void sched_core_cpu_deactivate(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ unsigned long flags;
+ int t;
+
+ sched_core_lock(cpu, &flags);
+
+ /* if we're the last man standing, nothing to do */
+ if (cpumask_weight(smt_mask) == 1) {
+ WARN_ON_ONCE(rq->core != rq);
+ goto unlock;
+ }
+
+ /* if we're not the leader, nothing to do */
+ if (rq->core != rq)
+ goto unlock;
+
+ /* find a new leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ core_rq = cpu_rq(t);
+ break;
+ }
+
+ if (WARN_ON_ONCE(!core_rq)) /* impossible */
+ goto unlock;
+
+ /* copy the shared state to the new leader */
+ core_rq->core_task_seq = rq->core_task_seq;
+ core_rq->core_pick_seq = rq->core_pick_seq;
+ core_rq->core_cookie = rq->core_cookie;
+ core_rq->core_forceidle = rq->core_forceidle;
+ core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+
+ /* install new leader */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
+ rq->core = core_rq;
+ }
+
+unlock:
+ sched_core_unlock(cpu, &flags);
+}
+
+static inline void sched_core_cpu_dying(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->core != rq)
+ rq->core = rq;
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_cpu_starting(unsigned int cpu) {}
+static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
+static inline void sched_core_cpu_dying(unsigned int cpu) {}
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ return __pick_next_task(rq, prev, rf);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
+/*
+ * Constants for the sched_mode argument of __schedule().
+ *
+ * The mode argument allows RT enabled kernels to differentiate a
+ * preemption from blocking on an 'sleeping' spin/rwlock. Note that
+ * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
+ * optimize the AND operation out and just check for zero.
+ */
+#define SM_NONE 0x0
+#define SM_PREEMPT 0x1
+#define SM_RTLOCK_WAIT 0x2
+
+#ifndef CONFIG_PREEMPT_RT
+# define SM_MASK_PREEMPT (~0U)
+#else
+# define SM_MASK_PREEMPT SM_PREEMPT
+#endif
+
/*
* __schedule() is the main scheduler function.
*
@@ -5022,7 +6162,7 @@ restart:
*
* WARNING: must be called with preemption disabled!
*/
-static void __sched notrace __schedule(bool preempt)
+static void __sched notrace __schedule(unsigned int sched_mode)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
@@ -5035,13 +6175,13 @@ static void __sched notrace __schedule(bool preempt)
rq = cpu_rq(cpu);
prev = rq->curr;
- schedule_debug(prev, preempt);
+ schedule_debug(prev, !!sched_mode);
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
- rcu_note_context_switch(preempt);
+ rcu_note_context_switch(!!sched_mode);
/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -5074,10 +6214,10 @@ static void __sched notrace __schedule(bool preempt)
* - we form a control dependency vs deactivate_task() below.
* - ptrace_{,un}freeze_traced() can change ->state underneath us.
*/
- prev_state = prev->state;
- if (!preempt && prev_state) {
+ prev_state = READ_ONCE(prev->__state);
+ if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
if (signal_pending_state(prev_state, prev)) {
- prev->state = TASK_RUNNING;
+ WRITE_ONCE(prev->__state, TASK_RUNNING);
} else {
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
@@ -5141,7 +6281,7 @@ static void __sched notrace __schedule(bool preempt)
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(preempt, prev, next);
+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
@@ -5150,7 +6290,7 @@ static void __sched notrace __schedule(bool preempt)
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_rq_unlock_irq(rq);
}
}
@@ -5162,7 +6302,7 @@ void __noreturn do_task_dead(void)
/* Tell freezer to ignore us: */
current->flags |= PF_NOFREEZE;
- __schedule(false);
+ __schedule(SM_NONE);
BUG();
/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
@@ -5174,7 +6314,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
{
unsigned int task_flags;
- if (!tsk->state)
+ if (task_is_running(tsk))
return;
task_flags = tsk->flags;
@@ -5223,7 +6363,7 @@ asmlinkage __visible void __sched schedule(void)
sched_submit_work(tsk);
do {
preempt_disable();
- __schedule(false);
+ __schedule(SM_NONE);
sched_preempt_enable_no_resched();
} while (need_resched());
sched_update_worker(tsk);
@@ -5249,9 +6389,9 @@ void __sched schedule_idle(void)
* current task can be in any other state. Note, idle is always in the
* TASK_RUNNING state.
*/
- WARN_ON_ONCE(current->state);
+ WARN_ON_ONCE(current->__state);
do {
- __schedule(false);
+ __schedule(SM_NONE);
} while (need_resched());
}
@@ -5286,6 +6426,18 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
+#ifdef CONFIG_PREEMPT_RT
+void __sched notrace schedule_rtlock(void)
+{
+ do {
+ preempt_disable();
+ __schedule(SM_RTLOCK_WAIT);
+ sched_preempt_enable_no_resched();
+ } while (need_resched());
+}
+NOKPROBE_SYMBOL(schedule_rtlock);
+#endif
+
static void __sched notrace preempt_schedule_common(void)
{
do {
@@ -5304,7 +6456,7 @@ static void __sched notrace preempt_schedule_common(void)
*/
preempt_disable_notrace();
preempt_latency_start(1);
- __schedule(true);
+ __schedule(SM_PREEMPT);
preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
@@ -5383,7 +6535,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
* an infinite recursion.
*/
prev_ctx = exception_enter();
- __schedule(true);
+ __schedule(SM_PREEMPT);
exception_exit(prev_ctx);
preempt_latency_stop(1);
@@ -5532,7 +6684,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
do {
preempt_disable();
local_irq_enable();
- __schedule(true);
+ __schedule(SM_PREEMPT);
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());
@@ -5548,6 +6700,18 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
}
EXPORT_SYMBOL(default_wake_function);
+static void __setscheduler_prio(struct task_struct *p, int prio)
+{
+ if (dl_prio(prio))
+ p->sched_class = &dl_sched_class;
+ else if (rt_prio(prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+
+ p->prio = prio;
+}
+
#ifdef CONFIG_RT_MUTEXES
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
@@ -5663,22 +6827,19 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
} else {
p->dl.pi_se = &p->dl;
}
- p->sched_class = &dl_sched_class;
} else if (rt_prio(prio)) {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
- p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (rt_prio(oldprio))
p->rt.timeout = 0;
- p->sched_class = &fair_sched_class;
}
- p->prio = prio;
+ __setscheduler_prio(p, prio);
if (queued)
enqueue_task(rq, p, queue_flag);
@@ -5692,7 +6853,7 @@ out_unlock:
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
preempt_enable();
}
@@ -6031,35 +7192,6 @@ static void __setscheduler_params(struct task_struct *p,
set_load_weight(p, true);
}
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr, bool keep_boost)
-{
- /*
- * If params can't change scheduling class changes aren't allowed
- * either.
- */
- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
- return;
-
- __setscheduler_params(p, attr);
-
- /*
- * Keep a potential priority boosting if called from
- * sched_setscheduler().
- */
- p->prio = normal_prio(p);
- if (keep_boost)
- p->prio = rt_effective_prio(p, p->prio);
-
- if (dl_prio(p->prio))
- p->sched_class = &dl_sched_class;
- else if (rt_prio(p->prio))
- p->sched_class = &rt_sched_class;
- else
- p->sched_class = &fair_sched_class;
-}
-
/*
* Check the target process has a UID that matches the current process's:
*/
@@ -6080,10 +7212,8 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
{
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
- MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, queued, running;
- int new_effective_prio, policy = attr->sched_policy;
+ int oldpolicy = -1, policy = attr->sched_policy;
+ int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
struct callback_head *head;
struct rq_flags rf;
@@ -6281,6 +7411,7 @@ change:
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
if (pi) {
/*
* Take priority boosted tasks into account. If the new
@@ -6289,8 +7420,8 @@ change:
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_effective_prio(p, newprio);
- if (new_effective_prio == oldprio)
+ newprio = rt_effective_prio(p, newprio);
+ if (newprio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
@@ -6303,7 +7434,10 @@ change:
prev_class = p->sched_class;
- __setscheduler(rq, p, attr, pi);
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ __setscheduler_prio(p, newprio);
+ }
__setscheduler_uclamp(p, attr);
if (queued) {
@@ -6389,6 +7523,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, false, true);
}
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
@@ -6526,6 +7661,16 @@ err_size:
return -E2BIG;
}
+static void get_params(struct task_struct *p, struct sched_attr *attr)
+{
+ if (task_has_dl_policy(p))
+ __getparam_dl(p, attr);
+ else if (task_has_rt_policy(p))
+ attr->sched_priority = p->rt_priority;
+ else
+ attr->sched_nice = task_nice(p);
+}
+
/**
* sys_sched_setscheduler - set/change the scheduler policy and RT priority
* @pid: the pid in question.
@@ -6587,6 +7732,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
rcu_read_unlock();
if (likely(p)) {
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ get_params(p, &attr);
retval = sched_setattr(p, &attr);
put_task_struct(p);
}
@@ -6735,12 +7882,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- if (task_has_dl_policy(p))
- __getparam_dl(p, &kattr);
- else if (task_has_rt_policy(p))
- kattr.sched_priority = p->rt_priority;
- else
- kattr.sched_nice = task_nice(p);
+ get_params(p, &kattr);
+ kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
/*
@@ -6761,9 +7904,76 @@ out_unlock:
return retval;
}
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+#ifdef CONFIG_SMP
+int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
+{
+ int ret = 0;
+
+ /*
+ * If the task isn't a deadline task or admission control is
+ * disabled then we don't care about affinity changes.
+ */
+ if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
+ return 0;
+
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+ rcu_read_lock();
+ if (!cpumask_subset(task_rq(p)->rd->span, mask))
+ ret = -EBUSY;
+ rcu_read_unlock();
+ return ret;
+}
+#endif
+
+static int
+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
{
+ int retval;
cpumask_var_t cpus_allowed, new_mask;
+
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, mask, cpus_allowed);
+
+ retval = dl_task_check_affinity(p, new_mask);
+ if (retval)
+ goto out_free_new_mask;
+again:
+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
+ if (retval)
+ goto out_free_new_mask;
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset update.
+ * Just reset the cpumask to the cpuset's cpus_allowed.
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+ goto again;
+ }
+
+out_free_new_mask:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
struct task_struct *p;
int retval;
@@ -6783,68 +7993,22 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
retval = -EINVAL;
goto out_put_task;
}
- if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_put_task;
- }
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_free_cpus_allowed;
- }
- retval = -EPERM;
+
if (!check_same_owner(p)) {
rcu_read_lock();
if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
rcu_read_unlock();
- goto out_free_new_mask;
+ retval = -EPERM;
+ goto out_put_task;
}
rcu_read_unlock();
}
retval = security_task_setscheduler(p);
if (retval)
- goto out_free_new_mask;
-
-
- cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, in_mask, cpus_allowed);
-
- /*
- * Since bandwidth control happens on root_domain basis,
- * if admission test is enabled, we only admit -deadline
- * tasks allowed to run on all the CPUs in the task's
- * root_domain.
- */
-#ifdef CONFIG_SMP
- if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
- rcu_read_lock();
- if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
- retval = -EBUSY;
- rcu_read_unlock();
- goto out_free_new_mask;
- }
- rcu_read_unlock();
- }
-#endif
-again:
- retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
+ goto out_put_task;
- if (!retval) {
- cpuset_cpus_allowed(p, cpus_allowed);
- if (!cpumask_subset(new_mask, cpus_allowed)) {
- /*
- * We must have raced with a concurrent cpuset
- * update. Just reset the cpus_allowed to the
- * cpuset's cpus_allowed
- */
- cpumask_copy(new_mask, cpus_allowed);
- goto again;
- }
- }
-out_free_new_mask:
- free_cpumask_var(new_mask);
-out_free_cpus_allowed:
- free_cpumask_var(cpus_allowed);
+ retval = __sched_setaffinity(p, in_mask);
out_put_task:
put_task_struct(p);
return retval;
@@ -6987,6 +8151,17 @@ int __sched __cond_resched(void)
preempt_schedule_common();
return 1;
}
+ /*
+ * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
+ * whether the current CPU is in an RCU read-side critical section,
+ * so the tick can report quiescent states even for CPUs looping
+ * in kernel context. In contrast, in non-preemptible kernels,
+ * RCU readers leave no in-memory hints, which means that CPU-bound
+ * processes executing in kernel context might never report an
+ * RCU quiescent state. Therefore, the following code causes
+ * cond_resched() to report a quiescent state, but only when RCU
+ * is in urgent need of one.
+ */
#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
#endif
@@ -7148,7 +8323,7 @@ again:
if (curr->sched_class != p->sched_class)
goto out_unlock;
- if (task_running(p_rq, p) || p->state)
+ if (task_running(p_rq, p) || !task_is_running(p))
goto out_unlock;
yielded = curr->sched_class->yield_to_task(rq, p);
@@ -7351,7 +8526,7 @@ void sched_show_task(struct task_struct *p)
pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
- if (p->state == TASK_RUNNING)
+ if (task_is_running(p))
pr_cont(" running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
@@ -7375,26 +8550,28 @@ EXPORT_SYMBOL_GPL(sched_show_task);
static inline bool
state_filter_match(unsigned long state_filter, struct task_struct *p)
{
+ unsigned int state = READ_ONCE(p->__state);
+
/* no filter, everything matches */
if (!state_filter)
return true;
/* filter, but doesn't match */
- if (!(p->state & state_filter))
+ if (!(state & state_filter))
return false;
/*
* When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
* TASK_KILLABLE).
*/
- if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+ if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
return false;
return true;
}
-void show_state_filter(unsigned long state_filter)
+void show_state_filter(unsigned int state_filter)
{
struct task_struct *g, *p;
@@ -7433,19 +8610,32 @@ void show_state_filter(unsigned long state_filter)
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void init_idle(struct task_struct *idle, int cpu)
+void __init init_idle(struct task_struct *idle, int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
__sched_fork(0, idle);
+ /*
+ * The idle task doesn't need the kthread struct to function, but it
+ * is dressed up as a per-CPU kthread and thus needs to play the part
+ * if we want to avoid special-casing it in code that deals with per-CPU
+ * kthreads.
+ */
+ set_kthread_struct(idle);
+
raw_spin_lock_irqsave(&idle->pi_lock, flags);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
- idle->state = TASK_RUNNING;
+ idle->__state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
- idle->flags |= PF_IDLE;
+ /*
+ * PF_KTHREAD should already be set at this point; regardless, make it
+ * look like a proper per-CPU kthread.
+ */
+ idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
+ kthread_set_per_cpu(idle, cpu);
scs_task_reset(idle);
kasan_unpoison_task_stack(idle);
@@ -7479,7 +8669,7 @@ void init_idle(struct task_struct *idle, int cpu)
#ifdef CONFIG_SMP
idle->on_cpu = 1;
#endif
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
@@ -7605,6 +8795,7 @@ void idle_task_exit(void)
finish_arch_post_lock_switch();
}
+ scs_task_reset(current);
/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
@@ -7645,8 +8836,7 @@ static void balance_push(struct rq *rq)
{
struct task_struct *push_task = rq->curr;
- lockdep_assert_held(&rq->lock);
- SCHED_WARN_ON(rq->cpu != smp_processor_id());
+ lockdep_assert_rq_held(rq);
/*
* Ensure the thing is persistent until balance_push_set(.on = false);
@@ -7654,20 +8844,17 @@ static void balance_push(struct rq *rq)
rq->balance_callback = &balance_push_callback;
/*
- * Only active while going offline.
+ * Only active while going offline and when invoked on the outgoing
+ * CPU.
*/
- if (!cpu_dying(rq->cpu))
+ if (!cpu_dying(rq->cpu) || rq != this_rq())
return;
/*
* Both the cpu-hotplug and stop task are in this case and are
* required to complete the hotplug process.
- *
- * XXX: the idle task does not match kthread_is_per_cpu() due to
- * histerical raisins.
*/
- if (rq->idle == push_task ||
- kthread_is_per_cpu(push_task) ||
+ if (kthread_is_per_cpu(push_task) ||
is_migration_disabled(push_task)) {
/*
@@ -7683,9 +8870,9 @@ static void balance_push(struct rq *rq)
*/
if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
rcuwait_active(&rq->hotplug_wait)) {
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
rcuwait_wake_up(&rq->hotplug_wait);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
}
return;
}
@@ -7695,7 +8882,7 @@ static void balance_push(struct rq *rq)
* Temporarily drop rq->lock such that we can wake-up the stop task.
* Both preemption and IRQs are still disabled.
*/
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
this_cpu_ptr(&push_work));
/*
@@ -7703,7 +8890,7 @@ static void balance_push(struct rq *rq)
* schedule(). The next pick is obviously going to be the stop task
* which kthread_is_per_cpu() and will push this task away.
*/
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
}
static void balance_push_set(int cpu, bool on)
@@ -7922,6 +9109,8 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_dec_cpuslocked(&sched_smt_present);
+
+ sched_core_cpu_deactivate(cpu);
#endif
if (!sched_smp_initialized)
@@ -7947,6 +9136,7 @@ static void sched_rq_cpu_starting(unsigned int cpu)
int sched_cpu_starting(unsigned int cpu)
{
+ sched_core_cpu_starting(cpu);
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
@@ -7993,7 +9183,7 @@ static void dump_rq_tasks(struct rq *rq, const char *loglvl)
struct task_struct *g, *p;
int cpu = cpu_of(rq);
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
for_each_process_thread(g, p) {
@@ -8025,6 +9215,7 @@ int sched_cpu_dying(unsigned int cpu)
calc_load_migrate(rq);
update_max_interval();
hrtick_clear(rq);
+ sched_core_cpu_dying(cpu);
return 0;
}
#endif
@@ -8045,6 +9236,7 @@ void __init sched_init_smp(void)
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
BUG();
+ current->flags &= ~PF_NO_SETAFFINITY;
sched_init_granularity();
init_sched_rt_class();
@@ -8166,7 +9358,7 @@ void __init sched_init(void)
struct rq *rq;
rq = cpu_rq(i);
- raw_spin_lock_init(&rq->lock);
+ raw_spin_lock_init(&rq->__lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -8214,6 +9406,8 @@ void __init sched_init(void)
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
+ rq->wake_stamp = jiffies;
+ rq->wake_avg_idle = rq->avg_idle;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -8231,6 +9425,16 @@ void __init sched_init(void)
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
+
+#ifdef CONFIG_SCHED_CORE
+ rq->core = rq;
+ rq->core_pick = NULL;
+ rq->core_enabled = 0;
+ rq->core_tree = RB_ROOT;
+ rq->core_forceidle = false;
+
+ rq->core_cookie = 0UL;
+#endif
}
set_load_weight(&init_task, false);
@@ -8257,8 +9461,6 @@ void __init sched_init(void)
#endif
init_sched_fair_class();
- init_schedstats();
-
psi_init();
init_uclamp();
@@ -8276,15 +9478,15 @@ static inline int preempt_count_equals(int preempt_offset)
void __might_sleep(const char *file, int line, int preempt_offset)
{
+ unsigned int state = get_current_state();
/*
* Blocking primitives will set (and therefore destroy) current->state,
* since we will exit with TASK_RUNNING make sure we enter with it,
* otherwise we will destroy state.
*/
- WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+ WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
"do not call blocking ops when !TASK_RUNNING; "
- "state=%lx set at [<%p>] %pS\n",
- current->state,
+ "state=%x set at [<%p>] %pS\n", state,
(void *)current->task_state_change,
(void *)current->task_state_change);
@@ -8680,7 +9882,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
#ifdef CONFIG_UCLAMP_TASK_GROUP
/* Propagate the effective uclamp value for the new group */
+ mutex_lock(&uclamp_mutex);
+ rcu_read_lock();
cpu_util_update_eff(css);
+ rcu_read_unlock();
+ mutex_unlock(&uclamp_mutex);
#endif
return 0;
@@ -8741,7 +9947,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
* has happened. This would lead to problems with PELT, due to
* move wanting to detach+attach while we're not attached yet.
*/
- if (task->state == TASK_NEW)
+ if (READ_ONCE(task->__state) == TASK_NEW)
ret = -EINVAL;
raw_spin_unlock_irq(&task->pi_lock);
@@ -8770,6 +9976,9 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
enum uclamp_id clamp_id;
unsigned int clamps;
+ lockdep_assert_held(&uclamp_mutex);
+ SCHED_WARN_ON(!rcu_read_lock_held());
+
css_for_each_descendant_pre(css, top_css) {
uc_parent = css_tg(css)->parent
? css_tg(css)->parent->uclamp : NULL;
@@ -8802,7 +10011,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
}
/* Immediately update descendants RUNNABLE tasks */
- uclamp_update_active_tasks(css, clamps);
+ uclamp_update_active_tasks(css);
}
}
@@ -8961,7 +10170,8 @@ static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
+ u64 burst)
{
int i, ret = 0, runtime_enabled, runtime_was_enabled;
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -8991,11 +10201,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
if (quota != RUNTIME_INF && quota > max_cfs_runtime)
return -EINVAL;
+ if (quota != RUNTIME_INF && (burst > quota ||
+ burst + quota > max_cfs_runtime))
+ return -EINVAL;
+
/*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&cfs_constraints_mutex);
ret = __cfs_schedulable(tg, period, quota);
if (ret)
@@ -9012,6 +10226,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
+ cfs_b->burst = burst;
__refill_cfs_bandwidth_runtime(cfs_b);
@@ -9038,16 +10253,17 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return ret;
}
static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
{
- u64 quota, period;
+ u64 quota, period, burst;
period = ktime_to_ns(tg->cfs_bandwidth.period);
+ burst = tg->cfs_bandwidth.burst;
if (cfs_quota_us < 0)
quota = RUNTIME_INF;
else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
@@ -9055,7 +10271,7 @@ static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
else
return -EINVAL;
- return tg_set_cfs_bandwidth(tg, period, quota);
+ return tg_set_cfs_bandwidth(tg, period, quota, burst);
}
static long tg_get_cfs_quota(struct task_group *tg)
@@ -9073,15 +10289,16 @@ static long tg_get_cfs_quota(struct task_group *tg)
static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
{
- u64 quota, period;
+ u64 quota, period, burst;
if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
return -EINVAL;
period = (u64)cfs_period_us * NSEC_PER_USEC;
quota = tg->cfs_bandwidth.quota;
+ burst = tg->cfs_bandwidth.burst;
- return tg_set_cfs_bandwidth(tg, period, quota);
+ return tg_set_cfs_bandwidth(tg, period, quota, burst);
}
static long tg_get_cfs_period(struct task_group *tg)
@@ -9094,6 +10311,30 @@ static long tg_get_cfs_period(struct task_group *tg)
return cfs_period_us;
}
+static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
+{
+ u64 quota, period, burst;
+
+ if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
+ burst = (u64)cfs_burst_us * NSEC_PER_USEC;
+ period = ktime_to_ns(tg->cfs_bandwidth.period);
+ quota = tg->cfs_bandwidth.quota;
+
+ return tg_set_cfs_bandwidth(tg, period, quota, burst);
+}
+
+static long tg_get_cfs_burst(struct task_group *tg)
+{
+ u64 burst_us;
+
+ burst_us = tg->cfs_bandwidth.burst;
+ do_div(burst_us, NSEC_PER_USEC);
+
+ return burst_us;
+}
+
static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -9118,6 +10359,18 @@ static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
return tg_set_cfs_period(css_tg(css), cfs_period_us);
}
+static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return tg_get_cfs_burst(css_tg(css));
+}
+
+static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 cfs_burst_us)
+{
+ return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
+}
+
struct cfs_schedulable_data {
struct task_group *tg;
u64 period, quota;
@@ -9251,6 +10504,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return css_tg(css)->idle;
+}
+
+static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 idle)
+{
+ return sched_group_set_idle(css_tg(css), idle);
+}
+#endif
+
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
@@ -9258,6 +10525,11 @@ static struct cftype cpu_legacy_files[] = {
.read_u64 = cpu_shares_read_u64,
.write_u64 = cpu_shares_write_u64,
},
+ {
+ .name = "idle",
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -9271,6 +10543,11 @@ static struct cftype cpu_legacy_files[] = {
.write_u64 = cpu_cfs_period_write_u64,
},
{
+ .name = "cfs_burst_us",
+ .read_u64 = cpu_cfs_burst_read_u64,
+ .write_u64 = cpu_cfs_burst_write_u64,
+ },
+ {
.name = "stat",
.seq_show = cpu_cfs_stat_show,
},
@@ -9435,12 +10712,13 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
{
struct task_group *tg = css_tg(of_css(of));
u64 period = tg_get_cfs_period(tg);
+ u64 burst = tg_get_cfs_burst(tg);
u64 quota;
int ret;
ret = cpu_period_quota_parse(buf, &period, &quota);
if (!ret)
- ret = tg_set_cfs_bandwidth(tg, period, quota);
+ ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
return ret ?: nbytes;
}
#endif
@@ -9459,6 +10737,12 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_weight_nice_read_s64,
.write_s64 = cpu_weight_nice_write_s64,
},
+ {
+ .name = "idle",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -9467,6 +10751,12 @@ static struct cftype cpu_files[] = {
.seq_show = cpu_max_show,
.write = cpu_max_write,
},
+ {
+ .name = "max.burst",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_cfs_burst_read_u64,
+ .write_u64 = cpu_cfs_burst_write_u64,
+ },
#endif
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
new file mode 100644
index 000000000000..9a80e9a474c0
--- /dev/null
+++ b/kernel/sched/core_sched.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/prctl.h>
+#include "sched.h"
+
+/*
+ * A simple wrapper around refcount. An allocated sched_core_cookie's
+ * address is used to compute the cookie of the task.
+ */
+struct sched_core_cookie {
+ refcount_t refcnt;
+};
+
+unsigned long sched_core_alloc_cookie(void)
+{
+ struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL);
+ if (!ck)
+ return 0;
+
+ refcount_set(&ck->refcnt, 1);
+ sched_core_get();
+
+ return (unsigned long)ck;
+}
+
+void sched_core_put_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (void *)cookie;
+
+ if (ptr && refcount_dec_and_test(&ptr->refcnt)) {
+ kfree(ptr);
+ sched_core_put();
+ }
+}
+
+unsigned long sched_core_get_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (void *)cookie;
+
+ if (ptr)
+ refcount_inc(&ptr->refcnt);
+
+ return cookie;
+}
+
+/*
+ * sched_core_update_cookie - replace the cookie on a task
+ * @p: the task to update
+ * @cookie: the new cookie
+ *
+ * Effectively exchange the task cookie; caller is responsible for lifetimes on
+ * both ends.
+ *
+ * Returns: the old cookie
+ */
+unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie)
+{
+ unsigned long old_cookie;
+ struct rq_flags rf;
+ struct rq *rq;
+ bool enqueued;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Since creating a cookie implies sched_core_get(), and we cannot set
+ * a cookie until after we've created it, similarly, we cannot destroy
+ * a cookie until after we've removed it, we must have core scheduling
+ * enabled here.
+ */
+ SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq));
+
+ enqueued = sched_core_enqueued(p);
+ if (enqueued)
+ sched_core_dequeue(rq, p);
+
+ old_cookie = p->core_cookie;
+ p->core_cookie = cookie;
+
+ if (enqueued)
+ sched_core_enqueue(rq, p);
+
+ /*
+ * If task is currently running, it may not be compatible anymore after
+ * the cookie change, so enter the scheduler on its CPU to schedule it
+ * away.
+ */
+ if (task_running(rq, p))
+ resched_curr(rq);
+
+ task_rq_unlock(rq, p, &rf);
+
+ return old_cookie;
+}
+
+static unsigned long sched_core_clone_cookie(struct task_struct *p)
+{
+ unsigned long cookie, flags;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ cookie = sched_core_get_cookie(p->core_cookie);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return cookie;
+}
+
+void sched_core_fork(struct task_struct *p)
+{
+ RB_CLEAR_NODE(&p->core_node);
+ p->core_cookie = sched_core_clone_cookie(current);
+}
+
+void sched_core_free(struct task_struct *p)
+{
+ sched_core_put_cookie(p->core_cookie);
+}
+
+static void __sched_core_set(struct task_struct *p, unsigned long cookie)
+{
+ cookie = sched_core_get_cookie(cookie);
+ cookie = sched_core_update_cookie(p, cookie);
+ sched_core_put_cookie(cookie);
+}
+
+/* Called from prctl interface: PR_SCHED_CORE */
+int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
+ unsigned long uaddr)
+{
+ unsigned long cookie = 0, id = 0;
+ struct task_struct *task, *p;
+ struct pid *grp;
+ int err = 0;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -ENODEV;
+
+ if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 ||
+ (cmd != PR_SCHED_CORE_GET && uaddr))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (pid == 0) {
+ task = current;
+ } else {
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. Use the regular "ptrace_may_access()" checks.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ switch (cmd) {
+ case PR_SCHED_CORE_GET:
+ if (type != PIDTYPE_PID || uaddr & 7) {
+ err = -EINVAL;
+ goto out;
+ }
+ cookie = sched_core_clone_cookie(task);
+ if (cookie) {
+ /* XXX improve ? */
+ ptr_to_hashval((void *)cookie, &id);
+ }
+ err = put_user(id, (u64 __user *)uaddr);
+ goto out;
+
+ case PR_SCHED_CORE_CREATE:
+ cookie = sched_core_alloc_cookie();
+ if (!cookie) {
+ err = -ENOMEM;
+ goto out;
+ }
+ break;
+
+ case PR_SCHED_CORE_SHARE_TO:
+ cookie = sched_core_clone_cookie(current);
+ break;
+
+ case PR_SCHED_CORE_SHARE_FROM:
+ if (type != PIDTYPE_PID) {
+ err = -EINVAL;
+ goto out;
+ }
+ cookie = sched_core_clone_cookie(task);
+ __sched_core_set(current, cookie);
+ goto out;
+
+ default:
+ err = -EINVAL;
+ goto out;
+ };
+
+ if (type == PIDTYPE_PID) {
+ __sched_core_set(task, cookie);
+ goto out;
+ }
+
+ read_lock(&tasklist_lock);
+ grp = task_pid_type(task, type);
+
+ do_each_pid_thread(grp, type, p) {
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) {
+ err = -EPERM;
+ goto out_tasklist;
+ }
+ } while_each_pid_thread(grp, type, p);
+
+ do_each_pid_thread(grp, type, p) {
+ __sched_core_set(p, cookie);
+ } while_each_pid_thread(grp, type, p);
+out_tasklist:
+ read_unlock(&tasklist_lock);
+
+out:
+ sched_core_put_cookie(cookie);
+ put_task_struct(task);
+ return err;
+}
+
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 104a1bade14f..893eece65bfd 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -112,7 +112,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
/*
* Take rq->lock to make 64-bit read safe on 32-bit platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
if (index == CPUACCT_STAT_NSTATS) {
@@ -126,7 +126,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
}
#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
return data;
@@ -141,14 +141,14 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
/*
* Take rq->lock to make 64-bit write safe on 32-bit platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
cpuusage->usages[i] = val;
#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
}
@@ -253,13 +253,13 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
* Take rq->lock to make 64-bit read safe on 32-bit
* platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
seq_printf(m, " %llu", cpuusage->usages[index]);
#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
}
seq_puts(m, "\n");
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 4f09afd2f321..e7af18857371 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -151,6 +151,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
+ util = map_util_perf(util);
freq = map_util_freq(util, freq, max);
if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
@@ -536,9 +537,17 @@ static struct attribute *sugov_attrs[] = {
};
ATTRIBUTE_GROUPS(sugov);
+static void sugov_tunables_free(struct kobject *kobj)
+{
+ struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj);
+
+ kfree(to_sugov_tunables(attr_set));
+}
+
static struct kobj_type sugov_tunables_ktype = {
.default_groups = sugov_groups,
.sysfs_ops = &governor_sysfs_ops,
+ .release = &sugov_tunables_free,
};
/********************** cpufreq governor interface *********************/
@@ -638,12 +647,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic
return tunables;
}
-static void sugov_tunables_free(struct sugov_tunables *tunables)
+static void sugov_clear_global_tunables(void)
{
if (!have_governor_per_policy())
global_tunables = NULL;
-
- kfree(tunables);
}
static int sugov_init(struct cpufreq_policy *policy)
@@ -706,7 +713,7 @@ out:
fail:
kobject_put(&tunables->attr_set.kobj);
policy->governor_data = NULL;
- sugov_tunables_free(tunables);
+ sugov_clear_global_tunables();
stop_kthread:
sugov_kthread_stop(sg_policy);
@@ -733,7 +740,7 @@ static void sugov_exit(struct cpufreq_policy *policy)
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
policy->governor_data = NULL;
if (!count)
- sugov_tunables_free(tunables);
+ sugov_clear_global_tunables();
mutex_unlock(&global_tunables_lock);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9a2989749b8d..e94314633b39 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -157,7 +157,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw += dl_bw;
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
@@ -170,7 +170,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw -= dl_bw;
SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
@@ -184,7 +184,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw += dl_bw;
SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
}
@@ -194,7 +194,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw -= dl_bw;
SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
if (dl_rq->this_bw > old)
@@ -348,10 +348,10 @@ static void task_non_contending(struct task_struct *p)
if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
if (dl_task(p))
sub_running_bw(dl_se, dl_rq);
- if (!dl_task(p) || p->state == TASK_DEAD) {
+ if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
- if (p->state == TASK_DEAD)
+ if (READ_ONCE(p->__state) == TASK_DEAD)
sub_rq_bw(&p->dl, &rq->dl);
raw_spin_lock(&dl_b->lock);
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
@@ -987,7 +987,7 @@ static int start_dl_timer(struct task_struct *p)
ktime_t now, act;
s64 delta;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
/*
* We want the timer to fire at the deadline, but considering
@@ -1097,9 +1097,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* If the runqueue is no longer available, migrate the
* task elsewhere. This necessarily changes rq.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ lockdep_unpin_lock(__rq_lockp(rq), rf.cookie);
rq = dl_task_offline_migration(rq, p);
- rf.cookie = lockdep_pin_lock(&rq->lock);
+ rf.cookie = lockdep_pin_lock(__rq_lockp(rq));
update_rq_clock(rq);
/*
@@ -1355,10 +1355,10 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
sched_clock_tick();
update_rq_clock(rq);
- if (!dl_task(p) || p->state == TASK_DEAD) {
+ if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
- if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
+ if (READ_ONCE(p->__state) == TASK_DEAD && dl_se->dl_non_contending) {
sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));
dl_se->dl_non_contending = 0;
@@ -1722,7 +1722,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
{
struct rq *rq;
- if (p->state != TASK_WAKING)
+ if (READ_ONCE(p->__state) != TASK_WAKING)
return;
rq = task_rq(p);
@@ -1731,8 +1731,9 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* from try_to_wake_up(). Hence, p->pi_lock is locked, but
* rq->lock is not... So, lock it
*/
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
if (p->dl.dl_non_contending) {
+ update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
@@ -1746,7 +1747,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
put_task_struct(p);
}
sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
@@ -1852,7 +1853,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-static struct task_struct *pick_next_task_dl(struct rq *rq)
+static struct task_struct *pick_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
@@ -1864,7 +1865,18 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
- set_next_task_dl(rq, p, true);
+
+ return p;
+}
+
+static struct task_struct *pick_next_task_dl(struct rq *rq)
+{
+ struct task_struct *p;
+
+ p = pick_task_dl(rq);
+ if (p)
+ set_next_task_dl(rq, p, true);
+
return p;
}
@@ -2291,10 +2303,10 @@ skip:
double_unlock_balance(this_rq, src_rq);
if (push_task) {
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
push_task, &src_rq->push_work);
- raw_spin_lock(&this_rq->lock);
+ raw_spin_rq_lock(this_rq);
}
}
@@ -2486,6 +2498,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
+ } else {
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
}
}
@@ -2539,6 +2553,7 @@ DEFINE_SCHED_CLASS(dl) = {
#ifdef CONFIG_SMP
.balance = balance_dl,
+ .pick_task = pick_task_dl,
.select_task_rq = select_task_rq_dl,
.migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
@@ -2727,7 +2742,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
- dl_se->flags = attr->sched_flags;
+ dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
}
@@ -2740,7 +2755,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
attr->sched_runtime = dl_se->dl_runtime;
attr->sched_deadline = dl_se->dl_deadline;
attr->sched_period = dl_se->dl_period;
- attr->sched_flags = dl_se->flags;
+ attr->sched_flags &= ~SCHED_DL_FLAGS;
+ attr->sched_flags |= dl_se->flags;
}
/*
@@ -2837,7 +2853,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
if (dl_se->dl_runtime != attr->sched_runtime ||
dl_se->dl_deadline != attr->sched_deadline ||
dl_se->dl_period != attr->sched_period ||
- dl_se->flags != attr->sched_flags)
+ dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
return true;
return false;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c5aacbd492a1..17a653b67006 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -173,16 +173,22 @@ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[16];
+ unsigned int scaling;
if (cnt > 15)
cnt = 15;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
+ buf[cnt] = '\0';
- if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling))
+ if (kstrtouint(buf, 10, &scaling))
return -EINVAL;
+ if (scaling >= SCHED_TUNABLESCALING_END)
+ return -EINVAL;
+
+ sysctl_sched_tunable_scaling = scaling;
if (sched_update_scaling())
return -EINVAL;
@@ -388,6 +394,13 @@ void update_sched_domain_debugfs(void)
{
int cpu, i;
+ /*
+ * This can unfortunately be invoked before sched_debug_init() creates
+ * the debug directory. Don't touch sd_sysctl_cpus until then.
+ */
+ if (!debugfs_sched)
+ return;
+
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
@@ -576,7 +589,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
if (rb_first_cached(&cfs_rq->tasks_timeline))
MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
last = __pick_last_entity(cfs_rq);
@@ -584,7 +597,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
max_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
SPLIT_NS(MIN_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
@@ -600,6 +613,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
+ cfs_rq->idle_h_nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 23663318fb81..f6a05d9b5443 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -268,33 +268,11 @@ const struct sched_class fair_sched_class;
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- SCHED_WARN_ON(!entity_is_task(se));
- return container_of(se, struct task_struct, se);
-}
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
- return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
- return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return grp->my_q;
-}
-
static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
{
if (!path)
@@ -453,34 +431,27 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
}
}
-#else /* !CONFIG_FAIR_GROUP_SCHED */
-
-static inline struct task_struct *task_of(struct sched_entity *se)
+static int tg_is_idle(struct task_group *tg)
{
- return container_of(se, struct task_struct, se);
+ return tg->idle > 0;
}
-#define for_each_sched_entity(se) \
- for (; se; se = NULL)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
{
- return &task_rq(p)->cfs;
+ return cfs_rq->idle > 0;
}
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static int se_is_idle(struct sched_entity *se)
{
- struct task_struct *p = task_of(se);
- struct rq *rq = task_rq(p);
-
- return &rq->cfs;
+ if (entity_is_task(se))
+ return task_has_idle_policy(task_of(se));
+ return cfs_rq_is_idle(group_cfs_rq(se));
}
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return NULL;
-}
+#else /* !CONFIG_FAIR_GROUP_SCHED */
+
+#define for_each_sched_entity(se) \
+ for (; se; se = NULL)
static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
{
@@ -514,6 +485,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}
+static inline int tg_is_idle(struct task_group *tg)
+{
+ return 0;
+}
+
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static int se_is_idle(struct sched_entity *se)
+{
+ return 0;
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline
@@ -1039,11 +1025,14 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
+ unsigned int state;
- if (tsk->state & TASK_INTERRUPTIBLE)
+ /* XXX racy against TTWU */
+ state = READ_ONCE(tsk->__state);
+ if (state & TASK_INTERRUPTIBLE)
__schedstat_set(se->statistics.sleep_start,
rq_clock(rq_of(cfs_rq)));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
+ if (state & TASK_UNINTERRUPTIBLE)
__schedstat_set(se->statistics.block_start,
rq_clock(rq_of(cfs_rq)));
}
@@ -1107,7 +1096,7 @@ struct numa_group {
static struct numa_group *deref_task_numa_group(struct task_struct *p)
{
return rcu_dereference_check(p->numa_group, p == current ||
- (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+ (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
}
static struct numa_group *deref_curr_numa_group(struct task_struct *p)
@@ -1529,7 +1518,7 @@ static inline bool is_core_idle(int cpu)
if (cpu == sibling)
continue;
- if (!idle_cpu(cpu))
+ if (!idle_cpu(sibling))
return false;
}
#endif
@@ -3080,8 +3069,9 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ u32 divider = get_pelt_divider(&se->avg);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
}
#else
static inline void
@@ -3139,7 +3129,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (1)
- * \Sum grq->load.weight
+ * \Sum grq->load.weight
*
* Now, because computing that sum is prohibitively expensive to compute (been
* there, done that) we approximate it with this average stuff. The average
@@ -3153,7 +3143,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->avg.load_avg
* ge->load.weight = ------------------------------ (3)
- * tg->load_avg
+ * tg->load_avg
*
* Where: tg->load_avg ~= \Sum grq->avg.load_avg
*
@@ -3169,7 +3159,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- = tg->weight (4)
- * grp->load.weight
+ * grp->load.weight
*
* That is, the sum collapses because all other CPUs are idle; the UP scenario.
*
@@ -3188,7 +3178,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (6)
- * tg_load_avg'
+ * tg_load_avg'
*
* Where:
*
@@ -3341,6 +3331,15 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (child_cfs_rq_on_list(cfs_rq))
return false;
+ /*
+ * _avg must be null when _sum are null because _avg = _sum / divider
+ * Make sure that rounding and/or propagation of PELT values never
+ * break this.
+ */
+ SCHED_WARN_ON(cfs_rq->avg.load_avg ||
+ cfs_rq->avg.util_avg ||
+ cfs_rq->avg.runnable_avg);
+
return true;
}
@@ -3594,9 +3593,12 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
load_sum = (s64)se_weight(se) * runnable_sum;
load_avg = div_s64(load_sum, divider);
+ se->avg.load_sum = runnable_sum;
+
delta = load_avg - se->avg.load_avg;
+ if (!delta)
+ return;
- se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
add_positive(&cfs_rq->avg.load_avg, delta);
@@ -3716,15 +3718,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
r = removed_load;
sub_positive(&sa->load_avg, r);
- sub_positive(&sa->load_sum, r * divider);
+ sa->load_sum = sa->load_avg * divider;
r = removed_util;
sub_positive(&sa->util_avg, r);
- sub_positive(&sa->util_sum, r * divider);
+ sa->util_sum = sa->util_avg * divider;
r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
- sub_positive(&sa->runnable_sum, r * divider);
+ sa->runnable_sum = sa->runnable_avg * divider;
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -4476,6 +4478,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ clear_buddies(cfs_rq, se);
+
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
@@ -4535,7 +4539,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
- if (cfs_rq->skip == se) {
+ if (cfs_rq->skip && cfs_rq->skip == se) {
struct sched_entity *second;
if (se == curr) {
@@ -4562,8 +4566,6 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
se = cfs_rq->last;
}
- clear_buddies(cfs_rq, se);
-
return se;
}
@@ -4685,8 +4687,11 @@ static inline u64 sched_cfs_bandwidth_slice(void)
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
- if (cfs_b->quota != RUNTIME_INF)
- cfs_b->runtime = cfs_b->quota;
+ if (unlikely(cfs_b->quota == RUNTIME_INF))
+ return;
+
+ cfs_b->runtime += cfs_b->quota;
+ cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4868,6 +4873,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
+
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
@@ -4887,6 +4895,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
update_load_avg(qcfs_rq, se, 0);
se_update_runnable(se);
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
+
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
}
@@ -4925,45 +4936,55 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
- if (!cfs_rq->load.weight)
+ /* Nothing to run but something to decay (on_list)? Complete the branch */
+ if (!cfs_rq->load.weight) {
+ if (cfs_rq->on_list)
+ goto unthrottle_throttle;
return;
+ }
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+
if (se->on_rq)
break;
- cfs_rq = cfs_rq_of(se);
- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+ enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
+
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
- cfs_rq->h_nr_running += task_delta;
- cfs_rq->idle_h_nr_running += idle_task_delta;
+ qcfs_rq->h_nr_running += task_delta;
+ qcfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
}
for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_load_avg(qcfs_rq, se, UPDATE_TG);
se_update_runnable(se);
- cfs_rq->h_nr_running += task_delta;
- cfs_rq->idle_h_nr_running += idle_task_delta;
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
+ qcfs_rq->h_nr_running += task_delta;
+ qcfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
/*
* One parent has been throttled and cfs_rq removed from the
* list. Add it back to not break the leaf list.
*/
- if (throttled_hierarchy(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
+ if (throttled_hierarchy(qcfs_rq))
+ list_add_leaf_cfs_rq(qcfs_rq);
}
/* At this point se is NULL and we are at root level*/
@@ -4976,9 +4997,9 @@ unthrottle_throttle:
* assertion below.
*/
for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- if (list_add_leaf_cfs_rq(cfs_rq))
+ if (list_add_leaf_cfs_rq(qcfs_rq))
break;
}
@@ -5047,6 +5068,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->nr_periods += overrun;
+ /* Refill extra burst quota even if cfs_b->idle */
+ __refill_cfs_bandwidth_runtime(cfs_b);
+
/*
* idle depends on !throttled (for the case of a large deficit), and if
* we're going inactive then everything else can be deferred
@@ -5054,8 +5078,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
if (cfs_b->idle && !throttled)
goto out_deactivate;
- __refill_cfs_bandwidth_runtime(cfs_b);
-
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
@@ -5108,7 +5130,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
- u64 remaining;
+ s64 remaining;
/* if the call-back is running a quota refresh is already occurring */
if (hrtimer_callback_running(refresh_timer))
@@ -5116,7 +5138,7 @@ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
/* is a quota refresh about to occur? */
remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
- if (remaining < min_expire)
+ if (remaining < (s64)min_expire)
return 1;
return 0;
@@ -5305,6 +5327,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (new < max_cfs_quota_period) {
cfs_b->period = ns_to_ktime(new);
cfs_b->quota *= 2;
+ cfs_b->burst *= 2;
pr_warn_ratelimited(
"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
@@ -5336,6 +5359,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
+ cfs_b->burst = 0;
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
@@ -5385,7 +5409,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
struct task_group *tg;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -5404,7 +5428,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct task_group *tg;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -5598,6 +5622,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
@@ -5615,6 +5642,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
@@ -5692,6 +5722,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;
@@ -5721,6 +5754,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;
@@ -5992,11 +6028,15 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ struct rq *rq = cpu_rq(i);
+
+ if (!sched_core_cookie_match(rq, p))
+ continue;
+
if (sched_idle_cpu(i))
return i;
if (available_idle_cpu(i)) {
- struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
/*
@@ -6082,9 +6122,10 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return new_cpu;
}
-static inline int __select_idle_cpu(int cpu)
+static inline int __select_idle_cpu(int cpu, struct task_struct *p)
{
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
+ sched_cpu_cookie_match(cpu_rq(cpu), p))
return cpu;
return -1;
@@ -6154,7 +6195,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
int cpu;
if (!static_branch_likely(&sched_smt_present))
- return __select_idle_cpu(core);
+ return __select_idle_cpu(core, p);
for_each_cpu(cpu, cpu_smt_mask(core)) {
if (!available_idle_cpu(cpu)) {
@@ -6210,7 +6251,7 @@ static inline bool test_idle_cores(int cpu, bool def)
static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
- return __select_idle_cpu(core);
+ return __select_idle_cpu(core, p);
}
static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
@@ -6229,9 +6270,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ struct rq *this_rq = this_rq();
int this = smp_processor_id();
struct sched_domain *this_sd;
- u64 time;
+ u64 time = 0;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -6241,12 +6283,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
if (sched_feat(SIS_PROP) && !has_idle_core) {
u64 avg_cost, avg_idle, span_avg;
+ unsigned long now = jiffies;
/*
- * Due to large variance we need a large fuzz factor;
- * hackbench in particularly is sensitive here.
+ * If we're busy, the assumption that the last idle period
+ * predicts the future is flawed; age away the remaining
+ * predicted idle time.
*/
- avg_idle = this_rq()->avg_idle / 512;
+ if (unlikely(this_rq->wake_stamp < now)) {
+ while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
+ this_rq->wake_stamp++;
+ this_rq->wake_avg_idle >>= 1;
+ }
+ }
+
+ avg_idle = this_rq->wake_avg_idle;
avg_cost = this_sd->avg_scan_cost + 1;
span_avg = sd->span_weight * avg_idle;
@@ -6258,7 +6309,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
time = cpu_clock(this);
}
- for_each_cpu_wrap(cpu, cpus, target) {
+ for_each_cpu_wrap(cpu, cpus, target + 1) {
if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
if ((unsigned int)i < nr_cpumask_bits)
@@ -6267,7 +6318,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
} else {
if (!--nr)
return -1;
- idle_cpu = __select_idle_cpu(cpu);
+ idle_cpu = __select_idle_cpu(cpu, p);
if ((unsigned int)idle_cpu < nr_cpumask_bits)
break;
}
@@ -6278,6 +6329,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
if (sched_feat(SIS_PROP) && !has_idle_core) {
time = cpu_clock(this) - time;
+
+ /*
+ * Account for the scan cost of wakeups against the average
+ * idle time.
+ */
+ this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
+
update_avg(&this_sd->avg_scan_cost, time);
}
@@ -6345,6 +6403,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
task_util = uclamp_task_util(p);
}
+ /*
+ * per-cpu select_idle_mask usage
+ */
+ lockdep_assert_irqs_disabled();
+
if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
asym_fits_capacity(task_util, target))
return target;
@@ -6373,6 +6436,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
+ p->recent_used_cpu = prev;
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
@@ -6620,8 +6684,11 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
struct cpumask *pd_mask = perf_domain_span(pd);
unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
unsigned long max_util = 0, sum_util = 0;
+ unsigned long _cpu_cap = cpu_cap;
int cpu;
+ _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
+
/*
* The capacity state of CPUs of the current rd can be driven by CPUs
* of another rd if they belong to the same pd. So, account for the
@@ -6657,8 +6724,10 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* is already enough to scale the EM reported power
* consumption at the (eventually clamped) cpu_capacity.
*/
- sum_util += effective_cpu_util(cpu, util_running, cpu_cap,
- ENERGY_UTIL, NULL);
+ cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
+ ENERGY_UTIL, NULL);
+
+ sum_util += min(cpu_util, _cpu_cap);
/*
* Performance domain frequency: utilization clamping
@@ -6669,10 +6738,10 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
*/
cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
FREQUENCY_UTIL, tsk);
- max_util = max(max_util, cpu_util);
+ max_util = max(max_util, min(cpu_util, _cpu_cap));
}
- return em_cpu_energy(pd->em_pd, max_util, sum_util);
+ return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
}
/*
@@ -6718,15 +6787,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ int cpu, best_energy_cpu = prev_cpu, target = -1;
unsigned long cpu_cap, util, base_energy = 0;
- int cpu, best_energy_cpu = prev_cpu;
struct sched_domain *sd;
struct perf_domain *pd;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
if (!pd || READ_ONCE(rd->overutilized))
- goto fail;
+ goto unlock;
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6736,7 +6805,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
- goto fail;
+ goto unlock;
+
+ target = prev_cpu;
sync_entity_load_avg(&p->se);
if (!task_util_est(p))
@@ -6744,13 +6815,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
for (; pd; pd = pd->next) {
unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+ bool compute_prev_delta = false;
unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
- /* Compute the 'base' energy of the pd, without @p */
- base_energy_pd = compute_energy(p, -1, pd);
- base_energy += base_energy_pd;
-
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
@@ -6771,26 +6839,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!fits_capacity(util, cpu_cap))
continue;
- /* Always use prev_cpu as a candidate. */
if (cpu == prev_cpu) {
- prev_delta = compute_energy(p, prev_cpu, pd);
- prev_delta -= base_energy_pd;
- best_delta = min(best_delta, prev_delta);
- }
-
- /*
- * Find the CPU with the maximum spare capacity in
- * the performance domain
- */
- if (spare_cap > max_spare_cap) {
+ /* Always use prev_cpu as a candidate. */
+ compute_prev_delta = true;
+ } else if (spare_cap > max_spare_cap) {
+ /*
+ * Find the CPU with the maximum spare capacity
+ * in the performance domain.
+ */
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
}
}
- /* Evaluate the energy impact of using this CPU. */
- if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
+ if (max_spare_cap_cpu < 0 && !compute_prev_delta)
+ continue;
+
+ /* Compute the 'base' energy of the pd, without @p */
+ base_energy_pd = compute_energy(p, -1, pd);
+ base_energy += base_energy_pd;
+
+ /* Evaluate the energy impact of using prev_cpu. */
+ if (compute_prev_delta) {
+ prev_delta = compute_energy(p, prev_cpu, pd);
+ if (prev_delta < base_energy_pd)
+ goto unlock;
+ prev_delta -= base_energy_pd;
+ best_delta = min(best_delta, prev_delta);
+ }
+
+ /* Evaluate the energy impact of using max_spare_cap_cpu. */
+ if (max_spare_cap_cpu >= 0) {
cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+ if (cur_delta < base_energy_pd)
+ goto unlock;
cur_delta -= base_energy_pd;
if (cur_delta < best_delta) {
best_delta = cur_delta;
@@ -6798,25 +6880,22 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
}
}
}
-unlock:
rcu_read_unlock();
/*
* Pick the best CPU if prev_cpu cannot be used, or if it saves at
* least 6% of the energy used by prev_cpu.
*/
- if (prev_delta == ULONG_MAX)
- return best_energy_cpu;
-
- if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
- return best_energy_cpu;
+ if ((prev_delta == ULONG_MAX) ||
+ (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
+ target = best_energy_cpu;
- return prev_cpu;
+ return target;
-fail:
+unlock:
rcu_read_unlock();
- return -1;
+ return target;
}
/*
@@ -6828,8 +6907,6 @@ fail:
* certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
*
* Returns the target CPU number.
- *
- * preempt must be disabled.
*/
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
@@ -6842,6 +6919,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
/* SD_flags and WF_flags share the first nibble */
int sd_flag = wake_flags & 0xF;
+ /*
+ * required for stable ->cpus_allowed
+ */
+ lockdep_assert_held(&p->pi_lock);
if (wake_flags & WF_TTWU) {
record_wakee(p);
@@ -6882,9 +6963,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
-
- if (want_affine)
- current->recent_used_cpu = cpu;
}
rcu_read_unlock();
@@ -6906,7 +6984,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* min_vruntime -- the latter is done by enqueue_entity() when placing
* the task on the new runqueue.
*/
- if (p->state == TASK_WAKING) {
+ if (READ_ONCE(p->__state) == TASK_WAKING) {
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 min_vruntime;
@@ -6931,7 +7009,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
* rq->lock and can modify state directly.
*/
- lockdep_assert_held(&task_rq(p)->lock);
+ lockdep_assert_rq_held(task_rq(p));
detach_entity_cfs_rq(&p->se);
} else {
@@ -7021,24 +7099,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
static void set_last_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
- return;
-
for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq))
return;
+ if (se_is_idle(se))
+ return;
cfs_rq_of(se)->last = se;
}
}
static void set_next_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
- return;
-
for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq))
return;
+ if (se_is_idle(se))
+ return;
cfs_rq_of(se)->next = se;
}
}
@@ -7059,6 +7135,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int scale = cfs_rq->nr_running >= sched_nr_latency;
int next_buddy_marked = 0;
+ int cse_is_idle, pse_is_idle;
if (unlikely(se == pse))
return;
@@ -7103,8 +7180,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
find_matching_se(&se, &pse);
- update_curr(cfs_rq_of(se));
BUG_ON(!pse);
+
+ cse_is_idle = se_is_idle(se);
+ pse_is_idle = se_is_idle(pse);
+
+ /*
+ * Preempt an idle group in favor of a non-idle group (and don't preempt
+ * in the inverse case).
+ */
+ if (cse_is_idle && !pse_is_idle)
+ goto preempt;
+ if (cse_is_idle != pse_is_idle)
+ return;
+
+ update_curr(cfs_rq_of(se));
if (wakeup_preempt_entity(se, pse) == 1) {
/*
* Bias pick_next to pick the sched entity that is
@@ -7135,6 +7225,39 @@ preempt:
set_last_buddy(se);
}
+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_fair(struct rq *rq)
+{
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+
+again:
+ cfs_rq = &rq->cfs;
+ if (!cfs_rq->nr_running)
+ return NULL;
+
+ do {
+ struct sched_entity *curr = cfs_rq->curr;
+
+ /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
+ if (curr) {
+ if (curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
+
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto again;
+ }
+
+ se = pick_next_entity(cfs_rq, curr);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ return task_of(se);
+}
+#endif
+
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -7558,7 +7681,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
if (p->sched_class != &fair_sched_class)
return 0;
@@ -7580,6 +7703,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (sysctl_sched_migration_cost == -1)
return 1;
+
+ /*
+ * Don't migrate task if the task's cookie does not match
+ * with the destination CPU's core cookie.
+ */
+ if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
+ return 1;
+
if (sysctl_sched_migration_cost == 0)
return 0;
@@ -7656,7 +7787,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
/*
* We do not migrate tasks that are:
@@ -7745,7 +7876,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
*/
static void detach_task(struct task_struct *p, struct lb_env *env)
{
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
@@ -7761,7 +7892,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) {
@@ -7797,7 +7928,7 @@ static int detach_tasks(struct lb_env *env)
struct task_struct *p;
int detached = 0;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
/*
* Source run queue has been emptied by another CPU, clear
@@ -7927,7 +8058,7 @@ next:
*/
static void attach_task(struct rq *rq, struct task_struct *p)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
BUG_ON(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
@@ -8893,6 +9024,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
p->cpus_ptr))
continue;
+ /* Skip over this group if no cookie matched */
+ if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
+ continue;
+
local_group = cpumask_test_cpu(this_cpu,
sched_group_span(group));
@@ -9821,7 +9956,7 @@ more_balance:
if (need_active_balance(&env)) {
unsigned long flags;
- raw_spin_lock_irqsave(&busiest->lock, flags);
+ raw_spin_rq_lock_irqsave(busiest, flags);
/*
* Don't kick the active_load_balance_cpu_stop,
@@ -9829,8 +9964,7 @@ more_balance:
* moved to this_cpu:
*/
if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
- raw_spin_unlock_irqrestore(&busiest->lock,
- flags);
+ raw_spin_rq_unlock_irqrestore(busiest, flags);
goto out_one_pinned;
}
@@ -9847,7 +9981,7 @@ more_balance:
busiest->push_cpu = this_cpu;
active_balance = 1;
}
- raw_spin_unlock_irqrestore(&busiest->lock, flags);
+ raw_spin_rq_unlock_irqrestore(busiest, flags);
if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
@@ -10153,9 +10287,11 @@ static inline int on_null_domain(struct rq *rq)
static inline int find_new_ilb(void)
{
int ilb;
+ const struct cpumask *hk_mask;
+
+ hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
- for_each_cpu_and(ilb, nohz.idle_cpus_mask,
- housekeeping_cpumask(HK_FLAG_MISC)) {
+ for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
if (ilb == smp_processor_id())
continue;
@@ -10632,6 +10768,14 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
u64 curr_cost = 0;
update_misfit_status(NULL, this_rq);
+
+ /*
+ * There is a task waiting to run. No need to search for one.
+ * Return 0; the task will be enqueued when switching to idle.
+ */
+ if (this_rq->ttwu_pending)
+ return 0;
+
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -10664,7 +10808,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
goto out;
}
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
update_blocked_averages(this_cpu);
rcu_read_lock();
@@ -10697,12 +10841,13 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
- if (pulled_task || this_rq->nr_running > 0)
+ if (pulled_task || this_rq->nr_running > 0 ||
+ this_rq->ttwu_pending)
break;
}
rcu_read_unlock();
- raw_spin_lock(&this_rq->lock);
+ raw_spin_rq_lock(this_rq);
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
@@ -10795,6 +10940,119 @@ static void rq_offline_fair(struct rq *rq)
#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_CORE
+static inline bool
+__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
+{
+ u64 slice = sched_slice(cfs_rq_of(se), se);
+ u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+
+ return (rtime * min_nr_tasks > slice);
+}
+
+#define MIN_NR_TASKS_DURING_FORCEIDLE 2
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ /*
+ * If runqueue has only one task which used up its slice and
+ * if the sibling is forced idle, then trigger schedule to
+ * give forced idle task a chance.
+ *
+ * sched_slice() considers only this active rq and it gets the
+ * whole slice. But during force idle, we have siblings acting
+ * like a single runqueue and hence we need to consider runnable
+ * tasks on this CPU and the forced idle CPU. Ideally, we should
+ * go through the forced idle rq, but that would be a perf hit.
+ * We can assume that the forced idle CPU has at least
+ * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
+ * if we need to give up the CPU.
+ */
+ if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+ __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
+ resched_curr(rq);
+}
+
+/*
+ * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ */
+static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (forceidle) {
+ if (cfs_rq->forceidle_seq == fi_seq)
+ break;
+ cfs_rq->forceidle_seq = fi_seq;
+ }
+
+ cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ }
+}
+
+void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
+{
+ struct sched_entity *se = &p->se;
+
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
+}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+{
+ struct rq *rq = task_rq(a);
+ struct sched_entity *sea = &a->se;
+ struct sched_entity *seb = &b->se;
+ struct cfs_rq *cfs_rqa;
+ struct cfs_rq *cfs_rqb;
+ s64 delta;
+
+ SCHED_WARN_ON(task_rq(b)->core != rq->core);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Find an se in the hierarchy for tasks a and b, such that the se's
+ * are immediate siblings.
+ */
+ while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
+ int sea_depth = sea->depth;
+ int seb_depth = seb->depth;
+
+ if (sea_depth >= seb_depth)
+ sea = parent_entity(sea);
+ if (sea_depth <= seb_depth)
+ seb = parent_entity(seb);
+ }
+
+ se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
+ se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
+
+ cfs_rqa = sea->cfs_rq;
+ cfs_rqb = seb->cfs_rq;
+#else
+ cfs_rqa = &task_rq(a)->cfs;
+ cfs_rqb = &task_rq(b)->cfs;
+#endif
+
+ /*
+ * Find delta after normalizing se's vruntime with its cfs_rq's
+ * min_vruntime_fi, which would have been updated in prior calls
+ * to se_fi_update().
+ */
+ delta = (s64)(sea->vruntime - seb->vruntime) +
+ (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+
+ return delta > 0;
+}
+#else
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
+#endif
+
/*
* scheduler tick hitting a task of our scheduling class.
*
@@ -10818,6 +11076,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
+
+ task_tick_core(rq, curr);
}
/*
@@ -10903,7 +11163,7 @@ static inline bool vruntime_normalized(struct task_struct *p)
* waiting for actually being woken up by sched_ttwu_pending().
*/
if (!se->sum_exec_runtime ||
- (p->state == TASK_WAKING && p->sched_remote_wakeup))
+ (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
return true;
return false;
@@ -11189,9 +11449,9 @@ void unregister_fair_sched_group(struct task_group *tg)
rq = cpu_rq(cpu);
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
}
}
@@ -11228,10 +11488,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
static DEFINE_MUTEX(shares_mutex);
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
+ lockdep_assert_held(&shares_mutex);
+
/*
* We can't change the weight of the root cgroup.
*/
@@ -11240,9 +11502,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
- mutex_lock(&shares_mutex);
if (tg->shares == shares)
- goto done;
+ return 0;
tg->shares = shares;
for_each_possible_cpu(i) {
@@ -11260,10 +11521,88 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
rq_unlock_irqrestore(rq, &rf);
}
-done:
+ return 0;
+}
+
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+ int ret;
+
+ mutex_lock(&shares_mutex);
+ if (tg_is_idle(tg))
+ ret = -EINVAL;
+ else
+ ret = __sched_group_set_shares(tg, shares);
+ mutex_unlock(&shares_mutex);
+
+ return ret;
+}
+
+int sched_group_set_idle(struct task_group *tg, long idle)
+{
+ int i;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ if (idle < 0 || idle > 1)
+ return -EINVAL;
+
+ mutex_lock(&shares_mutex);
+
+ if (tg->idle == idle) {
+ mutex_unlock(&shares_mutex);
+ return 0;
+ }
+
+ tg->idle = idle;
+
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se = tg->se[i];
+ struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+ bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
+ long idle_task_delta;
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+
+ grp_cfs_rq->idle = idle;
+ if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
+ goto next_cpu;
+
+ idle_task_delta = grp_cfs_rq->h_nr_running -
+ grp_cfs_rq->idle_h_nr_running;
+ if (!cfs_rq_is_idle(grp_cfs_rq))
+ idle_task_delta *= -1;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (!se->on_rq)
+ break;
+
+ cfs_rq->idle_h_nr_running += idle_task_delta;
+
+ /* Already accounted at parent level and above. */
+ if (cfs_rq_is_idle(cfs_rq))
+ break;
+ }
+
+next_cpu:
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+ /* Idle groups have minimum weight. */
+ if (tg_is_idle(tg))
+ __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
+ else
+ __sched_group_set_shares(tg, NICE_0_LOAD);
+
mutex_unlock(&shares_mutex);
return 0;
}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
void free_fair_sched_group(struct task_group *tg) { }
@@ -11313,6 +11652,7 @@ DEFINE_SCHED_CLASS(fair) = {
#ifdef CONFIG_SMP
.balance = balance_fair,
+ .pick_task = pick_task_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7ca3d3d86c2a..d17b0a5ce6ac 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -379,10 +379,10 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
cpuidle_use_deepest_state(latency_ns);
it.done = 0;
- hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
it.timer.function = idle_inject_timer_fn;
hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
while (!READ_ONCE(it.done))
do_idle();
@@ -437,8 +437,16 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
{
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
+ queue_core_balance(rq);
}
+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_idle(struct rq *rq)
+{
+ return rq->idle;
+}
+#endif
+
struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
@@ -455,10 +463,10 @@ struct task_struct *pick_next_task_idle(struct rq *rq)
static void
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_rq_unlock_irq(rq);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_rq_lock_irq(rq);
}
/*
@@ -506,6 +514,7 @@ DEFINE_SCHED_CLASS(idle) = {
#ifdef CONFIG_SMP
.balance = balance_idle,
+ .pick_task = pick_task_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 5a6ea03f9882..7f06eaf12818 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -81,11 +81,9 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
{
cpumask_var_t non_housekeeping_mask;
cpumask_var_t tmp;
- int err;
alloc_bootmem_cpumask_var(&non_housekeeping_mask);
- err = cpulist_parse(str, non_housekeeping_mask);
- if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
+ if (cpulist_parse(str, non_housekeeping_mask) < 0) {
pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
free_bootmem_cpumask_var(non_housekeeping_mask);
return 0;
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 1c79896f1bc0..954b229868d9 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -81,7 +81,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust)
long nr_active, delta = 0;
nr_active = this_rq->nr_running - adjust;
- nr_active += (long)this_rq->nr_uninterruptible;
+ nr_active += (int)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index cfe94ffd2b38..e06071bf3472 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -132,7 +132,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
static inline u64 rq_clock_pelt(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock_pelt - rq->lost_idle_time;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index cc25a3cff41f..1652f2bb54b7 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -148,6 +148,7 @@
static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
+DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED
static bool psi_enable;
@@ -182,6 +183,8 @@ struct psi_group psi_system = {
static void psi_avgs_work(struct work_struct *work);
+static void poll_timer_fn(struct timer_list *t);
+
static void group_init(struct psi_group *group)
{
int cpu;
@@ -201,6 +204,8 @@ static void group_init(struct psi_group *group)
memset(group->polling_total, 0, sizeof(group->polling_total));
group->polling_next_update = ULLONG_MAX;
group->polling_until = 0;
+ init_waitqueue_head(&group->poll_wait);
+ timer_setup(&group->poll_timer, poll_timer_fn, 0);
rcu_assign_pointer(group->poll_task, NULL);
}
@@ -211,6 +216,9 @@ void __init psi_init(void)
return;
}
+ if (!cgroup_psi_enabled())
+ static_branch_disable(&psi_cgroups_enabled);
+
psi_period = jiffies_to_nsecs(PSI_FREQ);
group_init(&psi_system);
}
@@ -744,23 +752,23 @@ static void psi_group_change(struct psi_group *group, int cpu,
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
{
+ if (*iter == &psi_system)
+ return NULL;
+
#ifdef CONFIG_CGROUPS
- struct cgroup *cgroup = NULL;
+ if (static_branch_likely(&psi_cgroups_enabled)) {
+ struct cgroup *cgroup = NULL;
- if (!*iter)
- cgroup = task->cgroups->dfl_cgrp;
- else if (*iter == &psi_system)
- return NULL;
- else
- cgroup = cgroup_parent(*iter);
+ if (!*iter)
+ cgroup = task->cgroups->dfl_cgrp;
+ else
+ cgroup = cgroup_parent(*iter);
- if (cgroup && cgroup_parent(cgroup)) {
- *iter = cgroup;
- return cgroup_psi(cgroup);
+ if (cgroup && cgroup_parent(cgroup)) {
+ *iter = cgroup;
+ return cgroup_psi(cgroup);
+ }
}
-#else
- if (*iter)
- return NULL;
#endif
*iter = &psi_system;
return &psi_system;
@@ -1157,9 +1165,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
return ERR_CAST(task);
}
atomic_set(&group->poll_wakeup, 0);
- init_waitqueue_head(&group->poll_wait);
wake_up_process(task);
- timer_setup(&group->poll_timer, poll_timer_fn, 0);
rcu_assign_pointer(group->poll_task, task);
}
@@ -1211,6 +1217,7 @@ static void psi_trigger_destroy(struct kref *ref)
group->poll_task,
lockdep_is_held(&group->trigger_lock));
rcu_assign_pointer(group->poll_task, NULL);
+ del_timer(&group->poll_timer);
}
}
@@ -1223,17 +1230,14 @@ static void psi_trigger_destroy(struct kref *ref)
*/
synchronize_rcu();
/*
- * Destroy the kworker after releasing trigger_lock to prevent a
+ * Stop kthread 'psimon' after releasing trigger_lock to prevent a
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
if (task_to_destroy) {
/*
* After the RCU grace period has expired, the worker
* can no longer be found through group->poll_task.
- * But it might have been already scheduled before
- * that - deschedule it cleanly before destroying it.
*/
- del_timer_sync(&group->poll_timer);
kthread_stop(task_to_destroy);
}
kfree(t);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c286e5ba3c94..3daf42a0f462 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -888,7 +888,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (skip)
continue;
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
update_rq_clock(rq);
if (rt_rq->rt_time) {
@@ -926,7 +926,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
@@ -1626,7 +1626,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *pick_task_rt(struct rq *rq)
{
struct task_struct *p;
@@ -1634,7 +1634,17 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
return NULL;
p = _pick_next_task_rt(rq);
- set_next_task_rt(rq, p, true);
+
+ return p;
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+ struct task_struct *p = pick_task_rt(rq);
+
+ if (p)
+ set_next_task_rt(rq, p, true);
+
return p;
}
@@ -1894,10 +1904,10 @@ retry:
*/
push_task = get_push_task(rq);
if (push_task) {
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
push_task, &rq->push_work);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
}
return 0;
@@ -2122,10 +2132,10 @@ void rto_push_irq_work_func(struct irq_work *work)
* When it gets updated, a check is made if a push is possible.
*/
if (has_pushable_tasks(rq)) {
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
while (push_rt_task(rq, true))
;
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
raw_spin_lock(&rd->rto_lock);
@@ -2243,10 +2253,10 @@ skip:
double_unlock_balance(this_rq, src_rq);
if (push_task) {
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
push_task, &src_rq->push_work);
- raw_spin_lock(&this_rq->lock);
+ raw_spin_rq_lock(this_rq);
}
}
@@ -2331,13 +2341,20 @@ void __init init_sched_rt_class(void)
static void switched_to_rt(struct rq *rq, struct task_struct *p)
{
/*
- * If we are already running, then there's nothing
- * that needs to be done. But if we are not running
- * we may need to preempt the current running task.
- * If that current running task is also an RT task
+ * If we are running, update the avg_rt tracking, as the running time
+ * will now on be accounted into the latter.
+ */
+ if (task_current(rq, p)) {
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ return;
+ }
+
+ /*
+ * If we are not running we may need to preempt the current
+ * running task. If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (task_on_rq_queued(p) && rq->curr != p) {
+ if (task_on_rq_queued(p)) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
rt_queue_push_tasks(rq);
@@ -2483,6 +2500,7 @@ DEFINE_SCHED_CLASS(rt) = {
#ifdef CONFIG_SMP
.balance = balance_rt,
+ .pick_task = pick_task_rt,
.select_task_rq = select_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a189bec13729..3d3e5793e117 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -227,6 +227,8 @@ static inline void update_avg(u64 *avg, u64 sample)
*/
#define SCHED_FLAG_SUGOV 0x10000000
+#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
+
static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
{
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
@@ -366,6 +368,7 @@ struct cfs_bandwidth {
ktime_t period;
u64 quota;
u64 runtime;
+ u64 burst;
s64 hierarchical_quota;
u8 idle;
@@ -393,6 +396,9 @@ struct task_group {
struct cfs_rq **cfs_rq;
unsigned long shares;
+ /* A positive value indicates that this is a SCHED_IDLE group. */
+ int idle;
+
#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
@@ -502,6 +508,8 @@ extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern int sched_group_set_idle(struct task_group *tg, long idle);
+
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
@@ -526,6 +534,11 @@ struct cfs_rq {
u64 exec_clock;
u64 min_vruntime;
+#ifdef CONFIG_SCHED_CORE
+ unsigned int forceidle_seq;
+ u64 min_vruntime_fi;
+#endif
+
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
@@ -593,6 +606,9 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
+ /* Locally cached copy of our task_group's idle value */
+ int idle;
+
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
s64 runtime_remaining;
@@ -631,8 +647,8 @@ struct rt_rq {
} highest_prio;
#endif
#ifdef CONFIG_SMP
- unsigned long rt_nr_migratory;
- unsigned long rt_nr_total;
+ unsigned int rt_nr_migratory;
+ unsigned int rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
@@ -646,7 +662,7 @@ struct rt_rq {
raw_spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
- unsigned long rt_nr_boosted;
+ unsigned int rt_nr_boosted;
struct rq *rq;
struct task_group *tg;
@@ -663,7 +679,7 @@ struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
struct rb_root_cached root;
- unsigned long dl_nr_running;
+ unsigned int dl_nr_running;
#ifdef CONFIG_SMP
/*
@@ -677,7 +693,7 @@ struct dl_rq {
u64 next;
} earliest_dl;
- unsigned long dl_nr_migratory;
+ unsigned int dl_nr_migratory;
int overloaded;
/*
@@ -905,7 +921,7 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
*/
struct rq {
/* runqueue lock: */
- raw_spinlock_t lock;
+ raw_spinlock_t __lock;
/*
* nr_running and cpu_load should be in the same cacheline because
@@ -955,7 +971,7 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
- unsigned long nr_uninterruptible;
+ unsigned int nr_uninterruptible;
struct task_struct __rcu *curr;
struct task_struct *idle;
@@ -1017,6 +1033,9 @@ struct rq {
u64 idle_stamp;
u64 avg_idle;
+ unsigned long wake_stamp;
+ u64 wake_avg_idle;
+
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
@@ -1075,6 +1094,22 @@ struct rq {
#endif
unsigned int push_busy;
struct cpu_stop_work push_work;
+
+#ifdef CONFIG_SCHED_CORE
+ /* per rq */
+ struct rq *core;
+ struct task_struct *core_pick;
+ unsigned int core_enabled;
+ unsigned int core_sched_seq;
+ struct rb_root core_tree;
+
+ /* shared state -- careful with sched_core_cpu_deactivate() */
+ unsigned int core_task_seq;
+ unsigned int core_pick_seq;
+ unsigned long core_cookie;
+ unsigned char core_forceidle;
+ unsigned int core_forceidle_seq;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1113,6 +1148,206 @@ static inline bool is_migration_disabled(struct task_struct *p)
#endif
}
+struct sched_group;
+#ifdef CONFIG_SCHED_CORE
+static inline struct cpumask *sched_group_span(struct sched_group *sg);
+
+DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled;
+}
+
+static inline bool sched_core_disabled(void)
+{
+ return !static_branch_unlikely(&__sched_core_enabled);
+}
+
+/*
+ * Be careful with this function; not for general use. The return value isn't
+ * stable unless you actually hold a relevant rq->__lock.
+ */
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ if (sched_core_enabled(rq))
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+{
+ if (rq->core_enabled)
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);
+
+/*
+ * Helpers to check if the CPU's core cookie matches with the task's cookie
+ * when core scheduling is enabled.
+ * A special case is that the task's cookie always matches with CPU's core
+ * cookie if the CPU is in an idle core.
+ */
+static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ return rq->core->core_cookie == p->core_cookie;
+}
+
+static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ bool idle_core = true;
+ int cpu;
+
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) {
+ if (!available_idle_cpu(cpu)) {
+ idle_core = false;
+ break;
+ }
+ }
+
+ /*
+ * A CPU in an idle core is always the best choice for tasks with
+ * cookies.
+ */
+ return idle_core || rq->core->core_cookie == p->core_cookie;
+}
+
+static inline bool sched_group_cookie_match(struct rq *rq,
+ struct task_struct *p,
+ struct sched_group *group)
+{
+ int cpu;
+
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) {
+ if (sched_core_cookie_match(rq, p))
+ return true;
+ }
+ return false;
+}
+
+extern void queue_core_balance(struct rq *rq);
+
+static inline bool sched_core_enqueued(struct task_struct *p)
+{
+ return !RB_EMPTY_NODE(&p->core_node);
+}
+
+extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
+
+extern void sched_core_get(void);
+extern void sched_core_put(void);
+
+extern unsigned long sched_core_alloc_cookie(void);
+extern void sched_core_put_cookie(unsigned long cookie);
+extern unsigned long sched_core_get_cookie(unsigned long cookie);
+extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie);
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return false;
+}
+
+static inline bool sched_core_disabled(void)
+{
+ return true;
+}
+
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ return &rq->__lock;
+}
+
+static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+{
+ return &rq->__lock;
+}
+
+static inline void queue_core_balance(struct rq *rq)
+{
+}
+
+static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ return true;
+}
+
+static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ return true;
+}
+
+static inline bool sched_group_cookie_match(struct rq *rq,
+ struct task_struct *p,
+ struct sched_group *group)
+{
+ return true;
+}
+#endif /* CONFIG_SCHED_CORE */
+
+static inline void lockdep_assert_rq_held(struct rq *rq)
+{
+ lockdep_assert_held(__rq_lockp(rq));
+}
+
+extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass);
+extern bool raw_spin_rq_trylock(struct rq *rq);
+extern void raw_spin_rq_unlock(struct rq *rq);
+
+static inline void raw_spin_rq_lock(struct rq *rq)
+{
+ raw_spin_rq_lock_nested(rq, 0);
+}
+
+static inline void raw_spin_rq_lock_irq(struct rq *rq)
+{
+ local_irq_disable();
+ raw_spin_rq_lock(rq);
+}
+
+static inline void raw_spin_rq_unlock_irq(struct rq *rq)
+{
+ raw_spin_rq_unlock(rq);
+ local_irq_enable();
+}
+
+static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq)
+{
+ unsigned long flags;
+ local_irq_save(flags);
+ raw_spin_rq_lock(rq);
+ return flags;
+}
+
+static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags)
+{
+ raw_spin_rq_unlock(rq);
+ local_irq_restore(flags);
+}
+
+#define raw_spin_rq_lock_irqsave(rq, flags) \
+do { \
+ flags = _raw_spin_rq_lock_irqsave(rq); \
+} while (0)
+
#ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq);
@@ -1134,6 +1369,57 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ SCHED_WARN_ON(!entity_is_task(se));
+ return container_of(se, struct task_struct, se);
+}
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return grp->my_q;
+}
+
+#else
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ return container_of(se, struct task_struct, se);
+}
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return NULL;
+}
+#endif
+
extern void update_rq_clock(struct rq *rq);
static inline u64 __rq_clock_broken(struct rq *rq)
@@ -1179,7 +1465,7 @@ static inline void assert_clock_updated(struct rq *rq)
static inline u64 rq_clock(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock;
@@ -1187,7 +1473,7 @@ static inline u64 rq_clock(struct rq *rq)
static inline u64 rq_clock_task(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock_task;
@@ -1213,7 +1499,7 @@ static inline u64 rq_clock_thermal(struct rq *rq)
static inline void rq_clock_skip_update(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rq->clock_update_flags |= RQCF_REQ_SKIP;
}
@@ -1223,7 +1509,7 @@ static inline void rq_clock_skip_update(struct rq *rq)
*/
static inline void rq_clock_cancel_skipupdate(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}
@@ -1254,7 +1540,7 @@ extern struct callback_head balance_push_callback;
*/
static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
- rf->cookie = lockdep_pin_lock(&rq->lock);
+ rf->cookie = lockdep_pin_lock(__rq_lockp(rq));
#ifdef CONFIG_SCHED_DEBUG
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
@@ -1272,12 +1558,12 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
rf->clock_update_flags = RQCF_UPDATED;
#endif
- lockdep_unpin_lock(&rq->lock, rf->cookie);
+ lockdep_unpin_lock(__rq_lockp(rq), rf->cookie);
}
static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
{
- lockdep_repin_lock(&rq->lock, rf->cookie);
+ lockdep_repin_lock(__rq_lockp(rq), rf->cookie);
#ifdef CONFIG_SCHED_DEBUG
/*
@@ -1298,7 +1584,7 @@ static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
static inline void
@@ -1307,7 +1593,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(p->pi_lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
@@ -1315,7 +1601,7 @@ static inline void
rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock_irqsave(&rq->lock, rf->flags);
+ raw_spin_rq_lock_irqsave(rq, rf->flags);
rq_pin_lock(rq, rf);
}
@@ -1323,7 +1609,7 @@ static inline void
rq_lock_irq(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_rq_lock_irq(rq);
rq_pin_lock(rq, rf);
}
@@ -1331,7 +1617,7 @@ static inline void
rq_lock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
rq_pin_lock(rq, rf);
}
@@ -1339,7 +1625,7 @@ static inline void
rq_relock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
rq_repin_lock(rq, rf);
}
@@ -1348,7 +1634,7 @@ rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+ raw_spin_rq_unlock_irqrestore(rq, rf->flags);
}
static inline void
@@ -1356,7 +1642,7 @@ rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_rq_unlock_irq(rq);
}
static inline void
@@ -1364,7 +1650,7 @@ rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
static inline struct rq *
@@ -1429,7 +1715,7 @@ queue_balance_callback(struct rq *rq,
struct callback_head *head,
void (*func)(struct rq *rq))
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (unlikely(head->next || rq->balance_callback == &balance_push_callback))
return;
@@ -1844,6 +2130,9 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+
+ struct task_struct * (*pick_task)(struct rq *rq);
+
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
@@ -1893,7 +2182,6 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
- WARN_ON_ONCE(rq->curr != next);
next->sched_class->set_next_task(rq, next, false);
}
@@ -1956,6 +2244,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq);
#define SCA_CHECK 0x01
#define SCA_MIGRATE_DISABLE 0x02
#define SCA_MIGRATE_ENABLE 0x04
+#define SCA_USER 0x08
#ifdef CONFIG_SMP
@@ -1969,7 +2258,7 @@ static inline struct task_struct *get_push_task(struct rq *rq)
{
struct task_struct *p = rq->curr;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (rq->push_busy)
return NULL;
@@ -1977,6 +2266,9 @@ static inline struct task_struct *get_push_task(struct rq *rq)
if (p->nr_cpus_allowed == 1)
return NULL;
+ if (p->migration_disabled)
+ return NULL;
+
rq->push_busy = true;
return get_task_struct(p);
}
@@ -2107,6 +2399,21 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_DEBUG
+extern unsigned int sysctl_sched_latency;
+extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_wakeup_granularity;
+extern int sysctl_resched_latency_warn_ms;
+extern int sysctl_resched_latency_warn_once;
+
+extern unsigned int sysctl_sched_tunable_scaling;
+
+extern unsigned int sysctl_numa_balancing_scan_delay;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_size;
+#endif
+
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -2181,10 +2488,38 @@ unsigned long arch_scale_freq_capacity(int cpu)
}
#endif
+
#ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPTION
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
+static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
+{
+#ifdef CONFIG_SCHED_CORE
+ /*
+ * In order to not have {0,2},{1,3} turn into into an AB-BA,
+ * order by core-id first and cpu-id second.
+ *
+ * Notably:
+ *
+ * double_rq_lock(0,3); will take core-0, core-1 lock
+ * double_rq_lock(1,2); will take core-1, core-0 lock
+ *
+ * when only cpu-id is considered.
+ */
+ if (rq1->core->cpu < rq2->core->cpu)
+ return true;
+ if (rq1->core->cpu > rq2->core->cpu)
+ return false;
+
+ /*
+ * __sched_core_flip() relies on SMT having cpu-id lock order.
+ */
+#endif
+ return rq1->cpu < rq2->cpu;
+}
+
+extern void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
+#ifdef CONFIG_PREEMPTION
/*
* fair double_lock_balance: Safely acquires both rq->locks in a fair
@@ -2199,7 +2534,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
double_rq_lock(this_rq, busiest);
return 1;
@@ -2218,20 +2553,21 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- int ret = 0;
+ if (__rq_lockp(this_rq) == __rq_lockp(busiest))
+ return 0;
+
+ if (likely(raw_spin_rq_trylock(busiest)))
+ return 0;
- if (unlikely(!raw_spin_trylock(&busiest->lock))) {
- if (busiest < this_rq) {
- raw_spin_unlock(&this_rq->lock);
- raw_spin_lock(&busiest->lock);
- raw_spin_lock_nested(&this_rq->lock,
- SINGLE_DEPTH_NESTING);
- ret = 1;
- } else
- raw_spin_lock_nested(&busiest->lock,
- SINGLE_DEPTH_NESTING);
+ if (rq_order_less(this_rq, busiest)) {
+ raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
+ return 0;
}
- return ret;
+
+ raw_spin_rq_unlock(this_rq);
+ double_rq_lock(this_rq, busiest);
+
+ return 1;
}
#endif /* CONFIG_PREEMPTION */
@@ -2241,11 +2577,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
*/
static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
{
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work well under rq->lock */
- raw_spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
+ lockdep_assert_irqs_disabled();
return _double_lock_balance(this_rq, busiest);
}
@@ -2253,8 +2585,9 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
- raw_spin_unlock(&busiest->lock);
- lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+ if (__rq_lockp(this_rq) != __rq_lockp(busiest))
+ raw_spin_rq_unlock(busiest);
+ lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_);
}
static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
@@ -2285,31 +2618,6 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
}
/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
-{
- BUG_ON(!irqs_disabled());
- if (rq1 == rq2) {
- raw_spin_lock(&rq1->lock);
- __acquire(rq2->lock); /* Fake it out ;) */
- } else {
- if (rq1 < rq2) {
- raw_spin_lock(&rq1->lock);
- raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
- } else {
- raw_spin_lock(&rq2->lock);
- raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
- }
- }
-}
-
-/*
* double_rq_unlock - safely unlock two runqueues
*
* Note this does not restore interrupts like task_rq_unlock,
@@ -2319,11 +2627,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq1->lock)
__releases(rq2->lock)
{
- raw_spin_unlock(&rq1->lock);
- if (rq1 != rq2)
- raw_spin_unlock(&rq2->lock);
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_unlock(rq2);
else
__release(rq2->lock);
+ raw_spin_rq_unlock(rq1);
}
extern void set_rq_online (struct rq *rq);
@@ -2344,7 +2652,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
{
BUG_ON(!irqs_disabled());
BUG_ON(rq1 != rq2);
- raw_spin_lock(&rq1->lock);
+ raw_spin_rq_lock(rq1);
__acquire(rq2->lock); /* Fake it out ;) */
}
@@ -2359,7 +2667,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq2->lock)
{
BUG_ON(rq1 != rq2);
- raw_spin_unlock(&rq1->lock);
+ raw_spin_rq_unlock(rq1);
__release(rq2->lock);
}
@@ -2539,20 +2847,27 @@ static __always_inline
unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
struct task_struct *p)
{
- unsigned long min_util;
- unsigned long max_util;
+ unsigned long min_util = 0;
+ unsigned long max_util = 0;
if (!static_branch_likely(&sched_uclamp_used))
return util;
- min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
- max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
-
if (p) {
- min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
- max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
+ min_util = uclamp_eff_value(p, UCLAMP_MIN);
+ max_util = uclamp_eff_value(p, UCLAMP_MAX);
+
+ /*
+ * Ignore last runnable task's max clamp, as this task will
+ * reset it. Similarly, no need to read the rq's min clamp.
+ */
+ if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+ goto out;
}
+ min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
+ max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
+out:
/*
* Since CPU's {min,max}_util clamps are MAX aggregated considering
* RUNNABLE tasks with _different_ clamps, we can end up with an
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index dc218e9f4558..d8f8eb0c655b 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -25,7 +25,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
}
static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+rq_sched_info_dequeue(struct rq *rq, unsigned long long delta)
{
if (rq)
rq->rq_sched_info.run_delay += delta;
@@ -42,7 +42,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
#else /* !CONFIG_SCHEDSTATS: */
static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
-static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
+static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { }
static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
# define schedstat_enabled() 0
# define __schedstat_inc(var) do { } while (0)
@@ -150,29 +150,24 @@ static inline void psi_sched_switch(struct task_struct *prev,
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
-static inline void sched_info_reset_dequeued(struct task_struct *t)
-{
- t->sched_info.last_queued = 0;
-}
-
/*
* We are interested in knowing how long it was from the *first* time a
* task was queued to the time that it finally hit a CPU, we call this routine
* from dequeue_task() to account for possible rq->clock skew across CPUs. The
* delta taken on each CPU would annul the skew.
*/
-static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
+static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = rq_clock(rq), delta = 0;
+ unsigned long long delta = 0;
- if (sched_info_on()) {
- if (t->sched_info.last_queued)
- delta = now - t->sched_info.last_queued;
- }
- sched_info_reset_dequeued(t);
+ if (!t->sched_info.last_queued)
+ return;
+
+ delta = rq_clock(rq) - t->sched_info.last_queued;
+ t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
- rq_sched_info_dequeued(rq, delta);
+ rq_sched_info_dequeue(rq, delta);
}
/*
@@ -182,11 +177,14 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
*/
static void sched_info_arrive(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = rq_clock(rq), delta = 0;
+ unsigned long long now, delta = 0;
+
+ if (!t->sched_info.last_queued)
+ return;
- if (t->sched_info.last_queued)
- delta = now - t->sched_info.last_queued;
- sched_info_reset_dequeued(t);
+ now = rq_clock(rq);
+ delta = now - t->sched_info.last_queued;
+ t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
@@ -197,14 +195,12 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
/*
* This function is only called from enqueue_task(), but also only updates
* the timestamp if it is already not set. It's assumed that
- * sched_info_dequeued() will clear that stamp when appropriate.
+ * sched_info_dequeue() will clear that stamp when appropriate.
*/
-static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
+static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t)
{
- if (sched_info_on()) {
- if (!t->sched_info.last_queued)
- t->sched_info.last_queued = rq_clock(rq);
- }
+ if (!t->sched_info.last_queued)
+ t->sched_info.last_queued = rq_clock(rq);
}
/*
@@ -212,7 +208,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
* due, typically, to expiring its time slice (this may also be called when
* switching to the idle task). Now we can calculate how long we ran.
* Also, if the process is still in the TASK_RUNNING state, call
- * sched_info_queued() to mark that it has now again started waiting on
+ * sched_info_enqueue() to mark that it has now again started waiting on
* the runqueue.
*/
static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
@@ -221,8 +217,8 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
rq_sched_info_depart(rq, delta);
- if (t->state == TASK_RUNNING)
- sched_info_queued(rq, t);
+ if (task_is_running(t))
+ sched_info_enqueue(rq, t);
}
/*
@@ -231,7 +227,7 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
* the idle task.) We are only called when prev != next.
*/
static inline void
-__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
/*
* prev now departs the CPU. It's not interesting to record
@@ -245,18 +241,8 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct
sched_info_arrive(rq, next);
}
-static inline void
-sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
-{
- if (sched_info_on())
- __sched_info_switch(rq, prev, next);
-}
-
#else /* !CONFIG_SCHED_INFO: */
-# define sched_info_queued(rq, t) do { } while (0)
-# define sched_info_reset_dequeued(t) do { } while (0)
-# define sched_info_dequeued(rq, t) do { } while (0)
-# define sched_info_depart(rq, t) do { } while (0)
-# define sched_info_arrive(rq, next) do { } while (0)
+# define sched_info_enqueue(rq, t) do { } while (0)
+# define sched_info_dequeue(rq, t) do { } while (0)
# define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHED_INFO */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 55f39125c0e1..f988ebe3febb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -34,15 +34,24 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
stop->se.exec_start = rq_clock_task(rq);
}
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *pick_task_stop(struct rq *rq)
{
if (!sched_stop_runnable(rq))
return NULL;
- set_next_task_stop(rq, rq->stop, true);
return rq->stop;
}
+static struct task_struct *pick_next_task_stop(struct rq *rq)
+{
+ struct task_struct *p = pick_task_stop(rq);
+
+ if (p)
+ set_next_task_stop(rq, p, true);
+
+ return p;
+}
+
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
@@ -123,6 +132,7 @@ DEFINE_SCHED_CLASS(stop) = {
#ifdef CONFIG_SMP
.balance = balance_stop,
+ .pick_task = pick_task_stop,
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 55a0a243e871..4e8698e62f07 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -467,7 +467,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
struct root_domain *old_rd = NULL;
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
if (rq->rd) {
old_rd = rq->rd;
@@ -493,7 +493,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);
@@ -675,7 +675,7 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
- sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
+ sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
}
@@ -1267,6 +1267,116 @@ next:
}
/*
+ * Asymmetric CPU capacity bits
+ */
+struct asym_cap_data {
+ struct list_head link;
+ unsigned long capacity;
+ unsigned long cpus[];
+};
+
+/*
+ * Set of available CPUs grouped by their corresponding capacities
+ * Each list entry contains a CPU mask reflecting CPUs that share the same
+ * capacity.
+ * The lifespan of data is unlimited.
+ */
+static LIST_HEAD(asym_cap_list);
+
+#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+
+/*
+ * Verify whether there is any CPU capacity asymmetry in a given sched domain.
+ * Provides sd_flags reflecting the asymmetry scope.
+ */
+static inline int
+asym_cpu_capacity_classify(const struct cpumask *sd_span,
+ const struct cpumask *cpu_map)
+{
+ struct asym_cap_data *entry;
+ int count = 0, miss = 0;
+
+ /*
+ * Count how many unique CPU capacities this domain spans across
+ * (compare sched_domain CPUs mask with ones representing available
+ * CPUs capacities). Take into account CPUs that might be offline:
+ * skip those.
+ */
+ list_for_each_entry(entry, &asym_cap_list, link) {
+ if (cpumask_intersects(sd_span, cpu_capacity_span(entry)))
+ ++count;
+ else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry)))
+ ++miss;
+ }
+
+ WARN_ON_ONCE(!count && !list_empty(&asym_cap_list));
+
+ /* No asymmetry detected */
+ if (count < 2)
+ return 0;
+ /* Some of the available CPU capacity values have not been detected */
+ if (miss)
+ return SD_ASYM_CPUCAPACITY;
+
+ /* Full asymmetry */
+ return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL;
+
+}
+
+static inline void asym_cpu_capacity_update_data(int cpu)
+{
+ unsigned long capacity = arch_scale_cpu_capacity(cpu);
+ struct asym_cap_data *entry = NULL;
+
+ list_for_each_entry(entry, &asym_cap_list, link) {
+ if (capacity == entry->capacity)
+ goto done;
+ }
+
+ entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
+ if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
+ return;
+ entry->capacity = capacity;
+ list_add(&entry->link, &asym_cap_list);
+done:
+ __cpumask_set_cpu(cpu, cpu_capacity_span(entry));
+}
+
+/*
+ * Build-up/update list of CPUs grouped by their capacities
+ * An update requires explicit request to rebuild sched domains
+ * with state indicating CPU topology changes.
+ */
+static void asym_cpu_capacity_scan(void)
+{
+ struct asym_cap_data *entry, *next;
+ int cpu;
+
+ list_for_each_entry(entry, &asym_cap_list, link)
+ cpumask_clear(cpu_capacity_span(entry));
+
+ for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_DOMAIN))
+ asym_cpu_capacity_update_data(cpu);
+
+ list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
+ if (cpumask_empty(cpu_capacity_span(entry))) {
+ list_del(&entry->link);
+ kfree(entry);
+ }
+ }
+
+ /*
+ * Only one capacity value has been detected i.e. this system is symmetric.
+ * No need to keep this data around.
+ */
+ if (list_is_singular(&asym_cap_list)) {
+ entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
+ list_del(&entry->link);
+ kfree(entry);
+ }
+}
+
+/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
@@ -1372,6 +1482,8 @@ int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
+
+static unsigned long __read_mostly *sched_numa_onlined_nodes;
#endif
/*
@@ -1399,11 +1511,12 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map,
- struct sched_domain *child, int dflags, int cpu)
+ struct sched_domain *child, int cpu)
{
struct sd_data *sdd = &tl->data;
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
int sd_id, sd_weight, sd_flags = 0;
+ struct cpumask *sd_span;
#ifdef CONFIG_NUMA
/*
@@ -1420,9 +1533,6 @@ sd_init(struct sched_domain_topology_level *tl,
"wrong sd_flags in topology description\n"))
sd_flags &= TOPOLOGY_SD_FLAGS;
- /* Apply detected topology flags */
- sd_flags |= dflags;
-
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
@@ -1454,13 +1564,19 @@ sd_init(struct sched_domain_topology_level *tl,
#endif
};
- cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
- sd_id = cpumask_first(sched_domain_span(sd));
+ sd_span = sched_domain_span(sd);
+ cpumask_and(sd_span, cpu_map, tl->mask(cpu));
+ sd_id = cpumask_first(sd_span);
+
+ sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
+
+ WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
+ (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
+ "CPU capacity asymmetry not supported on SMT\n");
/*
* Convert topological properties into behaviour.
*/
-
/* Don't attempt to spread across CPUs of different capacities. */
if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
sd->child->flags &= ~SD_PREFER_SIBLING;
@@ -1719,6 +1835,16 @@ void sched_init_numa(void)
sched_domains_numa_masks[i][j] = mask;
for_each_node(k) {
+ /*
+ * Distance information can be unreliable for
+ * offline nodes, defer building the node
+ * masks to its bringup.
+ * This relies on all unique distance values
+ * still being visible at init time.
+ */
+ if (!node_online(j))
+ continue;
+
if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric");
@@ -1772,6 +1898,53 @@ void sched_init_numa(void)
sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
init_numa_topology_type();
+
+ sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
+ if (!sched_numa_onlined_nodes)
+ return;
+
+ bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
+ for_each_online_node(i)
+ bitmap_set(sched_numa_onlined_nodes, i, 1);
+}
+
+static void __sched_domains_numa_masks_set(unsigned int node)
+{
+ int i, j;
+
+ /*
+ * NUMA masks are not built for offline nodes in sched_init_numa().
+ * Thus, when a CPU of a never-onlined-before node gets plugged in,
+ * adding that new CPU to the right NUMA masks is not sufficient: the
+ * masks of that CPU's node must also be updated.
+ */
+ if (test_bit(node, sched_numa_onlined_nodes))
+ return;
+
+ bitmap_set(sched_numa_onlined_nodes, node, 1);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (!node_online(j) || node == j)
+ continue;
+
+ if (node_distance(j, node) > sched_domains_numa_distance[i])
+ continue;
+
+ /* Add remote nodes in our masks */
+ cpumask_or(sched_domains_numa_masks[i][node],
+ sched_domains_numa_masks[i][node],
+ sched_domains_numa_masks[0][j]);
+ }
+ }
+
+ /*
+ * A new node has been brought up, potentially changing the topology
+ * classification.
+ *
+ * Note that this is racy vs any use of sched_numa_topology_type :/
+ */
+ init_numa_topology_type();
}
void sched_domains_numa_masks_set(unsigned int cpu)
@@ -1779,8 +1952,14 @@ void sched_domains_numa_masks_set(unsigned int cpu)
int node = cpu_to_node(cpu);
int i, j;
+ __sched_domains_numa_masks_set(node);
+
for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
+ if (!node_online(j))
+ continue;
+
+ /* Set ourselves in the remote node's masks */
if (node_distance(j, node) <= sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
}
@@ -1926,9 +2105,9 @@ static void __sdt_free(const struct cpumask *cpu_map)
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *child, int dflags, int cpu)
+ struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
+ struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
if (child) {
sd->level = child->level + 1;
@@ -1991,65 +2170,6 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl,
}
/*
- * Find the sched_domain_topology_level where all CPU capacities are visible
- * for all CPUs.
- */
-static struct sched_domain_topology_level
-*asym_cpu_capacity_level(const struct cpumask *cpu_map)
-{
- int i, j, asym_level = 0;
- bool asym = false;
- struct sched_domain_topology_level *tl, *asym_tl = NULL;
- unsigned long cap;
-
- /* Is there any asymmetry? */
- cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
-
- for_each_cpu(i, cpu_map) {
- if (arch_scale_cpu_capacity(i) != cap) {
- asym = true;
- break;
- }
- }
-
- if (!asym)
- return NULL;
-
- /*
- * Examine topology from all CPU's point of views to detect the lowest
- * sched_domain_topology_level where a highest capacity CPU is visible
- * to everyone.
- */
- for_each_cpu(i, cpu_map) {
- unsigned long max_capacity = arch_scale_cpu_capacity(i);
- int tl_id = 0;
-
- for_each_sd_topology(tl) {
- if (tl_id < asym_level)
- goto next_level;
-
- for_each_cpu_and(j, tl->mask(i), cpu_map) {
- unsigned long capacity;
-
- capacity = arch_scale_cpu_capacity(j);
-
- if (capacity <= max_capacity)
- continue;
-
- max_capacity = capacity;
- asym_level = tl_id;
- asym_tl = tl;
- }
-next_level:
- tl_id++;
- }
- }
-
- return asym_tl;
-}
-
-
-/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
*/
@@ -2061,7 +2181,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
struct s_data d;
struct rq *rq = NULL;
int i, ret = -ENOMEM;
- struct sched_domain_topology_level *tl_asym;
bool has_asym = false;
if (WARN_ON(cpumask_empty(cpu_map)))
@@ -2071,24 +2190,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (alloc_state != sa_rootdomain)
goto error;
- tl_asym = asym_cpu_capacity_level(cpu_map);
-
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
- int dflags = 0;
sd = NULL;
for_each_sd_topology(tl) {
- if (tl == tl_asym) {
- dflags |= SD_ASYM_CPUCAPACITY;
- has_asym = true;
- }
if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
goto error;
- sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+
+ has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
@@ -2217,6 +2331,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
arch_update_cpu_topology();
+ asym_cpu_capacity_scan();
ndoms_cur = 1;
doms_cur = alloc_sched_domains(ndoms_cur);
if (!doms_cur)
@@ -2299,6 +2414,9 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
/* Let the architecture update CPU core mappings: */
new_topology = arch_update_cpu_topology();
+ /* Trigger rebuilding CPU capacity asymmetry data */
+ if (new_topology)
+ asym_cpu_capacity_scan();
if (!doms_new) {
WARN_ON_ONCE(dattr_new);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 183cc6ae68a6..76577d1642a5 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -264,17 +264,22 @@ prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_ent
}
EXPORT_SYMBOL(prepare_to_wait);
-void
+/* Returns true if we are the first waiter in the queue, false otherwise. */
+bool
prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
{
unsigned long flags;
+ bool was_empty = false;
wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&wq_head->lock, flags);
- if (list_empty(&wq_entry->entry))
+ if (list_empty(&wq_entry->entry)) {
+ was_empty = list_empty(&wq_head->head);
__add_wait_queue_entry_tail(wq_head, wq_entry);
+ }
set_current_state(state);
spin_unlock_irqrestore(&wq_head->lock, flags);
+ return was_empty;
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 9f58049ac16d..4d8f44a17727 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -107,6 +107,7 @@ struct seccomp_knotif {
* installing process should allocate the fd as normal.
* @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
* is allowed.
+ * @ioctl_flags: The flags used for the seccomp_addfd ioctl.
* @ret: The return value of the installing process. It is set to the fd num
* upon success (>= 0).
* @completion: Indicates that the installing process has completed fd
@@ -118,6 +119,7 @@ struct seccomp_kaddfd {
struct file *file;
int fd;
unsigned int flags;
+ __u32 ioctl_flags;
union {
bool setfd;
@@ -600,7 +602,7 @@ static inline void seccomp_sync_threads(unsigned long flags)
smp_store_release(&thread->seccomp.filter,
caller->seccomp.filter);
atomic_set(&thread->seccomp.filter_count,
- atomic_read(&thread->seccomp.filter_count));
+ atomic_read(&caller->seccomp.filter_count));
/*
* Don't let an unprivileged task work around
@@ -920,30 +922,6 @@ void get_seccomp_filter(struct task_struct *tsk)
refcount_inc(&orig->users);
}
-static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason)
-{
- clear_siginfo(info);
- info->si_signo = SIGSYS;
- info->si_code = SYS_SECCOMP;
- info->si_call_addr = (void __user *)KSTK_EIP(current);
- info->si_errno = reason;
- info->si_arch = syscall_get_arch(current);
- info->si_syscall = syscall;
-}
-
-/**
- * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
- * @syscall: syscall number to send to userland
- * @reason: filter-supplied reason code to send to userland (via si_errno)
- *
- * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
- */
-static void seccomp_send_sigsys(int syscall, int reason)
-{
- struct kernel_siginfo info;
- seccomp_init_siginfo(&info, syscall, reason);
- force_sig_info(&info);
-}
#endif /* CONFIG_SECCOMP_FILTER */
/* For use with seccomp_actions_logged */
@@ -1065,18 +1043,37 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
return filter->notif->next_id++;
}
-static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
+static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_knotif *n)
{
+ int fd;
+
/*
* Remove the notification, and reset the list pointers, indicating
* that it has been handled.
*/
list_del_init(&addfd->list);
if (!addfd->setfd)
- addfd->ret = receive_fd(addfd->file, addfd->flags);
+ fd = receive_fd(addfd->file, addfd->flags);
else
- addfd->ret = receive_fd_replace(addfd->fd, addfd->file,
- addfd->flags);
+ fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
+ addfd->ret = fd;
+
+ if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) {
+ /* If we fail reset and return an error to the notifier */
+ if (fd < 0) {
+ n->state = SECCOMP_NOTIFY_SENT;
+ } else {
+ /* Return the FD we just added */
+ n->flags = 0;
+ n->error = 0;
+ n->val = fd;
+ }
+ }
+
+ /*
+ * Mark the notification as completed. From this point, addfd mem
+ * might be invalidated and we can't safely read it anymore.
+ */
complete(&addfd->completion);
}
@@ -1120,7 +1117,7 @@ static int seccomp_do_user_notification(int this_syscall,
struct seccomp_kaddfd, list);
/* Check if we were woken up by a addfd message */
if (addfd)
- seccomp_handle_addfd(addfd);
+ seccomp_handle_addfd(addfd, &n);
} while (n.state != SECCOMP_NOTIFY_REPLIED);
@@ -1197,7 +1194,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
/* Show the handler the original registers. */
syscall_rollback(current, current_pt_regs());
/* Let the filter pass back 16 bits of data. */
- seccomp_send_sigsys(this_syscall, data);
+ force_sig_seccomp(this_syscall, data, false);
goto skip;
case SECCOMP_RET_TRACE:
@@ -1267,19 +1264,15 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
seccomp_log(this_syscall, SIGSYS, action, true);
/* Dump core only if this is the last remaining thread. */
if (action != SECCOMP_RET_KILL_THREAD ||
- get_nr_threads(current) == 1) {
- kernel_siginfo_t info;
-
+ (atomic_read(&current->signal->live) == 1)) {
/* Show the original registers in the dump. */
syscall_rollback(current, current_pt_regs());
- /* Trigger a manual coredump since do_exit skips it. */
- seccomp_init_siginfo(&info, this_syscall, data);
- do_coredump(&info);
- }
- if (action == SECCOMP_RET_KILL_THREAD)
+ /* Trigger a coredump with SIGSYS */
+ force_sig_seccomp(this_syscall, data, true);
+ } else {
do_exit(SIGSYS);
- else
- do_group_exit(SIGSYS);
+ }
+ return -1; /* skip the syscall go directly to signal handling */
}
unreachable();
@@ -1581,7 +1574,7 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter,
if (addfd.newfd_flags & ~O_CLOEXEC)
return -EINVAL;
- if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
+ if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_SEND))
return -EINVAL;
if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
@@ -1591,6 +1584,7 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter,
if (!kaddfd.file)
return -EBADF;
+ kaddfd.ioctl_flags = addfd.flags;
kaddfd.flags = addfd.newfd_flags;
kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD;
kaddfd.fd = addfd.newfd;
@@ -1616,6 +1610,23 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter,
goto out_unlock;
}
+ if (addfd.flags & SECCOMP_ADDFD_FLAG_SEND) {
+ /*
+ * Disallow queuing an atomic addfd + send reply while there are
+ * some addfd requests still to process.
+ *
+ * There is no clear reason to support it and allows us to keep
+ * the loop on the other side straight-forward.
+ */
+ if (!list_empty(&knotif->addfd)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /* Allow exactly only one reply */
+ knotif->state = SECCOMP_NOTIFY_REPLIED;
+ }
+
list_add(&kaddfd.list, &knotif->addfd);
complete(&knotif->ready);
mutex_unlock(&filter->notify_lock);
diff --git a/kernel/signal.c b/kernel/signal.c
index 30a0bee5ff9b..487bf4f5dadf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -54,6 +54,7 @@
#include <asm/unistd.h>
#include <asm/siginfo.h>
#include <asm/cacheflush.h>
+#include <asm/syscall.h> /* for syscall_get_* */
/*
* SLAB caches for signal bits.
@@ -412,8 +413,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
int override_rlimit, const unsigned int sigqueue_flags)
{
struct sigqueue *q = NULL;
- struct user_struct *user;
- int sigpending;
+ struct ucounts *ucounts = NULL;
+ long sigpending;
/*
* Protect access to @t credentials. This can go away when all
@@ -424,11 +425,11 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
* changes from/to zero.
*/
rcu_read_lock();
- user = __task_cred(t)->user;
- sigpending = atomic_inc_return(&user->sigpending);
- if (sigpending == 1)
- get_uid(user);
+ ucounts = task_ucounts(t);
+ sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
+ if (!sigpending)
+ return NULL;
if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
@@ -437,14 +438,12 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
}
if (unlikely(q == NULL)) {
- if (atomic_dec_and_test(&user->sigpending))
- free_uid(user);
+ dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
} else {
INIT_LIST_HEAD(&q->list);
q->flags = sigqueue_flags;
- q->user = user;
+ q->ucounts = ucounts;
}
-
return q;
}
@@ -452,8 +451,10 @@ static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
- if (atomic_dec_and_test(&q->user->sigpending))
- free_uid(q->user);
+ if (q->ucounts) {
+ dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING);
+ q->ucounts = NULL;
+ }
kmem_cache_free(sigqueue_cachep, q);
}
@@ -1200,7 +1201,7 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
case SIL_FAULT_MCEERR:
case SIL_FAULT_BNDERR:
case SIL_FAULT_PKUERR:
- case SIL_PERF_EVENT:
+ case SIL_FAULT_PERF_EVENT:
case SIL_SYS:
ret = false;
break;
@@ -1309,7 +1310,7 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p
* that is why we also clear SIGNAL_UNKILLABLE.
*/
static int
-force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t)
+force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool sigdfl)
{
unsigned long int flags;
int ret, blocked, ignored;
@@ -1320,7 +1321,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t)
action = &t->sighand->action[sig-1];
ignored = action->sa.sa_handler == SIG_IGN;
blocked = sigismember(&t->blocked, sig);
- if (blocked || ignored) {
+ if (blocked || ignored || sigdfl) {
action->sa.sa_handler = SIG_DFL;
if (blocked) {
sigdelset(&t->blocked, sig);
@@ -1341,7 +1342,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t)
int force_sig_info(struct kernel_siginfo *info)
{
- return force_sig_info_to_task(info, current);
+ return force_sig_info_to_task(info, current, false);
}
/*
@@ -1400,6 +1401,21 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
return sighand;
}
+#ifdef CONFIG_LOCKDEP
+void lockdep_assert_task_sighand_held(struct task_struct *task)
+{
+ struct sighand_struct *sighand;
+
+ rcu_read_lock();
+ sighand = rcu_dereference(task->sighand);
+ if (sighand)
+ lockdep_assert_held(&sighand->siglock);
+ else
+ WARN_ON_ONCE(1);
+ rcu_read_unlock();
+}
+#endif
+
/*
* send signal info to all the members of a group
*/
@@ -1653,7 +1669,6 @@ void force_sigsegv(int sig)
}
int force_sig_fault_to_task(int sig, int code, void __user *addr
- ___ARCH_SI_TRAPNO(int trapno)
___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
, struct task_struct *t)
{
@@ -1664,28 +1679,22 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
-#ifdef __ARCH_SI_TRAPNO
- info.si_trapno = trapno;
-#endif
#ifdef __ia64__
info.si_imm = imm;
info.si_flags = flags;
info.si_isr = isr;
#endif
- return force_sig_info_to_task(&info, t);
+ return force_sig_info_to_task(&info, t, false);
}
int force_sig_fault(int sig, int code, void __user *addr
- ___ARCH_SI_TRAPNO(int trapno)
___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
{
return force_sig_fault_to_task(sig, code, addr
- ___ARCH_SI_TRAPNO(trapno)
___ARCH_SI_IA64(imm, flags, isr), current);
}
int send_sig_fault(int sig, int code, void __user *addr
- ___ARCH_SI_TRAPNO(int trapno)
___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
, struct task_struct *t)
{
@@ -1696,9 +1705,6 @@ int send_sig_fault(int sig, int code, void __user *addr
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
-#ifdef __ARCH_SI_TRAPNO
- info.si_trapno = trapno;
-#endif
#ifdef __ia64__
info.si_imm = imm;
info.si_flags = flags;
@@ -1780,6 +1786,27 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data)
return force_sig_info(&info);
}
+/**
+ * force_sig_seccomp - signals the task to allow in-process syscall emulation
+ * @syscall: syscall number to send to userland
+ * @reason: filter-supplied reason code to send to userland (via si_errno)
+ *
+ * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
+ */
+int force_sig_seccomp(int syscall, int reason, bool force_coredump)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGSYS;
+ info.si_code = SYS_SECCOMP;
+ info.si_call_addr = (void __user *)KSTK_EIP(current);
+ info.si_errno = reason;
+ info.si_arch = syscall_get_arch(current);
+ info.si_syscall = syscall;
+ return force_sig_info_to_task(&info, current, force_coredump);
+}
+
/* For the crazy architectures that include trap information in
* the errno field, instead of an actual errno value.
*/
@@ -1795,6 +1822,39 @@ int force_sig_ptrace_errno_trap(int errno, void __user *addr)
return force_sig_info(&info);
}
+/* For the rare architectures that include trap information using
+ * si_trapno.
+ */
+int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = code;
+ info.si_addr = addr;
+ info.si_trapno = trapno;
+ return force_sig_info(&info);
+}
+
+/* For the rare architectures that include trap information using
+ * si_trapno.
+ */
+int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
+ struct task_struct *t)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = code;
+ info.si_addr = addr;
+ info.si_trapno = trapno;
+ return send_sig_info(info.si_signo, &info, t);
+}
+
int kill_pgrp(struct pid *pid, int sig, int priv)
{
int ret;
@@ -2544,7 +2604,7 @@ static void hide_si_addr_tag_bits(struct ksignal *ksig)
case SIL_FAULT_MCEERR:
case SIL_FAULT_BNDERR:
case SIL_FAULT_PKUERR:
- case SIL_PERF_EVENT:
+ case SIL_FAULT_PERF_EVENT:
ksig->info.si_addr = arch_untagged_si_addr(
ksig->info.si_addr, ksig->sig, ksig->info.si_code);
break;
@@ -2829,6 +2889,8 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
sigaddset(&blocked, ksig->sig);
set_current_blocked(&blocked);
+ if (current->sas_ss_flags & SS_AUTODISARM)
+ sas_ss_reset(current);
tracehook_signal_handler(stepping);
}
@@ -3227,11 +3289,14 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code)
layout = SIL_FAULT_PKUERR;
#endif
else if ((sig == SIGTRAP) && (si_code == TRAP_PERF))
- layout = SIL_PERF_EVENT;
-#ifdef __ARCH_SI_TRAPNO
- else if (layout == SIL_FAULT)
+ layout = SIL_FAULT_PERF_EVENT;
+ else if (IS_ENABLED(CONFIG_SPARC) &&
+ (sig == SIGILL) && (si_code == ILL_ILLTRP))
+ layout = SIL_FAULT_TRAPNO;
+ else if (IS_ENABLED(CONFIG_ALPHA) &&
+ ((sig == SIGFPE) ||
+ ((sig == SIGTRAP) && (si_code == TRAP_UNK))))
layout = SIL_FAULT_TRAPNO;
-#endif
}
else if (si_code <= NSIGPOLL)
layout = SIL_POLL;
@@ -3353,7 +3418,7 @@ void copy_siginfo_to_external32(struct compat_siginfo *to,
to->si_addr = ptr_to_compat(from->si_addr);
to->si_pkey = from->si_pkey;
break;
- case SIL_PERF_EVENT:
+ case SIL_FAULT_PERF_EVENT:
to->si_addr = ptr_to_compat(from->si_addr);
to->si_perf_data = from->si_perf_data;
to->si_perf_type = from->si_perf_type;
@@ -3430,7 +3495,7 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to,
to->si_addr = compat_ptr(from->si_addr);
to->si_pkey = from->si_pkey;
break;
- case SIL_PERF_EVENT:
+ case SIL_FAULT_PERF_EVENT:
to->si_addr = compat_ptr(from->si_addr);
to->si_perf_data = from->si_perf_data;
to->si_perf_type = from->si_perf_type;
@@ -4147,11 +4212,7 @@ int __save_altstack(stack_t __user *uss, unsigned long sp)
int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
__put_user(t->sas_ss_flags, &uss->ss_flags) |
__put_user(t->sas_ss_size, &uss->ss_size);
- if (err)
- return err;
- if (t->sas_ss_flags & SS_AUTODISARM)
- sas_ss_reset(t);
- return 0;
+ return err;
}
#ifdef CONFIG_COMPAT
@@ -4206,11 +4267,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
&uss->ss_sp) |
__put_user(t->sas_ss_flags, &uss->ss_flags) |
__put_user(t->sas_ss_size, &uss->ss_size);
- if (err)
- return err;
- if (t->sas_ss_flags & SS_AUTODISARM)
- sas_ss_reset(t);
- return 0;
+ return err;
}
#endif
@@ -4656,7 +4713,7 @@ void __init signals_init(void)
{
siginfo_buildtime_checks();
- sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
+ sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT);
}
#ifdef CONFIG_KGDB_KDB
@@ -4679,7 +4736,7 @@ void kdb_send_sig(struct task_struct *t, int sig)
}
new_t = kdb_prev_t != t;
kdb_prev_t = t;
- if (t->state != TASK_RUNNING && new_t) {
+ if (!task_is_running(t) && new_t) {
spin_unlock(&t->sighand->siglock);
kdb_printf("Process is not RUNNING, sending a signal from "
"kdb risks deadlock\n"
diff --git a/kernel/smp.c b/kernel/smp.c
index 52bf159ec400..f43ede0ab183 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -764,7 +764,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
EXPORT_SYMBOL(smp_call_function_single);
/**
- * smp_call_function_single_async(): Run an asynchronous function on a
+ * smp_call_function_single_async() - Run an asynchronous function on a
* specific CPU.
* @cpu: The CPU to run on.
* @csd: Pre-allocated and setup data structure
@@ -783,6 +783,8 @@ EXPORT_SYMBOL(smp_call_function_single);
*
* NOTE: Be careful, there is unfortunately no current debugging facility to
* validate the correctness of this serialization.
+ *
+ * Return: %0 on success or negative errno value on error
*/
int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
{
@@ -974,7 +976,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
- * @flags: Bitmask that controls the operation. If %SCF_WAIT is set, wait
+ * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait
* (atomically) until function has completed on other CPUs. If
* %SCF_RUN_LOCAL is set, the function will also be run locally
* if the local CPU is set in the @cpumask.
@@ -1180,7 +1182,13 @@ void wake_up_all_idle_cpus(void)
EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
/**
- * smp_call_on_cpu - Call a function on a specific cpu
+ * struct smp_call_on_cpu_struct - Call a function on a specific CPU
+ * @work: &work_struct
+ * @done: &completion to signal
+ * @func: function to call
+ * @data: function's data argument
+ * @ret: return value from @func
+ * @cpu: target CPU (%-1 for any CPU)
*
* Used to call a function on a specific cpu and wait for it to return.
* Optionally make sure the call is done on a specified physical cpu via vcpu
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f25208e8df83..f6bc0bc8a2aa 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -33,7 +33,6 @@ struct task_struct *idle_thread_get(unsigned int cpu)
if (!tsk)
return ERR_PTR(-ENOMEM);
- init_idle(tsk, cpu);
return tsk;
}
@@ -48,7 +47,7 @@ void __init idle_thread_set_boot_cpu(void)
*
* Creates the thread if it does not exist.
*/
-static inline void idle_init(unsigned int cpu)
+static __always_inline void idle_init(unsigned int cpu)
{
struct task_struct *tsk = per_cpu(idle_threads, cpu);
@@ -292,7 +291,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
unsigned int cpu;
int ret = 0;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu);
@@ -305,7 +304,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
list_add(&plug_thread->list, &hotplug_threads);
out:
mutex_unlock(&smpboot_threads_lock);
- put_online_cpus();
+ cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
@@ -318,12 +317,12 @@ EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
*/
void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&smpboot_threads_lock);
list_del(&plug_thread->list);
smpboot_destroy_threads(plug_thread);
mutex_unlock(&smpboot_threads_lock);
- put_online_cpus();
+ cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4992853ef53d..322b65d45676 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -76,7 +76,7 @@ static void wakeup_softirqd(void)
/* Interrupts are disabled: no need to stop preemption */
struct task_struct *tsk = __this_cpu_read(ksoftirqd);
- if (tsk && tsk->state != TASK_RUNNING)
+ if (tsk)
wake_up_process(tsk);
}
@@ -92,8 +92,7 @@ static bool ksoftirqd_running(unsigned long pending)
if (pending & SOFTIRQ_NOW_MASK)
return false;
- return tsk && (tsk->state == TASK_RUNNING) &&
- !__kthread_should_park(tsk);
+ return tsk && task_is_running(tsk) && !__kthread_should_park(tsk);
}
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -423,7 +422,7 @@ static inline void invoke_softirq(void)
if (ksoftirqd_running(local_softirq_pending()))
return;
- if (!force_irqthreads || !__this_cpu_read(ksoftirqd)) {
+ if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
* We can safely execute softirq on the current stack if
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 723fcc9d20db..43ba0b1e0edb 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -292,13 +292,15 @@ static int addr_conflict(struct static_call_site *site, void *start, void *end)
static int __static_call_text_reserved(struct static_call_site *iter_start,
struct static_call_site *iter_stop,
- void *start, void *end)
+ void *start, void *end, bool init)
{
struct static_call_site *iter = iter_start;
while (iter < iter_stop) {
- if (addr_conflict(iter, start, end))
- return 1;
+ if (init || !static_call_is_init(iter)) {
+ if (addr_conflict(iter, start, end))
+ return 1;
+ }
iter++;
}
@@ -324,7 +326,7 @@ static int __static_call_mod_text_reserved(void *start, void *end)
ret = __static_call_text_reserved(mod->static_call_sites,
mod->static_call_sites + mod->num_static_call_sites,
- start, end);
+ start, end, mod->state == MODULE_STATE_COMING);
module_put(mod);
@@ -459,8 +461,9 @@ static inline int __static_call_mod_text_reserved(void *start, void *end)
int static_call_text_reserved(void *start, void *end)
{
+ bool init = system_state < SYSTEM_RUNNING;
int ret = __static_call_text_reserved(__start_static_call_sites,
- __stop_static_call_sites, start, end);
+ __stop_static_call_sites, start, end, init);
if (ret)
return ret;
diff --git a/kernel/sys.c b/kernel/sys.c
index 3a583a29815f..8fdac0d90504 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -479,8 +479,9 @@ static int set_user(struct cred *new)
* for programs doing set*uid()+execve() by harmlessly deferring the
* failure to the execve() stage.
*/
- if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
- new_user != INIT_USER)
+ if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
+ new_user != INIT_USER &&
+ !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
current->flags |= PF_NPROC_EXCEEDED;
else
current->flags &= ~PF_NPROC_EXCEEDED;
@@ -558,6 +559,10 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
if (retval < 0)
goto error;
+ retval = set_cred_ucounts(new);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
error:
@@ -616,6 +621,10 @@ long __sys_setuid(uid_t uid)
if (retval < 0)
goto error;
+ retval = set_cred_ucounts(new);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
error:
@@ -691,6 +700,10 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if (retval < 0)
goto error;
+ retval = set_cred_ucounts(new);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
error:
@@ -1834,7 +1847,6 @@ SYSCALL_DEFINE1(umask, int, mask)
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
struct fd exe;
- struct file *old_exe, *exe_file;
struct inode *inode;
int err;
@@ -1857,40 +1869,10 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (err)
goto exit;
- /*
- * Forbid mm->exe_file change if old file still mapped.
- */
- exe_file = get_mm_exe_file(mm);
- err = -EBUSY;
- if (exe_file) {
- struct vm_area_struct *vma;
-
- mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (!vma->vm_file)
- continue;
- if (path_equal(&vma->vm_file->f_path,
- &exe_file->f_path))
- goto exit_err;
- }
-
- mmap_read_unlock(mm);
- fput(exe_file);
- }
-
- err = 0;
- /* set the new file, lockless */
- get_file(exe.file);
- old_exe = xchg(&mm->exe_file, exe.file);
- if (old_exe)
- fput(old_exe);
+ err = replace_mm_exe_file(mm, exe.file);
exit:
fdput(exe);
return err;
-exit_err:
- mmap_read_unlock(mm);
- fput(exe_file);
- goto exit;
}
/*
@@ -1948,13 +1930,6 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
error = -EINVAL;
/*
- * @brk should be after @end_data in traditional maps.
- */
- if (prctl_map->start_brk <= prctl_map->end_data ||
- prctl_map->brk <= prctl_map->end_data)
- goto out;
-
- /*
* Neither we should allow to override limits if they set.
*/
if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
@@ -2550,6 +2525,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = set_syscall_user_dispatch(arg2, arg3, arg4,
(char __user *) arg5);
break;
+#ifdef CONFIG_SCHED_CORE
+ case PR_SCHED_CORE:
+ error = sched_core_share_pid(arg2, arg3, arg4, arg5);
+ break;
+#endif
default:
error = -EINVAL;
break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0ea8128468c3..f43d89d92860 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -99,7 +99,7 @@ COND_SYSCALL(flock);
/* fs/quota.c */
COND_SYSCALL(quotactl);
-COND_SYSCALL(quotactl_path);
+COND_SYSCALL(quotactl_fd);
/* fs/readdir.c */
@@ -289,17 +289,13 @@ COND_SYSCALL(munlockall);
COND_SYSCALL(mincore);
COND_SYSCALL(madvise);
COND_SYSCALL(process_madvise);
+COND_SYSCALL(process_mrelease);
COND_SYSCALL(remap_file_pages);
COND_SYSCALL(mbind);
-COND_SYSCALL_COMPAT(mbind);
COND_SYSCALL(get_mempolicy);
-COND_SYSCALL_COMPAT(get_mempolicy);
COND_SYSCALL(set_mempolicy);
-COND_SYSCALL_COMPAT(set_mempolicy);
COND_SYSCALL(migrate_pages);
-COND_SYSCALL_COMPAT(migrate_pages);
COND_SYSCALL(move_pages);
-COND_SYSCALL_COMPAT(move_pages);
COND_SYSCALL(perf_event_open);
COND_SYSCALL(accept4);
@@ -358,6 +354,8 @@ COND_SYSCALL(pkey_mprotect);
COND_SYSCALL(pkey_alloc);
COND_SYSCALL(pkey_free);
+/* memfd_secret */
+COND_SYSCALL(memfd_secret);
/*
* Architecture specific weak syscall entries.
@@ -414,7 +412,6 @@ COND_SYSCALL(epoll_wait);
COND_SYSCALL(recv);
COND_SYSCALL_COMPAT(recv);
COND_SYSCALL(send);
-COND_SYSCALL(bdflush);
COND_SYSCALL(uselib);
/* optional: time32 */
diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c
index ccb78509f1a8..664ded05dd7a 100644
--- a/kernel/sysctl-test.c
+++ b/kernel/sysctl-test.c
@@ -49,7 +49,7 @@ static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&null_data_table,
KUNIT_PROC_READ, buffer, &len,
&pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
/*
* See above.
@@ -58,7 +58,7 @@ static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&null_data_table,
KUNIT_PROC_WRITE, buffer, &len,
&pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
}
/*
@@ -95,7 +95,7 @@ static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&data_maxlen_unset_table,
KUNIT_PROC_READ, buffer, &len,
&pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
/*
* See previous comment.
@@ -104,7 +104,7 @@ static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&data_maxlen_unset_table,
KUNIT_PROC_WRITE, buffer, &len,
&pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
}
/*
@@ -135,11 +135,11 @@ static void sysctl_test_api_dointvec_table_len_is_zero(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, buffer,
&len, &pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE, buffer,
&len, &pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
}
/*
@@ -174,7 +174,7 @@ static void sysctl_test_api_dointvec_table_read_but_position_set(
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, buffer,
&len, &pos));
- KUNIT_EXPECT_EQ(test, (size_t)0, len);
+ KUNIT_EXPECT_EQ(test, 0, len);
}
/*
@@ -203,7 +203,7 @@ static void sysctl_test_dointvec_read_happy_single_positive(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ,
user_buffer, &len, &pos));
- KUNIT_ASSERT_EQ(test, (size_t)3, len);
+ KUNIT_ASSERT_EQ(test, 3, len);
buffer[len] = '\0';
/* And we read 13 back out. */
KUNIT_EXPECT_STREQ(test, "13\n", buffer);
@@ -233,9 +233,9 @@ static void sysctl_test_dointvec_read_happy_single_negative(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ,
user_buffer, &len, &pos));
- KUNIT_ASSERT_EQ(test, (size_t)4, len);
+ KUNIT_ASSERT_EQ(test, 4, len);
buffer[len] = '\0';
- KUNIT_EXPECT_STREQ(test, "-16\n", (char *)buffer);
+ KUNIT_EXPECT_STREQ(test, "-16\n", buffer);
}
/*
@@ -265,7 +265,7 @@ static void sysctl_test_dointvec_write_happy_single_positive(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE,
user_buffer, &len, &pos));
KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len);
- KUNIT_EXPECT_EQ(test, sizeof(input) - 1, (size_t)pos);
+ KUNIT_EXPECT_EQ(test, sizeof(input) - 1, pos);
KUNIT_EXPECT_EQ(test, 9, *((int *)table.data));
}
@@ -295,7 +295,7 @@ static void sysctl_test_dointvec_write_happy_single_negative(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE,
user_buffer, &len, &pos));
KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len);
- KUNIT_EXPECT_EQ(test, sizeof(input) - 1, (size_t)pos);
+ KUNIT_EXPECT_EQ(test, sizeof(input) - 1, pos);
KUNIT_EXPECT_EQ(test, -9, *((int *)table.data));
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d4a78e08f6d8..083be6af29d7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
#include <linux/sysctl.h>
#include <linux/bitmap.h>
#include <linux/signal.h>
+#include <linux/panic.h>
#include <linux/printk.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
@@ -71,6 +72,7 @@
#include <linux/coredump.h>
#include <linux/latencytop.h>
#include <linux/pid.h>
+#include <linux/delayacct.h>
#include "../lib/kstrtox.h"
@@ -534,6 +536,21 @@ static void proc_put_char(void **buf, size_t *size, char c)
}
}
+static int do_proc_dobool_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ *(bool *)valp = *lvalp;
+ } else {
+ int val = *(bool *)valp;
+
+ *lvalp = (unsigned long)val;
+ *negp = false;
+ }
+ return 0;
+}
+
static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
@@ -797,6 +814,26 @@ static int do_proc_douintvec(struct ctl_table *table, int write,
}
/**
+ * proc_dobool - read/write a bool
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_dobool(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dobool_conv, NULL);
+}
+
+/**
* proc_dointvec - read a vector of integers
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
@@ -1494,7 +1531,6 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int err = 0;
- bool first = 1;
size_t left = *lenp;
unsigned long bitmap_len = table->maxlen;
unsigned long *bitmap = *(unsigned long **) table->data;
@@ -1579,12 +1615,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
}
bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
- first = 0;
proc_skip_char(&p, &left, '\n');
}
left += skipped;
} else {
unsigned long bit_a, bit_b = 0;
+ bool first = 1;
while (left) {
bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
@@ -1629,6 +1665,12 @@ int proc_dostring(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_dobool(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -1747,6 +1789,17 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_TASK_DELAY_ACCT
+ {
+ .procname = "task_delayacct",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_delayacct,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_TASK_DELAY_ACCT */
#ifdef CONFIG_NUMA_BALANCING
{
.procname = "numa_balancing",
@@ -2859,7 +2912,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_compaction_proactiveness,
.maxlen = sizeof(sysctl_compaction_proactiveness),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = compaction_proactiveness_sysctl_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
@@ -2909,11 +2962,11 @@ static struct ctl_table vm_table[] = {
.extra2 = &one_thousand,
},
{
- .procname = "percpu_pagelist_fraction",
- .data = &percpu_pagelist_fraction,
- .maxlen = sizeof(percpu_pagelist_fraction),
+ .procname = "percpu_pagelist_high_fraction",
+ .data = &percpu_pagelist_high_fraction,
+ .maxlen = sizeof(percpu_pagelist_high_fraction),
.mode = 0644,
- .proc_handler = percpu_pagelist_fraction_sysctl_handler,
+ .proc_handler = percpu_pagelist_high_fraction_sysctl_handler,
.extra1 = SYSCTL_ZERO,
},
{
@@ -2951,14 +3004,6 @@ static struct ctl_table vm_table[] = {
.proc_handler = proc_dointvec_jiffies,
},
{
- .procname = "block_dump",
- .data = &block_dump,
- .maxlen = sizeof(block_dump),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
- {
.procname = "vfs_cache_pressure",
.data = &sysctl_vfs_cache_pressure,
.maxlen = sizeof(sysctl_vfs_cache_pressure),
@@ -3421,6 +3466,7 @@ int __init sysctl_init(void)
* No sense putting this after each symbol definition, twice,
* exception granted :-)
*/
+EXPORT_SYMBOL(proc_dobool);
EXPORT_SYMBOL(proc_dointvec);
EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 83e158d016ba..04bfd62f5e5c 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -64,6 +64,15 @@ config LEGACY_TIMER_TICK
lack support for the generic clockevent framework.
New platforms should use generic clockevents instead.
+config TIME_KUNIT_TEST
+ tristate "KUnit test for kernel/time functions" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ Enable this option to test RTC library functions.
+
+ If unsure, say N.
+
if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
@@ -117,13 +126,14 @@ config NO_HZ_FULL
the task mostly runs in userspace and has few kernel activity.
You need to fill up the nohz_full boot parameter with the
- desired range of dynticks CPUs.
+ desired range of dynticks CPUs to use it. This is implemented at
+ the expense of some overhead in user <-> kernel transitions:
+ syscalls, exceptions and interrupts.
- This is implemented at the expense of some overhead in user <-> kernel
- transitions: syscalls, exceptions and interrupts. Even when it's
- dynamically off.
+ By default, without passing the nohz_full parameter, this behaves just
+ like NO_HZ_IDLE.
- Say N.
+ If you're a distro say Y.
endchoice
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 1fb1c1ef6a19..7e875e63ff3b 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -21,3 +21,5 @@ obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
obj-$(CONFIG_TIME_NS) += namespace.o
+obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o
+obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f5490222e134..003ccf338d20 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -347,8 +347,7 @@ static void clockevents_notify_released(void)
while (!list_empty(&clockevents_released)) {
dev = list_entry(clockevents_released.next,
struct clock_event_device, list);
- list_del(&dev->list);
- list_add(&dev->list, &clockevent_devices);
+ list_move(&dev->list, &clockevent_devices);
tick_check_new_device(dev);
}
}
@@ -576,8 +575,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
if (old) {
module_put(old->owner);
clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED);
- list_del(&old->list);
- list_add(&old->list, &clockevents_released);
+ list_move(&old->list, &clockevents_released);
}
if (new) {
@@ -629,6 +627,7 @@ void tick_offline_cpu(unsigned int cpu)
/**
* tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
+ * @cpu: The dead CPU
*/
void tick_cleanup_dead_cpu(int cpu)
{
@@ -668,9 +667,9 @@ static struct bus_type clockevents_subsys = {
static DEFINE_PER_CPU(struct device, tick_percpu_dev);
static struct tick_device *tick_get_tick_dev(struct device *dev);
-static ssize_t sysfs_show_current_tick_dev(struct device *dev,
- struct device_attribute *attr,
- char *buf)
+static ssize_t current_device_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
struct tick_device *td;
ssize_t count = 0;
@@ -682,12 +681,12 @@ static ssize_t sysfs_show_current_tick_dev(struct device *dev,
raw_spin_unlock_irq(&clockevents_lock);
return count;
}
-static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL);
+static DEVICE_ATTR_RO(current_device);
/* We don't support the abomination of removable broadcast devices */
-static ssize_t sysfs_unbind_tick_dev(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t unbind_device_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
char name[CS_NAME_LEN];
ssize_t ret = sysfs_get_uname(buf, name, count);
@@ -714,7 +713,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev,
mutex_unlock(&clockevents_mutex);
return ret ? ret : count;
}
-static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev);
+static DEVICE_ATTR_WO(unbind_device);
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
static struct device tick_bc_dev = {
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c
new file mode 100644
index 000000000000..df922f49d171
--- /dev/null
+++ b/kernel/time/clocksource-wdtest.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Unit test for the clocksource watchdog.
+ *
+ * Copyright (C) 2021 Facebook, Inc.
+ *
+ * Author: Paul E. McKenney <paulmck@kernel.org>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
+#include <linux/tick.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/prandom.h>
+#include <linux/cpu.h>
+
+#include "tick-internal.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
+
+static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0;
+module_param(holdoff, int, 0444);
+MODULE_PARM_DESC(holdoff, "Time to wait to start test (s).");
+
+/* Watchdog kthread's task_struct pointer for debug purposes. */
+static struct task_struct *wdtest_task;
+
+static u64 wdtest_jiffies_read(struct clocksource *cs)
+{
+ return (u64)jiffies;
+}
+
+static struct clocksource clocksource_wdtest_jiffies = {
+ .name = "wdtest-jiffies",
+ .rating = 1, /* lowest valid rating*/
+ .uncertainty_margin = TICK_NSEC,
+ .read = wdtest_jiffies_read,
+ .mask = CLOCKSOURCE_MASK(32),
+ .flags = CLOCK_SOURCE_MUST_VERIFY,
+ .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
+ .shift = JIFFIES_SHIFT,
+ .max_cycles = 10,
+};
+
+static int wdtest_ktime_read_ndelays;
+static bool wdtest_ktime_read_fuzz;
+
+static u64 wdtest_ktime_read(struct clocksource *cs)
+{
+ int wkrn = READ_ONCE(wdtest_ktime_read_ndelays);
+ static int sign = 1;
+ u64 ret;
+
+ if (wkrn) {
+ udelay(cs->uncertainty_margin / 250);
+ WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1);
+ }
+ ret = ktime_get_real_fast_ns();
+ if (READ_ONCE(wdtest_ktime_read_fuzz)) {
+ sign = -sign;
+ ret = ret + sign * 100 * NSEC_PER_MSEC;
+ }
+ return ret;
+}
+
+static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs)
+{
+ pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name);
+}
+
+#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \
+ CLOCK_SOURCE_VALID_FOR_HRES | \
+ CLOCK_SOURCE_MUST_VERIFY | \
+ CLOCK_SOURCE_VERIFY_PERCPU)
+
+static struct clocksource clocksource_wdtest_ktime = {
+ .name = "wdtest-ktime",
+ .rating = 300,
+ .read = wdtest_ktime_read,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = KTIME_FLAGS,
+ .mark_unstable = wdtest_ktime_cs_mark_unstable,
+ .list = LIST_HEAD_INIT(clocksource_wdtest_ktime.list),
+};
+
+/* Reset the clocksource if needed. */
+static void wdtest_ktime_clocksource_reset(void)
+{
+ if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) {
+ clocksource_unregister(&clocksource_wdtest_ktime);
+ clocksource_wdtest_ktime.flags = KTIME_FLAGS;
+ schedule_timeout_uninterruptible(HZ / 10);
+ clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+ }
+}
+
+/* Run the specified series of watchdog tests. */
+static int wdtest_func(void *arg)
+{
+ unsigned long j1, j2;
+ char *s;
+ int i;
+
+ schedule_timeout_uninterruptible(holdoff * HZ);
+
+ /*
+ * Verify that jiffies-like clocksources get the manually
+ * specified uncertainty margin.
+ */
+ pr_info("--- Verify jiffies-like uncertainty margin.\n");
+ __clocksource_register(&clocksource_wdtest_jiffies);
+ WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC);
+
+ j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
+ schedule_timeout_uninterruptible(HZ);
+ j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
+ WARN_ON_ONCE(j1 == j2);
+
+ clocksource_unregister(&clocksource_wdtest_jiffies);
+
+ /*
+ * Verify that tsc-like clocksources are assigned a reasonable
+ * uncertainty margin.
+ */
+ pr_info("--- Verify tsc-like uncertainty margin.\n");
+ clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+ WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC);
+
+ j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
+ udelay(1);
+ j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
+ pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1);
+ WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC));
+
+ /* Verify tsc-like stability with various numbers of errors injected. */
+ for (i = 0; i <= max_cswd_read_retries + 1; i++) {
+ if (i <= 1 && i < max_cswd_read_retries)
+ s = "";
+ else if (i <= max_cswd_read_retries)
+ s = ", expect message";
+ else
+ s = ", expect clock skew";
+ pr_info("--- Watchdog with %dx error injection, %lu retries%s.\n", i, max_cswd_read_retries, s);
+ WRITE_ONCE(wdtest_ktime_read_ndelays, i);
+ schedule_timeout_uninterruptible(2 * HZ);
+ WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays));
+ WARN_ON_ONCE((i <= max_cswd_read_retries) !=
+ !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
+ wdtest_ktime_clocksource_reset();
+ }
+
+ /* Verify tsc-like stability with clock-value-fuzz error injection. */
+ pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n");
+ WRITE_ONCE(wdtest_ktime_read_fuzz, true);
+ schedule_timeout_uninterruptible(2 * HZ);
+ WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
+ clocksource_verify_percpu(&clocksource_wdtest_ktime);
+ WRITE_ONCE(wdtest_ktime_read_fuzz, false);
+
+ clocksource_unregister(&clocksource_wdtest_ktime);
+
+ pr_info("--- Done with test.\n");
+ return 0;
+}
+
+static void wdtest_print_module_parms(void)
+{
+ pr_alert("--- holdoff=%d\n", holdoff);
+}
+
+/* Cleanup function. */
+static void clocksource_wdtest_cleanup(void)
+{
+}
+
+static int __init clocksource_wdtest_init(void)
+{
+ int ret = 0;
+
+ wdtest_print_module_parms();
+
+ /* Create watchdog-test task. */
+ wdtest_task = kthread_run(wdtest_func, NULL, "wdtest");
+ if (IS_ERR(wdtest_task)) {
+ ret = PTR_ERR(wdtest_task);
+ pr_warn("%s: Failed to create wdtest kthread.\n", __func__);
+ wdtest_task = NULL;
+ return ret;
+ }
+
+ return 0;
+}
+
+module_init(clocksource_wdtest_init);
+module_exit(clocksource_wdtest_cleanup);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2cd902592fc1..b8a14d2fb5ba 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -14,6 +14,8 @@
#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
#include <linux/tick.h>
#include <linux/kthread.h>
+#include <linux/prandom.h>
+#include <linux/cpu.h>
#include "tick-internal.h"
#include "timekeeping_internal.h"
@@ -93,6 +95,20 @@ static char override_name[CS_NAME_LEN];
static int finished_booting;
static u64 suspend_start;
+/*
+ * Threshold: 0.0312s, when doubled: 0.0625s.
+ * Also a default for cs->uncertainty_margin when registering clocks.
+ */
+#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
+
+/*
+ * Maximum permissible delay between two readouts of the watchdog
+ * clocksource surrounding a read of the clocksource being validated.
+ * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
+ * a lower bound for cs->uncertainty_margin values when registering clocks.
+ */
+#define WATCHDOG_MAX_SKEW (50 * NSEC_PER_USEC)
+
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
static void clocksource_select(void);
@@ -119,10 +135,9 @@ static int clocksource_watchdog_kthread(void *data);
static void __clocksource_change_rating(struct clocksource *cs, int rating);
/*
- * Interval: 0.5sec Threshold: 0.0625s
+ * Interval: 0.5sec.
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
static void clocksource_watchdog_work(struct work_struct *work)
{
@@ -184,12 +199,164 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
+ulong max_cswd_read_retries = 3;
+module_param(max_cswd_read_retries, ulong, 0644);
+EXPORT_SYMBOL_GPL(max_cswd_read_retries);
+static int verify_n_cpus = 8;
+module_param(verify_n_cpus, int, 0644);
+
+static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
+{
+ unsigned int nretries;
+ u64 wd_end, wd_delta;
+ int64_t wd_delay;
+
+ for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
+ local_irq_disable();
+ *wdnow = watchdog->read(watchdog);
+ *csnow = cs->read(cs);
+ wd_end = watchdog->read(watchdog);
+ local_irq_enable();
+
+ wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
+ wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
+ watchdog->shift);
+ if (wd_delay <= WATCHDOG_MAX_SKEW) {
+ if (nretries > 1 || nretries >= max_cswd_read_retries) {
+ pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
+ smp_processor_id(), watchdog->name, nretries);
+ }
+ return true;
+ }
+ }
+
+ pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
+ smp_processor_id(), watchdog->name, wd_delay, nretries);
+ return false;
+}
+
+static u64 csnow_mid;
+static cpumask_t cpus_ahead;
+static cpumask_t cpus_behind;
+static cpumask_t cpus_chosen;
+
+static void clocksource_verify_choose_cpus(void)
+{
+ int cpu, i, n = verify_n_cpus;
+
+ if (n < 0) {
+ /* Check all of the CPUs. */
+ cpumask_copy(&cpus_chosen, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+ return;
+ }
+
+ /* If no checking desired, or no other CPU to check, leave. */
+ cpumask_clear(&cpus_chosen);
+ if (n == 0 || num_online_cpus() <= 1)
+ return;
+
+ /* Make sure to select at least one CPU other than the current CPU. */
+ cpu = cpumask_next(-1, cpu_online_mask);
+ if (cpu == smp_processor_id())
+ cpu = cpumask_next(cpu, cpu_online_mask);
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ return;
+ cpumask_set_cpu(cpu, &cpus_chosen);
+
+ /* Force a sane value for the boot parameter. */
+ if (n > nr_cpu_ids)
+ n = nr_cpu_ids;
+
+ /*
+ * Randomly select the specified number of CPUs. If the same
+ * CPU is selected multiple times, that CPU is checked only once,
+ * and no replacement CPU is selected. This gracefully handles
+ * situations where verify_n_cpus is greater than the number of
+ * CPUs that are currently online.
+ */
+ for (i = 1; i < n; i++) {
+ cpu = prandom_u32() % nr_cpu_ids;
+ cpu = cpumask_next(cpu - 1, cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_next(-1, cpu_online_mask);
+ if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ cpumask_set_cpu(cpu, &cpus_chosen);
+ }
+
+ /* Don't verify ourselves. */
+ cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+}
+
+static void clocksource_verify_one_cpu(void *csin)
+{
+ struct clocksource *cs = (struct clocksource *)csin;
+
+ csnow_mid = cs->read(cs);
+}
+
+void clocksource_verify_percpu(struct clocksource *cs)
+{
+ int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
+ u64 csnow_begin, csnow_end;
+ int cpu, testcpu;
+ s64 delta;
+
+ if (verify_n_cpus == 0)
+ return;
+ cpumask_clear(&cpus_ahead);
+ cpumask_clear(&cpus_behind);
+ cpus_read_lock();
+ preempt_disable();
+ clocksource_verify_choose_cpus();
+ if (cpumask_weight(&cpus_chosen) == 0) {
+ preempt_enable();
+ cpus_read_unlock();
+ pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
+ return;
+ }
+ testcpu = smp_processor_id();
+ pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
+ for_each_cpu(cpu, &cpus_chosen) {
+ if (cpu == testcpu)
+ continue;
+ csnow_begin = cs->read(cs);
+ smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
+ csnow_end = cs->read(cs);
+ delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
+ if (delta < 0)
+ cpumask_set_cpu(cpu, &cpus_behind);
+ delta = (csnow_end - csnow_mid) & cs->mask;
+ if (delta < 0)
+ cpumask_set_cpu(cpu, &cpus_ahead);
+ delta = clocksource_delta(csnow_end, csnow_begin, cs->mask);
+ cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ if (cs_nsec > cs_nsec_max)
+ cs_nsec_max = cs_nsec;
+ if (cs_nsec < cs_nsec_min)
+ cs_nsec_min = cs_nsec;
+ }
+ preempt_enable();
+ cpus_read_unlock();
+ if (!cpumask_empty(&cpus_ahead))
+ pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
+ cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
+ if (!cpumask_empty(&cpus_behind))
+ pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n",
+ cpumask_pr_args(&cpus_behind), testcpu, cs->name);
+ if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
+ pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
+ testcpu, cs_nsec_min, cs_nsec_max, cs->name);
+}
+EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+
static void clocksource_watchdog(struct timer_list *unused)
{
- struct clocksource *cs;
u64 csnow, wdnow, cslast, wdlast, delta;
- int64_t wd_nsec, cs_nsec;
int next_cpu, reset_pending;
+ int64_t wd_nsec, cs_nsec;
+ struct clocksource *cs;
+ u32 md;
spin_lock(&watchdog_lock);
if (!watchdog_running)
@@ -206,10 +373,11 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
}
- local_irq_disable();
- csnow = cs->read(cs);
- wdnow = watchdog->read(watchdog);
- local_irq_enable();
+ if (!cs_watchdog_read(cs, &csnow, &wdnow)) {
+ /* Clock readout unreliable, so give it up. */
+ __clocksource_unstable(cs);
+ continue;
+ }
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
@@ -235,13 +403,20 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
/* Check the deviation from the watchdog clocksource. */
- if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
+ md = cs->uncertainty_margin + watchdog->uncertainty_margin;
+ if (abs(cs_nsec - wd_nsec) > md) {
pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
smp_processor_id(), cs->name);
- pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
- watchdog->name, wdnow, wdlast, watchdog->mask);
- pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
- cs->name, csnow, cslast, cs->mask);
+ pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
+ watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
+ pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
+ cs->name, cs_nsec, csnow, cslast, cs->mask);
+ if (curr_clocksource == cs)
+ pr_warn(" '%s' is current clocksource.\n", cs->name);
+ else if (curr_clocksource)
+ pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name);
+ else
+ pr_warn(" No current clocksource.\n");
__clocksource_unstable(cs);
continue;
}
@@ -407,6 +582,12 @@ static int __clocksource_watchdog_kthread(void)
unsigned long flags;
int select = 0;
+ /* Do any required per-CPU skew verification. */
+ if (curr_clocksource &&
+ curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
+ curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
+ clocksource_verify_percpu(curr_clocksource);
+
spin_lock_irqsave(&watchdog_lock, flags);
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
@@ -876,6 +1057,26 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
NSEC_PER_SEC / scale, sec * scale);
}
+
+ /*
+ * If the uncertainty margin is not specified, calculate it.
+ * If both scale and freq are non-zero, calculate the clock
+ * period, but bound below at 2*WATCHDOG_MAX_SKEW. However,
+ * if either of scale or freq is zero, be very conservative and
+ * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the
+ * uncertainty margin. Allow stupidly small uncertainty margins
+ * to be specified by the caller for testing purposes, but warn
+ * to discourage production use of this capability.
+ */
+ if (scale && freq && !cs->uncertainty_margin) {
+ cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
+ if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
+ cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
+ } else if (!cs->uncertainty_margin) {
+ cs->uncertainty_margin = WATCHDOG_THRESHOLD;
+ }
+ WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
+
/*
* Ensure clocksources that have large 'mult' values don't overflow
* when adjusted.
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 4a66725b1d4a..0ea8702eb516 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -652,21 +652,10 @@ static inline int hrtimer_hres_active(void)
return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
}
-/*
- * Reprogram the event source with checking both queues for the
- * next event
- * Called with interrupts disabled and base->lock held
- */
-static void
-hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
+static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
+ struct hrtimer *next_timer,
+ ktime_t expires_next)
{
- ktime_t expires_next;
-
- expires_next = hrtimer_update_next_event(cpu_base);
-
- if (skip_equal && expires_next == cpu_base->expires_next)
- return;
-
cpu_base->expires_next = expires_next;
/*
@@ -689,7 +678,25 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
- tick_program_event(cpu_base->expires_next, 1);
+ tick_program_event(expires_next, 1);
+}
+
+/*
+ * Reprogram the event source with checking both queues for the
+ * next event
+ * Called with interrupts disabled and base->lock held
+ */
+static void
+hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
+{
+ ktime_t expires_next;
+
+ expires_next = hrtimer_update_next_event(cpu_base);
+
+ if (skip_equal && expires_next == cpu_base->expires_next)
+ return;
+
+ __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next);
}
/* High resolution timer related functions */
@@ -720,23 +727,7 @@ static inline int hrtimer_is_hres_enabled(void)
return hrtimer_hres_enabled;
}
-/*
- * Retrigger next event is called after clock was set
- *
- * Called with interrupts disabled via on_each_cpu()
- */
-static void retrigger_next_event(void *arg)
-{
- struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
-
- if (!__hrtimer_hres_active(base))
- return;
-
- raw_spin_lock(&base->lock);
- hrtimer_update_base(base);
- hrtimer_force_reprogram(base, 0);
- raw_spin_unlock(&base->lock);
-}
+static void retrigger_next_event(void *arg);
/*
* Switch to high resolution mode
@@ -758,29 +749,54 @@ static void hrtimer_switch_to_hres(void)
retrigger_next_event(NULL);
}
-static void clock_was_set_work(struct work_struct *work)
-{
- clock_was_set();
-}
+#else
-static DECLARE_WORK(hrtimer_work, clock_was_set_work);
+static inline int hrtimer_is_hres_enabled(void) { return 0; }
+static inline void hrtimer_switch_to_hres(void) { }
+#endif /* CONFIG_HIGH_RES_TIMERS */
/*
- * Called from timekeeping and resume code to reprogram the hrtimer
- * interrupt device on all cpus.
+ * Retrigger next event is called after clock was set with interrupts
+ * disabled through an SMP function call or directly from low level
+ * resume code.
+ *
+ * This is only invoked when:
+ * - CONFIG_HIGH_RES_TIMERS is enabled.
+ * - CONFIG_NOHZ_COMMON is enabled
+ *
+ * For the other cases this function is empty and because the call sites
+ * are optimized out it vanishes as well, i.e. no need for lots of
+ * #ifdeffery.
*/
-void clock_was_set_delayed(void)
+static void retrigger_next_event(void *arg)
{
- schedule_work(&hrtimer_work);
-}
-
-#else
+ struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
-static inline int hrtimer_is_hres_enabled(void) { return 0; }
-static inline void hrtimer_switch_to_hres(void) { }
-static inline void retrigger_next_event(void *arg) { }
+ /*
+ * When high resolution mode or nohz is active, then the offsets of
+ * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the
+ * next tick will take care of that.
+ *
+ * If high resolution mode is active then the next expiring timer
+ * must be reevaluated and the clock event device reprogrammed if
+ * necessary.
+ *
+ * In the NOHZ case the update of the offset and the reevaluation
+ * of the next expiring timer is enough. The return from the SMP
+ * function call will take care of the reprogramming in case the
+ * CPU was in a NOHZ idle sleep.
+ */
+ if (!__hrtimer_hres_active(base) && !tick_nohz_active)
+ return;
-#endif /* CONFIG_HIGH_RES_TIMERS */
+ raw_spin_lock(&base->lock);
+ hrtimer_update_base(base);
+ if (__hrtimer_hres_active(base))
+ hrtimer_force_reprogram(base, 0);
+ else
+ hrtimer_update_next_event(base);
+ raw_spin_unlock(&base->lock);
+}
/*
* When a timer is enqueued and expires earlier than the already enqueued
@@ -835,75 +851,161 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
if (base->cpu_base != cpu_base)
return;
+ if (expires >= cpu_base->expires_next)
+ return;
+
/*
- * If the hrtimer interrupt is running, then it will
- * reevaluate the clock bases and reprogram the clock event
- * device. The callbacks are always executed in hard interrupt
- * context so we don't need an extra check for a running
- * callback.
+ * If the hrtimer interrupt is running, then it will reevaluate the
+ * clock bases and reprogram the clock event device.
*/
if (cpu_base->in_hrtirq)
return;
- if (expires >= cpu_base->expires_next)
- return;
-
- /* Update the pointer to the next expiring timer */
cpu_base->next_timer = timer;
- cpu_base->expires_next = expires;
+
+ __hrtimer_reprogram(cpu_base, timer, expires);
+}
+
+static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
+ unsigned int active)
+{
+ struct hrtimer_clock_base *base;
+ unsigned int seq;
+ ktime_t expires;
/*
- * If hres is not active, hardware does not have to be
- * programmed yet.
+ * Update the base offsets unconditionally so the following
+ * checks whether the SMP function call is required works.
*
- * If a hang was detected in the last timer interrupt then we
- * do not schedule a timer which is earlier than the expiry
- * which we enforced in the hang detection. We want the system
- * to make progress.
+ * The update is safe even when the remote CPU is in the hrtimer
+ * interrupt or the hrtimer soft interrupt and expiring affected
+ * bases. Either it will see the update before handling a base or
+ * it will see it when it finishes the processing and reevaluates
+ * the next expiring timer.
*/
- if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
- return;
+ seq = cpu_base->clock_was_set_seq;
+ hrtimer_update_base(cpu_base);
+
+ /*
+ * If the sequence did not change over the update then the
+ * remote CPU already handled it.
+ */
+ if (seq == cpu_base->clock_was_set_seq)
+ return false;
+
+ /*
+ * If the remote CPU is currently handling an hrtimer interrupt, it
+ * will reevaluate the first expiring timer of all clock bases
+ * before reprogramming. Nothing to do here.
+ */
+ if (cpu_base->in_hrtirq)
+ return false;
/*
- * Program the timer hardware. We enforce the expiry for
- * events which are already in the past.
+ * Walk the affected clock bases and check whether the first expiring
+ * timer in a clock base is moving ahead of the first expiring timer of
+ * @cpu_base. If so, the IPI must be invoked because per CPU clock
+ * event devices cannot be remotely reprogrammed.
*/
- tick_program_event(expires, 1);
+ active &= cpu_base->active_bases;
+
+ for_each_active_base(base, cpu_base, active) {
+ struct timerqueue_node *next;
+
+ next = timerqueue_getnext(&base->active);
+ expires = ktime_sub(next->expires, base->offset);
+ if (expires < cpu_base->expires_next)
+ return true;
+
+ /* Extra check for softirq clock bases */
+ if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT)
+ continue;
+ if (cpu_base->softirq_activated)
+ continue;
+ if (expires < cpu_base->softirq_expires_next)
+ return true;
+ }
+ return false;
}
/*
- * Clock realtime was set
- *
- * Change the offset of the realtime clock vs. the monotonic
- * clock.
+ * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and
+ * CLOCK_BOOTTIME (for late sleep time injection).
*
- * We might have to reprogram the high resolution timer interrupt. On
- * SMP we call the architecture specific code to retrigger _all_ high
- * resolution timer interrupts. On UP we just disable interrupts and
- * call the high resolution interrupt code.
+ * This requires to update the offsets for these clocks
+ * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this
+ * also requires to eventually reprogram the per CPU clock event devices
+ * when the change moves an affected timer ahead of the first expiring
+ * timer on that CPU. Obviously remote per CPU clock event devices cannot
+ * be reprogrammed. The other reason why an IPI has to be sent is when the
+ * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets
+ * in the tick, which obviously might be stopped, so this has to bring out
+ * the remote CPU which might sleep in idle to get this sorted.
*/
-void clock_was_set(void)
+void clock_was_set(unsigned int bases)
{
-#ifdef CONFIG_HIGH_RES_TIMERS
- /* Retrigger the CPU local events everywhere */
- on_each_cpu(retrigger_next_event, NULL, 1);
-#endif
+ struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases);
+ cpumask_var_t mask;
+ int cpu;
+
+ if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active)
+ goto out_timerfd;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
+ on_each_cpu(retrigger_next_event, NULL, 1);
+ goto out_timerfd;
+ }
+
+ /* Avoid interrupting CPUs if possible */
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ unsigned long flags;
+
+ cpu_base = &per_cpu(hrtimer_bases, cpu);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
+
+ if (update_needs_ipi(cpu_base, bases))
+ cpumask_set_cpu(cpu, mask);
+
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ }
+
+ preempt_disable();
+ smp_call_function_many(mask, retrigger_next_event, NULL, 1);
+ preempt_enable();
+ cpus_read_unlock();
+ free_cpumask_var(mask);
+
+out_timerfd:
timerfd_clock_was_set();
}
+static void clock_was_set_work(struct work_struct *work)
+{
+ clock_was_set(CLOCK_SET_WALL);
+}
+
+static DECLARE_WORK(hrtimer_work, clock_was_set_work);
+
+/*
+ * Called from timekeeping code to reprogram the hrtimer interrupt device
+ * on all cpus and to notify timerfd.
+ */
+void clock_was_set_delayed(void)
+{
+ schedule_work(&hrtimer_work);
+}
+
/*
- * During resume we might have to reprogram the high resolution timer
- * interrupt on all online CPUs. However, all other CPUs will be
- * stopped with IRQs interrupts disabled so the clock_was_set() call
- * must be deferred.
+ * Called during resume either directly from via timekeeping_resume()
+ * or in the case of s2idle from tick_unfreeze() to ensure that the
+ * hrtimers are up to date.
*/
-void hrtimers_resume(void)
+void hrtimers_resume_local(void)
{
lockdep_assert_irqs_disabled();
/* Retrigger on the local CPU */
retrigger_next_event(NULL);
- /* And schedule a retrigger for all others */
- clock_was_set_delayed();
}
/*
@@ -1030,12 +1132,13 @@ static void __remove_hrtimer(struct hrtimer *timer,
* remove hrtimer, called with base lock held
*/
static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ bool restart, bool keep_local)
{
u8 state = timer->state;
if (state & HRTIMER_STATE_ENQUEUED) {
- int reprogram;
+ bool reprogram;
/*
* Remove the timer and force reprogramming when high
@@ -1048,8 +1151,16 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
debug_deactivate(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ /*
+ * If the timer is not restarted then reprogramming is
+ * required if the timer is local. If it is local and about
+ * to be restarted, avoid programming it twice (on removal
+ * and a moment later when it's requeued).
+ */
if (!restart)
state = HRTIMER_STATE_INACTIVE;
+ else
+ reprogram &= !keep_local;
__remove_hrtimer(timer, base, state, reprogram);
return 1;
@@ -1103,9 +1214,31 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
struct hrtimer_clock_base *base)
{
struct hrtimer_clock_base *new_base;
+ bool force_local, first;
- /* Remove an active timer from the queue: */
- remove_hrtimer(timer, base, true);
+ /*
+ * If the timer is on the local cpu base and is the first expiring
+ * timer then this might end up reprogramming the hardware twice
+ * (on removal and on enqueue). To avoid that by prevent the
+ * reprogram on removal, keep the timer local to the current CPU
+ * and enforce reprogramming after it is queued no matter whether
+ * it is the new first expiring timer again or not.
+ */
+ force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ force_local &= base->cpu_base->next_timer == timer;
+
+ /*
+ * Remove an active timer from the queue. In case it is not queued
+ * on the current CPU, make sure that remove_hrtimer() updates the
+ * remote data correctly.
+ *
+ * If it's on the current CPU and the first expiring timer, then
+ * skip reprogramming, keep the timer local and enforce
+ * reprogramming later if it was the first expiring timer. This
+ * avoids programming the underlying clock event twice (once at
+ * removal and once after enqueue).
+ */
+ remove_hrtimer(timer, base, true, force_local);
if (mode & HRTIMER_MODE_REL)
tim = ktime_add_safe(tim, base->get_time());
@@ -1115,9 +1248,24 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
hrtimer_set_expires_range_ns(timer, tim, delta_ns);
/* Switch the timer base, if necessary: */
- new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+ if (!force_local) {
+ new_base = switch_hrtimer_base(timer, base,
+ mode & HRTIMER_MODE_PINNED);
+ } else {
+ new_base = base;
+ }
- return enqueue_hrtimer(timer, new_base, mode);
+ first = enqueue_hrtimer(timer, new_base, mode);
+ if (!force_local)
+ return first;
+
+ /*
+ * Timer was forced to stay on the current CPU to avoid
+ * reprogramming on removal and enqueue. Force reprogram the
+ * hardware by evaluating the new first expiring timer.
+ */
+ hrtimer_force_reprogram(new_base->cpu_base, 1);
+ return 0;
}
/**
@@ -1183,7 +1331,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
base = lock_hrtimer_base(timer, &flags);
if (!hrtimer_callback_running(timer))
- ret = remove_hrtimer(timer, base, false);
+ ret = remove_hrtimer(timer, base, false, false);
unlock_hrtimer_base(timer, &flags);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a492e4da69ba..bc4db9e5ab70 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -10,28 +10,9 @@
#include <linux/init.h>
#include "timekeeping.h"
+#include "tick-internal.h"
-/* Since jiffies uses a simple TICK_NSEC multiplier
- * conversion, the .shift value could be zero. However
- * this would make NTP adjustments impossible as they are
- * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
- * shift both the nominator and denominator the same
- * amount, and give ntp adjustments in units of 1/2^8
- *
- * The value 8 is somewhat carefully chosen, as anything
- * larger can result in overflows. TICK_NSEC grows as HZ
- * shrinks, so values greater than 8 overflow 32bits when
- * HZ=100.
- */
-#if HZ < 34
-#define JIFFIES_SHIFT 6
-#elif HZ < 67
-#define JIFFIES_SHIFT 7
-#else
-#define JIFFIES_SHIFT 8
-#endif
-
static u64 jiffies_read(struct clocksource *cs)
{
return (u64) jiffies;
@@ -49,13 +30,14 @@ static u64 jiffies_read(struct clocksource *cs)
* for "tick-less" systems.
*/
static struct clocksource clocksource_jiffies = {
- .name = "jiffies",
- .rating = 1, /* lowest valid rating*/
- .read = jiffies_read,
- .mask = CLOCKSOURCE_MASK(32),
- .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
- .shift = JIFFIES_SHIFT,
- .max_cycles = 10,
+ .name = "jiffies",
+ .rating = 1, /* lowest valid rating*/
+ .uncertainty_margin = 32 * NSEC_PER_MSEC,
+ .read = jiffies_read,
+ .mask = CLOCKSOURCE_MASK(32),
+ .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
+ .shift = JIFFIES_SHIFT,
+ .max_cycles = 10,
};
__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 12eab0d2ae28..aec832801c26 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -88,13 +88,13 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = kmalloc(sizeof(*ns), GFP_KERNEL);
+ ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
if (!ns)
goto fail_dec;
refcount_set(&ns->ns.count, 1);
- ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!ns->vvar_page)
goto fail_free;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3bb96a8b49c9..643d412ac623 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -291,6 +291,8 @@ static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples)
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
+ lockdep_assert_task_sighand_held(tsk);
+
/* Check if cputimer isn't running. This is accessed without locking. */
if (!READ_ONCE(pct->timers_active)) {
struct task_cputime sum;
@@ -405,6 +407,55 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
return 0;
}
+static struct posix_cputimer_base *timer_base(struct k_itimer *timer,
+ struct task_struct *tsk)
+{
+ int clkidx = CPUCLOCK_WHICH(timer->it_clock);
+
+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
+ return tsk->posix_cputimers.bases + clkidx;
+ else
+ return tsk->signal->posix_cputimers.bases + clkidx;
+}
+
+/*
+ * Force recalculating the base earliest expiration on the next tick.
+ * This will also re-evaluate the need to keep around the process wide
+ * cputime counter and tick dependency and eventually shut these down
+ * if necessary.
+ */
+static void trigger_base_recalc_expires(struct k_itimer *timer,
+ struct task_struct *tsk)
+{
+ struct posix_cputimer_base *base = timer_base(timer, tsk);
+
+ base->nextevt = 0;
+}
+
+/*
+ * Dequeue the timer and reset the base if it was its earliest expiration.
+ * It makes sure the next tick recalculates the base next expiration so we
+ * don't keep the costly process wide cputime counter around for a random
+ * amount of time, along with the tick dependency.
+ *
+ * If another timer gets queued between this and the next tick, its
+ * expiration will update the base next event if necessary on the next
+ * tick.
+ */
+static void disarm_timer(struct k_itimer *timer, struct task_struct *p)
+{
+ struct cpu_timer *ctmr = &timer->it.cpu;
+ struct posix_cputimer_base *base;
+
+ if (!cpu_timer_dequeue(ctmr))
+ return;
+
+ base = timer_base(timer, p);
+ if (cpu_timer_getexpires(ctmr) == base->nextevt)
+ trigger_base_recalc_expires(timer, p);
+}
+
+
/*
* Clean up a CPU-clock timer that is about to be destroyed.
* This is called from timer deletion with the timer already locked.
@@ -439,7 +490,7 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
if (timer->it.cpu.firing)
ret = TIMER_RETRY;
else
- cpu_timer_dequeue(ctmr);
+ disarm_timer(timer, p);
unlock_task_sighand(p, &flags);
}
@@ -498,15 +549,9 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
*/
static void arm_timer(struct k_itimer *timer, struct task_struct *p)
{
- int clkidx = CPUCLOCK_WHICH(timer->it_clock);
+ struct posix_cputimer_base *base = timer_base(timer, p);
struct cpu_timer *ctmr = &timer->it.cpu;
u64 newexp = cpu_timer_getexpires(ctmr);
- struct posix_cputimer_base *base;
-
- if (CPUCLOCK_PERTHREAD(timer->it_clock))
- base = p->posix_cputimers.bases + clkidx;
- else
- base = p->signal->posix_cputimers.bases + clkidx;
if (!cpu_timer_enqueue(&base->tqhead, ctmr))
return;
@@ -523,7 +568,7 @@ static void arm_timer(struct k_itimer *timer, struct task_struct *p)
if (CPUCLOCK_PERTHREAD(timer->it_clock))
tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
else
- tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
+ tick_dep_set_signal(p, TICK_DEP_BIT_POSIX_TIMER);
}
/*
@@ -703,16 +748,29 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
timer->it_overrun_last = 0;
timer->it_overrun = -1;
- if (new_expires != 0 && !(val < new_expires)) {
+ if (val >= new_expires) {
+ if (new_expires != 0) {
+ /*
+ * The designated time already passed, so we notify
+ * immediately, even if the thread never runs to
+ * accumulate more time on this clock.
+ */
+ cpu_timer_fire(timer);
+ }
+
/*
- * The designated time already passed, so we notify
- * immediately, even if the thread never runs to
- * accumulate more time on this clock.
+ * Make sure we don't keep around the process wide cputime
+ * counter or the tick dependency if they are not necessary.
*/
- cpu_timer_fire(timer);
- }
+ sighand = lock_task_sighand(p, &flags);
+ if (!sighand)
+ goto out;
+
+ if (!cpu_timer_queued(ctmr))
+ trigger_base_recalc_expires(timer, p);
- ret = 0;
+ unlock_task_sighand(p, &flags);
+ }
out:
rcu_read_unlock();
if (old)
@@ -991,6 +1049,11 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
if (!p)
goto out;
+ /* Protect timer list r/w in arm_timer() */
+ sighand = lock_task_sighand(p, &flags);
+ if (unlikely(sighand == NULL))
+ goto out;
+
/*
* Fetch the current sample and update the timer's expiry time.
*/
@@ -1001,11 +1064,6 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
bump_cpu_timer(timer, now);
- /* Protect timer list r/w in arm_timer() */
- sighand = lock_task_sighand(p, &flags);
- if (unlikely(sighand == NULL))
- goto out;
-
/*
* Now re-arm for the new expiry time.
*/
@@ -1346,9 +1404,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
}
}
- if (!*newval)
- return;
- *newval += now;
+ if (*newval)
+ *newval += now;
}
/*
@@ -1358,7 +1415,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
if (*newval < *nextevt)
*nextevt = *newval;
- tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
+ tick_dep_set_signal(tsk, TICK_DEP_BIT_POSIX_TIMER);
}
static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index dd5697d7347b..1cd10b102c51 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -273,8 +273,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
static __init int init_posix_timers(void)
{
posix_timers_cache = kmem_cache_create("posix_timers_cache",
- sizeof (struct k_itimer), 0, SLAB_PANIC,
- NULL);
+ sizeof(struct k_itimer), 0,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
return 0;
}
__initcall(init_posix_timers);
@@ -336,7 +336,7 @@ void posixtimer_rearm(struct kernel_siginfo *info)
int posix_timer_event(struct k_itimer *timr, int si_private)
{
enum pid_type type;
- int ret = -1;
+ int ret;
/*
* FIXME: if ->sigq is queued we can race with
* dequeue_signal()->posixtimer_rearm().
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index a44055228796..f7fe6fe36173 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -33,6 +33,8 @@ static int tick_broadcast_forced;
static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
#ifdef CONFIG_TICK_ONESHOT
+static DEFINE_PER_CPU(struct clock_event_device *, tick_oneshot_wakeup_device);
+
static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
static void tick_broadcast_clear_oneshot(int cpu);
static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
@@ -61,6 +63,13 @@ struct cpumask *tick_get_broadcast_mask(void)
return tick_broadcast_mask;
}
+static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu);
+
+const struct clock_event_device *tick_get_wakeup_device(int cpu)
+{
+ return tick_get_oneshot_wakeup_device(cpu);
+}
+
/*
* Start the device in periodic mode
*/
@@ -88,13 +97,75 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev,
return !curdev || newdev->rating > curdev->rating;
}
+#ifdef CONFIG_TICK_ONESHOT
+static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu)
+{
+ return per_cpu(tick_oneshot_wakeup_device, cpu);
+}
+
+static void tick_oneshot_wakeup_handler(struct clock_event_device *wd)
+{
+ /*
+ * If we woke up early and the tick was reprogrammed in the
+ * meantime then this may be spurious but harmless.
+ */
+ tick_receive_broadcast();
+}
+
+static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev,
+ int cpu)
+{
+ struct clock_event_device *curdev = tick_get_oneshot_wakeup_device(cpu);
+
+ if (!newdev)
+ goto set_device;
+
+ if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
+ (newdev->features & CLOCK_EVT_FEAT_C3STOP))
+ return false;
+
+ if (!(newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
+ !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return false;
+
+ if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu)))
+ return false;
+
+ if (curdev && newdev->rating <= curdev->rating)
+ return false;
+
+ if (!try_module_get(newdev->owner))
+ return false;
+
+ newdev->event_handler = tick_oneshot_wakeup_handler;
+set_device:
+ clockevents_exchange_device(curdev, newdev);
+ per_cpu(tick_oneshot_wakeup_device, cpu) = newdev;
+ return true;
+}
+#else
+static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu)
+{
+ return NULL;
+}
+
+static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev,
+ int cpu)
+{
+ return false;
+}
+#endif
+
/*
* Conditionally install/replace broadcast device
*/
-void tick_install_broadcast_device(struct clock_event_device *dev)
+void tick_install_broadcast_device(struct clock_event_device *dev, int cpu)
{
struct clock_event_device *cur = tick_broadcast_device.evtdev;
+ if (tick_set_oneshot_wakeup_device(dev, cpu))
+ return;
+
if (!tick_check_broadcast_device(cur, dev))
return;
@@ -253,7 +324,6 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
return ret;
}
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
int tick_receive_broadcast(void)
{
struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
@@ -268,7 +338,6 @@ int tick_receive_broadcast(void)
evt->event_handler(evt);
return 0;
}
-#endif
/*
* Broadcast the event to the cpus, which are set in the mask (mangled).
@@ -719,24 +788,16 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
}
-int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+static int ___tick_broadcast_oneshot_control(enum tick_broadcast_state state,
+ struct tick_device *td,
+ int cpu)
{
- struct clock_event_device *bc, *dev;
- int cpu, ret = 0;
+ struct clock_event_device *bc, *dev = td->evtdev;
+ int ret = 0;
ktime_t now;
- /*
- * If there is no broadcast device, tell the caller not to go
- * into deep idle.
- */
- if (!tick_broadcast_device.evtdev)
- return -EBUSY;
-
- dev = this_cpu_ptr(&tick_cpu_device)->evtdev;
-
raw_spin_lock(&tick_broadcast_lock);
bc = tick_broadcast_device.evtdev;
- cpu = smp_processor_id();
if (state == TICK_BROADCAST_ENTER) {
/*
@@ -865,6 +926,53 @@ out:
return ret;
}
+static int tick_oneshot_wakeup_control(enum tick_broadcast_state state,
+ struct tick_device *td,
+ int cpu)
+{
+ struct clock_event_device *dev, *wd;
+
+ dev = td->evtdev;
+ if (td->mode != TICKDEV_MODE_ONESHOT)
+ return -EINVAL;
+
+ wd = tick_get_oneshot_wakeup_device(cpu);
+ if (!wd)
+ return -ENODEV;
+
+ switch (state) {
+ case TICK_BROADCAST_ENTER:
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
+ clockevents_switch_state(wd, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_program_event(wd, dev->next_event, 1);
+ break;
+ case TICK_BROADCAST_EXIT:
+ /* We may have transitioned to oneshot mode while idle */
+ if (clockevent_get_state(wd) != CLOCK_EVT_STATE_ONESHOT)
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+ int cpu = smp_processor_id();
+
+ if (!tick_oneshot_wakeup_control(state, td, cpu))
+ return 0;
+
+ if (tick_broadcast_device.evtdev)
+ return ___tick_broadcast_oneshot_control(state, td, cpu);
+
+ /*
+ * If there is no broadcast or wakeup device, tell the caller not
+ * to go into deep idle.
+ */
+ return -EBUSY;
+}
+
/*
* Reset the one shot broadcast for a cpu
*
@@ -991,6 +1099,9 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
*/
static void tick_broadcast_oneshot_offline(unsigned int cpu)
{
+ if (tick_get_oneshot_wakeup_device(cpu))
+ tick_set_oneshot_wakeup_device(NULL, cpu);
+
/*
* Clear the broadcast masks for the dead cpu, but do not stop
* the broadcast device!
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index e15bc0ef1912..46789356f856 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -373,7 +373,7 @@ out_bc:
/*
* Can the new device be used as a broadcast device ?
*/
- tick_install_broadcast_device(newdev);
+ tick_install_broadcast_device(newdev, cpu);
}
/**
@@ -470,6 +470,13 @@ void tick_resume_local(void)
else
tick_resume_oneshot();
}
+
+ /*
+ * Ensure that hrtimers are up to date and the clockevents device
+ * is reprogrammed correctly when high resolution timers are
+ * enabled.
+ */
+ hrtimers_resume_local();
}
/**
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 7a981c9e87a4..649f2b48e8f0 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -61,7 +61,7 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
/* Broadcasting support */
# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
-extern void tick_install_broadcast_device(struct clock_event_device *dev);
+extern void tick_install_broadcast_device(struct clock_event_device *dev, int cpu);
extern int tick_is_broadcast_device(struct clock_event_device *dev);
extern void tick_suspend_broadcast(void);
extern void tick_resume_broadcast(void);
@@ -71,8 +71,9 @@ extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadc
extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
extern struct tick_device *tick_get_broadcast_device(void);
extern struct cpumask *tick_get_broadcast_mask(void);
+extern const struct clock_event_device *tick_get_wakeup_device(int cpu);
# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */
-static inline void tick_install_broadcast_device(struct clock_event_device *dev) { }
+static inline void tick_install_broadcast_device(struct clock_event_device *dev, int cpu) { }
static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
@@ -164,3 +165,35 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
void timer_clear_idle(void);
+
+#define CLOCK_SET_WALL \
+ (BIT(HRTIMER_BASE_REALTIME) | BIT(HRTIMER_BASE_REALTIME_SOFT) | \
+ BIT(HRTIMER_BASE_TAI) | BIT(HRTIMER_BASE_TAI_SOFT))
+
+#define CLOCK_SET_BOOT \
+ (BIT(HRTIMER_BASE_BOOTTIME) | BIT(HRTIMER_BASE_BOOTTIME_SOFT))
+
+void clock_was_set(unsigned int bases);
+void clock_was_set_delayed(void);
+
+void hrtimers_resume_local(void);
+
+/* Since jiffies uses a simple TICK_NSEC multiplier
+ * conversion, the .shift value could be zero. However
+ * this would make NTP adjustments impossible as they are
+ * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
+ * shift both the nominator and denominator the same
+ * amount, and give ntp adjustments in units of 1/2^8
+ *
+ * The value 8 is somewhat carefully chosen, as anything
+ * larger can result in overflows. TICK_NSEC grows as HZ
+ * shrinks, so values greater than 8 overflow 32bits when
+ * HZ=100.
+ */
+#if HZ < 34
+#define JIFFIES_SHIFT 6
+#elif HZ < 67
+#define JIFFIES_SHIFT 7
+#else
+#define JIFFIES_SHIFT 8
+#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6784f27a3099..6bffe5af8cb1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -323,6 +323,46 @@ void tick_nohz_full_kick_cpu(int cpu)
irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
}
+static void tick_nohz_kick_task(struct task_struct *tsk)
+{
+ int cpu;
+
+ /*
+ * If the task is not running, run_posix_cpu_timers()
+ * has nothing to elapse, IPI can then be spared.
+ *
+ * activate_task() STORE p->tick_dep_mask
+ * STORE p->on_rq
+ * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or())
+ * LOCK rq->lock LOAD p->on_rq
+ * smp_mb__after_spin_lock()
+ * tick_nohz_task_switch()
+ * LOAD p->tick_dep_mask
+ */
+ if (!sched_task_on_rq(tsk))
+ return;
+
+ /*
+ * If the task concurrently migrates to another CPU,
+ * we guarantee it sees the new tick dependency upon
+ * schedule.
+ *
+ * set_task_cpu(p, cpu);
+ * STORE p->cpu = @cpu
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock
+ * smp_mb__after_spin_lock() STORE p->tick_dep_mask
+ * tick_nohz_task_switch() smp_mb() (atomic_fetch_or())
+ * LOAD p->tick_dep_mask LOAD p->cpu
+ */
+ cpu = task_cpu(tsk);
+
+ preempt_disable();
+ if (cpu_online(cpu))
+ tick_nohz_full_kick_cpu(cpu);
+ preempt_enable();
+}
+
/*
* Kick all full dynticks CPUs in order to force these to re-evaluate
* their dependency on the tick and restart it if necessary.
@@ -405,19 +445,8 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
*/
void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
- if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) {
- if (tsk == current) {
- preempt_disable();
- tick_nohz_full_kick();
- preempt_enable();
- } else {
- /*
- * Some future tick_nohz_full_kick_task()
- * should optimize this.
- */
- tick_nohz_full_kick_all();
- }
- }
+ if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask))
+ tick_nohz_kick_task(tsk);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
@@ -431,9 +460,20 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);
* Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
* per process timers.
*/
-void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+void tick_nohz_dep_set_signal(struct task_struct *tsk,
+ enum tick_dep_bits bit)
{
- tick_nohz_dep_set_all(&sig->tick_dep_mask, bit);
+ int prev;
+ struct signal_struct *sig = tsk->signal;
+
+ prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask);
+ if (!prev) {
+ struct task_struct *t;
+
+ lockdep_assert_held(&tsk->sighand->siglock);
+ __for_each_thread(sig, t)
+ tick_nohz_kick_task(t);
+ }
}
void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
@@ -448,13 +488,10 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi
*/
void __tick_nohz_task_switch(void)
{
- unsigned long flags;
struct tick_sched *ts;
- local_irq_save(flags);
-
if (!tick_nohz_full_cpu(smp_processor_id()))
- goto out;
+ return;
ts = this_cpu_ptr(&tick_cpu_sched);
@@ -463,8 +500,6 @@ void __tick_nohz_task_switch(void)
atomic_read(&current->signal->tick_dep_mask))
tick_nohz_full_kick();
}
-out:
- local_irq_restore(flags);
}
/* Get the boot-time nohz CPU list from the kernel parameters. */
@@ -922,27 +957,31 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
* Cancel the scheduled timer and restore the tick
*/
ts->tick_stopped = 0;
- ts->idle_exittime = now;
-
tick_nohz_restart(ts, now);
}
-static void tick_nohz_full_update_tick(struct tick_sched *ts)
+static void __tick_nohz_full_update_tick(struct tick_sched *ts,
+ ktime_t now)
{
#ifdef CONFIG_NO_HZ_FULL
int cpu = smp_processor_id();
- if (!tick_nohz_full_cpu(cpu))
+ if (can_stop_full_tick(cpu, ts))
+ tick_nohz_stop_sched_tick(ts, cpu);
+ else if (ts->tick_stopped)
+ tick_nohz_restart_sched_tick(ts, now);
+#endif
+}
+
+static void tick_nohz_full_update_tick(struct tick_sched *ts)
+{
+ if (!tick_nohz_full_cpu(smp_processor_id()))
return;
if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
return;
- if (can_stop_full_tick(cpu, ts))
- tick_nohz_stop_sched_tick(ts, cpu);
- else if (ts->tick_stopped)
- tick_nohz_restart_sched_tick(ts, ktime_get());
-#endif
+ __tick_nohz_full_update_tick(ts, ktime_get());
}
static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
@@ -1189,11 +1228,13 @@ unsigned long tick_nohz_get_idle_calls(void)
return ts->idle_calls;
}
-static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
+static void tick_nohz_account_idle_time(struct tick_sched *ts,
+ ktime_t now)
{
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
unsigned long ticks;
+ ts->idle_exittime = now;
+
if (vtime_accounting_enabled_this_cpu())
return;
/*
@@ -1207,21 +1248,27 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
*/
if (ticks && ticks < LONG_MAX)
account_idle_ticks(ticks);
-#endif
}
-static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now)
+void tick_nohz_idle_restart_tick(void)
{
- tick_nohz_restart_sched_tick(ts, now);
- tick_nohz_account_idle_ticks(ts);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ if (ts->tick_stopped) {
+ ktime_t now = ktime_get();
+ tick_nohz_restart_sched_tick(ts, now);
+ tick_nohz_account_idle_time(ts, now);
+ }
}
-void tick_nohz_idle_restart_tick(void)
+static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
{
- struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ __tick_nohz_full_update_tick(ts, now);
+ else
+ tick_nohz_restart_sched_tick(ts, now);
- if (ts->tick_stopped)
- __tick_nohz_idle_restart_tick(ts, ktime_get());
+ tick_nohz_account_idle_time(ts, now);
}
/**
@@ -1253,7 +1300,7 @@ void tick_nohz_idle_exit(void)
tick_nohz_stop_idle(ts, now);
if (tick_stopped)
- __tick_nohz_idle_restart_tick(ts, now);
+ tick_nohz_idle_update_tick(ts, now);
local_irq_enable();
}
diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c
new file mode 100644
index 000000000000..831e8e779ace
--- /dev/null
+++ b/kernel/time/time_test.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: LGPL-2.1+
+
+#include <kunit/test.h>
+#include <linux/time.h>
+
+/*
+ * Traditional implementation of leap year evaluation.
+ */
+static bool is_leap(long year)
+{
+ return year % 4 == 0 && (year % 100 != 0 || year % 400 == 0);
+}
+
+/*
+ * Gets the last day of a month.
+ */
+static int last_day_of_month(long year, int month)
+{
+ if (month == 2)
+ return 28 + is_leap(year);
+ if (month == 4 || month == 6 || month == 9 || month == 11)
+ return 30;
+ return 31;
+}
+
+/*
+ * Advances a date by one day.
+ */
+static void advance_date(long *year, int *month, int *mday, int *yday)
+{
+ if (*mday != last_day_of_month(*year, *month)) {
+ ++*mday;
+ ++*yday;
+ return;
+ }
+
+ *mday = 1;
+ if (*month != 12) {
+ ++*month;
+ ++*yday;
+ return;
+ }
+
+ *month = 1;
+ *yday = 0;
+ ++*year;
+}
+
+/*
+ * Checks every day in a 160000 years interval centered at 1970-01-01
+ * against the expected result.
+ */
+static void time64_to_tm_test_date_range(struct kunit *test)
+{
+ /*
+ * 80000 years = (80000 / 400) * 400 years
+ * = (80000 / 400) * 146097 days
+ * = (80000 / 400) * 146097 * 86400 seconds
+ */
+ time64_t total_secs = ((time64_t) 80000) / 400 * 146097 * 86400;
+ long year = 1970 - 80000;
+ int month = 1;
+ int mdday = 1;
+ int yday = 0;
+
+ struct tm result;
+ time64_t secs;
+ s64 days;
+
+ for (secs = -total_secs; secs <= total_secs; secs += 86400) {
+
+ time64_to_tm(secs, 0, &result);
+
+ days = div_s64(secs, 86400);
+
+ #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %ld", \
+ year, month, mdday, yday, days
+
+ KUNIT_ASSERT_EQ_MSG(test, year - 1900, result.tm_year, FAIL_MSG);
+ KUNIT_ASSERT_EQ_MSG(test, month - 1, result.tm_mon, FAIL_MSG);
+ KUNIT_ASSERT_EQ_MSG(test, mdday, result.tm_mday, FAIL_MSG);
+ KUNIT_ASSERT_EQ_MSG(test, yday, result.tm_yday, FAIL_MSG);
+
+ advance_date(&year, &month, &mdday, &yday);
+ }
+}
+
+static struct kunit_case time_test_cases[] = {
+ KUNIT_CASE(time64_to_tm_test_date_range),
+ {}
+};
+
+static struct kunit_suite time_test_suite = {
+ .name = "time_test_cases",
+ .test_cases = time_test_cases,
+};
+
+kunit_test_suite(time_test_suite);
+MODULE_LICENSE("GPL");
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
index 62e3b46717a6..59b922c826e7 100644
--- a/kernel/time/timeconv.c
+++ b/kernel/time/timeconv.c
@@ -22,47 +22,16 @@
/*
* Converts the calendar time to broken-down time representation
- * Based on code from glibc-2.6
*
* 2009-7-14:
* Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
+ * 2021-06-02:
+ * Reimplemented by Cassio Neri <cassio.neri@gmail.com>
*/
#include <linux/time.h>
#include <linux/module.h>
-
-/*
- * Nonzero if YEAR is a leap year (every 4 years,
- * except every 100th isn't, and every 400th is).
- */
-static int __isleap(long year)
-{
- return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
-}
-
-/* do a mathdiv for long type */
-static long math_div(long a, long b)
-{
- return a / b - (a % b < 0);
-}
-
-/* How many leap years between y1 and y2, y1 must less or equal to y2 */
-static long leaps_between(long y1, long y2)
-{
- long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
- + math_div(y1 - 1, 400);
- long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
- + math_div(y2 - 1, 400);
- return leaps2 - leaps1;
-}
-
-/* How many days come before each month (0-12). */
-static const unsigned short __mon_yday[2][13] = {
- /* Normal years. */
- {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
- /* Leap years. */
- {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
-};
+#include <linux/kernel.h>
#define SECS_PER_HOUR (60 * 60)
#define SECS_PER_DAY (SECS_PER_HOUR * 24)
@@ -77,9 +46,11 @@ static const unsigned short __mon_yday[2][13] = {
*/
void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)
{
- long days, rem, y;
+ u32 u32tmp, day_of_century, year_of_century, day_of_year, month, day;
+ u64 u64tmp, udays, century, year;
+ bool is_Jan_or_Feb, is_leap_year;
+ long days, rem;
int remainder;
- const unsigned short *ip;
days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder);
rem = remainder;
@@ -103,27 +74,68 @@ void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)
if (result->tm_wday < 0)
result->tm_wday += 7;
- y = 1970;
+ /*
+ * The following algorithm is, basically, Proposition 6.3 of Neri
+ * and Schneider [1]. In a few words: it works on the computational
+ * (fictitious) calendar where the year starts in March, month = 2
+ * (*), and finishes in February, month = 13. This calendar is
+ * mathematically convenient because the day of the year does not
+ * depend on whether the year is leap or not. For instance:
+ *
+ * March 1st 0-th day of the year;
+ * ...
+ * April 1st 31-st day of the year;
+ * ...
+ * January 1st 306-th day of the year; (Important!)
+ * ...
+ * February 28th 364-th day of the year;
+ * February 29th 365-th day of the year (if it exists).
+ *
+ * After having worked out the date in the computational calendar
+ * (using just arithmetics) it's easy to convert it to the
+ * corresponding date in the Gregorian calendar.
+ *
+ * [1] "Euclidean Affine Functions and Applications to Calendar
+ * Algorithms". https://arxiv.org/abs/2102.06959
+ *
+ * (*) The numbering of months follows tm more closely and thus,
+ * is slightly different from [1].
+ */
- while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
- /* Guess a corrected year, assuming 365 days per year. */
- long yg = y + math_div(days, 365);
+ udays = ((u64) days) + 2305843009213814918ULL;
- /* Adjust DAYS and Y to match the guessed year. */
- days -= (yg - y) * 365 + leaps_between(y, yg);
- y = yg;
- }
+ u64tmp = 4 * udays + 3;
+ century = div64_u64_rem(u64tmp, 146097, &u64tmp);
+ day_of_century = (u32) (u64tmp / 4);
+
+ u32tmp = 4 * day_of_century + 3;
+ u64tmp = 2939745ULL * u32tmp;
+ year_of_century = upper_32_bits(u64tmp);
+ day_of_year = lower_32_bits(u64tmp) / 2939745 / 4;
+
+ year = 100 * century + year_of_century;
+ is_leap_year = year_of_century ? !(year_of_century % 4) : !(century % 4);
- result->tm_year = y - 1900;
+ u32tmp = 2141 * day_of_year + 132377;
+ month = u32tmp >> 16;
+ day = ((u16) u32tmp) / 2141;
- result->tm_yday = days;
+ /*
+ * Recall that January 1st is the 306-th day of the year in the
+ * computational (not Gregorian) calendar.
+ */
+ is_Jan_or_Feb = day_of_year >= 306;
- ip = __mon_yday[__isleap(y)];
- for (y = 11; days < ip[y]; y--)
- continue;
- days -= ip[y];
+ /* Convert to the Gregorian calendar and adjust to Unix time. */
+ year = year + is_Jan_or_Feb - 6313183731940000ULL;
+ month = is_Jan_or_Feb ? month - 12 : month;
+ day = day + 1;
+ day_of_year += is_Jan_or_Feb ? -306 : 31 + 28 + is_leap_year;
- result->tm_mon = y;
- result->tm_mday = days + 1;
+ /* Convert to tm's format. */
+ result->tm_year = (long) (year - 1900);
+ result->tm_mon = (int) month;
+ result->tm_mday = (int) day;
+ result->tm_yday = (int) day_of_year;
}
EXPORT_SYMBOL(time64_to_tm);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8a364aa9881a..b348749a9fc6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1323,8 +1323,8 @@ out:
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- /* signal hrtimers about time change */
- clock_was_set();
+ /* Signal hrtimers about time change */
+ clock_was_set(CLOCK_SET_WALL);
if (!ret)
audit_tk_injoffset(ts_delta);
@@ -1371,8 +1371,8 @@ error: /* even if we error out, we forwarded the time, so call update */
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- /* signal hrtimers about time change */
- clock_was_set();
+ /* Signal hrtimers about time change */
+ clock_was_set(CLOCK_SET_WALL);
return ret;
}
@@ -1746,8 +1746,8 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- /* signal hrtimers about time change */
- clock_was_set();
+ /* Signal hrtimers about time change */
+ clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT);
}
#endif
@@ -1810,8 +1810,10 @@ void timekeeping_resume(void)
touch_softlockup_watchdog();
+ /* Resume the clockevent device(s) and hrtimers */
tick_resume();
- hrtimers_resume();
+ /* Notify timerfd as resume is equivalent to clock_was_set() */
+ timerfd_resume();
}
int timekeeping_suspend(void)
@@ -2125,7 +2127,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
*/
-static void timekeeping_advance(enum timekeeping_adv_mode mode)
+static bool timekeeping_advance(enum timekeeping_adv_mode mode)
{
struct timekeeper *real_tk = &tk_core.timekeeper;
struct timekeeper *tk = &shadow_timekeeper;
@@ -2196,9 +2198,8 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode)
write_seqcount_end(&tk_core.seq);
out:
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- if (clock_set)
- /* Have to call _delayed version, since in irq context*/
- clock_was_set_delayed();
+
+ return !!clock_set;
}
/**
@@ -2207,7 +2208,8 @@ out:
*/
void update_wall_time(void)
{
- timekeeping_advance(TK_ADV_TICK);
+ if (timekeeping_advance(TK_ADV_TICK))
+ clock_was_set_delayed();
}
/**
@@ -2387,8 +2389,9 @@ int do_adjtimex(struct __kernel_timex *txc)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct audit_ntp_data ad;
- unsigned long flags;
+ bool clock_set = false;
struct timespec64 ts;
+ unsigned long flags;
s32 orig_tai, tai;
int ret;
@@ -2423,6 +2426,7 @@ int do_adjtimex(struct __kernel_timex *txc)
if (tai != orig_tai) {
__timekeeping_set_tai_offset(tk, tai);
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
+ clock_set = true;
}
tk_update_leap_state(tk);
@@ -2433,10 +2437,10 @@ int do_adjtimex(struct __kernel_timex *txc)
/* Update the multiplier immediately if frequency was set directly */
if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
- timekeeping_advance(TK_ADV_FREQ);
+ clock_set |= timekeeping_advance(TK_ADV_FREQ);
- if (tai != orig_tai)
- clock_was_set();
+ if (clock_set)
+ clock_was_set(CLOCK_REALTIME);
ntp_notify_cmos_timer();
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index d111adf4a0cb..e3d2c23c413d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -207,6 +207,7 @@ struct timer_base {
unsigned int cpu;
bool next_expiry_recalc;
bool is_idle;
+ bool timers_pending;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -595,6 +596,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
* can reevaluate the wheel:
*/
base->next_expiry = bucket_expiry;
+ base->timers_pending = true;
base->next_expiry_recalc = false;
trigger_dyntick_cpu(base, timer);
}
@@ -1237,20 +1239,6 @@ int try_to_del_timer_sync(struct timer_list *timer)
}
EXPORT_SYMBOL(try_to_del_timer_sync);
-bool timer_curr_running(struct timer_list *timer)
-{
- int i;
-
- for (i = 0; i < NR_BASES; i++) {
- struct timer_base *base = this_cpu_ptr(&timer_bases[i]);
-
- if (base->running_timer == timer)
- return true;
- }
-
- return false;
-}
-
#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
@@ -1277,8 +1265,10 @@ static inline void timer_base_unlock_expiry(struct timer_base *base)
static void timer_sync_wait_running(struct timer_base *base)
{
if (atomic_read(&base->timer_waiters)) {
+ raw_spin_unlock_irq(&base->lock);
spin_unlock(&base->expiry_lock);
spin_lock(&base->expiry_lock);
+ raw_spin_lock_irq(&base->lock);
}
}
@@ -1469,14 +1459,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn, baseclk);
- base->running_timer = NULL;
raw_spin_lock(&base->lock);
+ base->running_timer = NULL;
} else {
raw_spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn, baseclk);
+ raw_spin_lock_irq(&base->lock);
base->running_timer = NULL;
timer_sync_wait_running(base);
- raw_spin_lock_irq(&base->lock);
}
}
}
@@ -1596,6 +1586,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
}
base->next_expiry_recalc = false;
+ base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
return next;
}
@@ -1647,7 +1638,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
u64 expires = KTIME_MAX;
unsigned long nextevt;
- bool is_max_delta;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1660,7 +1650,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
if (base->next_expiry_recalc)
base->next_expiry = __next_timer_interrupt(base);
nextevt = base->next_expiry;
- is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
/*
* We have a fresh next event. Check whether we can forward the
@@ -1678,7 +1667,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
expires = basem;
base->is_idle = false;
} else {
- if (!is_max_delta)
+ if (base->timers_pending)
expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
* If we expect to sleep more than a tick, mark the base idle.
@@ -1879,7 +1868,7 @@ signed long __sched schedule_timeout(signed long timeout)
printk(KERN_ERR "schedule_timeout: wrong timeout "
"value %lx\n", timeout);
dump_stack();
- current->state = TASK_RUNNING;
+ __set_current_state(TASK_RUNNING);
goto out;
}
}
@@ -1961,6 +1950,7 @@ int timers_prepare_cpu(unsigned int cpu)
base = per_cpu_ptr(&timer_bases[b], cpu);
base->clk = jiffies;
base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->timers_pending = false;
base->is_idle = false;
}
return 0;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 6939140ab7c5..ed7d6ad694fb 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -228,6 +228,14 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
SEQ_printf(m, " event_handler: %ps\n", dev->event_handler);
SEQ_printf(m, "\n");
SEQ_printf(m, " retries: %lu\n", dev->retries);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+ if (cpu >= 0) {
+ const struct clock_event_device *wd = tick_get_wakeup_device(cpu);
+
+ SEQ_printf(m, "Wakeup Device: %s\n", wd ? wd->name : "<NULL>");
+ }
+#endif
SEQ_printf(m, "\n");
}
@@ -248,7 +256,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
static inline void timer_list_header(struct seq_file *m, u64 now)
{
- SEQ_printf(m, "Timer List Version: v0.8\n");
+ SEQ_printf(m, "Timer List Version: v0.9\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
SEQ_printf(m, "\n");
diff --git a/kernel/torture.c b/kernel/torture.c
index 0a315c387bed..bb8f411c974b 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -521,11 +521,11 @@ static void torture_shuffle_tasks(void)
struct shuffle_task *stp;
cpumask_setall(shuffle_tmp_mask);
- get_online_cpus();
+ cpus_read_lock();
/* No point in shuffling if there is only one online CPU (ex: UP) */
if (num_online_cpus() == 1) {
- put_online_cpus();
+ cpus_read_unlock();
return;
}
@@ -541,7 +541,7 @@ static void torture_shuffle_tasks(void)
set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
mutex_unlock(&shuffle_task_mutex);
- put_online_cpus();
+ cpus_read_unlock();
}
/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 7fa82778c3e6..420ff4bc67fd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -135,10 +135,9 @@ config TRACING_SUPPORT
depends on STACKTRACE_SUPPORT
default y
-if TRACING_SUPPORT
-
menuconfig FTRACE
bool "Tracers"
+ depends on TRACING_SUPPORT
default y if DEBUG_KERNEL
help
Enable the kernel tracing infrastructure.
@@ -219,6 +218,11 @@ config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
depends on DYNAMIC_FTRACE_WITH_REGS
depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+config DYNAMIC_FTRACE_WITH_ARGS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
@@ -356,6 +360,68 @@ config HWLAT_TRACER
file. Every time a latency is greater than tracing_thresh, it will
be recorded into the ring buffer.
+config OSNOISE_TRACER
+ bool "OS Noise tracer"
+ select GENERIC_TRACER
+ help
+ In the context of high-performance computing (HPC), the Operating
+ System Noise (osnoise) refers to the interference experienced by an
+ application due to activities inside the operating system. In the
+ context of Linux, NMIs, IRQs, SoftIRQs, and any other system thread
+ can cause noise to the system. Moreover, hardware-related jobs can
+ also cause noise, for example, via SMIs.
+
+ The osnoise tracer leverages the hwlat_detector by running a similar
+ loop with preemption, SoftIRQs and IRQs enabled, thus allowing all
+ the sources of osnoise during its execution. The osnoise tracer takes
+ note of the entry and exit point of any source of interferences,
+ increasing a per-cpu interference counter. It saves an interference
+ counter for each source of interference. The interference counter for
+ NMI, IRQs, SoftIRQs, and threads is increased anytime the tool
+ observes these interferences' entry events. When a noise happens
+ without any interference from the operating system level, the
+ hardware noise counter increases, pointing to a hardware-related
+ noise. In this way, osnoise can account for any source of
+ interference. At the end of the period, the osnoise tracer prints
+ the sum of all noise, the max single noise, the percentage of CPU
+ available for the thread, and the counters for the noise sources.
+
+ In addition to the tracer, a set of tracepoints were added to
+ facilitate the identification of the osnoise source.
+
+ The output will appear in the trace and trace_pipe files.
+
+ To enable this tracer, echo in "osnoise" into the current_tracer
+ file.
+
+config TIMERLAT_TRACER
+ bool "Timerlat tracer"
+ select OSNOISE_TRACER
+ select GENERIC_TRACER
+ help
+ The timerlat tracer aims to help the preemptive kernel developers
+ to find sources of wakeup latencies of real-time threads.
+
+ The tracer creates a per-cpu kernel thread with real-time priority.
+ The tracer thread sets a periodic timer to wakeup itself, and goes
+ to sleep waiting for the timer to fire. At the wakeup, the thread
+ then computes a wakeup latency value as the difference between
+ the current time and the absolute time that the timer was set
+ to expire.
+
+ The tracer prints two lines at every activation. The first is the
+ timer latency observed at the hardirq context before the
+ activation of the thread. The second is the timer latency observed
+ by the thread, which is the same level that cyclictest reports. The
+ ACTIVATION ID field serves to relate the irq execution to its
+ respective thread execution.
+
+ The tracer is build on top of osnoise tracer, and the osnoise:
+ events can be used to trace the source of interference from NMI,
+ IRQs and other threads. It also enables the capture of the
+ stacktrace at the IRQ context, which helps to identify the code
+ path that can cause thread delay.
+
config MMIOTRACE
bool "Memory mapped IO tracing"
depends on HAVE_MMIOTRACE_SUPPORT && PCI
@@ -970,6 +1036,3 @@ config HIST_TRIGGERS_DEBUG
If unsure, say N.
endif # FTRACE
-
-endif # TRACING_SUPPORT
-
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index b28d3e5013cd..6de5d4d63165 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -58,6 +58,7 @@ obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
+obj-$(CONFIG_OSNOISE_TRACER) += trace_osnoise.o
obj-$(CONFIG_NOP_TRACER) += trace_nop.o
obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
@@ -76,6 +77,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c221e4c3f625..fa91f398f28b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1605,6 +1605,14 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (bt == NULL)
return -EINVAL;
+ if (bt->trace_state == Blktrace_running) {
+ bt->trace_state = Blktrace_stopped;
+ spin_lock_irq(&running_trace_lock);
+ list_del_init(&bt->running_list);
+ spin_unlock_irq(&running_trace_lock);
+ relay_flush(bt->rchan);
+ }
+
put_probe_ref();
synchronize_rcu();
blk_trace_free(bt);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7a52bc172841..8e2eb950aa82 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -124,7 +124,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
* out of events when it was updated in between this and the
* rcu_dereference() which is accepted risk.
*/
- ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run);
out:
__this_cpu_dec(bpf_prog_active);
@@ -714,13 +714,28 @@ BPF_CALL_0(bpf_get_current_task_btf)
return (unsigned long) current;
}
-BTF_ID_LIST_SINGLE(bpf_get_current_btf_ids, struct, task_struct)
-
-static const struct bpf_func_proto bpf_get_current_task_btf_proto = {
+const struct bpf_func_proto bpf_get_current_task_btf_proto = {
.func = bpf_get_current_task_btf,
.gpl_only = true,
.ret_type = RET_PTR_TO_BTF_ID,
- .ret_btf_id = &bpf_get_current_btf_ids[0],
+ .ret_btf_id = &btf_task_struct_ids[0],
+};
+
+BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
+{
+ return (unsigned long) task_pt_regs(task);
+}
+
+BTF_ID_LIST(bpf_task_pt_regs_ids)
+BTF_ID(struct, pt_regs)
+
+const struct bpf_func_proto bpf_task_pt_regs_proto = {
+ .func = bpf_task_pt_regs,
+ .gpl_only = true,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_task_struct_ids[0],
+ .ret_type = RET_PTR_TO_BTF_ID,
+ .ret_btf_id = &bpf_task_pt_regs_ids[0],
};
BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
@@ -948,7 +963,61 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
.arg5_type = ARG_ANYTHING,
};
-const struct bpf_func_proto *
+BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx)
+{
+ /* This helper call is inlined by verifier. */
+ return ((u64 *)ctx)[-1];
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
+ .func = bpf_get_func_ip_tracing,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
+{
+ struct kprobe *kp = kprobe_running();
+
+ return kp ? (uintptr_t)kp->addr : 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
+ .func = bpf_get_func_ip_kprobe,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
+{
+ struct bpf_trace_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+ return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = {
+ .func = bpf_get_attach_cookie_trace,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx)
+{
+ return ctx->event->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
+ .func = bpf_get_attach_cookie_pe,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
@@ -978,6 +1047,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_task_btf:
return &bpf_get_current_task_btf_proto;
+ case BPF_FUNC_task_pt_regs:
+ return &bpf_task_pt_regs_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_current_comm:
@@ -990,33 +1061,36 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_numa_node_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
- case BPF_FUNC_probe_write_user:
- return bpf_get_probe_write_proto();
case BPF_FUNC_current_task_under_cgroup:
return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_probe_write_user:
+ return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
+ NULL : bpf_get_probe_write_proto();
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
case BPF_FUNC_probe_read:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_proto;
case BPF_FUNC_probe_read_str:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_str_proto;
#endif
#ifdef CONFIG_CGROUPS
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
@@ -1056,8 +1130,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_for_each_map_elem_proto;
case BPF_FUNC_snprintf:
return &bpf_snprintf_proto;
+ case BPF_FUNC_get_func_ip:
+ return &bpf_get_func_ip_proto_tracing;
default:
- return NULL;
+ return bpf_base_func_proto(func_id);
}
}
@@ -1075,6 +1151,10 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_override_return:
return &bpf_override_return_proto;
#endif
+ case BPF_FUNC_get_func_ip:
+ return &bpf_get_func_ip_proto_kprobe;
+ case BPF_FUNC_get_attach_cookie:
+ return &bpf_get_attach_cookie_proto_trace;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -1185,6 +1265,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_stackid_proto_tp;
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto_tp;
+ case BPF_FUNC_get_attach_cookie:
+ return &bpf_get_attach_cookie_proto_trace;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -1292,6 +1374,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_prog_read_value_proto;
case BPF_FUNC_read_branch_records:
return &bpf_read_branch_records_proto;
+ case BPF_FUNC_get_attach_cookie:
+ return &bpf_get_attach_cookie_proto_pe;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -1428,6 +1512,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
const struct bpf_func_proto *
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *fn;
+
switch (func_id) {
#ifdef CONFIG_NET
case BPF_FUNC_skb_output:
@@ -1468,7 +1554,10 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_d_path:
return &bpf_d_path_proto;
default:
- return raw_tp_prog_func_proto(func_id, prog);
+ fn = raw_tp_prog_func_proto(func_id, prog);
+ if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
+ fn = bpf_iter_get_func_proto(func_id, prog);
+ return fn;
}
}
@@ -1636,7 +1725,8 @@ static DEFINE_MUTEX(bpf_event_mutex);
#define BPF_TRACE_MAX_PROGS 64
int perf_event_attach_bpf_prog(struct perf_event *event,
- struct bpf_prog *prog)
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
struct bpf_prog_array *old_array;
struct bpf_prog_array *new_array;
@@ -1663,12 +1753,13 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
goto unlock;
}
- ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+ ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array);
if (ret < 0)
goto unlock;
/* set the new array to event->tp_event and set event->prog */
event->prog = prog;
+ event->bpf_cookie = bpf_cookie;
rcu_assign_pointer(event->tp_event->prog_array, new_array);
bpf_prog_array_free(old_array);
@@ -1689,7 +1780,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
goto unlock;
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
- ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+ ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
if (ret == -ENOENT)
goto unlock;
if (ret < 0) {
@@ -1777,7 +1868,7 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
{
cant_sleep();
rcu_read_lock();
- (void) BPF_PROG_RUN(prog, args);
+ (void) bpf_prog_run(prog, args);
rcu_read_unlock();
}
@@ -1840,7 +1931,8 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
if (prog->aux->max_tp_access > btp->writable_size)
return -EINVAL;
- return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog);
+ return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
+ prog);
}
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 72ef4dccbcc4..feebf57c6458 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2208,7 +2208,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
}
/**
- * ftrace_update_record, set a record that now is tracing or not
+ * ftrace_update_record - set a record that now is tracing or not
* @rec: the record to update
* @enable: set to true if the record is tracing, false to force disable
*
@@ -2221,7 +2221,7 @@ int ftrace_update_record(struct dyn_ftrace *rec, bool enable)
}
/**
- * ftrace_test_record, check if the record has been enabled or not
+ * ftrace_test_record - check if the record has been enabled or not
* @rec: the record to test
* @enable: set to true to check if enabled, false if it is disabled
*
@@ -2574,7 +2574,7 @@ struct ftrace_rec_iter {
};
/**
- * ftrace_rec_iter_start, start up iterating over traced functions
+ * ftrace_rec_iter_start - start up iterating over traced functions
*
* Returns an iterator handle that is used to iterate over all
* the records that represent address locations where functions
@@ -2605,7 +2605,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void)
}
/**
- * ftrace_rec_iter_next, get the next record to process.
+ * ftrace_rec_iter_next - get the next record to process.
* @iter: The handle to the iterator.
*
* Returns the next iterator after the given iterator @iter.
@@ -2630,7 +2630,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
}
/**
- * ftrace_rec_iter_record, get the record at the iterator location
+ * ftrace_rec_iter_record - get the record at the iterator location
* @iter: The current iterator location
*
* Returns the record that the current @iter is at.
@@ -2733,7 +2733,7 @@ static int __ftrace_modify_code(void *data)
}
/**
- * ftrace_run_stop_machine, go back to the stop machine method
+ * ftrace_run_stop_machine - go back to the stop machine method
* @command: The command to tell ftrace what to do
*
* If an arch needs to fall back to the stop machine method, the
@@ -2745,7 +2745,7 @@ void ftrace_run_stop_machine(int command)
}
/**
- * arch_ftrace_update_code, modify the code to trace or not trace
+ * arch_ftrace_update_code - modify the code to trace or not trace
* @command: The command that needs to be done
*
* Archs can override this function if it does not need to
@@ -3100,6 +3100,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
+ bool init_nop = ftrace_need_init_nop();
struct ftrace_page *pg;
struct dyn_ftrace *p;
u64 start, stop;
@@ -3138,8 +3139,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
* Do the initial record conversion from mcount jump
* to the NOP instructions.
*/
- if (!__is_defined(CC_USING_NOP_MCOUNT) &&
- !ftrace_nop_initialize(mod, p))
+ if (init_nop && !ftrace_nop_initialize(mod, p))
break;
update_cnt++;
@@ -4212,8 +4212,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
if (!func) /* warn? */
continue;
- list_del(&ftrace_mod->list);
- list_add(&ftrace_mod->list, &process_mods);
+ list_move(&ftrace_mod->list, &process_mods);
/* Use the newly allocated func, as it may be "*" */
kfree(ftrace_mod->func);
@@ -5986,7 +5985,8 @@ ftrace_graph_release(struct inode *inode, struct file *file)
* infrastructure to do the synchronization, thus we must do it
* ourselves.
*/
- synchronize_rcu_tasks_rude();
+ if (old_hash != EMPTY_HASH)
+ synchronize_rcu_tasks_rude();
free_ftrace_hash(old_hash);
}
@@ -6977,7 +6977,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op;
int bit;
- bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
+ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START);
if (bit < 0)
return;
@@ -7052,7 +7052,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
{
int bit;
- bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
+ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START);
if (bit < 0)
return;
@@ -7525,7 +7525,9 @@ void ftrace_kill(void)
}
/**
- * Test if ftrace is dead or not.
+ * ftrace_is_dead - Test if ftrace is dead or not.
+ *
+ * Returns 1 if ftrace is "dead", zero otherwise.
*/
int ftrace_is_dead(void)
{
@@ -7545,7 +7547,7 @@ int ftrace_is_dead(void)
*/
int register_ftrace_function(struct ftrace_ops *ops)
{
- int ret = -1;
+ int ret;
ftrace_ops_init(ops);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2c0ee6484990..c5a3fbf19617 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2111,7 +2111,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
}
- get_online_cpus();
+ cpus_read_lock();
/*
* Fire off all the required work handlers
* We can't schedule on offline CPUs, but it's not necessary
@@ -2143,7 +2143,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cpu_buffer->nr_pages_to_update = 0;
}
- put_online_cpus();
+ cpus_read_unlock();
} else {
cpu_buffer = buffer->buffers[cpu_id];
@@ -2171,7 +2171,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
goto out_err;
}
- get_online_cpus();
+ cpus_read_lock();
/* Can't run something on an offline CPU. */
if (!cpu_online(cpu_id))
@@ -2183,7 +2183,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
cpu_buffer->nr_pages_to_update = 0;
- put_online_cpus();
+ cpus_read_unlock();
}
out:
@@ -3391,7 +3391,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
case RINGBUF_TYPE_PADDING:
if (event->time_delta == 1)
break;
- /* fall through */
+ fallthrough;
case RINGBUF_TYPE_DATA:
ts += event->time_delta;
break;
@@ -3880,10 +3880,30 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
if (unlikely(!head))
return true;
- return reader->read == rb_page_commit(reader) &&
- (commit == reader ||
- (commit == head &&
- head->read == rb_page_commit(commit)));
+ /* Reader should exhaust content in reader page */
+ if (reader->read != rb_page_commit(reader))
+ return false;
+
+ /*
+ * If writers are committing on the reader page, knowing all
+ * committed content has been read, the ring buffer is empty.
+ */
+ if (commit == reader)
+ return true;
+
+ /*
+ * If writers are committing on a page other than reader page
+ * and head page, there should always be content to read.
+ */
+ if (commit != head)
+ return false;
+
+ /*
+ * Writers are committing on the head page, we just need
+ * to care about there're committed data, and the reader will
+ * swap reader page with head page when it is to read data.
+ */
+ return rb_page_commit(commit) == 0;
}
/**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d23a09d3eb37..bc677cd64224 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -39,6 +39,7 @@
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/init.h>
+#include <linux/panic_notifier.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
@@ -86,6 +87,7 @@ void __init disable_tracing_selftest(const char *reason)
/* Pipe tracepoints to printk */
struct trace_iterator *tracepoint_print_iter;
int tracepoint_printk;
+static bool tracepoint_printk_stop_on_boot __initdata;
static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
/* For tracers that don't implement custom flags */
@@ -196,12 +198,12 @@ __setup("ftrace=", set_cmdline_ftrace);
static int __init set_ftrace_dump_on_oops(char *str)
{
- if (*str++ != '=' || !*str) {
+ if (*str++ != '=' || !*str || !strcmp("1", str)) {
ftrace_dump_on_oops = DUMP_ALL;
return 1;
}
- if (!strcmp("orig_cpu", str)) {
+ if (!strcmp("orig_cpu", str) || !strcmp("2", str)) {
ftrace_dump_on_oops = DUMP_ORIG;
return 1;
}
@@ -256,6 +258,13 @@ static int __init set_tracepoint_printk(char *str)
}
__setup("tp_printk", set_tracepoint_printk);
+static int __init set_tracepoint_printk_stop(char *str)
+{
+ tracepoint_printk_stop_on_boot = true;
+ return 1;
+}
+__setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop);
+
unsigned long long ns2usecs(u64 nsec)
{
nsec += 500;
@@ -1682,8 +1691,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
unsigned long __read_mostly tracing_thresh;
static const struct file_operations tracing_max_lat_fops;
-#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
- defined(CONFIG_FSNOTIFY)
+#ifdef LATENCY_FS_NOTIFY
static struct workqueue_struct *fsnotify_wq;
@@ -1736,16 +1744,15 @@ void latency_fsnotify(struct trace_array *tr)
irq_work_queue(&tr->fsnotify_irqwork);
}
-/*
- * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
- * defined(CONFIG_FSNOTIFY)
- */
-#else
+#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
+ || defined(CONFIG_OSNOISE_TRACER)
#define trace_create_maxlat_file(tr, d_tracer) \
trace_create_file("tracing_max_latency", 0644, d_tracer, \
&tr->max_latency, &tracing_max_lat_fops)
+#else
+#define trace_create_maxlat_file(tr, d_tracer) do { } while (0)
#endif
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -2184,8 +2191,15 @@ void tracing_reset_all_online_cpus(void)
}
}
+/*
+ * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
+ * is the tgid last observed corresponding to pid=i.
+ */
static int *tgid_map;
+/* The maximum valid index into tgid_map. */
+static size_t tgid_map_max;
+
#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -2458,24 +2472,41 @@ void trace_find_cmdline(int pid, char comm[])
preempt_enable();
}
+static int *trace_find_tgid_ptr(int pid)
+{
+ /*
+ * Pairs with the smp_store_release in set_tracer_flag() to ensure that
+ * if we observe a non-NULL tgid_map then we also observe the correct
+ * tgid_map_max.
+ */
+ int *map = smp_load_acquire(&tgid_map);
+
+ if (unlikely(!map || pid > tgid_map_max))
+ return NULL;
+
+ return &map[pid];
+}
+
int trace_find_tgid(int pid)
{
- if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT))
- return 0;
+ int *ptr = trace_find_tgid_ptr(pid);
- return tgid_map[pid];
+ return ptr ? *ptr : 0;
}
static int trace_save_tgid(struct task_struct *tsk)
{
+ int *ptr;
+
/* treat recording of idle task as a success */
if (!tsk->pid)
return 1;
- if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT))
+ ptr = trace_find_tgid_ptr(tsk->pid);
+ if (!ptr)
return 0;
- tgid_map[tsk->pid] = tsk->tgid;
+ *ptr = tsk->tgid;
return 1;
}
@@ -2571,6 +2602,15 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
}
EXPORT_SYMBOL_GPL(trace_handle_return);
+static unsigned short migration_disable_value(void)
+{
+#if defined(CONFIG_SMP)
+ return current->migration_disabled;
+#else
+ return 0;
+#endif
+}
+
unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
{
unsigned int trace_flags = irqs_status;
@@ -2589,7 +2629,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
trace_flags |= TRACE_FLAG_NEED_RESCHED;
if (test_preempt_need_resched())
trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
- return (trace_flags << 16) | (pc & 0xff);
+ return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
+ (min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
}
struct ring_buffer_event *
@@ -2729,9 +2770,45 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
if (!tr->no_filter_buffering_ref &&
(trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
(entry = this_cpu_read(trace_buffered_event))) {
- /* Try to use the per cpu buffer first */
+ /*
+ * Filtering is on, so try to use the per cpu buffer first.
+ * This buffer will simulate a ring_buffer_event,
+ * where the type_len is zero and the array[0] will
+ * hold the full length.
+ * (see include/linux/ring-buffer.h for details on
+ * how the ring_buffer_event is structured).
+ *
+ * Using a temp buffer during filtering and copying it
+ * on a matched filter is quicker than writing directly
+ * into the ring buffer and then discarding it when
+ * it doesn't match. That is because the discard
+ * requires several atomic operations to get right.
+ * Copying on match and doing nothing on a failed match
+ * is still quicker than no copy on match, but having
+ * to discard out of the ring buffer on a failed match.
+ */
+ int max_len = PAGE_SIZE - struct_size(entry, array, 1);
+
val = this_cpu_inc_return(trace_buffered_event_cnt);
- if ((len < (PAGE_SIZE - sizeof(*entry) - sizeof(entry->array[0]))) && val == 1) {
+
+ /*
+ * Preemption is disabled, but interrupts and NMIs
+ * can still come in now. If that happens after
+ * the above increment, then it will have to go
+ * back to the old method of allocating the event
+ * on the ring buffer, and if the filter fails, it
+ * will have to call ring_buffer_discard_commit()
+ * to remove it.
+ *
+ * Need to also check the unlikely case that the
+ * length is bigger than the temp buffer size.
+ * If that happens, then the reserve is pretty much
+ * guaranteed to fail, as the ring buffer currently
+ * only allows events less than a page. But that may
+ * change in the future, so let the ring buffer reserve
+ * handle the failure in that case.
+ */
+ if (val == 1 && likely(len <= max_len)) {
trace_event_setup(entry, type, trace_ctx);
entry->array[0] = len;
return entry;
@@ -2829,14 +2906,26 @@ int tracepoint_printk_sysctl(struct ctl_table *table, int write,
void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
{
+ enum event_trigger_type tt = ETT_NONE;
+ struct trace_event_file *file = fbuffer->trace_file;
+
+ if (__event_trigger_test_discard(file, fbuffer->buffer, fbuffer->event,
+ fbuffer->entry, &tt))
+ goto discard;
+
if (static_key_false(&tracepoint_printk_key.key))
output_printk(fbuffer);
if (static_branch_unlikely(&trace_event_exports_enabled))
ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT);
- event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer,
- fbuffer->event, fbuffer->entry,
- fbuffer->trace_ctx, fbuffer->regs);
+
+ trace_buffer_unlock_commit_regs(file->tr, fbuffer->buffer,
+ fbuffer->event, fbuffer->trace_ctx, fbuffer->regs);
+
+discard:
+ if (tt)
+ event_triggers_post_call(file, tt);
+
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
@@ -3617,11 +3706,11 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str)
return false;
event = container_of(trace_event, struct trace_event_call, event);
- if (!event->mod)
+ if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module)
return false;
/* Would rather have rodata, but this will suffice */
- if (within_module_core(addr, event->mod))
+ if (within_module_core(addr, event->module))
return true;
return false;
@@ -4109,9 +4198,10 @@ static void print_lat_help_header(struct seq_file *m)
"# | / _----=> need-resched \n"
"# || / _---=> hardirq/softirq \n"
"# ||| / _--=> preempt-depth \n"
- "# |||| / delay \n"
- "# cmd pid ||||| time | caller \n"
- "# \\ / ||||| \\ | / \n");
+ "# |||| / _-=> migrate-disable \n"
+ "# ||||| / delay \n"
+ "# cmd pid |||||| time | caller \n"
+ "# \\ / |||||| \\ | / \n");
}
static void print_event_info(struct array_buffer *buf, struct seq_file *m)
@@ -4149,9 +4239,10 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file
seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
- seq_printf(m, "# %.*s||| / delay\n", prec, space);
- seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID ");
- seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | ");
+ seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space);
+ seq_printf(m, "# %.*s|||| / delay\n", prec, space);
+ seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID ");
+ seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | ");
}
void
@@ -5171,6 +5262,8 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
+ int *map;
+
if ((mask == TRACE_ITER_RECORD_TGID) ||
(mask == TRACE_ITER_RECORD_CMD))
lockdep_assert_held(&event_mutex);
@@ -5193,10 +5286,19 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
trace_event_enable_cmd_record(enabled);
if (mask == TRACE_ITER_RECORD_TGID) {
- if (!tgid_map)
- tgid_map = kvcalloc(PID_MAX_DEFAULT + 1,
- sizeof(*tgid_map),
- GFP_KERNEL);
+ if (!tgid_map) {
+ tgid_map_max = pid_max;
+ map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
+ GFP_KERNEL);
+
+ /*
+ * Pairs with smp_load_acquire() in
+ * trace_find_tgid_ptr() to ensure that if it observes
+ * the tgid_map we just allocated then it also observes
+ * the corresponding tgid_map_max value.
+ */
+ smp_store_release(&tgid_map, map);
+ }
if (!tgid_map) {
tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
return -ENOMEM;
@@ -5452,6 +5554,7 @@ static const char readme_msg[] =
#ifdef CONFIG_HIST_TRIGGERS
"\t s:[synthetic/]<event> <field> [<field>]\n"
#endif
+ "\t e[:[<group>/]<event>] <attached-group>.<attached-event> [<args>]\n"
"\t -:[<group>/]<event>\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
@@ -5461,7 +5564,7 @@ static const char readme_msg[] =
" place (uprobe): <path>:<offset>[%return][(ref_ctr_offset)]\n"
#endif
"\t args: <name>=fetcharg[:type]\n"
- "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"
+ "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n"
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#else
@@ -5476,6 +5579,8 @@ static const char readme_msg[] =
"\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
"\t [unsigned] char/int/long\n"
#endif
+ "\t efield: For event probes ('e' types), the field is on of the fields\n"
+ "\t of the <attached-group>/<attached-event>.\n"
#endif
" events/\t\t- Directory containing all trace event subsystems:\n"
" enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
@@ -5530,6 +5635,10 @@ static const char readme_msg[] =
"\t [:name=histname1]\n"
"\t [:<handler>.<action>]\n"
"\t [if <filter>]\n\n"
+ "\t Note, special fields can be used as well:\n"
+ "\t common_timestamp - to record current timestamp\n"
+ "\t common_cpu - to record the CPU the event happened on\n"
+ "\n"
"\t When a matching event is hit, an entry is added to a hash\n"
"\t table using the key(s) and value(s) named, and the value of a\n"
"\t sum called 'hitcount' is incremented. Keys and values\n"
@@ -5559,6 +5668,7 @@ static const char readme_msg[] =
"\t .execname display a common_pid as a program name\n"
"\t .syscall display a syscall id as a syscall name\n"
"\t .log2 display log2 value rather than raw number\n"
+ "\t .buckets=size display values in groups of size rather than raw number\n"
"\t .usecs display a common_timestamp in microseconds\n\n"
"\t The 'pause' parameter can be used to pause an existing hist\n"
"\t trigger or to start a hist trigger but not log any events\n"
@@ -5608,37 +5718,16 @@ static const struct file_operations tracing_readme_fops = {
static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
{
- int *ptr = v;
-
- if (*pos || m->count)
- ptr++;
+ int pid = ++(*pos);
- (*pos)++;
-
- for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) {
- if (trace_find_tgid(*ptr))
- return ptr;
- }
-
- return NULL;
+ return trace_find_tgid_ptr(pid);
}
static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
{
- void *v;
- loff_t l = 0;
-
- if (!tgid_map)
- return NULL;
+ int pid = *pos;
- v = &tgid_map[0];
- while (l <= *pos) {
- v = saved_tgids_next(m, v, &l);
- if (!v)
- return NULL;
- }
-
- return v;
+ return trace_find_tgid_ptr(pid);
}
static void saved_tgids_stop(struct seq_file *m, void *v)
@@ -5647,9 +5736,14 @@ static void saved_tgids_stop(struct seq_file *m, void *v)
static int saved_tgids_show(struct seq_file *m, void *v)
{
- int pid = (int *)v - tgid_map;
+ int *entry = (int *)v;
+ int pid = entry - tgid_map;
+ int tgid = *entry;
- seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid));
+ if (tgid == 0)
+ return SEQ_SKIP;
+
+ seq_printf(m, "%d %d\n", pid, tgid);
return 0;
}
@@ -6134,7 +6228,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
unsigned long size, int cpu_id)
{
- int ret = size;
+ int ret;
mutex_lock(&trace_types_lock);
@@ -7528,6 +7622,91 @@ static const struct file_operations snapshot_raw_fops = {
#endif /* CONFIG_TRACER_SNAPSHOT */
+/*
+ * trace_min_max_write - Write a u64 value to a trace_min_max_param struct
+ * @filp: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function implements the write interface for a struct trace_min_max_param.
+ * The filp->private_data must point to a trace_min_max_param structure that
+ * defines where to write the value, the min and the max acceptable values,
+ * and a lock to protect the write.
+ */
+static ssize_t
+trace_min_max_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_min_max_param *param = filp->private_data;
+ u64 val;
+ int err;
+
+ if (!param)
+ return -EFAULT;
+
+ err = kstrtoull_from_user(ubuf, cnt, 10, &val);
+ if (err)
+ return err;
+
+ if (param->lock)
+ mutex_lock(param->lock);
+
+ if (param->min && val < *param->min)
+ err = -EINVAL;
+
+ if (param->max && val > *param->max)
+ err = -EINVAL;
+
+ if (!err)
+ *param->val = val;
+
+ if (param->lock)
+ mutex_unlock(param->lock);
+
+ if (err)
+ return err;
+
+ return cnt;
+}
+
+/*
+ * trace_min_max_read - Read a u64 value from a trace_min_max_param struct
+ * @filp: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function implements the read interface for a struct trace_min_max_param.
+ * The filp->private_data must point to a trace_min_max_param struct with valid
+ * data.
+ */
+static ssize_t
+trace_min_max_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_min_max_param *param = filp->private_data;
+ char buf[U64_STR_SIZE];
+ int len;
+ u64 val;
+
+ if (!param)
+ return -EFAULT;
+
+ val = *param->val;
+
+ if (cnt > sizeof(buf))
+ cnt = sizeof(buf);
+
+ len = snprintf(buf, sizeof(buf), "%llu\n", val);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+}
+
+const struct file_operations trace_min_max_fops = {
+ .open = tracing_open_generic,
+ .read = trace_min_max_read,
+ .write = trace_min_max_write,
+};
+
#define TRACING_LOG_ERRS_MAX 8
#define TRACING_LOG_LOC_MAX 128
@@ -8983,8 +9162,10 @@ static int trace_array_create_dir(struct trace_array *tr)
return -EINVAL;
ret = event_trace_add_tracer(tr->dir, tr);
- if (ret)
+ if (ret) {
tracefs_remove(tr->dir);
+ return ret;
+ }
init_tracer_tracefs(tr, tr->dir);
__update_tracer_options(tr);
@@ -9291,9 +9472,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
create_trace_options_dir(tr);
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
trace_create_maxlat_file(tr, d_tracer);
-#endif
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
@@ -9531,6 +9710,8 @@ static __init int tracer_init_tracefs(void)
return 0;
}
+fs_initcall(tracer_init_tracefs);
+
static int trace_panic_handler(struct notifier_block *this,
unsigned long event, void *unused)
{
@@ -9647,7 +9828,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
tracing_off();
local_irq_save(flags);
- printk_nmi_direct_enter();
/* Simulate the iterator */
trace_init_global_iter(&iter);
@@ -9729,7 +9909,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
atomic_dec(&dump_running);
- printk_nmi_direct_exit();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ftrace_dump);
@@ -9951,7 +10130,7 @@ void __init trace_init(void)
trace_event_init();
}
-__init static int clear_boot_tracer(void)
+__init static void clear_boot_tracer(void)
{
/*
* The default tracer at boot buffer is an init section.
@@ -9961,26 +10140,21 @@ __init static int clear_boot_tracer(void)
* about to be freed.
*/
if (!default_bootup_tracer)
- return 0;
+ return;
printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n",
default_bootup_tracer);
default_bootup_tracer = NULL;
-
- return 0;
}
-fs_initcall(tracer_init_tracefs);
-late_initcall_sync(clear_boot_tracer);
-
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-__init static int tracing_set_default_clock(void)
+__init static void tracing_set_default_clock(void)
{
/* sched_clock_stable() is determined in late_initcall */
if (!trace_boot_clock && !sched_clock_stable()) {
if (security_locked_down(LOCKDOWN_TRACEFS)) {
pr_warn("Can not set tracing clock due to lockdown\n");
- return -EPERM;
+ return;
}
printk(KERN_WARNING
@@ -9990,8 +10164,21 @@ __init static int tracing_set_default_clock(void)
"on the kernel command line\n");
tracing_set_clock(&global_trace, "global");
}
+}
+#else
+static inline void tracing_set_default_clock(void) { }
+#endif
+
+__init static int late_trace_init(void)
+{
+ if (tracepoint_printk && tracepoint_printk_stop_on_boot) {
+ static_key_disable(&tracepoint_printk_key.key);
+ tracepoint_printk = 0;
+ }
+ tracing_set_default_clock();
+ clear_boot_tracer();
return 0;
}
-late_initcall_sync(tracing_set_default_clock);
-#endif
+
+late_initcall_sync(late_trace_init);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cd80d046c7a5..b7c0f8e160fb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,6 +20,7 @@
#include <linux/irq_work.h>
#include <linux/workqueue.h>
#include <linux/ctype.h>
+#include <linux/once_lite.h>
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -44,6 +45,8 @@ enum trace_type {
TRACE_BLK,
TRACE_BPUTS,
TRACE_HWLAT,
+ TRACE_OSNOISE,
+ TRACE_TIMERLAT,
TRACE_RAW_DATA,
TRACE_FUNC_REPEATS,
@@ -99,16 +102,8 @@ enum trace_type {
#include "trace_entries.h"
/* Use this for memory failure errors */
-#define MEM_FAIL(condition, fmt, ...) ({ \
- static bool __section(".data.once") __warned; \
- int __ret_warn_once = !!(condition); \
- \
- if (unlikely(__ret_warn_once && !__warned)) { \
- __warned = true; \
- pr_err("ERROR: " fmt, ##__VA_ARGS__); \
- } \
- unlikely(__ret_warn_once); \
-})
+#define MEM_FAIL(condition, fmt, ...) \
+ DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__)
/*
* syscalls are special, and need special handling, this is why
@@ -131,6 +126,11 @@ struct kprobe_trace_entry_head {
unsigned long ip;
};
+struct eprobe_trace_entry_head {
+ struct trace_entry ent;
+ unsigned int type;
+};
+
struct kretprobe_trace_entry_head {
struct trace_entry ent;
unsigned long func;
@@ -297,7 +297,8 @@ struct trace_array {
struct array_buffer max_buffer;
bool allocated_snapshot;
#endif
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
+ || defined(CONFIG_OSNOISE_TRACER)
unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
@@ -445,6 +446,8 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \
+ IF_ASSIGN(var, ent, struct osnoise_entry, TRACE_OSNOISE);\
+ IF_ASSIGN(var, ent, struct timerlat_entry, TRACE_TIMERLAT);\
IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
@@ -675,15 +678,15 @@ void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
#endif /* CONFIG_TRACER_MAX_TRACE */
-#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
- defined(CONFIG_FSNOTIFY)
+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
+ || defined(CONFIG_OSNOISE_TRACER)) && defined(CONFIG_FSNOTIFY)
+#define LATENCY_FS_NOTIFY
+#endif
+#ifdef LATENCY_FS_NOTIFY
void latency_fsnotify(struct trace_array *tr);
-
#else
-
static inline void latency_fsnotify(struct trace_array *tr) { }
-
#endif
#ifdef CONFIG_STACKTRACE
@@ -1391,38 +1394,6 @@ event_trigger_unlock_commit(struct trace_event_file *file,
event_triggers_post_call(file, tt);
}
-/**
- * event_trigger_unlock_commit_regs - handle triggers and finish event commit
- * @file: The file pointer associated with the event
- * @buffer: The ring buffer that the event is being written to
- * @event: The event meta data in the ring buffer
- * @entry: The event itself
- * @trace_ctx: The tracing context flags.
- *
- * This is a helper function to handle triggers that require data
- * from the event itself. It also tests the event against filters and
- * if the event is soft disabled and should be discarded.
- *
- * Same as event_trigger_unlock_commit() but calls
- * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
- */
-static inline void
-event_trigger_unlock_commit_regs(struct trace_event_file *file,
- struct trace_buffer *buffer,
- struct ring_buffer_event *event,
- void *entry, unsigned int trace_ctx,
- struct pt_regs *regs)
-{
- enum event_trigger_type tt = ETT_NONE;
-
- if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
- trace_buffer_unlock_commit_regs(file->tr, buffer, event,
- trace_ctx, regs);
-
- if (tt)
- event_triggers_post_call(file, tt);
-}
-
#define FILTER_PRED_INVALID ((unsigned short)-1)
#define FILTER_PRED_IS_RIGHT (1 << 15)
#define FILTER_PRED_FOLD (1 << 15)
@@ -1542,9 +1513,14 @@ static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; }
extern int register_trigger_cmds(void);
extern void clear_event_triggers(struct trace_array *tr);
+enum {
+ EVENT_TRIGGER_FL_PROBE = BIT(0),
+};
+
struct event_trigger_data {
unsigned long count;
int ref;
+ int flags;
struct event_trigger_ops *ops;
struct event_command *cmd_ops;
struct event_filter __rcu *filter;
@@ -1952,4 +1928,30 @@ static inline bool is_good_name(const char *name)
return true;
}
+/* Convert certain expected symbols into '_' when generating event names */
+static inline void sanitize_event_name(char *name)
+{
+ while (*name++ != '\0')
+ if (*name == ':' || *name == '.')
+ *name = '_';
+}
+
+/*
+ * This is a generic way to read and write a u64 value from a file in tracefs.
+ *
+ * The value is stored on the variable pointed by *val. The value needs
+ * to be at least *min and at most *max. The write is protected by an
+ * existing *lock.
+ */
+struct trace_min_max_param {
+ struct mutex *lock;
+ u64 *val;
+ u64 *min;
+ u64 *max;
+};
+
+#define U64_STR_SIZE 24 /* 20 digits max */
+
+extern const struct file_operations trace_min_max_fops;
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index a82f03f385f8..8d252f63cd78 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -171,6 +171,289 @@ trace_boot_add_synth_event(struct xbc_node *node, const char *event)
}
#endif
+#ifdef CONFIG_HIST_TRIGGERS
+static int __init __printf(3, 4)
+append_printf(char **bufp, char *end, const char *fmt, ...)
+{
+ va_list args;
+ int ret;
+
+ if (*bufp == end)
+ return -ENOSPC;
+
+ va_start(args, fmt);
+ ret = vsnprintf(*bufp, end - *bufp, fmt, args);
+ if (ret < end - *bufp) {
+ *bufp += ret;
+ } else {
+ *bufp = end;
+ ret = -ERANGE;
+ }
+ va_end(args);
+
+ return ret;
+}
+
+static int __init
+append_str_nospace(char **bufp, char *end, const char *str)
+{
+ char *p = *bufp;
+ int len;
+
+ while (p < end - 1 && *str != '\0') {
+ if (!isspace(*str))
+ *(p++) = *str;
+ str++;
+ }
+ *p = '\0';
+ if (p == end - 1) {
+ *bufp = end;
+ return -ENOSPC;
+ }
+ len = p - *bufp;
+ *bufp = p;
+ return (int)len;
+}
+
+static int __init
+trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp,
+ char *end, const char *key)
+{
+ struct xbc_node *anode;
+ const char *p;
+ char sep;
+
+ p = xbc_node_find_value(hnode, key, &anode);
+ if (p) {
+ if (!anode) {
+ pr_err("hist.%s requires value(s).\n", key);
+ return -EINVAL;
+ }
+
+ append_printf(bufp, end, ":%s", key);
+ sep = '=';
+ xbc_array_for_each_value(anode, p) {
+ append_printf(bufp, end, "%c%s", sep, p);
+ if (sep == '=')
+ sep = ',';
+ }
+ } else
+ return -ENOENT;
+
+ return 0;
+}
+
+static int __init
+trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp,
+ char *end, const char *handler,
+ const char *param)
+{
+ struct xbc_node *knode, *anode;
+ const char *p;
+ char sep;
+
+ /* Compose 'handler' parameter */
+ p = xbc_node_find_value(hnode, param, NULL);
+ if (!p) {
+ pr_err("hist.%s requires '%s' option.\n",
+ xbc_node_get_data(hnode), param);
+ return -EINVAL;
+ }
+ append_printf(bufp, end, ":%s(%s)", handler, p);
+
+ /* Compose 'action' parameter */
+ knode = xbc_node_find_subkey(hnode, "trace");
+ if (!knode)
+ knode = xbc_node_find_subkey(hnode, "save");
+
+ if (knode) {
+ anode = xbc_node_get_child(knode);
+ if (!anode || !xbc_node_is_value(anode)) {
+ pr_err("hist.%s.%s requires value(s).\n",
+ xbc_node_get_data(hnode),
+ xbc_node_get_data(knode));
+ return -EINVAL;
+ }
+
+ append_printf(bufp, end, ".%s", xbc_node_get_data(knode));
+ sep = '(';
+ xbc_array_for_each_value(anode, p) {
+ append_printf(bufp, end, "%c%s", sep, p);
+ if (sep == '(')
+ sep = ',';
+ }
+ append_printf(bufp, end, ")");
+ } else if (xbc_node_find_subkey(hnode, "snapshot")) {
+ append_printf(bufp, end, ".snapshot()");
+ } else {
+ pr_err("hist.%s requires an action.\n",
+ xbc_node_get_data(hnode));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __init
+trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp,
+ char *end, const char *param)
+{
+ struct xbc_node *node;
+ const char *p, *handler;
+ int ret;
+
+ handler = xbc_node_get_data(hnode);
+
+ xbc_node_for_each_subkey(hnode, node) {
+ p = xbc_node_get_data(node);
+ if (!isdigit(p[0]))
+ continue;
+ /* All digit started node should be instances. */
+ ret = trace_boot_hist_add_one_handler(node, bufp, end, handler, param);
+ if (ret < 0)
+ break;
+ }
+
+ if (xbc_node_find_subkey(hnode, param))
+ ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param);
+
+ return ret;
+}
+
+/*
+ * Histogram boottime tracing syntax.
+ *
+ * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist[.N] {
+ * keys = <KEY>[,...]
+ * values = <VAL>[,...]
+ * sort = <SORT-KEY>[,...]
+ * size = <ENTRIES>
+ * name = <HISTNAME>
+ * var { <VAR> = <EXPR> ... }
+ * pause|continue|clear
+ * onmax|onchange[.N] { var = <VAR>; <ACTION> [= <PARAM>] }
+ * onmatch[.N] { event = <EVENT>; <ACTION> [= <PARAM>] }
+ * filter = <FILTER>
+ * }
+ *
+ * Where <ACTION> are;
+ *
+ * trace = <EVENT>, <ARG1>[, ...]
+ * save = <ARG1>[, ...]
+ * snapshot
+ */
+static int __init
+trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size)
+{
+ struct xbc_node *node, *knode;
+ char *end = buf + size;
+ const char *p;
+ int ret = 0;
+
+ append_printf(&buf, end, "hist");
+
+ ret = trace_boot_hist_add_array(hnode, &buf, end, "keys");
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ pr_err("hist requires keys.\n");
+ return -EINVAL;
+ }
+
+ ret = trace_boot_hist_add_array(hnode, &buf, end, "values");
+ if (ret == -EINVAL)
+ return ret;
+ ret = trace_boot_hist_add_array(hnode, &buf, end, "sort");
+ if (ret == -EINVAL)
+ return ret;
+
+ p = xbc_node_find_value(hnode, "size", NULL);
+ if (p)
+ append_printf(&buf, end, ":size=%s", p);
+
+ p = xbc_node_find_value(hnode, "name", NULL);
+ if (p)
+ append_printf(&buf, end, ":name=%s", p);
+
+ node = xbc_node_find_subkey(hnode, "var");
+ if (node) {
+ xbc_node_for_each_key_value(node, knode, p) {
+ /* Expression must not include spaces. */
+ append_printf(&buf, end, ":%s=",
+ xbc_node_get_data(knode));
+ append_str_nospace(&buf, end, p);
+ }
+ }
+
+ /* Histogram control attributes (mutual exclusive) */
+ if (xbc_node_find_value(hnode, "pause", NULL))
+ append_printf(&buf, end, ":pause");
+ else if (xbc_node_find_value(hnode, "continue", NULL))
+ append_printf(&buf, end, ":continue");
+ else if (xbc_node_find_value(hnode, "clear", NULL))
+ append_printf(&buf, end, ":clear");
+
+ /* Histogram handler and actions */
+ node = xbc_node_find_subkey(hnode, "onmax");
+ if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0)
+ return -EINVAL;
+ node = xbc_node_find_subkey(hnode, "onchange");
+ if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0)
+ return -EINVAL;
+ node = xbc_node_find_subkey(hnode, "onmatch");
+ if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0)
+ return -EINVAL;
+
+ p = xbc_node_find_value(hnode, "filter", NULL);
+ if (p)
+ append_printf(&buf, end, " if %s", p);
+
+ if (buf == end) {
+ pr_err("hist exceeds the max command length.\n");
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
+static void __init
+trace_boot_init_histograms(struct trace_event_file *file,
+ struct xbc_node *hnode, char *buf, size_t size)
+{
+ struct xbc_node *node;
+ const char *p;
+ char *tmp;
+
+ xbc_node_for_each_subkey(hnode, node) {
+ p = xbc_node_get_data(node);
+ if (!isdigit(p[0]))
+ continue;
+ /* All digit started node should be instances. */
+ if (trace_boot_compose_hist_cmd(node, buf, size) == 0) {
+ tmp = kstrdup(buf, GFP_KERNEL);
+ if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply hist trigger: %s\n", tmp);
+ kfree(tmp);
+ }
+ }
+
+ if (xbc_node_find_subkey(hnode, "keys")) {
+ if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) {
+ tmp = kstrdup(buf, GFP_KERNEL);
+ if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply hist trigger: %s\n", tmp);
+ kfree(tmp);
+ }
+ }
+}
+#else
+static void __init
+trace_boot_init_histograms(struct trace_event_file *file,
+ struct xbc_node *hnode, char *buf, size_t size)
+{
+ /* do nothing */
+}
+#endif
+
static void __init
trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
struct xbc_node *enode)
@@ -205,12 +488,18 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
pr_err("Failed to apply filter: %s\n", buf);
}
- xbc_node_for_each_array_value(enode, "actions", anode, p) {
- if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
- pr_err("action string is too long: %s\n", p);
- else if (trigger_process_regex(file, buf) < 0)
- pr_err("Failed to apply an action: %s\n", buf);
- }
+ if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) {
+ xbc_node_for_each_array_value(enode, "actions", anode, p) {
+ if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
+ pr_err("action string is too long: %s\n", p);
+ else if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply an action: %s\n", p);
+ }
+ anode = xbc_node_find_subkey(enode, "hist");
+ if (anode)
+ trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf));
+ } else if (xbc_node_find_value(enode, "actions", NULL))
+ pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n");
if (xbc_node_find_value(enode, "enable", NULL)) {
if (trace_event_enable_disable(file, 1, 0) < 0)
@@ -225,14 +514,37 @@ static void __init
trace_boot_init_events(struct trace_array *tr, struct xbc_node *node)
{
struct xbc_node *gnode, *enode;
+ bool enable, enable_all = false;
+ const char *data;
- node = xbc_node_find_child(node, "event");
+ node = xbc_node_find_subkey(node, "event");
if (!node)
return;
/* per-event key starts with "event.GROUP.EVENT" */
- xbc_node_for_each_child(node, gnode)
- xbc_node_for_each_child(gnode, enode)
+ xbc_node_for_each_subkey(node, gnode) {
+ data = xbc_node_get_data(gnode);
+ if (!strcmp(data, "enable")) {
+ enable_all = true;
+ continue;
+ }
+ enable = false;
+ xbc_node_for_each_subkey(gnode, enode) {
+ data = xbc_node_get_data(enode);
+ if (!strcmp(data, "enable")) {
+ enable = true;
+ continue;
+ }
trace_boot_init_one_event(tr, gnode, enode);
+ }
+ /* Event enablement must be done after event settings */
+ if (enable) {
+ data = xbc_node_get_data(gnode);
+ trace_array_set_clr_event(tr, data, NULL, true);
+ }
+ }
+ /* Ditto */
+ if (enable_all)
+ trace_array_set_clr_event(tr, NULL, NULL, true);
}
#else
#define trace_boot_enable_events(tr, node) do {} while (0)
@@ -308,11 +620,11 @@ trace_boot_init_instances(struct xbc_node *node)
struct trace_array *tr;
const char *p;
- node = xbc_node_find_child(node, "instance");
+ node = xbc_node_find_subkey(node, "instance");
if (!node)
return;
- xbc_node_for_each_child(node, inode) {
+ xbc_node_for_each_subkey(node, inode) {
p = xbc_node_get_data(inode);
if (!p || *p == '\0')
continue;
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index e57cc0870892..1110112e55bd 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -13,11 +13,49 @@
#include <linux/tracefs.h>
#include "trace.h"
+#include "trace_output.h" /* for trace_event_sem */
#include "trace_dynevent.h"
static DEFINE_MUTEX(dyn_event_ops_mutex);
static LIST_HEAD(dyn_event_ops_list);
+bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call)
+{
+ struct trace_event_call *call;
+ bool ret = false;
+
+ if (WARN_ON_ONCE(!(dyn_call->flags & TRACE_EVENT_FL_DYNAMIC)))
+ return false;
+
+ down_read(&trace_event_sem);
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (call == dyn_call) {
+ atomic_inc(&dyn_call->refcnt);
+ ret = true;
+ }
+ }
+ up_read(&trace_event_sem);
+ return ret;
+}
+
+void trace_event_dyn_put_ref(struct trace_event_call *call)
+{
+ if (WARN_ON_ONCE(!(call->flags & TRACE_EVENT_FL_DYNAMIC)))
+ return;
+
+ if (WARN_ON_ONCE(atomic_read(&call->refcnt) <= 0)) {
+ atomic_set(&call->refcnt, 0);
+ return;
+ }
+
+ atomic_dec(&call->refcnt);
+}
+
+bool trace_event_dyn_busy(struct trace_event_call *call)
+{
+ return atomic_read(&call->refcnt) != 0;
+}
+
int dyn_event_register(struct dyn_event_operations *ops)
{
if (!ops || !ops->create || !ops->show || !ops->is_busy ||
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index 7754936b57ee..936477a111d3 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -76,13 +76,15 @@ int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops)
return 0;
}
-static inline int dyn_event_add(struct dyn_event *ev)
+static inline int dyn_event_add(struct dyn_event *ev,
+ struct trace_event_call *call)
{
lockdep_assert_held(&event_mutex);
if (!ev || !ev->ops)
return -EINVAL;
+ call->flags |= TRACE_EVENT_FL_DYNAMIC;
list_add_tail(&ev->list, &dyn_event_list);
return 0;
}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 251c819cf0c5..cd41e863b51c 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -360,3 +360,44 @@ FTRACE_ENTRY(func_repeats, func_repeats_entry,
__entry->count,
FUNC_REPEATS_GET_DELTA_TS(__entry))
);
+
+FTRACE_ENTRY(osnoise, osnoise_entry,
+
+ TRACE_OSNOISE,
+
+ F_STRUCT(
+ __field( u64, noise )
+ __field( u64, runtime )
+ __field( u64, max_sample )
+ __field( unsigned int, hw_count )
+ __field( unsigned int, nmi_count )
+ __field( unsigned int, irq_count )
+ __field( unsigned int, softirq_count )
+ __field( unsigned int, thread_count )
+ ),
+
+ F_printk("noise:%llu\tmax_sample:%llu\thw:%u\tnmi:%u\tirq:%u\tsoftirq:%u\tthread:%u\n",
+ __entry->noise,
+ __entry->max_sample,
+ __entry->hw_count,
+ __entry->nmi_count,
+ __entry->irq_count,
+ __entry->softirq_count,
+ __entry->thread_count)
+);
+
+FTRACE_ENTRY(timerlat, timerlat_entry,
+
+ TRACE_TIMERLAT,
+
+ F_STRUCT(
+ __field( unsigned int, seqnum )
+ __field( int, context )
+ __field( u64, timer_latency )
+ ),
+
+ F_printk("seq:%u\tcontext:%d\ttimer_latency:%llu\n",
+ __entry->seqnum,
+ __entry->context,
+ __entry->timer_latency)
+);
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
new file mode 100644
index 000000000000..928867f527e7
--- /dev/null
+++ b/kernel/trace/trace_eprobe.c
@@ -0,0 +1,959 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * event probes
+ *
+ * Part of this code was copied from kernel/trace/trace_kprobe.c written by
+ * Masami Hiramatsu <mhiramat@kernel.org>
+ *
+ * Copyright (C) 2021, VMware Inc, Steven Rostedt <rostedt@goodmis.org>
+ * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/ftrace.h>
+
+#include "trace_dynevent.h"
+#include "trace_probe.h"
+#include "trace_probe_tmpl.h"
+
+#define EPROBE_EVENT_SYSTEM "eprobes"
+
+struct trace_eprobe {
+ /* tracepoint system */
+ const char *event_system;
+
+ /* tracepoint event */
+ const char *event_name;
+
+ struct trace_event_call *event;
+
+ struct dyn_event devent;
+ struct trace_probe tp;
+};
+
+struct eprobe_data {
+ struct trace_event_file *file;
+ struct trace_eprobe *ep;
+};
+
+static int __trace_eprobe_create(int argc, const char *argv[]);
+
+static void trace_event_probe_cleanup(struct trace_eprobe *ep)
+{
+ if (!ep)
+ return;
+ trace_probe_cleanup(&ep->tp);
+ kfree(ep->event_name);
+ kfree(ep->event_system);
+ if (ep->event)
+ trace_event_put_ref(ep->event);
+ kfree(ep);
+}
+
+static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev)
+{
+ return container_of(ev, struct trace_eprobe, devent);
+}
+
+static int eprobe_dyn_event_create(const char *raw_command)
+{
+ return trace_probe_create(raw_command, __trace_eprobe_create);
+}
+
+static int eprobe_dyn_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+ int i;
+
+ seq_printf(m, "e:%s/%s", trace_probe_group_name(&ep->tp),
+ trace_probe_name(&ep->tp));
+ seq_printf(m, " %s.%s", ep->event_system, ep->event_name);
+
+ for (i = 0; i < ep->tp.nr_args; i++)
+ seq_printf(m, " %s=%s", ep->tp.args[i].name, ep->tp.args[i].comm);
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static int unregister_trace_eprobe(struct trace_eprobe *ep)
+{
+ /* If other probes are on the event, just unregister eprobe */
+ if (trace_probe_has_sibling(&ep->tp))
+ goto unreg;
+
+ /* Enabled event can not be unregistered */
+ if (trace_probe_is_enabled(&ep->tp))
+ return -EBUSY;
+
+ /* Will fail if probe is being used by ftrace or perf */
+ if (trace_probe_unregister_event_call(&ep->tp))
+ return -EBUSY;
+
+unreg:
+ dyn_event_remove(&ep->devent);
+ trace_probe_unlink(&ep->tp);
+
+ return 0;
+}
+
+static int eprobe_dyn_event_release(struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+ int ret = unregister_trace_eprobe(ep);
+
+ if (!ret)
+ trace_event_probe_cleanup(ep);
+ return ret;
+}
+
+static bool eprobe_dyn_event_is_busy(struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+
+ return trace_probe_is_enabled(&ep->tp);
+}
+
+static bool eprobe_dyn_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+ const char *slash;
+
+ /*
+ * We match the following:
+ * event only - match all eprobes with event name
+ * system and event only - match all system/event probes
+ *
+ * The below has the above satisfied with more arguments:
+ *
+ * attached system/event - If the arg has the system and event
+ * the probe is attached to, match
+ * probes with the attachment.
+ *
+ * If any more args are given, then it requires a full match.
+ */
+
+ /*
+ * If system exists, but this probe is not part of that system
+ * do not match.
+ */
+ if (system && strcmp(trace_probe_group_name(&ep->tp), system) != 0)
+ return false;
+
+ /* Must match the event name */
+ if (strcmp(trace_probe_name(&ep->tp), event) != 0)
+ return false;
+
+ /* No arguments match all */
+ if (argc < 1)
+ return true;
+
+ /* First argument is the system/event the probe is attached to */
+
+ slash = strchr(argv[0], '/');
+ if (!slash)
+ slash = strchr(argv[0], '.');
+ if (!slash)
+ return false;
+
+ if (strncmp(ep->event_system, argv[0], slash - argv[0]))
+ return false;
+ if (strcmp(ep->event_name, slash + 1))
+ return false;
+
+ argc--;
+ argv++;
+
+ /* If there are no other args, then match */
+ if (argc < 1)
+ return true;
+
+ return trace_probe_match_command_args(&ep->tp, argc, argv);
+}
+
+static struct dyn_event_operations eprobe_dyn_event_ops = {
+ .create = eprobe_dyn_event_create,
+ .show = eprobe_dyn_event_show,
+ .is_busy = eprobe_dyn_event_is_busy,
+ .free = eprobe_dyn_event_release,
+ .match = eprobe_dyn_event_match,
+};
+
+static struct trace_eprobe *alloc_event_probe(const char *group,
+ const char *this_event,
+ struct trace_event_call *event,
+ int nargs)
+{
+ struct trace_eprobe *ep;
+ const char *event_name;
+ const char *sys_name;
+ int ret = -ENOMEM;
+
+ if (!event)
+ return ERR_PTR(-ENODEV);
+
+ sys_name = event->class->system;
+ event_name = trace_event_name(event);
+
+ ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL);
+ if (!ep) {
+ trace_event_put_ref(event);
+ goto error;
+ }
+ ep->event = event;
+ ep->event_name = kstrdup(event_name, GFP_KERNEL);
+ if (!ep->event_name)
+ goto error;
+ ep->event_system = kstrdup(sys_name, GFP_KERNEL);
+ if (!ep->event_system)
+ goto error;
+
+ ret = trace_probe_init(&ep->tp, this_event, group, false);
+ if (ret < 0)
+ goto error;
+
+ dyn_event_init(&ep->devent, &eprobe_dyn_event_ops);
+ return ep;
+error:
+ trace_event_probe_cleanup(ep);
+ return ERR_PTR(ret);
+}
+
+static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i)
+{
+ struct probe_arg *parg = &ep->tp.args[i];
+ struct ftrace_event_field *field;
+ struct list_head *head;
+
+ head = trace_get_fields(ep->event);
+ list_for_each_entry(field, head, link) {
+ if (!strcmp(parg->code->data, field->name)) {
+ kfree(parg->code->data);
+ parg->code->data = field;
+ return 0;
+ }
+ }
+ kfree(parg->code->data);
+ parg->code->data = NULL;
+ return -ENOENT;
+}
+
+static int eprobe_event_define_fields(struct trace_event_call *event_call)
+{
+ int ret;
+ struct eprobe_trace_entry_head field;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(event_call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENOENT;
+
+ DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0);
+
+ return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
+}
+
+static struct trace_event_fields eprobe_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = eprobe_event_define_fields },
+ {}
+};
+
+/* Event entry printers */
+static enum print_line_t
+print_eprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct eprobe_trace_entry_head *field;
+ struct trace_event_call *pevent;
+ struct trace_event *probed_event;
+ struct trace_seq *s = &iter->seq;
+ struct trace_probe *tp;
+
+ field = (struct eprobe_trace_entry_head *)iter->ent;
+ tp = trace_probe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (WARN_ON_ONCE(!tp))
+ goto out;
+
+ trace_seq_printf(s, "%s: (", trace_probe_name(tp));
+
+ probed_event = ftrace_find_event(field->type);
+ if (probed_event) {
+ pevent = container_of(probed_event, struct trace_event_call, event);
+ trace_seq_printf(s, "%s.%s", pevent->class->system,
+ trace_event_name(pevent));
+ } else {
+ trace_seq_printf(s, "%u", field->type);
+ }
+
+ trace_seq_putc(s, ')');
+
+ if (print_probe_args(s, tp->args, tp->nr_args,
+ (u8 *)&field[1], field) < 0)
+ goto out;
+
+ trace_seq_putc(s, '\n');
+ out:
+ return trace_handle_return(s);
+}
+
+static unsigned long get_event_field(struct fetch_insn *code, void *rec)
+{
+ struct ftrace_event_field *field = code->data;
+ unsigned long val;
+ void *addr;
+
+ addr = rec + field->offset;
+
+ switch (field->size) {
+ case 1:
+ if (field->is_signed)
+ val = *(char *)addr;
+ else
+ val = *(unsigned char *)addr;
+ break;
+ case 2:
+ if (field->is_signed)
+ val = *(short *)addr;
+ else
+ val = *(unsigned short *)addr;
+ break;
+ case 4:
+ if (field->is_signed)
+ val = *(int *)addr;
+ else
+ val = *(unsigned int *)addr;
+ break;
+ default:
+ if (field->is_signed)
+ val = *(long *)addr;
+ else
+ val = *(unsigned long *)addr;
+ break;
+ }
+ return val;
+}
+
+static int get_eprobe_size(struct trace_probe *tp, void *rec)
+{
+ struct probe_arg *arg;
+ int i, len, ret = 0;
+
+ for (i = 0; i < tp->nr_args; i++) {
+ arg = tp->args + i;
+ if (unlikely(arg->dynamic)) {
+ unsigned long val;
+
+ val = get_event_field(arg->code, rec);
+ len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL);
+ if (len > 0)
+ ret += len;
+ }
+ }
+
+ return ret;
+}
+
+/* Kprobe specific fetch functions */
+
+/* Note that we don't verify it, since the code does not come from user space */
+static int
+process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
+ void *base)
+{
+ unsigned long val;
+
+ val = get_event_field(code, rec);
+ return process_fetch_insn_bottom(code + 1, val, dest, base);
+}
+NOKPROBE_SYMBOL(process_fetch_insn)
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+fetch_store_strlen_user(unsigned long addr)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+
+ return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+}
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+fetch_store_strlen(unsigned long addr)
+{
+ int ret, len = 0;
+ u8 c;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if (addr < TASK_SIZE)
+ return fetch_store_strlen_user(addr);
+#endif
+
+ do {
+ ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
+ len++;
+ } while (c && ret == 0 && len < MAX_STRING_SIZE);
+
+ return (ret < 0) ? ret : len;
+}
+
+/*
+ * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
+ */
+static nokprobe_inline int
+fetch_store_string_user(unsigned long addr, void *dest, void *base)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
+ if (ret >= 0)
+ *(u32 *)dest = make_data_loc(ret, __dest - base);
+
+ return ret;
+}
+
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
+ * length and relative data location.
+ */
+static nokprobe_inline int
+fetch_store_string(unsigned long addr, void *dest, void *base)
+{
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)addr < TASK_SIZE)
+ return fetch_store_string_user(addr, dest, base);
+#endif
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
+ if (ret >= 0)
+ *(u32 *)dest = make_data_loc(ret, __dest - base);
+
+ return ret;
+}
+
+static nokprobe_inline int
+probe_mem_read_user(void *dest, void *src, size_t size)
+{
+ const void __user *uaddr = (__force const void __user *)src;
+
+ return copy_from_user_nofault(dest, uaddr, size);
+}
+
+static nokprobe_inline int
+probe_mem_read(void *dest, void *src, size_t size)
+{
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)src < TASK_SIZE)
+ return probe_mem_read_user(dest, src, size);
+#endif
+ return copy_from_kernel_nofault(dest, src, size);
+}
+
+/* eprobe handler */
+static inline void
+__eprobe_trace_func(struct eprobe_data *edata, void *rec)
+{
+ struct eprobe_trace_entry_head *entry;
+ struct trace_event_call *call = trace_probe_event_call(&edata->ep->tp);
+ struct trace_event_buffer fbuffer;
+ int dsize;
+
+ if (WARN_ON_ONCE(call != edata->file->event_call))
+ return;
+
+ if (trace_trigger_soft_disabled(edata->file))
+ return;
+
+ fbuffer.trace_ctx = tracing_gen_ctx();
+ fbuffer.trace_file = edata->file;
+
+ dsize = get_eprobe_size(&edata->ep->tp, rec);
+ fbuffer.regs = NULL;
+
+ fbuffer.event =
+ trace_event_buffer_lock_reserve(&fbuffer.buffer, edata->file,
+ call->event.type,
+ sizeof(*entry) + edata->ep->tp.size + dsize,
+ fbuffer.trace_ctx);
+ if (!fbuffer.event)
+ return;
+
+ entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
+ if (edata->ep->event)
+ entry->type = edata->ep->event->event.type;
+ else
+ entry->type = 0;
+ store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize);
+
+ trace_event_buffer_commit(&fbuffer);
+}
+
+/*
+ * The event probe implementation uses event triggers to get access to
+ * the event it is attached to, but is not an actual trigger. The below
+ * functions are just stubs to fulfill what is needed to use the trigger
+ * infrastructure.
+ */
+static int eprobe_trigger_init(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ return 0;
+}
+
+static void eprobe_trigger_free(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+
+}
+
+static int eprobe_trigger_print(struct seq_file *m,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ /* Do not print eprobe event triggers */
+ return 0;
+}
+
+static void eprobe_trigger_func(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *rbe)
+{
+ struct eprobe_data *edata = data->private_data;
+
+ __eprobe_trace_func(edata, rec);
+}
+
+static struct event_trigger_ops eprobe_trigger_ops = {
+ .func = eprobe_trigger_func,
+ .print = eprobe_trigger_print,
+ .init = eprobe_trigger_init,
+ .free = eprobe_trigger_free,
+};
+
+static int eprobe_trigger_cmd_func(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param)
+{
+ return -1;
+}
+
+static int eprobe_trigger_reg_func(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ return -1;
+}
+
+static void eprobe_trigger_unreg_func(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+
+}
+
+static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd,
+ char *param)
+{
+ return &eprobe_trigger_ops;
+}
+
+static struct event_command event_trigger_cmd = {
+ .name = "eprobe",
+ .trigger_type = ETT_EVENT_EPROBE,
+ .flags = EVENT_CMD_FL_NEEDS_REC,
+ .func = eprobe_trigger_cmd_func,
+ .reg = eprobe_trigger_reg_func,
+ .unreg = eprobe_trigger_unreg_func,
+ .unreg_all = NULL,
+ .get_trigger_ops = eprobe_trigger_get_ops,
+ .set_filter = NULL,
+};
+
+static struct event_trigger_data *
+new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
+{
+ struct event_trigger_data *trigger;
+ struct eprobe_data *edata;
+
+ edata = kzalloc(sizeof(*edata), GFP_KERNEL);
+ trigger = kzalloc(sizeof(*trigger), GFP_KERNEL);
+ if (!trigger || !edata) {
+ kfree(edata);
+ kfree(trigger);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ trigger->flags = EVENT_TRIGGER_FL_PROBE;
+ trigger->count = -1;
+ trigger->ops = &eprobe_trigger_ops;
+
+ /*
+ * EVENT PROBE triggers are not registered as commands with
+ * register_event_command(), as they are not controlled by the user
+ * from the trigger file
+ */
+ trigger->cmd_ops = &event_trigger_cmd;
+
+ INIT_LIST_HEAD(&trigger->list);
+ RCU_INIT_POINTER(trigger->filter, NULL);
+
+ edata->file = file;
+ edata->ep = ep;
+ trigger->private_data = edata;
+
+ return trigger;
+}
+
+static int enable_eprobe(struct trace_eprobe *ep,
+ struct trace_event_file *eprobe_file)
+{
+ struct event_trigger_data *trigger;
+ struct trace_event_file *file;
+ struct trace_array *tr = eprobe_file->tr;
+
+ file = find_event_file(tr, ep->event_system, ep->event_name);
+ if (!file)
+ return -ENOENT;
+ trigger = new_eprobe_trigger(ep, eprobe_file);
+ if (IS_ERR(trigger))
+ return PTR_ERR(trigger);
+
+ list_add_tail_rcu(&trigger->list, &file->triggers);
+
+ trace_event_trigger_enable_disable(file, 1);
+ update_cond_flag(file);
+
+ return 0;
+}
+
+static struct trace_event_functions eprobe_funcs = {
+ .trace = print_eprobe_event
+};
+
+static int disable_eprobe(struct trace_eprobe *ep,
+ struct trace_array *tr)
+{
+ struct event_trigger_data *trigger;
+ struct trace_event_file *file;
+ struct eprobe_data *edata;
+
+ file = find_event_file(tr, ep->event_system, ep->event_name);
+ if (!file)
+ return -ENOENT;
+
+ list_for_each_entry(trigger, &file->triggers, list) {
+ if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE))
+ continue;
+ edata = trigger->private_data;
+ if (edata->ep == ep)
+ break;
+ }
+ if (list_entry_is_head(trigger, &file->triggers, list))
+ return -ENODEV;
+
+ list_del_rcu(&trigger->list);
+
+ trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
+
+ /* Make sure nothing is using the edata or trigger */
+ tracepoint_synchronize_unregister();
+
+ kfree(edata);
+ kfree(trigger);
+
+ return 0;
+}
+
+static int enable_trace_eprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *pos, *tp;
+ struct trace_eprobe *ep;
+ bool enabled;
+ int ret = 0;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+ enabled = trace_probe_is_enabled(tp);
+
+ /* This also changes "enabled" state */
+ if (file) {
+ ret = trace_probe_add_file(tp, file);
+ if (ret)
+ return ret;
+ } else
+ trace_probe_set_flag(tp, TP_FLAG_PROFILE);
+
+ if (enabled)
+ return 0;
+
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ ep = container_of(pos, struct trace_eprobe, tp);
+ ret = enable_eprobe(ep, file);
+ if (ret)
+ break;
+ enabled = true;
+ }
+
+ if (ret) {
+ /* Failed to enable one of them. Roll back all */
+ if (enabled)
+ disable_eprobe(ep, file->tr);
+ if (file)
+ trace_probe_remove_file(tp, file);
+ else
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+ }
+
+ return ret;
+}
+
+static int disable_trace_eprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *pos, *tp;
+ struct trace_eprobe *ep;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ if (file) {
+ if (!trace_probe_get_file_link(tp, file))
+ return -ENOENT;
+ if (!trace_probe_has_single_file(tp))
+ goto out;
+ trace_probe_clear_flag(tp, TP_FLAG_TRACE);
+ } else
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+
+ if (!trace_probe_is_enabled(tp)) {
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ ep = container_of(pos, struct trace_eprobe, tp);
+ disable_eprobe(ep, file->tr);
+ }
+ }
+
+ out:
+ if (file)
+ /*
+ * Synchronization is done in below function. For perf event,
+ * file == NULL and perf_trace_event_unreg() calls
+ * tracepoint_synchronize_unregister() to ensure synchronize
+ * event. We don't need to care about it.
+ */
+ trace_probe_remove_file(tp, file);
+
+ return 0;
+}
+
+static int eprobe_register(struct trace_event_call *event,
+ enum trace_reg type, void *data)
+{
+ struct trace_event_file *file = data;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return enable_trace_eprobe(event, file);
+ case TRACE_REG_UNREGISTER:
+ return disable_trace_eprobe(event, file);
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ case TRACE_REG_PERF_UNREGISTER:
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ return 0;
+#endif
+ }
+ return 0;
+}
+
+static inline void init_trace_eprobe_call(struct trace_eprobe *ep)
+{
+ struct trace_event_call *call = trace_probe_event_call(&ep->tp);
+
+ call->flags = TRACE_EVENT_FL_EPROBE;
+ call->event.funcs = &eprobe_funcs;
+ call->class->fields_array = eprobe_fields_array;
+ call->class->reg = eprobe_register;
+}
+
+static struct trace_event_call *
+find_and_get_event(const char *system, const char *event_name)
+{
+ struct trace_event_call *tp_event;
+ const char *name;
+
+ list_for_each_entry(tp_event, &ftrace_events, list) {
+ /* Skip other probes and ftrace events */
+ if (tp_event->flags &
+ (TRACE_EVENT_FL_IGNORE_ENABLE |
+ TRACE_EVENT_FL_KPROBE |
+ TRACE_EVENT_FL_UPROBE |
+ TRACE_EVENT_FL_EPROBE))
+ continue;
+ if (!tp_event->class->system ||
+ strcmp(system, tp_event->class->system))
+ continue;
+ name = trace_event_name(tp_event);
+ if (!name || strcmp(event_name, name))
+ continue;
+ if (!trace_event_try_get_ref(tp_event)) {
+ return NULL;
+ break;
+ }
+ return tp_event;
+ break;
+ }
+ return NULL;
+}
+
+static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i)
+{
+ unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TPOINT;
+ int ret;
+
+ ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags);
+ if (ret)
+ return ret;
+
+ if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG)
+ ret = trace_eprobe_tp_arg_update(ep, i);
+
+ return ret;
+}
+
+static int __trace_eprobe_create(int argc, const char *argv[])
+{
+ /*
+ * Argument syntax:
+ * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS]
+ * Fetch args:
+ * <name>=$<field>[:TYPE]
+ */
+ const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
+ const char *sys_event = NULL, *sys_name = NULL;
+ struct trace_event_call *event_call;
+ struct trace_eprobe *ep = NULL;
+ char buf1[MAX_EVENT_NAME_LEN];
+ char buf2[MAX_EVENT_NAME_LEN];
+ int ret = 0;
+ int i;
+
+ if (argc < 2 || argv[0][0] != 'e')
+ return -ECANCELED;
+
+ trace_probe_log_init("event_probe", argc, argv);
+
+ event = strchr(&argv[0][1], ':');
+ if (event) {
+ event++;
+ ret = traceprobe_parse_event_name(&event, &group, buf1,
+ event - argv[0]);
+ if (ret)
+ goto parse_error;
+ } else {
+ strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN);
+ sanitize_event_name(buf1);
+ event = buf1;
+ }
+ if (!is_good_name(event) || !is_good_name(group))
+ goto parse_error;
+
+ sys_event = argv[1];
+ ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2,
+ sys_event - argv[1]);
+ if (ret || !sys_name)
+ goto parse_error;
+ if (!is_good_name(sys_event) || !is_good_name(sys_name))
+ goto parse_error;
+
+ mutex_lock(&event_mutex);
+ event_call = find_and_get_event(sys_name, sys_event);
+ ep = alloc_event_probe(group, event, event_call, argc - 2);
+ mutex_unlock(&event_mutex);
+
+ if (IS_ERR(ep)) {
+ ret = PTR_ERR(ep);
+ /* This must return -ENOMEM or missing event, else there is a bug */
+ WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV);
+ ep = NULL;
+ goto error;
+ }
+
+ argc -= 2; argv += 2;
+ /* parse arguments */
+ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ trace_probe_log_set_index(i + 2);
+ ret = trace_eprobe_tp_update_arg(ep, argv, i);
+ if (ret)
+ goto error;
+ }
+ ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT);
+ if (ret < 0)
+ goto error;
+ init_trace_eprobe_call(ep);
+ mutex_lock(&event_mutex);
+ ret = trace_probe_register_event_call(&ep->tp);
+ if (ret) {
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ }
+ mutex_unlock(&event_mutex);
+ goto error;
+ }
+ ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
+ mutex_unlock(&event_mutex);
+ return ret;
+parse_error:
+ ret = -EINVAL;
+error:
+ trace_event_probe_cleanup(ep);
+ return ret;
+}
+
+/*
+ * Register dynevent at core_initcall. This allows kernel to setup eprobe
+ * events in postcore_initcall without tracefs.
+ */
+static __init int trace_events_eprobe_init_early(void)
+{
+ int err = 0;
+
+ err = dyn_event_register(&eprobe_dyn_event_ops);
+ if (err)
+ pr_warn("Could not register eprobe_dyn_event_ops\n");
+
+ return err;
+}
+core_initcall(trace_events_eprobe_init_early);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 03be4435d103..6aed10e2f7ce 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -177,7 +177,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event)
}
}
out:
- module_put(tp_event->mod);
+ trace_event_put_ref(tp_event);
}
static int perf_trace_event_open(struct perf_event *p_event)
@@ -224,10 +224,10 @@ int perf_trace_init(struct perf_event *p_event)
list_for_each_entry(tp_event, &ftrace_events, list) {
if (tp_event->event.type == event_id &&
tp_event->class && tp_event->class->reg &&
- try_module_get(tp_event->mod)) {
+ trace_event_try_get_ref(tp_event)) {
ret = perf_trace_event_init(tp_event, p_event);
if (ret)
- module_put(tp_event->mod);
+ trace_event_put_ref(tp_event);
break;
}
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 80e96989770e..830b3b9940f4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -181,6 +181,7 @@ static int trace_define_common_fields(void)
__common_field(unsigned short, type);
__common_field(unsigned char, flags);
+ /* Holds both preempt_count and migrate_disable */
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
@@ -2525,7 +2526,10 @@ __register_event(struct trace_event_call *call, struct module *mod)
return ret;
list_add(&call->list, &ftrace_events);
- call->mod = mod;
+ if (call->flags & TRACE_EVENT_FL_DYNAMIC)
+ atomic_set(&call->refcnt, 0);
+ else
+ call->module = mod;
return 0;
}
@@ -2839,7 +2843,9 @@ static void trace_module_remove_events(struct module *mod)
down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
- if (call->mod == mod)
+ if ((call->flags & TRACE_EVENT_FL_DYNAMIC) || !call->module)
+ continue;
+ if (call->module == mod)
__trace_remove_event_call(call);
}
up_write(&trace_event_sem);
@@ -2982,7 +2988,7 @@ struct trace_event_file *trace_get_event_file(const char *instance,
}
/* Don't let event modules unload while in use */
- ret = try_module_get(file->event_call->mod);
+ ret = trace_event_try_get_ref(file->event_call);
if (!ret) {
trace_array_put(tr);
ret = -EBUSY;
@@ -3012,7 +3018,7 @@ EXPORT_SYMBOL_GPL(trace_get_event_file);
void trace_put_event_file(struct trace_event_file *file)
{
mutex_lock(&event_mutex);
- module_put(file->event_call->mod);
+ trace_event_put_ref(file->event_call);
mutex_unlock(&event_mutex);
trace_array_put(file->tr);
@@ -3147,7 +3153,7 @@ static int free_probe_data(void *data)
if (!edata->ref) {
/* Remove the SOFT_MODE flag */
__ftrace_event_enable_disable(edata->file, 0, 1);
- module_put(edata->file->event_call->mod);
+ trace_event_put_ref(edata->file->event_call);
kfree(edata);
}
return 0;
@@ -3280,7 +3286,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
out_reg:
/* Don't let event modules unload while probe registered */
- ret = try_module_get(file->event_call->mod);
+ ret = trace_event_try_get_ref(file->event_call);
if (!ret) {
ret = -EBUSY;
goto out_free;
@@ -3310,7 +3316,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
out_disable:
__ftrace_event_enable_disable(file, 0, 1);
out_put:
- module_put(file->event_call->mod);
+ trace_event_put_ref(file->event_call);
out_free:
kfree(data);
goto out;
@@ -3376,7 +3382,8 @@ void __trace_early_add_events(struct trace_array *tr)
list_for_each_entry(call, &ftrace_events, list) {
/* Early boot up should not have any modules loaded */
- if (WARN_ON_ONCE(call->mod))
+ if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) &&
+ WARN_ON_ONCE(call->module))
continue;
ret = __trace_early_add_new_event(call, tr);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index c1abd63f1d6c..f01e442716e2 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -65,7 +65,8 @@
C(INVALID_SORT_MODIFIER,"Invalid sort modifier"), \
C(EMPTY_SORT_FIELD, "Empty sort field"), \
C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \
- C(INVALID_SORT_FIELD, "Sort field must be a key or a val"),
+ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \
+ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"),
#undef C
#define C(a, b) HIST_ERR_##a
@@ -120,6 +121,7 @@ struct hist_field {
unsigned int size;
unsigned int offset;
unsigned int is_signed;
+ unsigned long buckets;
const char *type;
struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
struct hist_trigger_data *hist_data;
@@ -218,6 +220,27 @@ static u64 hist_field_log2(struct hist_field *hist_field,
return (u64) ilog2(roundup_pow_of_two(val));
}
+static u64 hist_field_bucket(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand = hist_field->operands[0];
+ unsigned long buckets = hist_field->buckets;
+
+ u64 val = operand->fn(operand, elt, buffer, rbe, event);
+
+ if (WARN_ON_ONCE(!buckets))
+ return val;
+
+ if (val >= LONG_MAX)
+ val = div64_ul(val, buckets);
+ else
+ val = (u64)((unsigned long)val / buckets);
+ return val * buckets;
+}
+
static u64 hist_field_plus(struct hist_field *hist_field,
struct tracing_map_elt *elt,
struct trace_buffer *buffer,
@@ -317,6 +340,7 @@ enum hist_field_flags {
HIST_FIELD_FL_VAR_REF = 1 << 14,
HIST_FIELD_FL_CPU = 1 << 15,
HIST_FIELD_FL_ALIAS = 1 << 16,
+ HIST_FIELD_FL_BUCKET = 1 << 17,
};
struct var_defs {
@@ -484,7 +508,8 @@ struct track_data {
struct hist_elt_data {
char *comm;
u64 *var_ref_vals;
- char *field_var_str[SYNTH_FIELDS_MAX];
+ char **field_var_str;
+ int n_field_var_str;
};
struct snapshot_context {
@@ -1108,10 +1133,11 @@ static const char *hist_field_name(struct hist_field *field,
if (field->field)
field_name = field->field->name;
else if (field->flags & HIST_FIELD_FL_LOG2 ||
- field->flags & HIST_FIELD_FL_ALIAS)
+ field->flags & HIST_FIELD_FL_ALIAS ||
+ field->flags & HIST_FIELD_FL_BUCKET)
field_name = hist_field_name(field->operands[0], ++level);
else if (field->flags & HIST_FIELD_FL_CPU)
- field_name = "cpu";
+ field_name = "common_cpu";
else if (field->flags & HIST_FIELD_FL_EXPR ||
field->flags & HIST_FIELD_FL_VAR_REF) {
if (field->system) {
@@ -1376,9 +1402,11 @@ static void hist_elt_data_free(struct hist_elt_data *elt_data)
{
unsigned int i;
- for (i = 0; i < SYNTH_FIELDS_MAX; i++)
+ for (i = 0; i < elt_data->n_field_var_str; i++)
kfree(elt_data->field_var_str[i]);
+ kfree(elt_data->field_var_str);
+
kfree(elt_data->comm);
kfree(elt_data);
}
@@ -1395,17 +1423,17 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
struct hist_trigger_data *hist_data = elt->map->private_data;
unsigned int size = TASK_COMM_LEN;
struct hist_elt_data *elt_data;
- struct hist_field *key_field;
+ struct hist_field *hist_field;
unsigned int i, n_str;
elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
if (!elt_data)
return -ENOMEM;
- for_each_hist_key_field(i, hist_data) {
- key_field = hist_data->fields[i];
+ for_each_hist_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
- if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
+ if (hist_field->flags & HIST_FIELD_FL_EXECNAME) {
elt_data->comm = kzalloc(size, GFP_KERNEL);
if (!elt_data->comm) {
kfree(elt_data);
@@ -1426,6 +1454,13 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
size = STR_VAR_LEN_MAX;
+ elt_data->field_var_str = kcalloc(n_str, sizeof(char *), GFP_KERNEL);
+ if (!elt_data->field_var_str) {
+ hist_elt_data_free(elt_data);
+ return -EINVAL;
+ }
+ elt_data->n_field_var_str = n_str;
+
for (i = 0; i < n_str; i++) {
elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
if (!elt_data->field_var_str[i]) {
@@ -1469,6 +1504,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
flags_str = "syscall";
else if (hist_field->flags & HIST_FIELD_FL_LOG2)
flags_str = "log2";
+ else if (hist_field->flags & HIST_FIELD_FL_BUCKET)
+ flags_str = "buckets";
else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
flags_str = "usecs";
@@ -1555,6 +1592,13 @@ static int contains_operator(char *str)
switch (*op) {
case '-':
+ /*
+ * Unfortunately, the modifier ".sym-offset"
+ * can confuse things.
+ */
+ if (op - str >= 4 && !strncmp(op - 4, ".sym-offset", 11))
+ return FIELD_OP_NONE;
+
if (*str == '-')
field_op = FIELD_OP_UNARY_MINUS;
else
@@ -1582,7 +1626,9 @@ static void __destroy_hist_field(struct hist_field *hist_field)
kfree(hist_field->var.name);
kfree(hist_field->name);
- kfree(hist_field->type);
+
+ /* Can likely be a const */
+ kfree_const(hist_field->type);
kfree(hist_field->system);
kfree(hist_field->event_name);
@@ -1639,9 +1685,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (flags & HIST_FIELD_FL_HITCOUNT) {
hist_field->fn = hist_field_counter;
hist_field->size = sizeof(u64);
- hist_field->type = kstrdup("u64", GFP_KERNEL);
- if (!hist_field->type)
- goto free;
+ hist_field->type = "u64";
goto out;
}
@@ -1650,12 +1694,13 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
goto out;
}
- if (flags & HIST_FIELD_FL_LOG2) {
- unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
- hist_field->fn = hist_field_log2;
+ if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) {
+ unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET);
+ hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 :
+ hist_field_bucket;
hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
hist_field->size = hist_field->operands[0]->size;
- hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
+ hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);
if (!hist_field->type)
goto free;
goto out;
@@ -1664,29 +1709,27 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (flags & HIST_FIELD_FL_TIMESTAMP) {
hist_field->fn = hist_field_timestamp;
hist_field->size = sizeof(u64);
- hist_field->type = kstrdup("u64", GFP_KERNEL);
- if (!hist_field->type)
- goto free;
+ hist_field->type = "u64";
goto out;
}
if (flags & HIST_FIELD_FL_CPU) {
hist_field->fn = hist_field_cpu;
hist_field->size = sizeof(int);
- hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
- if (!hist_field->type)
- goto free;
+ hist_field->type = "unsigned int";
goto out;
}
if (WARN_ON_ONCE(!field))
goto out;
- if (is_string_field(field)) {
+ /* Pointers to strings are just pointers and dangerous to dereference */
+ if (is_string_field(field) &&
+ (field->filter_type != FILTER_PTR_STRING)) {
flags |= HIST_FIELD_FL_STRING;
hist_field->size = MAX_FILTER_STR_VAL;
- hist_field->type = kstrdup(field->type, GFP_KERNEL);
+ hist_field->type = kstrdup_const(field->type, GFP_KERNEL);
if (!hist_field->type)
goto free;
@@ -1699,7 +1742,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
} else {
hist_field->size = field->size;
hist_field->is_signed = field->is_signed;
- hist_field->type = kstrdup(field->type, GFP_KERNEL);
+ hist_field->type = kstrdup_const(field->type, GFP_KERNEL);
if (!hist_field->type)
goto free;
@@ -1785,7 +1828,7 @@ static int init_var_ref(struct hist_field *ref_field,
}
}
- ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
+ ref_field->type = kstrdup_const(var_field->type, GFP_KERNEL);
if (!ref_field->type) {
err = -ENOMEM;
goto free;
@@ -1943,7 +1986,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
static struct ftrace_event_field *
parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
- char *field_str, unsigned long *flags)
+ char *field_str, unsigned long *flags, unsigned long *buckets)
{
struct ftrace_event_field *field = NULL;
char *field_name, *modifier, *str;
@@ -1970,7 +2013,22 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
*flags |= HIST_FIELD_FL_LOG2;
else if (strcmp(modifier, "usecs") == 0)
*flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
- else {
+ else if (strncmp(modifier, "bucket", 6) == 0) {
+ int ret;
+
+ modifier += 6;
+
+ if (*modifier == 's')
+ modifier++;
+ if (*modifier != '=')
+ goto error;
+ modifier++;
+ ret = kstrtoul(modifier, 0, buckets);
+ if (ret || !(*buckets))
+ goto error;
+ *flags |= HIST_FIELD_FL_BUCKET;
+ } else {
+ error:
hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier));
field = ERR_PTR(-EINVAL);
goto out;
@@ -1982,14 +2040,24 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
hist_data->enable_timestamps = true;
if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
hist_data->attrs->ts_in_usecs = true;
- } else if (strcmp(field_name, "cpu") == 0)
+ } else if (strcmp(field_name, "common_cpu") == 0)
*flags |= HIST_FIELD_FL_CPU;
else {
field = trace_find_event_field(file->event_call, field_name);
if (!field || !field->size) {
- hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));
- field = ERR_PTR(-EINVAL);
- goto out;
+ /*
+ * For backward compatibility, if field_name
+ * was "cpu", then we treat this the same as
+ * common_cpu.
+ */
+ if (strcmp(field_name, "cpu") == 0) {
+ *flags |= HIST_FIELD_FL_CPU;
+ } else {
+ hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
+ errpos(field_name));
+ field = ERR_PTR(-EINVAL);
+ goto out;
+ }
}
}
out:
@@ -2029,6 +2097,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
struct ftrace_event_field *field = NULL;
struct hist_field *hist_field = NULL;
+ unsigned long buckets = 0;
int ret = 0;
s = strchr(str, '.');
@@ -2066,7 +2135,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
} else
str = s;
- field = parse_field(hist_data, file, str, flags);
+ field = parse_field(hist_data, file, str, flags, &buckets);
if (IS_ERR(field)) {
ret = PTR_ERR(field);
goto out;
@@ -2077,6 +2146,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
ret = -ENOMEM;
goto out;
}
+ hist_field->buckets = buckets;
return hist_field;
out:
@@ -2137,6 +2207,13 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
ret = PTR_ERR(operand1);
goto free;
}
+ if (operand1->flags & HIST_FIELD_FL_STRING) {
+ /* String type can not be the operand of unary operator. */
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str));
+ destroy_hist_field(operand1, 0);
+ ret = -EINVAL;
+ goto free;
+ }
expr->flags |= operand1->flags &
(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
@@ -2144,7 +2221,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
expr->operands[0] = operand1;
expr->operator = FIELD_OP_UNARY_MINUS;
expr->name = expr_str(expr, 0);
- expr->type = kstrdup(operand1->type, GFP_KERNEL);
+ expr->type = kstrdup_const(operand1->type, GFP_KERNEL);
if (!expr->type) {
ret = -ENOMEM;
goto free;
@@ -2238,6 +2315,11 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
operand1 = NULL;
goto free;
}
+ if (operand1->flags & HIST_FIELD_FL_STRING) {
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(operand1_str));
+ ret = -EINVAL;
+ goto free;
+ }
/* rest of string could be another expression e.g. b+c in a+b+c */
operand_flags = 0;
@@ -2247,6 +2329,11 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
operand2 = NULL;
goto free;
}
+ if (operand2->flags & HIST_FIELD_FL_STRING) {
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str));
+ ret = -EINVAL;
+ goto free;
+ }
ret = check_expr_operands(file->tr, operand1, operand2);
if (ret)
@@ -2268,9 +2355,13 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
expr->operands[0] = operand1;
expr->operands[1] = operand2;
+
+ /* The operand sizes should be the same, so just pick one */
+ expr->size = operand1->size;
+
expr->operator = field_op;
expr->name = expr_str(expr, 0);
- expr->type = kstrdup(operand1->type, GFP_KERNEL);
+ expr->type = kstrdup_const(operand1->type, GFP_KERNEL);
if (!expr->type) {
ret = -ENOMEM;
goto free;
@@ -2415,7 +2506,7 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
* events. However, for convenience, users are allowed to directly
* specify an event field in an action, which will be automatically
* converted into a variable on their behalf.
-
+ *
* If a user specifies a field on an event that isn't the event the
* histogram currently being defined (the target event histogram), the
* only way that can be accomplished is if a new hist trigger is
@@ -2434,12 +2525,12 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
char *subsys_name, char *event_name, char *field_name)
{
struct trace_array *tr = target_hist_data->event_file->tr;
- struct hist_field *event_var = ERR_PTR(-EINVAL);
struct hist_trigger_data *hist_data;
unsigned int i, n, first = true;
struct field_var_hist *var_hist;
struct trace_event_file *file;
struct hist_field *key_field;
+ struct hist_field *event_var;
char *saved_filter;
char *cmd;
int ret;
@@ -2658,10 +2749,10 @@ static struct hist_field *create_var(struct hist_trigger_data *hist_data,
var->var.hist_data = var->hist_data = hist_data;
var->size = size;
var->var.name = kstrdup(name, GFP_KERNEL);
- var->type = kstrdup(type, GFP_KERNEL);
+ var->type = kstrdup_const(type, GFP_KERNEL);
if (!var->var.name || !var->type) {
+ kfree_const(var->type);
kfree(var->var.name);
- kfree(var->type);
kfree(var);
var = ERR_PTR(-ENOMEM);
}
@@ -3389,6 +3480,8 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data,
event = data->match_data.event;
}
+ if (!event)
+ goto free;
/*
* At this point, we're looking at a field on another
* event. Because we can't modify a hist trigger on
@@ -3688,6 +3781,41 @@ static int create_val_field(struct hist_trigger_data *hist_data,
return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
}
+static const char *no_comm = "(no comm)";
+
+static u64 hist_field_execname(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_elt_data *elt_data;
+
+ if (WARN_ON_ONCE(!elt))
+ return (u64)(unsigned long)no_comm;
+
+ elt_data = elt->private_data;
+
+ if (WARN_ON_ONCE(!elt_data->comm))
+ return (u64)(unsigned long)no_comm;
+
+ return (u64)(unsigned long)(elt_data->comm);
+}
+
+/* Convert a var that points to common_pid.execname to a string */
+static void update_var_execname(struct hist_field *hist_field)
+{
+ hist_field->flags = HIST_FIELD_FL_STRING | HIST_FIELD_FL_VAR |
+ HIST_FIELD_FL_EXECNAME;
+ hist_field->size = MAX_FILTER_STR_VAL;
+ hist_field->is_signed = 0;
+
+ kfree_const(hist_field->type);
+ hist_field->type = "char[]";
+
+ hist_field->fn = hist_field_execname;
+}
+
static int create_var_field(struct hist_trigger_data *hist_data,
unsigned int val_idx,
struct trace_event_file *file,
@@ -3712,6 +3840,9 @@ static int create_var_field(struct hist_trigger_data *hist_data,
ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
+ if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME)
+ update_var_execname(hist_data->fields[val_idx]);
+
if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING)
hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++;
@@ -4488,8 +4619,6 @@ static inline void add_to_key(char *compound_key, void *key,
field = key_field->field;
if (field->filter_type == FILTER_DYN_STRING)
size = *(u32 *)(rec + field->offset) >> 16;
- else if (field->filter_type == FILTER_PTR_STRING)
- size = strlen(key);
else if (field->filter_type == FILTER_STATIC_STRING)
size = field->size;
@@ -4657,6 +4786,11 @@ static void hist_trigger_print_key(struct seq_file *m,
} else if (key_field->flags & HIST_FIELD_FL_LOG2) {
seq_printf(m, "%s: ~ 2^%-2llu", field_name,
*(u64 *)(key + key_field->offset));
+ } else if (key_field->flags & HIST_FIELD_FL_BUCKET) {
+ unsigned long buckets = key_field->buckets;
+ uval = *(u64 *)(key + key_field->offset);
+ seq_printf(m, "%s: ~ %llu-%llu", field_name,
+ uval, uval + buckets -1);
} else if (key_field->flags & HIST_FIELD_FL_STRING) {
seq_printf(m, "%s: %-50s", field_name,
(char *)(key + key_field->offset));
@@ -5078,7 +5212,7 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
seq_printf(m, "%s=", hist_field->var.name);
if (hist_field->flags & HIST_FIELD_FL_CPU)
- seq_puts(m, "cpu");
+ seq_puts(m, "common_cpu");
else if (field_name) {
if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
hist_field->flags & HIST_FIELD_FL_ALIAS)
@@ -5096,6 +5230,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
seq_printf(m, ".%s", flags);
}
}
+ if (hist_field->buckets)
+ seq_printf(m, "=%ld", hist_field->buckets);
}
static int event_hist_trigger_print(struct seq_file *m,
@@ -5232,6 +5368,7 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
cmd = hist_data->field_var_hists[i]->cmd;
ret = event_hist_trigger_func(&trigger_hist_cmd, file,
"!hist", "hist", cmd);
+ WARN_ON_ONCE(ret < 0);
}
}
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 2ac75eb6aa86..d54094b7a9d7 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -893,15 +893,13 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields,
dyn_event_init(&event->devent, &synth_event_ops);
for (i = 0, j = 0; i < n_fields; i++) {
+ fields[i]->field_pos = i;
event->fields[i] = fields[i];
- if (fields[i]->is_dynamic) {
- event->dynamic_fields[j] = fields[i];
- event->dynamic_fields[j]->field_pos = i;
+ if (fields[i]->is_dynamic)
event->dynamic_fields[j++] = fields[i];
- event->n_dynamic_fields++;
- }
}
+ event->n_dynamic_fields = j;
event->n_fields = n_fields;
out:
return event;
@@ -1300,7 +1298,7 @@ static int __create_synth_event(const char *name, const char *raw_fields)
}
ret = register_synth_event(event);
if (!ret)
- dyn_event_add(&event->devent);
+ dyn_event_add(&event->devent, &event->call);
else
free_synth_event(event);
out:
@@ -1371,13 +1369,15 @@ static int destroy_synth_event(struct synth_event *se)
int ret;
if (se->ref)
- ret = -EBUSY;
- else {
- ret = unregister_synth_event(se);
- if (!ret) {
- dyn_event_remove(&se->devent);
- free_synth_event(se);
- }
+ return -EBUSY;
+
+ if (trace_event_dyn_busy(&se->call))
+ return -EBUSY;
+
+ ret = unregister_synth_event(se);
+ if (!ret) {
+ dyn_event_remove(&se->devent);
+ free_synth_event(se);
}
return ret;
@@ -2104,6 +2104,9 @@ static int synth_event_release(struct dyn_event *ev)
if (event->ref)
return -EBUSY;
+ if (trace_event_dyn_busy(&event->call))
+ return -EBUSY;
+
ret = unregister_synth_event(event);
if (ret)
return ret;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index b8bfa8505b7b..3d5c07239a2a 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -124,6 +124,18 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
return seq_list_next(t, &event_file->triggers, pos);
}
+static bool check_user_trigger(struct trace_event_file *file)
+{
+ struct event_trigger_data *data;
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ if (data->flags & EVENT_TRIGGER_FL_PROBE)
+ continue;
+ return true;
+ }
+ return false;
+}
+
static void *trigger_start(struct seq_file *m, loff_t *pos)
{
struct trace_event_file *event_file;
@@ -134,7 +146,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos)
if (unlikely(!event_file))
return ERR_PTR(-ENODEV);
- if (list_empty(&event_file->triggers))
+ if (list_empty(&event_file->triggers) || !check_user_trigger(event_file))
return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL;
return seq_list_start(&event_file->triggers, *pos);
@@ -916,7 +928,8 @@ void unpause_named_trigger(struct event_trigger_data *data)
/**
* set_named_trigger_data - Associate common named trigger data
- * @data: The trigger data of a named trigger to unpause
+ * @data: The trigger data to associate
+ * @named_data: The common named trigger to be associated
*
* Named triggers are sets of triggers that share a common set of
* trigger data. The first named trigger registered with a given name
@@ -1333,7 +1346,7 @@ void event_enable_trigger_free(struct event_trigger_ops *ops,
if (!data->ref) {
/* Remove the SOFT_MODE flag */
trace_event_enable_disable(enable_data->file, 0, 1);
- module_put(enable_data->file->event_call->mod);
+ trace_event_put_ref(enable_data->file->event_call);
trigger_data_free(data);
kfree(enable_data);
}
@@ -1480,7 +1493,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
out_reg:
/* Don't let event modules unload while probe registered */
- ret = try_module_get(event_enable_file->event_call->mod);
+ ret = trace_event_try_get_ref(event_enable_file->event_call);
if (!ret) {
ret = -EBUSY;
goto out_free;
@@ -1509,7 +1522,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
out_disable:
trace_event_enable_disable(event_enable_file, 0, 1);
out_put:
- module_put(event_enable_file->event_call->mod);
+ trace_event_put_ref(event_enable_file->event_call);
out_free:
if (cmd_ops->set_filter)
cmd_ops->set_filter(NULL, trigger_data, NULL);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 632ef88131a9..1b83d75eb103 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -34,7 +34,7 @@
* Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
* Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
*
- * Includes useful feedback from Clark Williams <clark@redhat.com>
+ * Includes useful feedback from Clark Williams <williams@redhat.com>
*
*/
#include <linux/kthread.h>
@@ -54,20 +54,33 @@ static struct trace_array *hwlat_trace;
#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */
#define DEFAULT_LAT_THRESHOLD 10 /* 10us */
-/* sampling thread*/
-static struct task_struct *hwlat_kthread;
-
static struct dentry *hwlat_sample_width; /* sample width us */
static struct dentry *hwlat_sample_window; /* sample window us */
+static struct dentry *hwlat_thread_mode; /* hwlat thread mode */
+
+enum {
+ MODE_NONE = 0,
+ MODE_ROUND_ROBIN,
+ MODE_PER_CPU,
+ MODE_MAX
+};
+static char *thread_mode_str[] = { "none", "round-robin", "per-cpu" };
/* Save the previous tracing_thresh value */
static unsigned long save_tracing_thresh;
-/* NMI timestamp counters */
-static u64 nmi_ts_start;
-static u64 nmi_total_ts;
-static int nmi_count;
-static int nmi_cpu;
+/* runtime kthread data */
+struct hwlat_kthread_data {
+ struct task_struct *kthread;
+ /* NMI timestamp counters */
+ u64 nmi_ts_start;
+ u64 nmi_total_ts;
+ int nmi_count;
+ int nmi_cpu;
+};
+
+struct hwlat_kthread_data hwlat_single_cpu_data;
+DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data);
/* Tells NMIs to call back to the hwlat tracer to record timestamps */
bool trace_hwlat_callback_enabled;
@@ -96,11 +109,24 @@ static struct hwlat_data {
u64 sample_window; /* total sampling window (on+off) */
u64 sample_width; /* active sampling portion of window */
+ int thread_mode; /* thread mode */
+
} hwlat_data = {
.sample_window = DEFAULT_SAMPLE_WINDOW,
.sample_width = DEFAULT_SAMPLE_WIDTH,
+ .thread_mode = MODE_ROUND_ROBIN
};
+static struct hwlat_kthread_data *get_cpu_data(void)
+{
+ if (hwlat_data.thread_mode == MODE_PER_CPU)
+ return this_cpu_ptr(&hwlat_per_cpu_data);
+ else
+ return &hwlat_single_cpu_data;
+}
+
+static bool hwlat_busy;
+
static void trace_hwlat_sample(struct hwlat_sample *sample)
{
struct trace_array *tr = hwlat_trace;
@@ -136,7 +162,9 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
void trace_hwlat_callback(bool enter)
{
- if (smp_processor_id() != nmi_cpu)
+ struct hwlat_kthread_data *kdata = get_cpu_data();
+
+ if (!kdata->kthread)
return;
/*
@@ -145,15 +173,24 @@ void trace_hwlat_callback(bool enter)
*/
if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
if (enter)
- nmi_ts_start = time_get();
+ kdata->nmi_ts_start = time_get();
else
- nmi_total_ts += time_get() - nmi_ts_start;
+ kdata->nmi_total_ts += time_get() - kdata->nmi_ts_start;
}
if (enter)
- nmi_count++;
+ kdata->nmi_count++;
}
+/*
+ * hwlat_err - report a hwlat error.
+ */
+#define hwlat_err(msg) ({ \
+ struct trace_array *tr = hwlat_trace; \
+ \
+ trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg); \
+})
+
/**
* get_sample - sample the CPU TSC and look for likely hardware latencies
*
@@ -163,6 +200,7 @@ void trace_hwlat_callback(bool enter)
*/
static int get_sample(void)
{
+ struct hwlat_kthread_data *kdata = get_cpu_data();
struct trace_array *tr = hwlat_trace;
struct hwlat_sample s;
time_type start, t1, t2, last_t2;
@@ -175,9 +213,8 @@ static int get_sample(void)
do_div(thresh, NSEC_PER_USEC); /* modifies interval value */
- nmi_cpu = smp_processor_id();
- nmi_total_ts = 0;
- nmi_count = 0;
+ kdata->nmi_total_ts = 0;
+ kdata->nmi_count = 0;
/* Make sure NMIs see this first */
barrier();
@@ -197,7 +234,7 @@ static int get_sample(void)
outer_diff = time_to_us(time_sub(t1, last_t2));
/* This shouldn't happen */
if (outer_diff < 0) {
- pr_err(BANNER "time running backwards\n");
+ hwlat_err(BANNER "time running backwards\n");
goto out;
}
if (outer_diff > outer_sample)
@@ -209,7 +246,7 @@ static int get_sample(void)
/* Check for possible overflows */
if (total < last_total) {
- pr_err("Time total overflowed\n");
+ hwlat_err("Time total overflowed\n");
break;
}
last_total = total;
@@ -225,7 +262,7 @@ static int get_sample(void)
/* This shouldn't happen */
if (diff < 0) {
- pr_err(BANNER "time running backwards\n");
+ hwlat_err(BANNER "time running backwards\n");
goto out;
}
@@ -247,15 +284,15 @@ static int get_sample(void)
ret = 1;
/* We read in microseconds */
- if (nmi_total_ts)
- do_div(nmi_total_ts, NSEC_PER_USEC);
+ if (kdata->nmi_total_ts)
+ do_div(kdata->nmi_total_ts, NSEC_PER_USEC);
hwlat_data.count++;
s.seqnum = hwlat_data.count;
s.duration = sample;
s.outer_duration = outer_sample;
- s.nmi_total_ts = nmi_total_ts;
- s.nmi_count = nmi_count;
+ s.nmi_total_ts = kdata->nmi_total_ts;
+ s.nmi_count = kdata->nmi_count;
s.count = count;
trace_hwlat_sample(&s);
@@ -273,7 +310,6 @@ out:
}
static struct cpumask save_cpumask;
-static bool disable_migrate;
static void move_to_next_cpu(void)
{
@@ -281,26 +317,24 @@ static void move_to_next_cpu(void)
struct trace_array *tr = hwlat_trace;
int next_cpu;
- if (disable_migrate)
- return;
/*
* If for some reason the user modifies the CPU affinity
* of this thread, then stop migrating for the duration
* of the current test.
*/
if (!cpumask_equal(current_mask, current->cpus_ptr))
- goto disable;
+ goto change_mode;
- get_online_cpus();
+ cpus_read_lock();
cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
- next_cpu = cpumask_next(smp_processor_id(), current_mask);
- put_online_cpus();
+ next_cpu = cpumask_next(raw_smp_processor_id(), current_mask);
+ cpus_read_unlock();
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(current_mask);
if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
- goto disable;
+ goto change_mode;
cpumask_clear(current_mask);
cpumask_set_cpu(next_cpu, current_mask);
@@ -308,8 +342,9 @@ static void move_to_next_cpu(void)
sched_setaffinity(0, current_mask);
return;
- disable:
- disable_migrate = true;
+ change_mode:
+ hwlat_data.thread_mode = MODE_NONE;
+ pr_info(BANNER "cpumask changed while in round-robin mode, switching to mode none\n");
}
/*
@@ -328,7 +363,8 @@ static int kthread_fn(void *data)
while (!kthread_should_stop()) {
- move_to_next_cpu();
+ if (hwlat_data.thread_mode == MODE_ROUND_ROBIN)
+ move_to_next_cpu();
local_irq_disable();
get_sample();
@@ -351,178 +387,380 @@ static int kthread_fn(void *data)
return 0;
}
-/**
- * start_kthread - Kick off the hardware latency sampling/detector kthread
+/*
+ * stop_stop_kthread - Inform the hardware latency sampling/detector kthread to stop
+ *
+ * This kicks the running hardware latency sampling/detector kernel thread and
+ * tells it to stop sampling now. Use this on unload and at system shutdown.
+ */
+static void stop_single_kthread(void)
+{
+ struct hwlat_kthread_data *kdata = get_cpu_data();
+ struct task_struct *kthread;
+
+ cpus_read_lock();
+ kthread = kdata->kthread;
+
+ if (!kthread)
+ goto out_put_cpus;
+
+ kthread_stop(kthread);
+ kdata->kthread = NULL;
+
+out_put_cpus:
+ cpus_read_unlock();
+}
+
+
+/*
+ * start_single_kthread - Kick off the hardware latency sampling/detector kthread
*
* This starts the kernel thread that will sit and sample the CPU timestamp
* counter (TSC or similar) and look for potential hardware latencies.
*/
-static int start_kthread(struct trace_array *tr)
+static int start_single_kthread(struct trace_array *tr)
{
+ struct hwlat_kthread_data *kdata = get_cpu_data();
struct cpumask *current_mask = &save_cpumask;
struct task_struct *kthread;
int next_cpu;
- if (hwlat_kthread)
- return 0;
-
- /* Just pick the first CPU on first iteration */
- get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
- put_online_cpus();
- next_cpu = cpumask_first(current_mask);
+ cpus_read_lock();
+ if (kdata->kthread)
+ goto out_put_cpus;
kthread = kthread_create(kthread_fn, NULL, "hwlatd");
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
+ cpus_read_unlock();
return -ENOMEM;
}
- cpumask_clear(current_mask);
- cpumask_set_cpu(next_cpu, current_mask);
+ /* Just pick the first CPU on first iteration */
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
+
+ if (hwlat_data.thread_mode == MODE_ROUND_ROBIN) {
+ next_cpu = cpumask_first(current_mask);
+ cpumask_clear(current_mask);
+ cpumask_set_cpu(next_cpu, current_mask);
+
+ }
+
sched_setaffinity(kthread->pid, current_mask);
- hwlat_kthread = kthread;
+ kdata->kthread = kthread;
wake_up_process(kthread);
+out_put_cpus:
+ cpus_read_unlock();
return 0;
}
-/**
- * stop_kthread - Inform the hardware latency sampling/detector kthread to stop
+/*
+ * stop_cpu_kthread - Stop a hwlat cpu kthread
+ */
+static void stop_cpu_kthread(unsigned int cpu)
+{
+ struct task_struct *kthread;
+
+ kthread = per_cpu(hwlat_per_cpu_data, cpu).kthread;
+ if (kthread)
+ kthread_stop(kthread);
+ per_cpu(hwlat_per_cpu_data, cpu).kthread = NULL;
+}
+
+/*
+ * stop_per_cpu_kthreads - Inform the hardware latency sampling/detector kthread to stop
*
- * This kicks the running hardware latency sampling/detector kernel thread and
+ * This kicks the running hardware latency sampling/detector kernel threads and
* tells it to stop sampling now. Use this on unload and at system shutdown.
*/
-static void stop_kthread(void)
+static void stop_per_cpu_kthreads(void)
{
- if (!hwlat_kthread)
- return;
- kthread_stop(hwlat_kthread);
- hwlat_kthread = NULL;
+ unsigned int cpu;
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu)
+ stop_cpu_kthread(cpu);
+ cpus_read_unlock();
}
/*
- * hwlat_read - Wrapper read function for reading both window and width
- * @filp: The active open file structure
- * @ubuf: The userspace provided buffer to read value into
- * @cnt: The maximum number of bytes to read
- * @ppos: The current "file" position
- *
- * This function provides a generic read implementation for the global state
- * "hwlat_data" structure filesystem entries.
+ * start_cpu_kthread - Start a hwlat cpu kthread
*/
-static ssize_t hwlat_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int start_cpu_kthread(unsigned int cpu)
{
- char buf[U64STR_SIZE];
- u64 *entry = filp->private_data;
- u64 val;
- int len;
+ struct task_struct *kthread;
+ char comm[24];
- if (!entry)
- return -EFAULT;
+ snprintf(comm, 24, "hwlatd/%d", cpu);
- if (cnt > sizeof(buf))
- cnt = sizeof(buf);
+ kthread = kthread_create_on_cpu(kthread_fn, NULL, cpu, comm);
+ if (IS_ERR(kthread)) {
+ pr_err(BANNER "could not start sampling thread\n");
+ return -ENOMEM;
+ }
- val = *entry;
+ per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread;
+ wake_up_process(kthread);
- len = snprintf(buf, sizeof(buf), "%llu\n", val);
+ return 0;
+}
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+#ifdef CONFIG_HOTPLUG_CPU
+static void hwlat_hotplug_workfn(struct work_struct *dummy)
+{
+ struct trace_array *tr = hwlat_trace;
+ unsigned int cpu = smp_processor_id();
+
+ mutex_lock(&trace_types_lock);
+ mutex_lock(&hwlat_data.lock);
+ cpus_read_lock();
+
+ if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU)
+ goto out_unlock;
+
+ if (!cpumask_test_cpu(cpu, tr->tracing_cpumask))
+ goto out_unlock;
+
+ start_cpu_kthread(cpu);
+
+out_unlock:
+ cpus_read_unlock();
+ mutex_unlock(&hwlat_data.lock);
+ mutex_unlock(&trace_types_lock);
}
-/**
- * hwlat_width_write - Write function for "width" entry
- * @filp: The active open file structure
- * @ubuf: The user buffer that contains the value to write
- * @cnt: The maximum number of bytes to write to "file"
- * @ppos: The current position in @file
+static DECLARE_WORK(hwlat_hotplug_work, hwlat_hotplug_workfn);
+
+/*
+ * hwlat_cpu_init - CPU hotplug online callback function
+ */
+static int hwlat_cpu_init(unsigned int cpu)
+{
+ schedule_work_on(cpu, &hwlat_hotplug_work);
+ return 0;
+}
+
+/*
+ * hwlat_cpu_die - CPU hotplug offline callback function
+ */
+static int hwlat_cpu_die(unsigned int cpu)
+{
+ stop_cpu_kthread(cpu);
+ return 0;
+}
+
+static void hwlat_init_hotplug_support(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "trace/hwlat:online",
+ hwlat_cpu_init, hwlat_cpu_die);
+ if (ret < 0)
+ pr_warn(BANNER "Error to init cpu hotplug support\n");
+
+ return;
+}
+#else /* CONFIG_HOTPLUG_CPU */
+static void hwlat_init_hotplug_support(void)
+{
+ return;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * start_per_cpu_kthreads - Kick off the hardware latency sampling/detector kthreads
*
- * This function provides a write implementation for the "width" interface
- * to the hardware latency detector. It can be used to configure
- * for how many us of the total window us we will actively sample for any
- * hardware-induced latency periods. Obviously, it is not possible to
- * sample constantly and have the system respond to a sample reader, or,
- * worse, without having the system appear to have gone out to lunch. It
- * is enforced that width is less that the total window size.
+ * This starts the kernel threads that will sit on potentially all cpus and
+ * sample the CPU timestamp counter (TSC or similar) and look for potential
+ * hardware latencies.
*/
-static ssize_t
-hwlat_width_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int start_per_cpu_kthreads(struct trace_array *tr)
{
- u64 val;
- int err;
+ struct cpumask *current_mask = &save_cpumask;
+ unsigned int cpu;
+ int retval;
- err = kstrtoull_from_user(ubuf, cnt, 10, &val);
- if (err)
- return err;
+ cpus_read_lock();
+ /*
+ * Run only on CPUs in which hwlat is allowed to run.
+ */
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
+
+ for_each_online_cpu(cpu)
+ per_cpu(hwlat_per_cpu_data, cpu).kthread = NULL;
+
+ for_each_cpu(cpu, current_mask) {
+ retval = start_cpu_kthread(cpu);
+ if (retval)
+ goto out_error;
+ }
+ cpus_read_unlock();
+
+ return 0;
+
+out_error:
+ cpus_read_unlock();
+ stop_per_cpu_kthreads();
+ return retval;
+}
+
+static void *s_mode_start(struct seq_file *s, loff_t *pos)
+{
+ int mode = *pos;
mutex_lock(&hwlat_data.lock);
- if (val < hwlat_data.sample_window)
- hwlat_data.sample_width = val;
+
+ if (mode >= MODE_MAX)
+ return NULL;
+
+ return pos;
+}
+
+static void *s_mode_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ int mode = ++(*pos);
+
+ if (mode >= MODE_MAX)
+ return NULL;
+
+ return pos;
+}
+
+static int s_mode_show(struct seq_file *s, void *v)
+{
+ loff_t *pos = v;
+ int mode = *pos;
+
+ if (mode == hwlat_data.thread_mode)
+ seq_printf(s, "[%s]", thread_mode_str[mode]);
else
- err = -EINVAL;
- mutex_unlock(&hwlat_data.lock);
+ seq_printf(s, "%s", thread_mode_str[mode]);
- if (err)
- return err;
+ if (mode != MODE_MAX)
+ seq_puts(s, " ");
+
+ return 0;
+}
- return cnt;
+static void s_mode_stop(struct seq_file *s, void *v)
+{
+ seq_puts(s, "\n");
+ mutex_unlock(&hwlat_data.lock);
}
+static const struct seq_operations thread_mode_seq_ops = {
+ .start = s_mode_start,
+ .next = s_mode_next,
+ .show = s_mode_show,
+ .stop = s_mode_stop
+};
+
+static int hwlat_mode_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &thread_mode_seq_ops);
+};
+
+static void hwlat_tracer_start(struct trace_array *tr);
+static void hwlat_tracer_stop(struct trace_array *tr);
+
/**
- * hwlat_window_write - Write function for "window" entry
+ * hwlat_mode_write - Write function for "mode" entry
* @filp: The active open file structure
* @ubuf: The user buffer that contains the value to write
* @cnt: The maximum number of bytes to write to "file"
* @ppos: The current position in @file
*
- * This function provides a write implementation for the "window" interface
- * to the hardware latency detector. The window is the total time
- * in us that will be considered one sample period. Conceptually, windows
- * occur back-to-back and contain a sample width period during which
- * actual sampling occurs. Can be used to write a new total window size. It
- * is enforced that any value written must be greater than the sample width
- * size, or an error results.
+ * This function provides a write implementation for the "mode" interface
+ * to the hardware latency detector. hwlatd has different operation modes.
+ * The "none" sets the allowed cpumask for a single hwlatd thread at the
+ * startup and lets the scheduler handle the migration. The default mode is
+ * the "round-robin" one, in which a single hwlatd thread runs, migrating
+ * among the allowed CPUs in a round-robin fashion. The "per-cpu" mode
+ * creates one hwlatd thread per allowed CPU.
*/
-static ssize_t
-hwlat_window_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static ssize_t hwlat_mode_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- u64 val;
- int err;
+ struct trace_array *tr = hwlat_trace;
+ const char *mode;
+ char buf[64];
+ int ret, i;
- err = kstrtoull_from_user(ubuf, cnt, 10, &val);
- if (err)
- return err;
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ mode = strstrip(buf);
+
+ ret = -EINVAL;
+
+ /*
+ * trace_types_lock is taken to avoid concurrency on start/stop
+ * and hwlat_busy.
+ */
+ mutex_lock(&trace_types_lock);
+ if (hwlat_busy)
+ hwlat_tracer_stop(tr);
mutex_lock(&hwlat_data.lock);
- if (hwlat_data.sample_width < val)
- hwlat_data.sample_window = val;
- else
- err = -EINVAL;
+
+ for (i = 0; i < MODE_MAX; i++) {
+ if (strcmp(mode, thread_mode_str[i]) == 0) {
+ hwlat_data.thread_mode = i;
+ ret = cnt;
+ }
+ }
+
mutex_unlock(&hwlat_data.lock);
- if (err)
- return err;
+ if (hwlat_busy)
+ hwlat_tracer_start(tr);
+ mutex_unlock(&trace_types_lock);
- return cnt;
+ *ppos += cnt;
+
+
+
+ return ret;
}
-static const struct file_operations width_fops = {
- .open = tracing_open_generic,
- .read = hwlat_read,
- .write = hwlat_width_write,
+/*
+ * The width parameter is read/write using the generic trace_min_max_param
+ * method. The *val is protected by the hwlat_data lock and is upper
+ * bounded by the window parameter.
+ */
+static struct trace_min_max_param hwlat_width = {
+ .lock = &hwlat_data.lock,
+ .val = &hwlat_data.sample_width,
+ .max = &hwlat_data.sample_window,
+ .min = NULL,
};
-static const struct file_operations window_fops = {
- .open = tracing_open_generic,
- .read = hwlat_read,
- .write = hwlat_window_write,
+/*
+ * The window parameter is read/write using the generic trace_min_max_param
+ * method. The *val is protected by the hwlat_data lock and is lower
+ * bounded by the width parameter.
+ */
+static struct trace_min_max_param hwlat_window = {
+ .lock = &hwlat_data.lock,
+ .val = &hwlat_data.sample_window,
+ .max = NULL,
+ .min = &hwlat_data.sample_width,
};
+static const struct file_operations thread_mode_fops = {
+ .open = hwlat_mode_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = hwlat_mode_write
+};
/**
* init_tracefs - A function to initialize the tracefs interface files
*
@@ -546,18 +784,25 @@ static int init_tracefs(void)
hwlat_sample_window = tracefs_create_file("window", 0640,
top_dir,
- &hwlat_data.sample_window,
- &window_fops);
+ &hwlat_window,
+ &trace_min_max_fops);
if (!hwlat_sample_window)
goto err;
hwlat_sample_width = tracefs_create_file("width", 0644,
top_dir,
- &hwlat_data.sample_width,
- &width_fops);
+ &hwlat_width,
+ &trace_min_max_fops);
if (!hwlat_sample_width)
goto err;
+ hwlat_thread_mode = trace_create_file("mode", 0644,
+ top_dir,
+ NULL,
+ &thread_mode_fops);
+ if (!hwlat_thread_mode)
+ goto err;
+
return 0;
err:
@@ -569,18 +814,22 @@ static void hwlat_tracer_start(struct trace_array *tr)
{
int err;
- err = start_kthread(tr);
+ if (hwlat_data.thread_mode == MODE_PER_CPU)
+ err = start_per_cpu_kthreads(tr);
+ else
+ err = start_single_kthread(tr);
if (err)
pr_err(BANNER "Cannot start hwlat kthread\n");
}
static void hwlat_tracer_stop(struct trace_array *tr)
{
- stop_kthread();
+ if (hwlat_data.thread_mode == MODE_PER_CPU)
+ stop_per_cpu_kthreads();
+ else
+ stop_single_kthread();
}
-static bool hwlat_busy;
-
static int hwlat_tracer_init(struct trace_array *tr)
{
/* Only allow one instance to enable this */
@@ -589,7 +838,6 @@ static int hwlat_tracer_init(struct trace_array *tr)
hwlat_trace = tr;
- disable_migrate = false;
hwlat_data.count = 0;
tr->max_latency = 0;
save_tracing_thresh = tracing_thresh;
@@ -608,7 +856,7 @@ static int hwlat_tracer_init(struct trace_array *tr)
static void hwlat_tracer_reset(struct trace_array *tr)
{
- stop_kthread();
+ hwlat_tracer_stop(tr);
/* the tracing threshold is static between runs */
last_tracing_thresh = tracing_thresh;
@@ -637,6 +885,8 @@ __init static int init_hwlat_tracer(void)
if (ret)
return ret;
+ hwlat_init_hotplug_support();
+
init_tracefs();
return 0;
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 9da76104f7a2..59857a1ee44c 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -147,11 +147,17 @@ static int kdb_ftdump(int argc, const char **argv)
return 0;
}
+static kdbtab_t ftdump_cmd = {
+ .name = "ftdump",
+ .func = kdb_ftdump,
+ .usage = "[skip_#entries] [cpu]",
+ .help = "Dump ftrace log; -skip dumps last #entries",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+};
+
static __init int kdb_ftrace_register(void)
{
- kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]",
- "Dump ftrace log; -skip dumps last #entries", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register(&ftdump_cmd);
return 0;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ea6178cb5e33..3a64ba4bbad6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -80,10 +80,6 @@ static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev)
for_each_dyn_event(dpos) \
if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos)))
-#define SIZEOF_TRACE_KPROBE(n) \
- (offsetof(struct trace_kprobe, tp.args) + \
- (sizeof(struct probe_arg) * (n)))
-
static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
{
return tk->rp.handler != NULL;
@@ -265,7 +261,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
struct trace_kprobe *tk;
int ret = -ENOMEM;
- tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL);
+ tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL);
if (!tk)
return ERR_PTR(ret);
@@ -543,6 +539,10 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk)
if (trace_probe_is_enabled(&tk->tp))
return -EBUSY;
+ /* If there's a reference to the dynamic event */
+ if (trace_event_dyn_busy(trace_probe_event_call(&tk->tp)))
+ return -EBUSY;
+
/* Will fail if probe is being used by ftrace or perf */
if (unregister_kprobe_event(tk))
return -EBUSY;
@@ -618,7 +618,7 @@ static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to)
if (ret)
trace_probe_unlink(&tk->tp);
else
- dyn_event_add(&tk->devent);
+ dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp));
return ret;
}
@@ -647,7 +647,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
/* Register new event */
ret = register_kprobe_event(tk);
if (ret) {
- pr_warn("Failed to register probe event(%d)\n", ret);
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ } else
+ pr_warn("Failed to register probe event(%d)\n", ret);
goto end;
}
@@ -661,7 +665,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
if (ret < 0)
unregister_kprobe_event(tk);
else
- dyn_event_add(&tk->devent);
+ dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp));
end:
mutex_unlock(&event_mutex);
@@ -703,14 +707,6 @@ static struct notifier_block trace_kprobe_module_nb = {
.priority = 1 /* Invoked after kprobe module callback */
};
-/* Convert certain expected symbols into '_' when generating event names */
-static inline void sanitize_event_name(char *name)
-{
- while (*name++ != '\0')
- if (*name == ':' || *name == '.')
- *name = '_';
-}
-
static int __trace_kprobe_create(int argc, const char *argv[])
{
/*
@@ -742,6 +738,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
bool is_return = false;
char *symbol = NULL, *tmp = NULL;
const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
+ enum probe_print_type ptype;
int maxactive = 0;
long offset = 0;
void *addr = NULL;
@@ -869,20 +866,14 @@ static int __trace_kprobe_create(int argc, const char *argv[])
/* parse arguments */
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
- tmp = kstrdup(argv[i], GFP_KERNEL);
- if (!tmp) {
- ret = -ENOMEM;
- goto error;
- }
-
trace_probe_log_set_index(i + 2);
- ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags);
- kfree(tmp);
+ ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags);
if (ret)
goto error; /* This can be -ENOMEM */
}
- ret = traceprobe_set_print_fmt(&tk->tp, is_return);
+ ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
+ ret = traceprobe_set_print_fmt(&tk->tp, ptype);
if (ret < 0)
goto error;
@@ -1330,9 +1321,10 @@ probe_mem_read(void *dest, void *src, size_t size)
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
+process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
void *base)
{
+ struct pt_regs *regs = rec;
unsigned long val;
retry:
@@ -1806,6 +1798,7 @@ struct trace_event_call *
create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
bool is_return)
{
+ enum probe_print_type ptype;
struct trace_kprobe *tk;
int ret;
char *event;
@@ -1829,7 +1822,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
init_trace_event_call(tk);
- if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) {
+ ptype = trace_kprobe_is_return(tk) ?
+ PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
+ if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) {
ret = -ENOMEM;
goto error;
}
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
new file mode 100644
index 000000000000..ce053619f289
--- /dev/null
+++ b/kernel/trace/trace_osnoise.c
@@ -0,0 +1,2113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OS Noise Tracer: computes the OS Noise suffered by a running thread.
+ * Timerlat Tracer: measures the wakeup latency of a timer triggered IRQ and thread.
+ *
+ * Based on "hwlat_detector" tracer by:
+ * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
+ * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
+ * With feedback from Clark Williams <williams@redhat.com>
+ *
+ * And also based on the rtsl tracer presented on:
+ * DE OLIVEIRA, Daniel Bristot, et al. Demystifying the real-time linux
+ * scheduling latency. In: 32nd Euromicro Conference on Real-Time Systems
+ * (ECRTS 2020). Schloss Dagstuhl-Leibniz-Zentrum fur Informatik, 2020.
+ *
+ * Copyright (C) 2021 Daniel Bristot de Oliveira, Red Hat, Inc. <bristot@redhat.com>
+ */
+
+#include <linux/kthread.h>
+#include <linux/tracefs.h>
+#include <linux/uaccess.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/sched/clock.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/sched.h>
+#include "trace.h"
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/trace/irq_vectors.h>
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#include <trace/events/irq.h>
+#include <trace/events/sched.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/osnoise.h>
+
+static struct trace_array *osnoise_trace;
+
+/*
+ * Default values.
+ */
+#define BANNER "osnoise: "
+#define DEFAULT_SAMPLE_PERIOD 1000000 /* 1s */
+#define DEFAULT_SAMPLE_RUNTIME 1000000 /* 1s */
+
+#define DEFAULT_TIMERLAT_PERIOD 1000 /* 1ms */
+#define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */
+
+/*
+ * NMI runtime info.
+ */
+struct osn_nmi {
+ u64 count;
+ u64 delta_start;
+};
+
+/*
+ * IRQ runtime info.
+ */
+struct osn_irq {
+ u64 count;
+ u64 arrival_time;
+ u64 delta_start;
+};
+
+#define IRQ_CONTEXT 0
+#define THREAD_CONTEXT 1
+/*
+ * sofirq runtime info.
+ */
+struct osn_softirq {
+ u64 count;
+ u64 arrival_time;
+ u64 delta_start;
+};
+
+/*
+ * thread runtime info.
+ */
+struct osn_thread {
+ u64 count;
+ u64 arrival_time;
+ u64 delta_start;
+};
+
+/*
+ * Runtime information: this structure saves the runtime information used by
+ * one sampling thread.
+ */
+struct osnoise_variables {
+ struct task_struct *kthread;
+ bool sampling;
+ pid_t pid;
+ struct osn_nmi nmi;
+ struct osn_irq irq;
+ struct osn_softirq softirq;
+ struct osn_thread thread;
+ local_t int_counter;
+};
+
+/*
+ * Per-cpu runtime information.
+ */
+DEFINE_PER_CPU(struct osnoise_variables, per_cpu_osnoise_var);
+
+/*
+ * this_cpu_osn_var - Return the per-cpu osnoise_variables on its relative CPU
+ */
+static inline struct osnoise_variables *this_cpu_osn_var(void)
+{
+ return this_cpu_ptr(&per_cpu_osnoise_var);
+}
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * Runtime information for the timer mode.
+ */
+struct timerlat_variables {
+ struct task_struct *kthread;
+ struct hrtimer timer;
+ u64 rel_period;
+ u64 abs_period;
+ bool tracing_thread;
+ u64 count;
+};
+
+DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var);
+
+/*
+ * this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU
+ */
+static inline struct timerlat_variables *this_cpu_tmr_var(void)
+{
+ return this_cpu_ptr(&per_cpu_timerlat_var);
+}
+
+/*
+ * tlat_var_reset - Reset the values of the given timerlat_variables
+ */
+static inline void tlat_var_reset(void)
+{
+ struct timerlat_variables *tlat_var;
+ int cpu;
+ /*
+ * So far, all the values are initialized as 0, so
+ * zeroing the structure is perfect.
+ */
+ for_each_cpu(cpu, cpu_online_mask) {
+ tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
+ memset(tlat_var, 0, sizeof(*tlat_var));
+ }
+}
+#else /* CONFIG_TIMERLAT_TRACER */
+#define tlat_var_reset() do {} while (0)
+#endif /* CONFIG_TIMERLAT_TRACER */
+
+/*
+ * osn_var_reset - Reset the values of the given osnoise_variables
+ */
+static inline void osn_var_reset(void)
+{
+ struct osnoise_variables *osn_var;
+ int cpu;
+
+ /*
+ * So far, all the values are initialized as 0, so
+ * zeroing the structure is perfect.
+ */
+ for_each_cpu(cpu, cpu_online_mask) {
+ osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
+ memset(osn_var, 0, sizeof(*osn_var));
+ }
+}
+
+/*
+ * osn_var_reset_all - Reset the value of all per-cpu osnoise_variables
+ */
+static inline void osn_var_reset_all(void)
+{
+ osn_var_reset();
+ tlat_var_reset();
+}
+
+/*
+ * Tells NMIs to call back to the osnoise tracer to record timestamps.
+ */
+bool trace_osnoise_callback_enabled;
+
+/*
+ * osnoise sample structure definition. Used to store the statistics of a
+ * sample run.
+ */
+struct osnoise_sample {
+ u64 runtime; /* runtime */
+ u64 noise; /* noise */
+ u64 max_sample; /* max single noise sample */
+ int hw_count; /* # HW (incl. hypervisor) interference */
+ int nmi_count; /* # NMIs during this sample */
+ int irq_count; /* # IRQs during this sample */
+ int softirq_count; /* # softirqs during this sample */
+ int thread_count; /* # threads during this sample */
+};
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * timerlat sample structure definition. Used to store the statistics of
+ * a sample run.
+ */
+struct timerlat_sample {
+ u64 timer_latency; /* timer_latency */
+ unsigned int seqnum; /* unique sequence */
+ int context; /* timer context */
+};
+#endif
+
+/*
+ * Protect the interface.
+ */
+struct mutex interface_lock;
+
+/*
+ * Tracer data.
+ */
+static struct osnoise_data {
+ u64 sample_period; /* total sampling period */
+ u64 sample_runtime; /* active sampling portion of period */
+ u64 stop_tracing; /* stop trace in the internal operation (loop/irq) */
+ u64 stop_tracing_total; /* stop trace in the final operation (report/thread) */
+#ifdef CONFIG_TIMERLAT_TRACER
+ u64 timerlat_period; /* timerlat period */
+ u64 print_stack; /* print IRQ stack if total > */
+ int timerlat_tracer; /* timerlat tracer */
+#endif
+ bool tainted; /* infor users and developers about a problem */
+} osnoise_data = {
+ .sample_period = DEFAULT_SAMPLE_PERIOD,
+ .sample_runtime = DEFAULT_SAMPLE_RUNTIME,
+ .stop_tracing = 0,
+ .stop_tracing_total = 0,
+#ifdef CONFIG_TIMERLAT_TRACER
+ .print_stack = 0,
+ .timerlat_period = DEFAULT_TIMERLAT_PERIOD,
+ .timerlat_tracer = 0,
+#endif
+};
+
+/*
+ * Boolean variable used to inform that the tracer is currently sampling.
+ */
+static bool osnoise_busy;
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Print the osnoise header info.
+ */
+static void print_osnoise_headers(struct seq_file *s)
+{
+ if (osnoise_data.tainted)
+ seq_puts(s, "# osnoise is tainted!\n");
+
+ seq_puts(s, "# _-------=> irqs-off\n");
+ seq_puts(s, "# / _------=> need-resched\n");
+ seq_puts(s, "# | / _-----=> need-resched-lazy\n");
+ seq_puts(s, "# || / _----=> hardirq/softirq\n");
+ seq_puts(s, "# ||| / _---=> preempt-depth\n");
+ seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n");
+ seq_puts(s, "# ||||| / _-=> migrate-disable\n");
+
+ seq_puts(s, "# |||||| / ");
+ seq_puts(s, " MAX\n");
+
+ seq_puts(s, "# ||||| / ");
+ seq_puts(s, " SINGLE Interference counters:\n");
+
+ seq_puts(s, "# ||||||| RUNTIME ");
+ seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n");
+
+ seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP IN US ");
+ seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n");
+
+ seq_puts(s, "# | | | ||||||| | | ");
+ seq_puts(s, " | | | | | | | |\n");
+}
+#else /* CONFIG_PREEMPT_RT */
+static void print_osnoise_headers(struct seq_file *s)
+{
+ if (osnoise_data.tainted)
+ seq_puts(s, "# osnoise is tainted!\n");
+
+ seq_puts(s, "# _-----=> irqs-off\n");
+ seq_puts(s, "# / _----=> need-resched\n");
+ seq_puts(s, "# | / _---=> hardirq/softirq\n");
+ seq_puts(s, "# || / _--=> preempt-depth ");
+ seq_puts(s, " MAX\n");
+
+ seq_puts(s, "# || / ");
+ seq_puts(s, " SINGLE Interference counters:\n");
+
+ seq_puts(s, "# |||| RUNTIME ");
+ seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n");
+
+ seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP IN US ");
+ seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n");
+
+ seq_puts(s, "# | | | |||| | | ");
+ seq_puts(s, " | | | | | | | |\n");
+}
+#endif /* CONFIG_PREEMPT_RT */
+
+/*
+ * osnoise_taint - report an osnoise error.
+ */
+#define osnoise_taint(msg) ({ \
+ struct trace_array *tr = osnoise_trace; \
+ \
+ trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg); \
+ osnoise_data.tainted = true; \
+})
+
+/*
+ * Record an osnoise_sample into the tracer buffer.
+ */
+static void trace_osnoise_sample(struct osnoise_sample *sample)
+{
+ struct trace_array *tr = osnoise_trace;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
+ struct trace_event_call *call = &event_osnoise;
+ struct ring_buffer_event *event;
+ struct osnoise_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_OSNOISE, sizeof(*entry),
+ tracing_gen_ctx());
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->runtime = sample->runtime;
+ entry->noise = sample->noise;
+ entry->max_sample = sample->max_sample;
+ entry->hw_count = sample->hw_count;
+ entry->nmi_count = sample->nmi_count;
+ entry->irq_count = sample->irq_count;
+ entry->softirq_count = sample->softirq_count;
+ entry->thread_count = sample->thread_count;
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit_nostack(buffer, event);
+}
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * Print the timerlat header info.
+ */
+#ifdef CONFIG_PREEMPT_RT
+static void print_timerlat_headers(struct seq_file *s)
+{
+ seq_puts(s, "# _-------=> irqs-off\n");
+ seq_puts(s, "# / _------=> need-resched\n");
+ seq_puts(s, "# | / _-----=> need-resched-lazy\n");
+ seq_puts(s, "# || / _----=> hardirq/softirq\n");
+ seq_puts(s, "# ||| / _---=> preempt-depth\n");
+ seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n");
+ seq_puts(s, "# ||||| / _-=> migrate-disable\n");
+ seq_puts(s, "# |||||| /\n");
+ seq_puts(s, "# ||||||| ACTIVATION\n");
+ seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP ID ");
+ seq_puts(s, " CONTEXT LATENCY\n");
+ seq_puts(s, "# | | | ||||||| | | ");
+ seq_puts(s, " | |\n");
+}
+#else /* CONFIG_PREEMPT_RT */
+static void print_timerlat_headers(struct seq_file *s)
+{
+ seq_puts(s, "# _-----=> irqs-off\n");
+ seq_puts(s, "# / _----=> need-resched\n");
+ seq_puts(s, "# | / _---=> hardirq/softirq\n");
+ seq_puts(s, "# || / _--=> preempt-depth\n");
+ seq_puts(s, "# || /\n");
+ seq_puts(s, "# |||| ACTIVATION\n");
+ seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP ID ");
+ seq_puts(s, " CONTEXT LATENCY\n");
+ seq_puts(s, "# | | | |||| | | ");
+ seq_puts(s, " | |\n");
+}
+#endif /* CONFIG_PREEMPT_RT */
+
+/*
+ * Record an timerlat_sample into the tracer buffer.
+ */
+static void trace_timerlat_sample(struct timerlat_sample *sample)
+{
+ struct trace_array *tr = osnoise_trace;
+ struct trace_event_call *call = &event_osnoise;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
+ struct ring_buffer_event *event;
+ struct timerlat_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_TIMERLAT, sizeof(*entry),
+ tracing_gen_ctx());
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->seqnum = sample->seqnum;
+ entry->context = sample->context;
+ entry->timer_latency = sample->timer_latency;
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit_nostack(buffer, event);
+}
+
+#ifdef CONFIG_STACKTRACE
+
+#define MAX_CALLS 256
+
+/*
+ * Stack trace will take place only at IRQ level, so, no need
+ * to control nesting here.
+ */
+struct trace_stack {
+ int stack_size;
+ int nr_entries;
+ unsigned long calls[MAX_CALLS];
+};
+
+static DEFINE_PER_CPU(struct trace_stack, trace_stack);
+
+/*
+ * timerlat_save_stack - save a stack trace without printing
+ *
+ * Save the current stack trace without printing. The
+ * stack will be printed later, after the end of the measurement.
+ */
+static void timerlat_save_stack(int skip)
+{
+ unsigned int size, nr_entries;
+ struct trace_stack *fstack;
+
+ fstack = this_cpu_ptr(&trace_stack);
+
+ size = ARRAY_SIZE(fstack->calls);
+
+ nr_entries = stack_trace_save(fstack->calls, size, skip);
+
+ fstack->stack_size = nr_entries * sizeof(unsigned long);
+ fstack->nr_entries = nr_entries;
+
+ return;
+
+}
+/*
+ * timerlat_dump_stack - dump a stack trace previously saved
+ *
+ * Dump a saved stack trace into the trace buffer.
+ */
+static void timerlat_dump_stack(void)
+{
+ struct trace_event_call *call = &event_osnoise;
+ struct trace_array *tr = osnoise_trace;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
+ struct ring_buffer_event *event;
+ struct trace_stack *fstack;
+ struct stack_entry *entry;
+ unsigned int size;
+
+ preempt_disable_notrace();
+ fstack = this_cpu_ptr(&trace_stack);
+ size = fstack->stack_size;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry) + size,
+ tracing_gen_ctx());
+ if (!event)
+ goto out;
+
+ entry = ring_buffer_event_data(event);
+
+ memcpy(&entry->caller, fstack->calls, size);
+ entry->size = fstack->nr_entries;
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit_nostack(buffer, event);
+
+out:
+ preempt_enable_notrace();
+}
+#else
+#define timerlat_dump_stack() do {} while (0)
+#define timerlat_save_stack(a) do {} while (0)
+#endif /* CONFIG_STACKTRACE */
+#endif /* CONFIG_TIMERLAT_TRACER */
+
+/*
+ * Macros to encapsulate the time capturing infrastructure.
+ */
+#define time_get() trace_clock_local()
+#define time_to_us(x) div_u64(x, 1000)
+#define time_sub(a, b) ((a) - (b))
+
+/*
+ * cond_move_irq_delta_start - Forward the delta_start of a running IRQ
+ *
+ * If an IRQ is preempted by an NMI, its delta_start is pushed forward
+ * to discount the NMI interference.
+ *
+ * See get_int_safe_duration().
+ */
+static inline void
+cond_move_irq_delta_start(struct osnoise_variables *osn_var, u64 duration)
+{
+ if (osn_var->irq.delta_start)
+ osn_var->irq.delta_start += duration;
+}
+
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * cond_move_softirq_delta_start - Forward the delta_start of a running softirq.
+ *
+ * If a softirq is preempted by an IRQ or NMI, its delta_start is pushed
+ * forward to discount the interference.
+ *
+ * See get_int_safe_duration().
+ */
+static inline void
+cond_move_softirq_delta_start(struct osnoise_variables *osn_var, u64 duration)
+{
+ if (osn_var->softirq.delta_start)
+ osn_var->softirq.delta_start += duration;
+}
+#else /* CONFIG_PREEMPT_RT */
+#define cond_move_softirq_delta_start(osn_var, duration) do {} while (0)
+#endif
+
+/*
+ * cond_move_thread_delta_start - Forward the delta_start of a running thread
+ *
+ * If a noisy thread is preempted by an softirq, IRQ or NMI, its delta_start
+ * is pushed forward to discount the interference.
+ *
+ * See get_int_safe_duration().
+ */
+static inline void
+cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration)
+{
+ if (osn_var->thread.delta_start)
+ osn_var->thread.delta_start += duration;
+}
+
+/*
+ * get_int_safe_duration - Get the duration of a window
+ *
+ * The irq, softirq and thread varaibles need to have its duration without
+ * the interference from higher priority interrupts. Instead of keeping a
+ * variable to discount the interrupt interference from these variables, the
+ * starting time of these variables are pushed forward with the interrupt's
+ * duration. In this way, a single variable is used to:
+ *
+ * - Know if a given window is being measured.
+ * - Account its duration.
+ * - Discount the interference.
+ *
+ * To avoid getting inconsistent values, e.g.,:
+ *
+ * now = time_get()
+ * ---> interrupt!
+ * delta_start -= int duration;
+ * <---
+ * duration = now - delta_start;
+ *
+ * result: negative duration if the variable duration before the
+ * interrupt was smaller than the interrupt execution.
+ *
+ * A counter of interrupts is used. If the counter increased, try
+ * to capture an interference safe duration.
+ */
+static inline s64
+get_int_safe_duration(struct osnoise_variables *osn_var, u64 *delta_start)
+{
+ u64 int_counter, now;
+ s64 duration;
+
+ do {
+ int_counter = local_read(&osn_var->int_counter);
+ /* synchronize with interrupts */
+ barrier();
+
+ now = time_get();
+ duration = (now - *delta_start);
+
+ /* synchronize with interrupts */
+ barrier();
+ } while (int_counter != local_read(&osn_var->int_counter));
+
+ /*
+ * This is an evidence of race conditions that cause
+ * a value to be "discounted" too much.
+ */
+ if (duration < 0)
+ osnoise_taint("Negative duration!\n");
+
+ *delta_start = 0;
+
+ return duration;
+}
+
+/*
+ *
+ * set_int_safe_time - Save the current time on *time, aware of interference
+ *
+ * Get the time, taking into consideration a possible interference from
+ * higher priority interrupts.
+ *
+ * See get_int_safe_duration() for an explanation.
+ */
+static u64
+set_int_safe_time(struct osnoise_variables *osn_var, u64 *time)
+{
+ u64 int_counter;
+
+ do {
+ int_counter = local_read(&osn_var->int_counter);
+ /* synchronize with interrupts */
+ barrier();
+
+ *time = time_get();
+
+ /* synchronize with interrupts */
+ barrier();
+ } while (int_counter != local_read(&osn_var->int_counter));
+
+ return int_counter;
+}
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * copy_int_safe_time - Copy *src into *desc aware of interference
+ */
+static u64
+copy_int_safe_time(struct osnoise_variables *osn_var, u64 *dst, u64 *src)
+{
+ u64 int_counter;
+
+ do {
+ int_counter = local_read(&osn_var->int_counter);
+ /* synchronize with interrupts */
+ barrier();
+
+ *dst = *src;
+
+ /* synchronize with interrupts */
+ barrier();
+ } while (int_counter != local_read(&osn_var->int_counter));
+
+ return int_counter;
+}
+#endif /* CONFIG_TIMERLAT_TRACER */
+
+/*
+ * trace_osnoise_callback - NMI entry/exit callback
+ *
+ * This function is called at the entry and exit NMI code. The bool enter
+ * distinguishes between either case. This function is used to note a NMI
+ * occurrence, compute the noise caused by the NMI, and to remove the noise
+ * it is potentially causing on other interference variables.
+ */
+void trace_osnoise_callback(bool enter)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+ u64 duration;
+
+ if (!osn_var->sampling)
+ return;
+
+ /*
+ * Currently trace_clock_local() calls sched_clock() and the
+ * generic version is not NMI safe.
+ */
+ if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
+ if (enter) {
+ osn_var->nmi.delta_start = time_get();
+ local_inc(&osn_var->int_counter);
+ } else {
+ duration = time_get() - osn_var->nmi.delta_start;
+
+ trace_nmi_noise(osn_var->nmi.delta_start, duration);
+
+ cond_move_irq_delta_start(osn_var, duration);
+ cond_move_softirq_delta_start(osn_var, duration);
+ cond_move_thread_delta_start(osn_var, duration);
+ }
+ }
+
+ if (enter)
+ osn_var->nmi.count++;
+}
+
+/*
+ * osnoise_trace_irq_entry - Note the starting of an IRQ
+ *
+ * Save the starting time of an IRQ. As IRQs are non-preemptive to other IRQs,
+ * it is safe to use a single variable (ons_var->irq) to save the statistics.
+ * The arrival_time is used to report... the arrival time. The delta_start
+ * is used to compute the duration at the IRQ exit handler. See
+ * cond_move_irq_delta_start().
+ */
+void osnoise_trace_irq_entry(int id)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+
+ if (!osn_var->sampling)
+ return;
+ /*
+ * This value will be used in the report, but not to compute
+ * the execution time, so it is safe to get it unsafe.
+ */
+ osn_var->irq.arrival_time = time_get();
+ set_int_safe_time(osn_var, &osn_var->irq.delta_start);
+ osn_var->irq.count++;
+
+ local_inc(&osn_var->int_counter);
+}
+
+/*
+ * osnoise_irq_exit - Note the end of an IRQ, sava data and trace
+ *
+ * Computes the duration of the IRQ noise, and trace it. Also discounts the
+ * interference from other sources of noise could be currently being accounted.
+ */
+void osnoise_trace_irq_exit(int id, const char *desc)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+ int duration;
+
+ if (!osn_var->sampling)
+ return;
+
+ duration = get_int_safe_duration(osn_var, &osn_var->irq.delta_start);
+ trace_irq_noise(id, desc, osn_var->irq.arrival_time, duration);
+ osn_var->irq.arrival_time = 0;
+ cond_move_softirq_delta_start(osn_var, duration);
+ cond_move_thread_delta_start(osn_var, duration);
+}
+
+/*
+ * trace_irqentry_callback - Callback to the irq:irq_entry traceevent
+ *
+ * Used to note the starting of an IRQ occurece.
+ */
+static void trace_irqentry_callback(void *data, int irq,
+ struct irqaction *action)
+{
+ osnoise_trace_irq_entry(irq);
+}
+
+/*
+ * trace_irqexit_callback - Callback to the irq:irq_exit traceevent
+ *
+ * Used to note the end of an IRQ occurece.
+ */
+static void trace_irqexit_callback(void *data, int irq,
+ struct irqaction *action, int ret)
+{
+ osnoise_trace_irq_exit(irq, action->name);
+}
+
+/*
+ * arch specific register function.
+ */
+int __weak osnoise_arch_register(void)
+{
+ return 0;
+}
+
+/*
+ * arch specific unregister function.
+ */
+void __weak osnoise_arch_unregister(void)
+{
+ return;
+}
+
+/*
+ * hook_irq_events - Hook IRQ handling events
+ *
+ * This function hooks the IRQ related callbacks to the respective trace
+ * events.
+ */
+static int hook_irq_events(void)
+{
+ int ret;
+
+ ret = register_trace_irq_handler_entry(trace_irqentry_callback, NULL);
+ if (ret)
+ goto out_err;
+
+ ret = register_trace_irq_handler_exit(trace_irqexit_callback, NULL);
+ if (ret)
+ goto out_unregister_entry;
+
+ ret = osnoise_arch_register();
+ if (ret)
+ goto out_irq_exit;
+
+ return 0;
+
+out_irq_exit:
+ unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL);
+out_unregister_entry:
+ unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL);
+out_err:
+ return -EINVAL;
+}
+
+/*
+ * unhook_irq_events - Unhook IRQ handling events
+ *
+ * This function unhooks the IRQ related callbacks to the respective trace
+ * events.
+ */
+static void unhook_irq_events(void)
+{
+ osnoise_arch_unregister();
+ unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL);
+ unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL);
+}
+
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * trace_softirq_entry_callback - Note the starting of a softirq
+ *
+ * Save the starting time of a softirq. As softirqs are non-preemptive to
+ * other softirqs, it is safe to use a single variable (ons_var->softirq)
+ * to save the statistics. The arrival_time is used to report... the
+ * arrival time. The delta_start is used to compute the duration at the
+ * softirq exit handler. See cond_move_softirq_delta_start().
+ */
+static void trace_softirq_entry_callback(void *data, unsigned int vec_nr)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+
+ if (!osn_var->sampling)
+ return;
+ /*
+ * This value will be used in the report, but not to compute
+ * the execution time, so it is safe to get it unsafe.
+ */
+ osn_var->softirq.arrival_time = time_get();
+ set_int_safe_time(osn_var, &osn_var->softirq.delta_start);
+ osn_var->softirq.count++;
+
+ local_inc(&osn_var->int_counter);
+}
+
+/*
+ * trace_softirq_exit_callback - Note the end of an softirq
+ *
+ * Computes the duration of the softirq noise, and trace it. Also discounts the
+ * interference from other sources of noise could be currently being accounted.
+ */
+static void trace_softirq_exit_callback(void *data, unsigned int vec_nr)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+ int duration;
+
+ if (!osn_var->sampling)
+ return;
+
+#ifdef CONFIG_TIMERLAT_TRACER
+ /*
+ * If the timerlat is enabled, but the irq handler did
+ * not run yet enabling timerlat_tracer, do not trace.
+ */
+ if (unlikely(osnoise_data.timerlat_tracer)) {
+ struct timerlat_variables *tlat_var;
+ tlat_var = this_cpu_tmr_var();
+ if (!tlat_var->tracing_thread) {
+ osn_var->softirq.arrival_time = 0;
+ osn_var->softirq.delta_start = 0;
+ return;
+ }
+ }
+#endif
+
+ duration = get_int_safe_duration(osn_var, &osn_var->softirq.delta_start);
+ trace_softirq_noise(vec_nr, osn_var->softirq.arrival_time, duration);
+ cond_move_thread_delta_start(osn_var, duration);
+ osn_var->softirq.arrival_time = 0;
+}
+
+/*
+ * hook_softirq_events - Hook softirq handling events
+ *
+ * This function hooks the softirq related callbacks to the respective trace
+ * events.
+ */
+static int hook_softirq_events(void)
+{
+ int ret;
+
+ ret = register_trace_softirq_entry(trace_softirq_entry_callback, NULL);
+ if (ret)
+ goto out_err;
+
+ ret = register_trace_softirq_exit(trace_softirq_exit_callback, NULL);
+ if (ret)
+ goto out_unreg_entry;
+
+ return 0;
+
+out_unreg_entry:
+ unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL);
+out_err:
+ return -EINVAL;
+}
+
+/*
+ * unhook_softirq_events - Unhook softirq handling events
+ *
+ * This function hooks the softirq related callbacks to the respective trace
+ * events.
+ */
+static void unhook_softirq_events(void)
+{
+ unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL);
+ unregister_trace_softirq_exit(trace_softirq_exit_callback, NULL);
+}
+#else /* CONFIG_PREEMPT_RT */
+/*
+ * softirq are threads on the PREEMPT_RT mode.
+ */
+static int hook_softirq_events(void)
+{
+ return 0;
+}
+static void unhook_softirq_events(void)
+{
+}
+#endif
+
+/*
+ * thread_entry - Record the starting of a thread noise window
+ *
+ * It saves the context switch time for a noisy thread, and increments
+ * the interference counters.
+ */
+static void
+thread_entry(struct osnoise_variables *osn_var, struct task_struct *t)
+{
+ if (!osn_var->sampling)
+ return;
+ /*
+ * The arrival time will be used in the report, but not to compute
+ * the execution time, so it is safe to get it unsafe.
+ */
+ osn_var->thread.arrival_time = time_get();
+
+ set_int_safe_time(osn_var, &osn_var->thread.delta_start);
+
+ osn_var->thread.count++;
+ local_inc(&osn_var->int_counter);
+}
+
+/*
+ * thread_exit - Report the end of a thread noise window
+ *
+ * It computes the total noise from a thread, tracing if needed.
+ */
+static void
+thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
+{
+ int duration;
+
+ if (!osn_var->sampling)
+ return;
+
+#ifdef CONFIG_TIMERLAT_TRACER
+ if (osnoise_data.timerlat_tracer) {
+ struct timerlat_variables *tlat_var;
+ tlat_var = this_cpu_tmr_var();
+ if (!tlat_var->tracing_thread) {
+ osn_var->thread.delta_start = 0;
+ osn_var->thread.arrival_time = 0;
+ return;
+ }
+ }
+#endif
+
+ duration = get_int_safe_duration(osn_var, &osn_var->thread.delta_start);
+
+ trace_thread_noise(t, osn_var->thread.arrival_time, duration);
+
+ osn_var->thread.arrival_time = 0;
+}
+
+/*
+ * trace_sched_switch - sched:sched_switch trace event handler
+ *
+ * This function is hooked to the sched:sched_switch trace event, and it is
+ * used to record the beginning and to report the end of a thread noise window.
+ */
+static void
+trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
+ struct task_struct *n)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+
+ if (p->pid != osn_var->pid)
+ thread_exit(osn_var, p);
+
+ if (n->pid != osn_var->pid)
+ thread_entry(osn_var, n);
+}
+
+/*
+ * hook_thread_events - Hook the insturmentation for thread noise
+ *
+ * Hook the osnoise tracer callbacks to handle the noise from other
+ * threads on the necessary kernel events.
+ */
+static int hook_thread_events(void)
+{
+ int ret;
+
+ ret = register_trace_sched_switch(trace_sched_switch_callback, NULL);
+ if (ret)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * unhook_thread_events - *nhook the insturmentation for thread noise
+ *
+ * Unook the osnoise tracer callbacks to handle the noise from other
+ * threads on the necessary kernel events.
+ */
+static void unhook_thread_events(void)
+{
+ unregister_trace_sched_switch(trace_sched_switch_callback, NULL);
+}
+
+/*
+ * save_osn_sample_stats - Save the osnoise_sample statistics
+ *
+ * Save the osnoise_sample statistics before the sampling phase. These
+ * values will be used later to compute the diff betwneen the statistics
+ * before and after the osnoise sampling.
+ */
+static void
+save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s)
+{
+ s->nmi_count = osn_var->nmi.count;
+ s->irq_count = osn_var->irq.count;
+ s->softirq_count = osn_var->softirq.count;
+ s->thread_count = osn_var->thread.count;
+}
+
+/*
+ * diff_osn_sample_stats - Compute the osnoise_sample statistics
+ *
+ * After a sample period, compute the difference on the osnoise_sample
+ * statistics. The struct osnoise_sample *s contains the statistics saved via
+ * save_osn_sample_stats() before the osnoise sampling.
+ */
+static void
+diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s)
+{
+ s->nmi_count = osn_var->nmi.count - s->nmi_count;
+ s->irq_count = osn_var->irq.count - s->irq_count;
+ s->softirq_count = osn_var->softirq.count - s->softirq_count;
+ s->thread_count = osn_var->thread.count - s->thread_count;
+}
+
+/*
+ * osnoise_stop_tracing - Stop tracing and the tracer.
+ */
+static __always_inline void osnoise_stop_tracing(void)
+{
+ struct trace_array *tr = osnoise_trace;
+
+ trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
+ "stop tracing hit on cpu %d\n", smp_processor_id());
+
+ tracer_tracing_off(tr);
+}
+
+/*
+ * run_osnoise - Sample the time and look for osnoise
+ *
+ * Used to capture the time, looking for potential osnoise latency repeatedly.
+ * Different from hwlat_detector, it is called with preemption and interrupts
+ * enabled. This allows irqs, softirqs and threads to run, interfering on the
+ * osnoise sampling thread, as they would do with a regular thread.
+ */
+static int run_osnoise(void)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+ struct trace_array *tr = osnoise_trace;
+ u64 start, sample, last_sample;
+ u64 last_int_count, int_count;
+ s64 noise = 0, max_noise = 0;
+ s64 total, last_total = 0;
+ struct osnoise_sample s;
+ unsigned int threshold;
+ u64 runtime, stop_in;
+ u64 sum_noise = 0;
+ int hw_count = 0;
+ int ret = -1;
+
+ /*
+ * Considers the current thread as the workload.
+ */
+ osn_var->pid = current->pid;
+
+ /*
+ * Save the current stats for the diff
+ */
+ save_osn_sample_stats(osn_var, &s);
+
+ /*
+ * if threshold is 0, use the default value of 5 us.
+ */
+ threshold = tracing_thresh ? : 5000;
+
+ /*
+ * Make sure NMIs see sampling first
+ */
+ osn_var->sampling = true;
+ barrier();
+
+ /*
+ * Transform the *_us config to nanoseconds to avoid the
+ * division on the main loop.
+ */
+ runtime = osnoise_data.sample_runtime * NSEC_PER_USEC;
+ stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC;
+
+ /*
+ * Start timestemp
+ */
+ start = time_get();
+
+ /*
+ * "previous" loop.
+ */
+ last_int_count = set_int_safe_time(osn_var, &last_sample);
+
+ do {
+ /*
+ * Get sample!
+ */
+ int_count = set_int_safe_time(osn_var, &sample);
+
+ noise = time_sub(sample, last_sample);
+
+ /*
+ * This shouldn't happen.
+ */
+ if (noise < 0) {
+ osnoise_taint("negative noise!");
+ goto out;
+ }
+
+ /*
+ * Sample runtime.
+ */
+ total = time_sub(sample, start);
+
+ /*
+ * Check for possible overflows.
+ */
+ if (total < last_total) {
+ osnoise_taint("total overflow!");
+ break;
+ }
+
+ last_total = total;
+
+ if (noise >= threshold) {
+ int interference = int_count - last_int_count;
+
+ if (noise > max_noise)
+ max_noise = noise;
+
+ if (!interference)
+ hw_count++;
+
+ sum_noise += noise;
+
+ trace_sample_threshold(last_sample, noise, interference);
+
+ if (osnoise_data.stop_tracing)
+ if (noise > stop_in)
+ osnoise_stop_tracing();
+ }
+
+ /*
+ * For the non-preemptive kernel config: let threads runs, if
+ * they so wish.
+ */
+ cond_resched();
+
+ last_sample = sample;
+ last_int_count = int_count;
+
+ } while (total < runtime && !kthread_should_stop());
+
+ /*
+ * Finish the above in the view for interrupts.
+ */
+ barrier();
+
+ osn_var->sampling = false;
+
+ /*
+ * Make sure sampling data is no longer updated.
+ */
+ barrier();
+
+ /*
+ * Save noise info.
+ */
+ s.noise = time_to_us(sum_noise);
+ s.runtime = time_to_us(total);
+ s.max_sample = time_to_us(max_noise);
+ s.hw_count = hw_count;
+
+ /* Save interference stats info */
+ diff_osn_sample_stats(osn_var, &s);
+
+ trace_osnoise_sample(&s);
+
+ /* Keep a running maximum ever recorded osnoise "latency" */
+ if (max_noise > tr->max_latency) {
+ tr->max_latency = max_noise;
+ latency_fsnotify(tr);
+ }
+
+ if (osnoise_data.stop_tracing_total)
+ if (s.noise > osnoise_data.stop_tracing_total)
+ osnoise_stop_tracing();
+
+ return 0;
+out:
+ return ret;
+}
+
+static struct cpumask osnoise_cpumask;
+static struct cpumask save_cpumask;
+
+/*
+ * osnoise_main - The osnoise detection kernel thread
+ *
+ * Calls run_osnoise() function to measure the osnoise for the configured runtime,
+ * every period.
+ */
+static int osnoise_main(void *data)
+{
+ u64 interval;
+
+ while (!kthread_should_stop()) {
+
+ run_osnoise();
+
+ mutex_lock(&interface_lock);
+ interval = osnoise_data.sample_period - osnoise_data.sample_runtime;
+ mutex_unlock(&interface_lock);
+
+ do_div(interval, USEC_PER_MSEC);
+
+ /*
+ * differently from hwlat_detector, the osnoise tracer can run
+ * without a pause because preemption is on.
+ */
+ if (interval < 1) {
+ /* Let synchronize_rcu_tasks() make progress */
+ cond_resched_tasks_rcu_qs();
+ continue;
+ }
+
+ if (msleep_interruptible(interval))
+ break;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * timerlat_irq - hrtimer handler for timerlat.
+ */
+static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+ struct trace_array *tr = osnoise_trace;
+ struct timerlat_variables *tlat;
+ struct timerlat_sample s;
+ u64 now;
+ u64 diff;
+
+ /*
+ * I am not sure if the timer was armed for this CPU. So, get
+ * the timerlat struct from the timer itself, not from this
+ * CPU.
+ */
+ tlat = container_of(timer, struct timerlat_variables, timer);
+
+ now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer));
+
+ /*
+ * Enable the osnoise: events for thread an softirq.
+ */
+ tlat->tracing_thread = true;
+
+ osn_var->thread.arrival_time = time_get();
+
+ /*
+ * A hardirq is running: the timer IRQ. It is for sure preempting
+ * a thread, and potentially preempting a softirq.
+ *
+ * At this point, it is not interesting to know the duration of the
+ * preempted thread (and maybe softirq), but how much time they will
+ * delay the beginning of the execution of the timer thread.
+ *
+ * To get the correct (net) delay added by the softirq, its delta_start
+ * is set as the IRQ one. In this way, at the return of the IRQ, the delta
+ * start of the sofitrq will be zeroed, accounting then only the time
+ * after that.
+ *
+ * The thread follows the same principle. However, if a softirq is
+ * running, the thread needs to receive the softirq delta_start. The
+ * reason being is that the softirq will be the last to be unfolded,
+ * resseting the thread delay to zero.
+ */
+#ifndef CONFIG_PREEMPT_RT
+ if (osn_var->softirq.delta_start) {
+ copy_int_safe_time(osn_var, &osn_var->thread.delta_start,
+ &osn_var->softirq.delta_start);
+
+ copy_int_safe_time(osn_var, &osn_var->softirq.delta_start,
+ &osn_var->irq.delta_start);
+ } else {
+ copy_int_safe_time(osn_var, &osn_var->thread.delta_start,
+ &osn_var->irq.delta_start);
+ }
+#else /* CONFIG_PREEMPT_RT */
+ /*
+ * The sofirqs run as threads on RT, so there is not need
+ * to keep track of it.
+ */
+ copy_int_safe_time(osn_var, &osn_var->thread.delta_start, &osn_var->irq.delta_start);
+#endif /* CONFIG_PREEMPT_RT */
+
+ /*
+ * Compute the current time with the expected time.
+ */
+ diff = now - tlat->abs_period;
+
+ tlat->count++;
+ s.seqnum = tlat->count;
+ s.timer_latency = diff;
+ s.context = IRQ_CONTEXT;
+
+ trace_timerlat_sample(&s);
+
+ /* Keep a running maximum ever recorded os noise "latency" */
+ if (diff > tr->max_latency) {
+ tr->max_latency = diff;
+ latency_fsnotify(tr);
+ }
+
+ if (osnoise_data.stop_tracing)
+ if (time_to_us(diff) >= osnoise_data.stop_tracing)
+ osnoise_stop_tracing();
+
+ wake_up_process(tlat->kthread);
+
+ if (osnoise_data.print_stack)
+ timerlat_save_stack(0);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * wait_next_period - Wait for the next period for timerlat
+ */
+static int wait_next_period(struct timerlat_variables *tlat)
+{
+ ktime_t next_abs_period, now;
+ u64 rel_period = osnoise_data.timerlat_period * 1000;
+
+ now = hrtimer_cb_get_time(&tlat->timer);
+ next_abs_period = ns_to_ktime(tlat->abs_period + rel_period);
+
+ /*
+ * Save the next abs_period.
+ */
+ tlat->abs_period = (u64) ktime_to_ns(next_abs_period);
+
+ /*
+ * If the new abs_period is in the past, skip the activation.
+ */
+ while (ktime_compare(now, next_abs_period) > 0) {
+ next_abs_period = ns_to_ktime(tlat->abs_period + rel_period);
+ tlat->abs_period = (u64) ktime_to_ns(next_abs_period);
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ hrtimer_start(&tlat->timer, next_abs_period, HRTIMER_MODE_ABS_PINNED_HARD);
+ schedule();
+ return 1;
+}
+
+/*
+ * timerlat_main- Timerlat main
+ */
+static int timerlat_main(void *data)
+{
+ struct osnoise_variables *osn_var = this_cpu_osn_var();
+ struct timerlat_variables *tlat = this_cpu_tmr_var();
+ struct timerlat_sample s;
+ struct sched_param sp;
+ u64 now, diff;
+
+ /*
+ * Make the thread RT, that is how cyclictest is usually used.
+ */
+ sp.sched_priority = DEFAULT_TIMERLAT_PRIO;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+
+ tlat->count = 0;
+ tlat->tracing_thread = false;
+
+ hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
+ tlat->timer.function = timerlat_irq;
+ tlat->kthread = current;
+ osn_var->pid = current->pid;
+ /*
+ * Anotate the arrival time.
+ */
+ tlat->abs_period = hrtimer_cb_get_time(&tlat->timer);
+
+ wait_next_period(tlat);
+
+ osn_var->sampling = 1;
+
+ while (!kthread_should_stop()) {
+ now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer));
+ diff = now - tlat->abs_period;
+
+ s.seqnum = tlat->count;
+ s.timer_latency = diff;
+ s.context = THREAD_CONTEXT;
+
+ trace_timerlat_sample(&s);
+
+#ifdef CONFIG_STACKTRACE
+ if (osnoise_data.print_stack)
+ if (osnoise_data.print_stack <= time_to_us(diff))
+ timerlat_dump_stack();
+#endif /* CONFIG_STACKTRACE */
+
+ tlat->tracing_thread = false;
+ if (osnoise_data.stop_tracing_total)
+ if (time_to_us(diff) >= osnoise_data.stop_tracing_total)
+ osnoise_stop_tracing();
+
+ wait_next_period(tlat);
+ }
+
+ hrtimer_cancel(&tlat->timer);
+ return 0;
+}
+#endif /* CONFIG_TIMERLAT_TRACER */
+
+/*
+ * stop_kthread - stop a workload thread
+ */
+static void stop_kthread(unsigned int cpu)
+{
+ struct task_struct *kthread;
+
+ kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
+ if (kthread)
+ kthread_stop(kthread);
+ per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+}
+
+/*
+ * stop_per_cpu_kthread - Stop per-cpu threads
+ *
+ * Stop the osnoise sampling htread. Use this on unload and at system
+ * shutdown.
+ */
+static void stop_per_cpu_kthreads(void)
+{
+ int cpu;
+
+ cpus_read_lock();
+
+ for_each_online_cpu(cpu)
+ stop_kthread(cpu);
+
+ cpus_read_unlock();
+}
+
+/*
+ * start_kthread - Start a workload tread
+ */
+static int start_kthread(unsigned int cpu)
+{
+ struct task_struct *kthread;
+ void *main = osnoise_main;
+ char comm[24];
+
+#ifdef CONFIG_TIMERLAT_TRACER
+ if (osnoise_data.timerlat_tracer) {
+ snprintf(comm, 24, "timerlat/%d", cpu);
+ main = timerlat_main;
+ } else {
+ snprintf(comm, 24, "osnoise/%d", cpu);
+ }
+#else
+ snprintf(comm, 24, "osnoise/%d", cpu);
+#endif
+ kthread = kthread_create_on_cpu(main, NULL, cpu, comm);
+
+ if (IS_ERR(kthread)) {
+ pr_err(BANNER "could not start sampling thread\n");
+ stop_per_cpu_kthreads();
+ return -ENOMEM;
+ }
+
+ per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread;
+ wake_up_process(kthread);
+
+ return 0;
+}
+
+/*
+ * start_per_cpu_kthread - Kick off per-cpu osnoise sampling kthreads
+ *
+ * This starts the kernel thread that will look for osnoise on many
+ * cpus.
+ */
+static int start_per_cpu_kthreads(struct trace_array *tr)
+{
+ struct cpumask *current_mask = &save_cpumask;
+ int retval = 0;
+ int cpu;
+
+ cpus_read_lock();
+ /*
+ * Run only on CPUs in which trace and osnoise are allowed to run.
+ */
+ cpumask_and(current_mask, tr->tracing_cpumask, &osnoise_cpumask);
+ /*
+ * And the CPU is online.
+ */
+ cpumask_and(current_mask, cpu_online_mask, current_mask);
+
+ for_each_possible_cpu(cpu)
+ per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+
+ for_each_cpu(cpu, current_mask) {
+ retval = start_kthread(cpu);
+ if (retval) {
+ stop_per_cpu_kthreads();
+ break;
+ }
+ }
+
+ cpus_read_unlock();
+
+ return retval;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void osnoise_hotplug_workfn(struct work_struct *dummy)
+{
+ struct trace_array *tr = osnoise_trace;
+ unsigned int cpu = smp_processor_id();
+
+
+ mutex_lock(&trace_types_lock);
+
+ if (!osnoise_busy)
+ goto out_unlock_trace;
+
+ mutex_lock(&interface_lock);
+ cpus_read_lock();
+
+ if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
+ goto out_unlock;
+
+ if (!cpumask_test_cpu(cpu, tr->tracing_cpumask))
+ goto out_unlock;
+
+ start_kthread(cpu);
+
+out_unlock:
+ cpus_read_unlock();
+ mutex_unlock(&interface_lock);
+out_unlock_trace:
+ mutex_unlock(&trace_types_lock);
+}
+
+static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn);
+
+/*
+ * osnoise_cpu_init - CPU hotplug online callback function
+ */
+static int osnoise_cpu_init(unsigned int cpu)
+{
+ schedule_work_on(cpu, &osnoise_hotplug_work);
+ return 0;
+}
+
+/*
+ * osnoise_cpu_die - CPU hotplug offline callback function
+ */
+static int osnoise_cpu_die(unsigned int cpu)
+{
+ stop_kthread(cpu);
+ return 0;
+}
+
+static void osnoise_init_hotplug_support(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "trace/osnoise:online",
+ osnoise_cpu_init, osnoise_cpu_die);
+ if (ret < 0)
+ pr_warn(BANNER "Error to init cpu hotplug support\n");
+
+ return;
+}
+#else /* CONFIG_HOTPLUG_CPU */
+static void osnoise_init_hotplug_support(void)
+{
+ return;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * osnoise_cpus_read - Read function for reading the "cpus" file
+ * @filp: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * Prints the "cpus" output into the user-provided buffer.
+ */
+static ssize_t
+osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count,
+ loff_t *ppos)
+{
+ char *mask_str;
+ int len;
+
+ mutex_lock(&interface_lock);
+
+ len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1;
+ mask_str = kmalloc(len, GFP_KERNEL);
+ if (!mask_str) {
+ count = -ENOMEM;
+ goto out_unlock;
+ }
+
+ len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask));
+ if (len >= count) {
+ count = -EINVAL;
+ goto out_free;
+ }
+
+ count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
+
+out_free:
+ kfree(mask_str);
+out_unlock:
+ mutex_unlock(&interface_lock);
+
+ return count;
+}
+
+static void osnoise_tracer_start(struct trace_array *tr);
+static void osnoise_tracer_stop(struct trace_array *tr);
+
+/*
+ * osnoise_cpus_write - Write function for "cpus" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * This function provides a write implementation for the "cpus"
+ * interface to the osnoise trace. By default, it lists all CPUs,
+ * in this way, allowing osnoise threads to run on any online CPU
+ * of the system. It serves to restrict the execution of osnoise to the
+ * set of CPUs writing via this interface. Note that osnoise also
+ * respects the "tracing_cpumask." Hence, osnoise threads will run only
+ * on the set of CPUs allowed here AND on "tracing_cpumask." Why not
+ * have just "tracing_cpumask?" Because the user might be interested
+ * in tracing what is running on other CPUs. For instance, one might
+ * run osnoise in one HT CPU while observing what is running on the
+ * sibling HT CPU.
+ */
+static ssize_t
+osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
+ loff_t *ppos)
+{
+ struct trace_array *tr = osnoise_trace;
+ cpumask_var_t osnoise_cpumask_new;
+ int running, err;
+ char buf[256];
+
+ if (count >= 256)
+ return -EINVAL;
+
+ if (copy_from_user(buf, ubuf, count))
+ return -EFAULT;
+
+ if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = cpulist_parse(buf, osnoise_cpumask_new);
+ if (err)
+ goto err_free;
+
+ /*
+ * trace_types_lock is taken to avoid concurrency on start/stop
+ * and osnoise_busy.
+ */
+ mutex_lock(&trace_types_lock);
+ running = osnoise_busy;
+ if (running)
+ osnoise_tracer_stop(tr);
+
+ mutex_lock(&interface_lock);
+ /*
+ * osnoise_cpumask is read by CPU hotplug operations.
+ */
+ cpus_read_lock();
+
+ cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new);
+
+ cpus_read_unlock();
+ mutex_unlock(&interface_lock);
+
+ if (running)
+ osnoise_tracer_start(tr);
+ mutex_unlock(&trace_types_lock);
+
+ free_cpumask_var(osnoise_cpumask_new);
+ return count;
+
+err_free:
+ free_cpumask_var(osnoise_cpumask_new);
+
+ return err;
+}
+
+/*
+ * osnoise/runtime_us: cannot be greater than the period.
+ */
+static struct trace_min_max_param osnoise_runtime = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.sample_runtime,
+ .max = &osnoise_data.sample_period,
+ .min = NULL,
+};
+
+/*
+ * osnoise/period_us: cannot be smaller than the runtime.
+ */
+static struct trace_min_max_param osnoise_period = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.sample_period,
+ .max = NULL,
+ .min = &osnoise_data.sample_runtime,
+};
+
+/*
+ * osnoise/stop_tracing_us: no limit.
+ */
+static struct trace_min_max_param osnoise_stop_tracing_in = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.stop_tracing,
+ .max = NULL,
+ .min = NULL,
+};
+
+/*
+ * osnoise/stop_tracing_total_us: no limit.
+ */
+static struct trace_min_max_param osnoise_stop_tracing_total = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.stop_tracing_total,
+ .max = NULL,
+ .min = NULL,
+};
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * osnoise/print_stack: print the stacktrace of the IRQ handler if the total
+ * latency is higher than val.
+ */
+static struct trace_min_max_param osnoise_print_stack = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.print_stack,
+ .max = NULL,
+ .min = NULL,
+};
+
+/*
+ * osnoise/timerlat_period: min 100 us, max 1 s
+ */
+u64 timerlat_min_period = 100;
+u64 timerlat_max_period = 1000000;
+static struct trace_min_max_param timerlat_period = {
+ .lock = &interface_lock,
+ .val = &osnoise_data.timerlat_period,
+ .max = &timerlat_max_period,
+ .min = &timerlat_min_period,
+};
+#endif
+
+static const struct file_operations cpus_fops = {
+ .open = tracing_open_generic,
+ .read = osnoise_cpus_read,
+ .write = osnoise_cpus_write,
+ .llseek = generic_file_llseek,
+};
+
+/*
+ * init_tracefs - A function to initialize the tracefs interface files
+ *
+ * This function creates entries in tracefs for "osnoise" and "timerlat".
+ * It creates these directories in the tracing directory, and within that
+ * directory the use can change and view the configs.
+ */
+static int init_tracefs(void)
+{
+ struct dentry *top_dir;
+ struct dentry *tmp;
+ int ret;
+
+ ret = tracing_init_dentry();
+ if (ret)
+ return -ENOMEM;
+
+ top_dir = tracefs_create_dir("osnoise", NULL);
+ if (!top_dir)
+ return 0;
+
+ tmp = tracefs_create_file("period_us", 0640, top_dir,
+ &osnoise_period, &trace_min_max_fops);
+ if (!tmp)
+ goto err;
+
+ tmp = tracefs_create_file("runtime_us", 0644, top_dir,
+ &osnoise_runtime, &trace_min_max_fops);
+ if (!tmp)
+ goto err;
+
+ tmp = tracefs_create_file("stop_tracing_us", 0640, top_dir,
+ &osnoise_stop_tracing_in, &trace_min_max_fops);
+ if (!tmp)
+ goto err;
+
+ tmp = tracefs_create_file("stop_tracing_total_us", 0640, top_dir,
+ &osnoise_stop_tracing_total, &trace_min_max_fops);
+ if (!tmp)
+ goto err;
+
+ tmp = trace_create_file("cpus", 0644, top_dir, NULL, &cpus_fops);
+ if (!tmp)
+ goto err;
+#ifdef CONFIG_TIMERLAT_TRACER
+#ifdef CONFIG_STACKTRACE
+ tmp = tracefs_create_file("print_stack", 0640, top_dir,
+ &osnoise_print_stack, &trace_min_max_fops);
+ if (!tmp)
+ goto err;
+#endif
+
+ tmp = tracefs_create_file("timerlat_period_us", 0640, top_dir,
+ &timerlat_period, &trace_min_max_fops);
+ if (!tmp)
+ goto err;
+#endif
+
+ return 0;
+
+err:
+ tracefs_remove(top_dir);
+ return -ENOMEM;
+}
+
+static int osnoise_hook_events(void)
+{
+ int retval;
+
+ /*
+ * Trace is already hooked, we are re-enabling from
+ * a stop_tracing_*.
+ */
+ if (trace_osnoise_callback_enabled)
+ return 0;
+
+ retval = hook_irq_events();
+ if (retval)
+ return -EINVAL;
+
+ retval = hook_softirq_events();
+ if (retval)
+ goto out_unhook_irq;
+
+ retval = hook_thread_events();
+ /*
+ * All fine!
+ */
+ if (!retval)
+ return 0;
+
+ unhook_softirq_events();
+out_unhook_irq:
+ unhook_irq_events();
+ return -EINVAL;
+}
+
+static int __osnoise_tracer_start(struct trace_array *tr)
+{
+ int retval;
+
+ osn_var_reset_all();
+
+ retval = osnoise_hook_events();
+ if (retval)
+ return retval;
+ /*
+ * Make sure NMIs see reseted values.
+ */
+ barrier();
+ trace_osnoise_callback_enabled = true;
+
+ retval = start_per_cpu_kthreads(tr);
+ if (retval) {
+ unhook_irq_events();
+ return retval;
+ }
+
+ osnoise_busy = true;
+
+ return 0;
+}
+
+static void osnoise_tracer_start(struct trace_array *tr)
+{
+ int retval;
+
+ if (osnoise_busy)
+ return;
+
+ retval = __osnoise_tracer_start(tr);
+ if (retval)
+ pr_err(BANNER "Error starting osnoise tracer\n");
+
+}
+
+static void osnoise_tracer_stop(struct trace_array *tr)
+{
+ if (!osnoise_busy)
+ return;
+
+ trace_osnoise_callback_enabled = false;
+ barrier();
+
+ stop_per_cpu_kthreads();
+
+ unhook_irq_events();
+ unhook_softirq_events();
+ unhook_thread_events();
+
+ osnoise_busy = false;
+}
+
+static int osnoise_tracer_init(struct trace_array *tr)
+{
+
+ /* Only allow one instance to enable this */
+ if (osnoise_busy)
+ return -EBUSY;
+
+ osnoise_trace = tr;
+ tr->max_latency = 0;
+
+ osnoise_tracer_start(tr);
+
+ return 0;
+}
+
+static void osnoise_tracer_reset(struct trace_array *tr)
+{
+ osnoise_tracer_stop(tr);
+}
+
+static struct tracer osnoise_tracer __read_mostly = {
+ .name = "osnoise",
+ .init = osnoise_tracer_init,
+ .reset = osnoise_tracer_reset,
+ .start = osnoise_tracer_start,
+ .stop = osnoise_tracer_stop,
+ .print_header = print_osnoise_headers,
+ .allow_instances = true,
+};
+
+#ifdef CONFIG_TIMERLAT_TRACER
+static void timerlat_tracer_start(struct trace_array *tr)
+{
+ int retval;
+
+ if (osnoise_busy)
+ return;
+
+ osnoise_data.timerlat_tracer = 1;
+
+ retval = __osnoise_tracer_start(tr);
+ if (retval)
+ goto out_err;
+
+ return;
+out_err:
+ pr_err(BANNER "Error starting timerlat tracer\n");
+}
+
+static void timerlat_tracer_stop(struct trace_array *tr)
+{
+ int cpu;
+
+ if (!osnoise_busy)
+ return;
+
+ for_each_online_cpu(cpu)
+ per_cpu(per_cpu_osnoise_var, cpu).sampling = 0;
+
+ osnoise_tracer_stop(tr);
+
+ osnoise_data.timerlat_tracer = 0;
+}
+
+static int timerlat_tracer_init(struct trace_array *tr)
+{
+ /* Only allow one instance to enable this */
+ if (osnoise_busy)
+ return -EBUSY;
+
+ osnoise_trace = tr;
+
+ tr->max_latency = 0;
+
+ timerlat_tracer_start(tr);
+
+ return 0;
+}
+
+static void timerlat_tracer_reset(struct trace_array *tr)
+{
+ timerlat_tracer_stop(tr);
+}
+
+static struct tracer timerlat_tracer __read_mostly = {
+ .name = "timerlat",
+ .init = timerlat_tracer_init,
+ .reset = timerlat_tracer_reset,
+ .start = timerlat_tracer_start,
+ .stop = timerlat_tracer_stop,
+ .print_header = print_timerlat_headers,
+ .allow_instances = true,
+};
+#endif /* CONFIG_TIMERLAT_TRACER */
+
+__init static int init_osnoise_tracer(void)
+{
+ int ret;
+
+ mutex_init(&interface_lock);
+
+ cpumask_copy(&osnoise_cpumask, cpu_all_mask);
+
+ ret = register_tracer(&osnoise_tracer);
+ if (ret) {
+ pr_err(BANNER "Error registering osnoise!\n");
+ return ret;
+ }
+
+#ifdef CONFIG_TIMERLAT_TRACER
+ ret = register_tracer(&timerlat_tracer);
+ if (ret) {
+ pr_err(BANNER "Error registering timerlat\n");
+ return ret;
+ }
+#endif
+ osnoise_init_hotplug_support();
+
+ init_tracefs();
+
+ return 0;
+}
+late_initcall(init_osnoise_tracer);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d0368a569bfa..c2ca40e8595b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -492,8 +492,13 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
trace_seq_printf(s, "%c%c%c",
irqs_off, need_resched, hardsoft_irq);
- if (entry->preempt_count)
- trace_seq_printf(s, "%x", entry->preempt_count);
+ if (entry->preempt_count & 0xf)
+ trace_seq_printf(s, "%x", entry->preempt_count & 0xf);
+ else
+ trace_seq_putc(s, '.');
+
+ if (entry->preempt_count & 0xf0)
+ trace_seq_printf(s, "%x", entry->preempt_count >> 4);
else
trace_seq_putc(s, '.');
@@ -656,7 +661,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
trace_seq_printf(
s, "%16s %7d %3d %d %08x %08lx ",
comm, entry->pid, iter->cpu, entry->flags,
- entry->preempt_count, iter->idx);
+ entry->preempt_count & 0xf, iter->idx);
} else {
lat_print_generic(s, entry, iter->cpu);
}
@@ -1202,7 +1207,6 @@ trace_hwlat_print(struct trace_iterator *iter, int flags,
return trace_handle_return(s);
}
-
static enum print_line_t
trace_hwlat_raw(struct trace_iterator *iter, int flags,
struct trace_event *event)
@@ -1232,6 +1236,122 @@ static struct trace_event trace_hwlat_event = {
.funcs = &trace_hwlat_funcs,
};
+/* TRACE_OSNOISE */
+static enum print_line_t
+trace_osnoise_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct osnoise_entry *field;
+ u64 ratio, ratio_dec;
+ u64 net_runtime;
+
+ trace_assign_type(field, entry);
+
+ /*
+ * compute the available % of cpu time.
+ */
+ net_runtime = field->runtime - field->noise;
+ ratio = net_runtime * 10000000;
+ do_div(ratio, field->runtime);
+ ratio_dec = do_div(ratio, 100000);
+
+ trace_seq_printf(s, "%llu %10llu %3llu.%05llu %7llu",
+ field->runtime,
+ field->noise,
+ ratio, ratio_dec,
+ field->max_sample);
+
+ trace_seq_printf(s, " %6u", field->hw_count);
+ trace_seq_printf(s, " %6u", field->nmi_count);
+ trace_seq_printf(s, " %6u", field->irq_count);
+ trace_seq_printf(s, " %6u", field->softirq_count);
+ trace_seq_printf(s, " %6u", field->thread_count);
+
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t
+trace_osnoise_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct osnoise_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(s, "%lld %llu %llu %u %u %u %u %u\n",
+ field->runtime,
+ field->noise,
+ field->max_sample,
+ field->hw_count,
+ field->nmi_count,
+ field->irq_count,
+ field->softirq_count,
+ field->thread_count);
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_osnoise_funcs = {
+ .trace = trace_osnoise_print,
+ .raw = trace_osnoise_raw,
+};
+
+static struct trace_event trace_osnoise_event = {
+ .type = TRACE_OSNOISE,
+ .funcs = &trace_osnoise_funcs,
+};
+
+/* TRACE_TIMERLAT */
+static enum print_line_t
+trace_timerlat_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct timerlat_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_printf(s, "#%-5u context %6s timer_latency %9llu ns\n",
+ field->seqnum,
+ field->context ? "thread" : "irq",
+ field->timer_latency);
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t
+trace_timerlat_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct timerlat_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(s, "%u %d %llu\n",
+ field->seqnum,
+ field->context,
+ field->timer_latency);
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_timerlat_funcs = {
+ .trace = trace_timerlat_print,
+ .raw = trace_timerlat_raw,
+};
+
+static struct trace_event trace_timerlat_event = {
+ .type = TRACE_TIMERLAT,
+ .funcs = &trace_timerlat_funcs,
+};
+
/* TRACE_BPUTS */
static enum print_line_t
trace_bputs_print(struct trace_iterator *iter, int flags,
@@ -1442,6 +1562,8 @@ static struct trace_event *events[] __initdata = {
&trace_bprint_event,
&trace_print_event,
&trace_hwlat_event,
+ &trace_osnoise_event,
+ &trace_timerlat_event,
&trace_raw_data_event,
&trace_func_repeats_event,
NULL
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 15413ad7cef2..3ed2a3f37297 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -233,6 +233,9 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
int len;
slash = strchr(event, '/');
+ if (!slash)
+ slash = strchr(event, '.');
+
if (slash) {
if (slash == event) {
trace_probe_log_err(offset, NO_GROUP_NAME);
@@ -316,6 +319,13 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
code->op = FETCH_OP_ARG;
code->param = (unsigned int)param - 1;
#endif
+ } else if (flags & TPARG_FL_TPOINT) {
+ if (code->data)
+ return -EFAULT;
+ code->data = kstrdup(arg, GFP_KERNEL);
+ if (!code->data)
+ return -ENOMEM;
+ code->op = FETCH_OP_TP_ARG;
} else
goto inval_var;
@@ -540,26 +550,34 @@ static int __parse_bitfield_probe_arg(const char *bf,
}
/* String length checking wrapper */
-static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
+static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
struct probe_arg *parg, unsigned int flags, int offset)
{
struct fetch_insn *code, *scode, *tmp = NULL;
char *t, *t2, *t3;
+ char *arg;
int ret, len;
+ arg = kstrdup(argv, GFP_KERNEL);
+ if (!arg)
+ return -ENOMEM;
+
+ ret = -EINVAL;
len = strlen(arg);
if (len > MAX_ARGSTR_LEN) {
trace_probe_log_err(offset, ARG_TOO_LONG);
- return -EINVAL;
+ goto out;
} else if (len == 0) {
trace_probe_log_err(offset, NO_ARG_BODY);
- return -EINVAL;
+ goto out;
}
+ ret = -ENOMEM;
parg->comm = kstrdup(arg, GFP_KERNEL);
if (!parg->comm)
- return -ENOMEM;
+ goto out;
+ ret = -EINVAL;
t = strchr(arg, ':');
if (t) {
*t = '\0';
@@ -571,22 +589,22 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
offset += t2 + strlen(t2) - arg;
trace_probe_log_err(offset,
ARRAY_NO_CLOSE);
- return -EINVAL;
+ goto out;
} else if (t3[1] != '\0') {
trace_probe_log_err(offset + t3 + 1 - arg,
BAD_ARRAY_SUFFIX);
- return -EINVAL;
+ goto out;
}
*t3 = '\0';
if (kstrtouint(t2, 0, &parg->count) || !parg->count) {
trace_probe_log_err(offset + t2 - arg,
BAD_ARRAY_NUM);
- return -EINVAL;
+ goto out;
}
if (parg->count > MAX_ARRAY_LEN) {
trace_probe_log_err(offset + t2 - arg,
ARRAY_TOO_BIG);
- return -EINVAL;
+ goto out;
}
}
}
@@ -598,29 +616,30 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) {
/* The type of $comm must be "string", and not an array. */
if (parg->count || (t && strcmp(t, "string")))
- return -EINVAL;
+ goto out;
parg->type = find_fetch_type("string");
} else
parg->type = find_fetch_type(t);
if (!parg->type) {
trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE);
- return -EINVAL;
+ goto out;
}
parg->offset = *size;
*size += parg->type->size * (parg->count ?: 1);
+ ret = -ENOMEM;
if (parg->count) {
len = strlen(parg->type->fmttype) + 6;
parg->fmt = kmalloc(len, GFP_KERNEL);
if (!parg->fmt)
- return -ENOMEM;
+ goto out;
snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
parg->count);
}
code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
if (!code)
- return -ENOMEM;
+ goto out;
code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
@@ -628,19 +647,20 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
if (ret)
goto fail;
+ ret = -EINVAL;
/* Store operation */
if (!strcmp(parg->type->name, "string") ||
!strcmp(parg->type->name, "ustring")) {
if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
- code->op != FETCH_OP_DATA) {
+ code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) {
trace_probe_log_err(offset + (t ? (t - arg) : 0),
BAD_STRING);
- ret = -EINVAL;
goto fail;
}
if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
- code->op == FETCH_OP_DATA) || parg->count) {
+ code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG ||
+ parg->count) {
/*
* IMM, DATA and COMM is pointing actual address, those
* must be kept, and if parg->count != 0, this is an
@@ -650,7 +670,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(offset, TOO_MANY_OPS);
- ret = -EINVAL;
goto fail;
}
}
@@ -672,7 +691,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(offset, TOO_MANY_OPS);
- ret = -EINVAL;
goto fail;
}
code->op = FETCH_OP_ST_RAW;
@@ -687,6 +705,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
goto fail;
}
}
+ ret = -EINVAL;
/* Loop(Array) operation */
if (parg->count) {
if (scode->op != FETCH_OP_ST_MEM &&
@@ -694,13 +713,11 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
scode->op != FETCH_OP_ST_USTRING) {
trace_probe_log_err(offset + (t ? (t - arg) : 0),
BAD_STRING);
- ret = -EINVAL;
goto fail;
}
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(offset, TOO_MANY_OPS);
- ret = -EINVAL;
goto fail;
}
code->op = FETCH_OP_LP_ARRAY;
@@ -709,6 +726,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
code++;
code->op = FETCH_OP_END;
+ ret = 0;
/* Shrink down the code buffer */
parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL);
if (!parg->code)
@@ -724,6 +742,8 @@ fail:
kfree(code->data);
}
kfree(tmp);
+out:
+ kfree(arg);
return ret;
}
@@ -745,11 +765,11 @@ static int traceprobe_conflict_field_name(const char *name,
return 0;
}
-int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
+int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg,
unsigned int flags)
{
struct probe_arg *parg = &tp->args[i];
- char *body;
+ const char *body;
/* Increment count for freeing args in error case */
tp->nr_args++;
@@ -839,19 +859,29 @@ int traceprobe_update_arg(struct probe_arg *arg)
/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)
static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
- bool is_return)
+ enum probe_print_type ptype)
{
struct probe_arg *parg;
int i, j;
int pos = 0;
const char *fmt, *arg;
- if (!is_return) {
+ switch (ptype) {
+ case PROBE_PRINT_NORMAL:
fmt = "(%lx)";
arg = "REC->" FIELD_STRING_IP;
- } else {
+ break;
+ case PROBE_PRINT_RETURN:
fmt = "(%lx <- %lx)";
arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+ break;
+ case PROBE_PRINT_EVENT:
+ fmt = "(%u)";
+ arg = "REC->" FIELD_STRING_TYPE;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
}
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
@@ -900,20 +930,20 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
}
#undef LEN_OR_ZERO
-int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return)
+int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype)
{
struct trace_event_call *call = trace_probe_event_call(tp);
int len;
char *print_fmt;
/* First: called with 0 length to calculate the needed length */
- len = __set_print_fmt(tp, NULL, 0, is_return);
+ len = __set_print_fmt(tp, NULL, 0, ptype);
print_fmt = kmalloc(len + 1, GFP_KERNEL);
if (!print_fmt)
return -ENOMEM;
/* Second: actually write the @print_fmt */
- __set_print_fmt(tp, print_fmt, len + 1, is_return);
+ __set_print_fmt(tp, print_fmt, len + 1, ptype);
call->print_fmt = print_fmt;
return 0;
@@ -1029,11 +1059,36 @@ error:
return ret;
}
+static struct trace_event_call *
+find_trace_event_call(const char *system, const char *event_name)
+{
+ struct trace_event_call *tp_event;
+ const char *name;
+
+ list_for_each_entry(tp_event, &ftrace_events, list) {
+ if (!tp_event->class->system ||
+ strcmp(system, tp_event->class->system))
+ continue;
+ name = trace_event_name(tp_event);
+ if (!name || strcmp(event_name, name))
+ continue;
+ return tp_event;
+ }
+
+ return NULL;
+}
+
int trace_probe_register_event_call(struct trace_probe *tp)
{
struct trace_event_call *call = trace_probe_event_call(tp);
int ret;
+ lockdep_assert_held(&event_mutex);
+
+ if (find_trace_event_call(trace_probe_group_name(tp),
+ trace_probe_name(tp)))
+ return -EEXIST;
+
ret = register_trace_event(&call->event);
if (!ret)
return -ENODEV;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 227d518e5ba5..99e7a5df025e 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -38,6 +38,7 @@
#define FIELD_STRING_IP "__probe_ip"
#define FIELD_STRING_RETIP "__probe_ret_ip"
#define FIELD_STRING_FUNC "__probe_func"
+#define FIELD_STRING_TYPE "__probe_type"
#undef DEFINE_FIELD
#define DEFINE_FIELD(type, item, name, is_signed) \
@@ -102,6 +103,7 @@ enum fetch_op {
FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */
// Stage 5 (loop) op
FETCH_OP_LP_ARRAY, /* Array: .param = loop count */
+ FETCH_OP_TP_ARG, /* Trace Point argument */
FETCH_OP_END,
FETCH_NOP_SYMBOL, /* Unresolved Symbol holder */
};
@@ -351,10 +353,11 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char
#define TPARG_FL_RETURN BIT(0)
#define TPARG_FL_KERNEL BIT(1)
#define TPARG_FL_FENTRY BIT(2)
-#define TPARG_FL_MASK GENMASK(2, 0)
+#define TPARG_FL_TPOINT BIT(3)
+#define TPARG_FL_MASK GENMASK(3, 0)
extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
- char *arg, unsigned int flags);
+ const char *argv, unsigned int flags);
extern int traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);
@@ -363,7 +366,13 @@ extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
char *buf, int offset);
-extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return);
+enum probe_print_type {
+ PROBE_PRINT_NORMAL,
+ PROBE_PRINT_RETURN,
+ PROBE_PRINT_EVENT,
+};
+
+extern int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype);
#ifdef CONFIG_PERF_EVENTS
extern struct trace_event_call *
@@ -399,6 +408,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NO_EVENT_NAME, "Event name is not specified"), \
C(EVENT_TOO_LONG, "Event name is too long"), \
C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \
+ C(EVENT_EXIST, "Given group/event name is already used by another event"), \
C(RETVAL_ON_PROBE, "$retval is not available on probe"), \
C(BAD_STACK_NUM, "Invalid stack number"), \
C(BAD_ARG_NUM, "Invalid argument number"), \
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index f003c5d02a3a..b3bdb8ddb862 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -54,7 +54,7 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf)
* If dest is NULL, don't store result and return required dynamic data size.
*/
static int
-process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs,
+process_fetch_insn(struct fetch_insn *code, void *rec,
void *dest, void *base);
static nokprobe_inline int fetch_store_strlen(unsigned long addr);
static nokprobe_inline int
@@ -188,7 +188,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
/* Store the value of each argument */
static nokprobe_inline void
-store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs,
+store_trace_args(void *data, struct trace_probe *tp, void *rec,
int header_size, int maxlen)
{
struct probe_arg *arg;
@@ -203,7 +203,7 @@ store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs,
/* Point the dynamic data area if needed */
if (unlikely(arg->dynamic))
*dl = make_data_loc(maxlen, dyndata - base);
- ret = process_fetch_insn(arg->code, regs, dl, base);
+ ret = process_fetch_insn(arg->code, rec, dl, base);
if (unlikely(ret < 0 && arg->dynamic)) {
*dl = make_data_loc(0, dyndata - base);
} else {
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e5778d1d7a5b..2402de520eca 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -26,9 +26,9 @@ static struct task_struct *wakeup_task;
static int wakeup_cpu;
static int wakeup_current_cpu;
static unsigned wakeup_prio = -1;
-static int wakeup_rt;
-static int wakeup_dl;
-static int tracing_dl = 0;
+static bool wakeup_rt;
+static bool wakeup_dl;
+static bool tracing_dl;
static arch_spinlock_t wakeup_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -498,7 +498,7 @@ static void __wakeup_reset(struct trace_array *tr)
{
wakeup_cpu = -1;
wakeup_prio = -1;
- tracing_dl = 0;
+ tracing_dl = false;
if (wakeup_task)
put_task_struct(wakeup_task);
@@ -572,9 +572,9 @@ probe_wakeup(void *ignore, struct task_struct *p)
* another task until the first one wakes up.
*/
if (dl_task(p))
- tracing_dl = 1;
+ tracing_dl = true;
else
- tracing_dl = 0;
+ tracing_dl = false;
wakeup_task = get_task_struct(p);
@@ -685,8 +685,8 @@ static int wakeup_tracer_init(struct trace_array *tr)
if (wakeup_busy)
return -EBUSY;
- wakeup_dl = 0;
- wakeup_rt = 0;
+ wakeup_dl = false;
+ wakeup_rt = false;
return __wakeup_tracer_init(tr);
}
@@ -695,8 +695,8 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
if (wakeup_busy)
return -EBUSY;
- wakeup_dl = 0;
- wakeup_rt = 1;
+ wakeup_dl = false;
+ wakeup_rt = true;
return __wakeup_tracer_init(tr);
}
@@ -705,8 +705,8 @@ static int wakeup_dl_tracer_init(struct trace_array *tr)
if (wakeup_busy)
return -EBUSY;
- wakeup_dl = 1;
- wakeup_rt = 0;
+ wakeup_dl = true;
+ wakeup_rt = false;
return __wakeup_tracer_init(tr);
}
diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h
index 6e146b959dcd..b29595fe3ac5 100644
--- a/kernel/trace/trace_synth.h
+++ b/kernel/trace/trace_synth.h
@@ -5,7 +5,7 @@
#include "trace_dynevent.h"
#define SYNTH_SYSTEM "synthetic"
-#define SYNTH_FIELDS_MAX 32
+#define SYNTH_FIELDS_MAX 64
#define STR_VAR_LEN_MAX MAX_FILTER_STR_VAL /* must be multiple of sizeof(u64) */
@@ -14,10 +14,10 @@ struct synth_field {
char *name;
size_t size;
unsigned int offset;
+ unsigned int field_pos;
bool is_signed;
bool is_string;
bool is_dynamic;
- bool field_pos;
};
struct synth_event {
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9b50869a5ddb..225ce569bf8f 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -83,10 +83,6 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
for_each_dyn_event(dpos) \
if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos)))
-#define SIZEOF_TRACE_UPROBE(n) \
- (offsetof(struct trace_uprobe, tp.args) + \
- (sizeof(struct probe_arg) * (n)))
-
static int register_uprobe_event(struct trace_uprobe *tu);
static int unregister_uprobe_event(struct trace_uprobe *tu);
@@ -217,9 +213,10 @@ static unsigned long translate_user_vaddr(unsigned long file_offset)
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
+process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
void *base)
{
+ struct pt_regs *regs = rec;
unsigned long val;
/* 1st stage: get value from context */
@@ -340,7 +337,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
struct trace_uprobe *tu;
int ret;
- tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
+ tu = kzalloc(struct_size(tu, tp.args, nargs), GFP_KERNEL);
if (!tu)
return ERR_PTR(-ENOMEM);
@@ -393,6 +390,10 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)
if (trace_probe_has_sibling(&tu->tp))
goto unreg;
+ /* If there's a reference to the dynamic event */
+ if (trace_event_dyn_busy(trace_probe_event_call(&tu->tp)))
+ return -EBUSY;
+
ret = unregister_uprobe_event(tu);
if (ret)
return ret;
@@ -455,7 +456,7 @@ static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to)
/* Append to existing event */
ret = trace_probe_append(&tu->tp, &to->tp);
if (!ret)
- dyn_event_add(&tu->devent);
+ dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp));
return ret;
}
@@ -514,11 +515,15 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
ret = register_uprobe_event(tu);
if (ret) {
- pr_warn("Failed to register probe event(%d)\n", ret);
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ } else
+ pr_warn("Failed to register probe event(%d)\n", ret);
goto end;
}
- dyn_event_add(&tu->devent);
+ dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp));
end:
mutex_unlock(&event_mutex);
@@ -536,6 +541,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
char *arg, *filename, *rctr, *rctr_end, *tmp;
char buf[MAX_EVENT_NAME_LEN];
+ enum probe_print_type ptype;
struct path path;
unsigned long offset, ref_ctr_offset;
bool is_return = false;
@@ -680,21 +686,15 @@ static int __trace_uprobe_create(int argc, const char **argv)
/* parse arguments */
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
- tmp = kstrdup(argv[i], GFP_KERNEL);
- if (!tmp) {
- ret = -ENOMEM;
- goto error;
- }
-
trace_probe_log_set_index(i + 2);
- ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp,
+ ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i],
is_return ? TPARG_FL_RETURN : 0);
- kfree(tmp);
if (ret)
goto error;
}
- ret = traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu));
+ ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
+ ret = traceprobe_set_print_fmt(&tu->tp, ptype);
if (ret < 0)
goto error;
@@ -1585,6 +1585,7 @@ struct trace_event_call *
create_local_trace_uprobe(char *name, unsigned long offs,
unsigned long ref_ctr_offset, bool is_return)
{
+ enum probe_print_type ptype;
struct trace_uprobe *tu;
struct path path;
int ret;
@@ -1619,7 +1620,8 @@ create_local_trace_uprobe(char *name, unsigned long offs,
tu->filename = kstrdup(name, GFP_KERNEL);
init_trace_event_call(tu);
- if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) {
+ ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
+ if (traceprobe_set_print_fmt(&tu->tp, ptype) < 0) {
ret = -ENOMEM;
goto error;
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 9f478d29b926..64ea283f2f86 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -15,12 +15,57 @@
#include <linux/sched/task.h>
#include <linux/static_key.h>
+enum tp_func_state {
+ TP_FUNC_0,
+ TP_FUNC_1,
+ TP_FUNC_2,
+ TP_FUNC_N,
+};
+
extern tracepoint_ptr_t __start___tracepoints_ptrs[];
extern tracepoint_ptr_t __stop___tracepoints_ptrs[];
DEFINE_SRCU(tracepoint_srcu);
EXPORT_SYMBOL_GPL(tracepoint_srcu);
+enum tp_transition_sync {
+ TP_TRANSITION_SYNC_1_0_1,
+ TP_TRANSITION_SYNC_N_2_1,
+
+ _NR_TP_TRANSITION_SYNC,
+};
+
+struct tp_transition_snapshot {
+ unsigned long rcu;
+ unsigned long srcu;
+ bool ongoing;
+};
+
+/* Protected by tracepoints_mutex */
+static struct tp_transition_snapshot tp_transition_snapshot[_NR_TP_TRANSITION_SYNC];
+
+static void tp_rcu_get_state(enum tp_transition_sync sync)
+{
+ struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync];
+
+ /* Keep the latest get_state snapshot. */
+ snapshot->rcu = get_state_synchronize_rcu();
+ snapshot->srcu = start_poll_synchronize_srcu(&tracepoint_srcu);
+ snapshot->ongoing = true;
+}
+
+static void tp_rcu_cond_sync(enum tp_transition_sync sync)
+{
+ struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync];
+
+ if (!snapshot->ongoing)
+ return;
+ cond_synchronize_rcu(snapshot->rcu);
+ if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu))
+ synchronize_srcu(&tracepoint_srcu);
+ snapshot->ongoing = false;
+}
+
/* Set to 1 to enable tracepoint debug output */
static const int tracepoint_debug;
@@ -246,26 +291,29 @@ static void *func_remove(struct tracepoint_func **funcs,
return old;
}
-static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs, bool sync)
+/*
+ * Count the number of functions (enum tp_func_state) in a tp_funcs array.
+ */
+static enum tp_func_state nr_func_state(const struct tracepoint_func *tp_funcs)
+{
+ if (!tp_funcs)
+ return TP_FUNC_0;
+ if (!tp_funcs[1].func)
+ return TP_FUNC_1;
+ if (!tp_funcs[2].func)
+ return TP_FUNC_2;
+ return TP_FUNC_N; /* 3 or more */
+}
+
+static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs)
{
void *func = tp->iterator;
/* Synthetic events do not have static call sites */
if (!tp->static_call_key)
return;
-
- if (!tp_funcs[1].func) {
+ if (nr_func_state(tp_funcs) == TP_FUNC_1)
func = tp_funcs[0].func;
- /*
- * If going from the iterator back to a single caller,
- * we need to synchronize with __DO_TRACE to make sure
- * that the data passed to the callback is the one that
- * belongs to that callback.
- */
- if (sync)
- tracepoint_synchronize_unregister();
- }
-
__static_call_update(tp->static_call_key, tp->static_call_tramp, func);
}
@@ -273,7 +321,8 @@ static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func
* Add the probe function to a tracepoint.
*/
static int tracepoint_add_func(struct tracepoint *tp,
- struct tracepoint_func *func, int prio)
+ struct tracepoint_func *func, int prio,
+ bool warn)
{
struct tracepoint_func *old, *tp_funcs;
int ret;
@@ -288,7 +337,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
lockdep_is_held(&tracepoints_mutex));
old = func_add(&tp_funcs, func, prio);
if (IS_ERR(old)) {
- WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
+ WARN_ON_ONCE(warn && PTR_ERR(old) != -ENOMEM);
return PTR_ERR(old);
}
@@ -298,9 +347,41 @@ static int tracepoint_add_func(struct tracepoint *tp,
* a pointer to it. This array is referenced by __DO_TRACE from
* include/linux/tracepoint.h using rcu_dereference_sched().
*/
- rcu_assign_pointer(tp->funcs, tp_funcs);
- tracepoint_update_call(tp, tp_funcs, false);
- static_key_enable(&tp->key);
+ switch (nr_func_state(tp_funcs)) {
+ case TP_FUNC_1: /* 0->1 */
+ /*
+ * Make sure new static func never uses old data after a
+ * 1->0->1 transition sequence.
+ */
+ tp_rcu_cond_sync(TP_TRANSITION_SYNC_1_0_1);
+ /* Set static call to first function */
+ tracepoint_update_call(tp, tp_funcs);
+ /* Both iterator and static call handle NULL tp->funcs */
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ static_key_enable(&tp->key);
+ break;
+ case TP_FUNC_2: /* 1->2 */
+ /* Set iterator static call */
+ tracepoint_update_call(tp, tp_funcs);
+ /*
+ * Iterator callback installed before updating tp->funcs.
+ * Requires ordering between RCU assign/dereference and
+ * static call update/call.
+ */
+ fallthrough;
+ case TP_FUNC_N: /* N->N+1 (N>1) */
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>1) transition sequence.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
release_probes(old);
return 0;
@@ -327,23 +408,84 @@ static int tracepoint_remove_func(struct tracepoint *tp,
/* Failed allocating new tp_funcs, replaced func with stub */
return 0;
- if (!tp_funcs) {
+ switch (nr_func_state(tp_funcs)) {
+ case TP_FUNC_0: /* 1->0 */
/* Removed last function */
if (tp->unregfunc && static_key_enabled(&tp->key))
tp->unregfunc();
static_key_disable(&tp->key);
+ /* Set iterator static call */
+ tracepoint_update_call(tp, tp_funcs);
+ /* Both iterator and static call handle NULL tp->funcs */
+ rcu_assign_pointer(tp->funcs, NULL);
+ /*
+ * Make sure new static func never uses old data after a
+ * 1->0->1 transition sequence.
+ */
+ tp_rcu_get_state(TP_TRANSITION_SYNC_1_0_1);
+ break;
+ case TP_FUNC_1: /* 2->1 */
rcu_assign_pointer(tp->funcs, tp_funcs);
- } else {
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>2) transition sequence. If the first
+ * element's data has changed, then force the synchronization
+ * to prevent current readers that have loaded the old data
+ * from calling the new function.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
+ tp_rcu_cond_sync(TP_TRANSITION_SYNC_N_2_1);
+ /* Set static call to first function */
+ tracepoint_update_call(tp, tp_funcs);
+ break;
+ case TP_FUNC_2: /* N->N-1 (N>2) */
+ fallthrough;
+ case TP_FUNC_N:
rcu_assign_pointer(tp->funcs, tp_funcs);
- tracepoint_update_call(tp, tp_funcs,
- tp_funcs[0].func != old[0].func);
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>2) transition sequence.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
release_probes(old);
return 0;
}
/**
+ * tracepoint_probe_register_prio_may_exist - Connect a probe to a tracepoint with priority
+ * @tp: tracepoint
+ * @probe: probe handler
+ * @data: tracepoint data
+ * @prio: priority of this function over other registered functions
+ *
+ * Same as tracepoint_probe_register_prio() except that it will not warn
+ * if the tracepoint is already registered.
+ */
+int tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe,
+ void *data, int prio)
+{
+ struct tracepoint_func tp_func;
+ int ret;
+
+ mutex_lock(&tracepoints_mutex);
+ tp_func.func = probe;
+ tp_func.data = data;
+ tp_func.prio = prio;
+ ret = tracepoint_add_func(tp, &tp_func, prio, false);
+ mutex_unlock(&tracepoints_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio_may_exist);
+
+/**
* tracepoint_probe_register_prio - Connect a probe to a tracepoint with priority
* @tp: tracepoint
* @probe: probe handler
@@ -366,7 +508,7 @@ int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe,
tp_func.func = probe;
tp_func.data = data;
tp_func.prio = prio;
- ret = tracepoint_add_func(tp, &tp_func, prio);
+ ret = tracepoint_add_func(tp, &tp_func, prio, true);
mutex_unlock(&tracepoints_mutex);
return ret;
}
@@ -435,7 +577,7 @@ bool trace_module_has_bad_taint(struct module *mod)
static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list);
/**
- * register_tracepoint_notifier - register tracepoint coming/going notifier
+ * register_tracepoint_module_notifier - register tracepoint coming/going notifier
* @nb: notifier block
*
* Notifiers registered with this function are called on module
@@ -461,7 +603,7 @@ end:
EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier);
/**
- * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier
+ * unregister_tracepoint_module_notifier - unregister tracepoint coming/going notifier
* @nb: notifier block
*
* The notifier block callback should expect a "struct tp_module" data
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 8d8874f1c35e..eb03f3c68375 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -8,6 +8,12 @@
#include <linux/kmemleak.h>
#include <linux/user_namespace.h>
+struct ucounts init_ucounts = {
+ .ns = &init_user_ns,
+ .uid = GLOBAL_ROOT_UID,
+ .count = ATOMIC_INIT(1),
+};
+
#define UCOUNTS_HASHTABLE_BITS 10
static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
static DEFINE_SPINLOCK(ucounts_lock);
@@ -52,14 +58,17 @@ static struct ctl_table_root set_root = {
.permissions = set_permissions,
};
-#define UCOUNT_ENTRY(name) \
- { \
- .procname = name, \
- .maxlen = sizeof(int), \
- .mode = 0644, \
- .proc_handler = proc_dointvec_minmax, \
- .extra1 = SYSCTL_ZERO, \
- .extra2 = SYSCTL_INT_MAX, \
+static long ue_zero = 0;
+static long ue_int_max = INT_MAX;
+
+#define UCOUNT_ENTRY(name) \
+ { \
+ .procname = name, \
+ .maxlen = sizeof(long), \
+ .mode = 0644, \
+ .proc_handler = proc_doulongvec_minmax, \
+ .extra1 = &ue_zero, \
+ .extra2 = &ue_int_max, \
}
static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_user_namespaces"),
@@ -78,6 +87,10 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_fanotify_groups"),
UCOUNT_ENTRY("max_fanotify_marks"),
#endif
+ { },
+ { },
+ { },
+ { },
{ }
};
#endif /* CONFIG_SYSCTL */
@@ -129,10 +142,28 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struc
return NULL;
}
-static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
+static void hlist_add_ucounts(struct ucounts *ucounts)
+{
+ struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid);
+ spin_lock_irq(&ucounts_lock);
+ hlist_add_head(&ucounts->node, hashent);
+ spin_unlock_irq(&ucounts_lock);
+}
+
+struct ucounts *get_ucounts(struct ucounts *ucounts)
+{
+ if (ucounts && atomic_add_negative(1, &ucounts->count)) {
+ put_ucounts(ucounts);
+ ucounts = NULL;
+ }
+ return ucounts;
+}
+
+struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
{
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
struct ucounts *ucounts, *new;
+ long overflow;
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -145,7 +176,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
new->ns = ns;
new->uid = uid;
- new->count = 0;
+ atomic_set(&new->count, 1);
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -153,40 +184,38 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
kfree(new);
} else {
hlist_add_head(&new->node, hashent);
- ucounts = new;
+ spin_unlock_irq(&ucounts_lock);
+ return new;
}
}
- if (ucounts->count == INT_MAX)
- ucounts = NULL;
- else
- ucounts->count += 1;
+ overflow = atomic_add_negative(1, &ucounts->count);
spin_unlock_irq(&ucounts_lock);
+ if (overflow) {
+ put_ucounts(ucounts);
+ return NULL;
+ }
return ucounts;
}
-static void put_ucounts(struct ucounts *ucounts)
+void put_ucounts(struct ucounts *ucounts)
{
unsigned long flags;
- spin_lock_irqsave(&ucounts_lock, flags);
- ucounts->count -= 1;
- if (!ucounts->count)
+ if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) {
hlist_del_init(&ucounts->node);
- else
- ucounts = NULL;
- spin_unlock_irqrestore(&ucounts_lock, flags);
-
- kfree(ucounts);
+ spin_unlock_irqrestore(&ucounts_lock, flags);
+ kfree(ucounts);
+ }
}
-static inline bool atomic_inc_below(atomic_t *v, int u)
+static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
{
- int c, old;
- c = atomic_read(v);
+ long c, old;
+ c = atomic_long_read(v);
for (;;) {
if (unlikely(c >= u))
return false;
- old = atomic_cmpxchg(v, c, c+1);
+ old = atomic_long_cmpxchg(v, c, c+1);
if (likely(old == c))
return true;
c = old;
@@ -198,19 +227,19 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
{
struct ucounts *ucounts, *iter, *bad;
struct user_namespace *tns;
- ucounts = get_ucounts(ns, uid);
+ ucounts = alloc_ucounts(ns, uid);
for (iter = ucounts; iter; iter = tns->ucounts) {
- int max;
+ long max;
tns = iter->ns;
max = READ_ONCE(tns->ucount_max[type]);
- if (!atomic_inc_below(&iter->ucount[type], max))
+ if (!atomic_long_inc_below(&iter->ucount[type], max))
goto fail;
}
return ucounts;
fail:
bad = iter;
for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
- atomic_dec(&iter->ucount[type]);
+ atomic_long_dec(&iter->ucount[type]);
put_ucounts(ucounts);
return NULL;
@@ -220,12 +249,103 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
{
struct ucounts *iter;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- int dec = atomic_dec_if_positive(&iter->ucount[type]);
+ long dec = atomic_long_dec_if_positive(&iter->ucount[type]);
WARN_ON_ONCE(dec < 0);
}
put_ucounts(ucounts);
}
+long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+{
+ struct ucounts *iter;
+ long ret = 0;
+
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ long max = READ_ONCE(iter->ns->ucount_max[type]);
+ long new = atomic_long_add_return(v, &iter->ucount[type]);
+ if (new < 0 || new > max)
+ ret = LONG_MAX;
+ else if (iter == ucounts)
+ ret = new;
+ }
+ return ret;
+}
+
+bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+{
+ struct ucounts *iter;
+ long new = -1; /* Silence compiler warning */
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ long dec = atomic_long_add_return(-v, &iter->ucount[type]);
+ WARN_ON_ONCE(dec < 0);
+ if (iter == ucounts)
+ new = dec;
+ }
+ return (new == 0);
+}
+
+static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
+ struct ucounts *last, enum ucount_type type)
+{
+ struct ucounts *iter, *next;
+ for (iter = ucounts; iter != last; iter = next) {
+ long dec = atomic_long_add_return(-1, &iter->ucount[type]);
+ WARN_ON_ONCE(dec < 0);
+ next = iter->ns->ucounts;
+ if (dec == 0)
+ put_ucounts(iter);
+ }
+}
+
+void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type)
+{
+ do_dec_rlimit_put_ucounts(ucounts, NULL, type);
+}
+
+long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type)
+{
+ /* Caller must hold a reference to ucounts */
+ struct ucounts *iter;
+ long dec, ret = 0;
+
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ long max = READ_ONCE(iter->ns->ucount_max[type]);
+ long new = atomic_long_add_return(1, &iter->ucount[type]);
+ if (new < 0 || new > max)
+ goto unwind;
+ if (iter == ucounts)
+ ret = new;
+ /*
+ * Grab an extra ucount reference for the caller when
+ * the rlimit count was previously 0.
+ */
+ if (new != 1)
+ continue;
+ if (!get_ucounts(iter))
+ goto dec_unwind;
+ }
+ return ret;
+dec_unwind:
+ dec = atomic_long_add_return(-1, &iter->ucount[type]);
+ WARN_ON_ONCE(dec < 0);
+unwind:
+ do_dec_rlimit_put_ucounts(ucounts, iter, type);
+ return 0;
+}
+
+bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
+{
+ struct ucounts *iter;
+ if (get_ucounts_value(ucounts, type) > max)
+ return true;
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ max = READ_ONCE(iter->ns->ucount_max[type]);
+ if (get_ucounts_value(iter, type) > max)
+ return true;
+ }
+ return false;
+}
+
static __init int user_namespace_sysctl_init(void)
{
#ifdef CONFIG_SYSCTL
@@ -241,6 +361,8 @@ static __init int user_namespace_sysctl_init(void)
BUG_ON(!user_header);
BUG_ON(!setup_userns_sysctls(&init_user_ns));
#endif
+ hlist_add_ucounts(&init_ucounts);
+ inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
return 0;
}
subsys_initcall(user_namespace_sysctl_init);
diff --git a/kernel/user.c b/kernel/user.c
index a2478cddf536..e2cf8c22b539 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -98,9 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
/* root_user.__count is 1, for init task cred */
struct user_struct root_user = {
.__count = REFCOUNT_INIT(1),
- .processes = ATOMIC_INIT(1),
- .sigpending = ATOMIC_INIT(0),
- .locked_shm = 0,
.uid = GLOBAL_ROOT_UID,
.ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),
};
@@ -132,6 +129,22 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
return NULL;
}
+static int user_epoll_alloc(struct user_struct *up)
+{
+#ifdef CONFIG_EPOLL
+ return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL);
+#else
+ return 0;
+#endif
+}
+
+static void user_epoll_free(struct user_struct *up)
+{
+#ifdef CONFIG_EPOLL
+ percpu_counter_destroy(&up->epoll_watches);
+#endif
+}
+
/* IRQs are disabled and uidhash_lock is held upon function entry.
* IRQ state (as stored in flags) is restored and uidhash_lock released
* upon function exit.
@@ -141,6 +154,7 @@ static void free_user(struct user_struct *up, unsigned long flags)
{
uid_hash_remove(up);
spin_unlock_irqrestore(&uidhash_lock, flags);
+ user_epoll_free(up);
kmem_cache_free(uid_cachep, up);
}
@@ -188,6 +202,10 @@ struct user_struct *alloc_uid(kuid_t uid)
new->uid = uid;
refcount_set(&new->__count, 1);
+ if (user_epoll_alloc(new)) {
+ kmem_cache_free(uid_cachep, new);
+ return NULL;
+ }
ratelimit_state_init(&new->ratelimit, HZ, 100);
ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);
@@ -198,6 +216,7 @@ struct user_struct *alloc_uid(kuid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
+ user_epoll_free(new);
kmem_cache_free(uid_cachep, new);
} else {
uid_hash_insert(new, hashent);
@@ -219,6 +238,9 @@ static int __init uid_cache_init(void)
for(n = 0; n < UIDHASH_SZ; ++n)
INIT_HLIST_HEAD(uidhash_table + n);
+ if (user_epoll_alloc(&root_user))
+ panic("root_user epoll percpu counter alloc failed");
+
/* Insert the root user immediately (init already runs as root) */
spin_lock_irq(&uidhash_lock);
uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 8d62863721b0..6b2e3ca7ee99 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -119,9 +119,13 @@ int create_user_ns(struct cred *new)
ns->owner = owner;
ns->group = group;
INIT_WORK(&ns->work, free_user_ns);
- for (i = 0; i < UCOUNT_COUNTS; i++) {
+ for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
ns->ucount_max[i] = INT_MAX;
}
+ set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC));
+ set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
+ set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
+ set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
ns->ucounts = ucounts;
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
@@ -1340,6 +1344,9 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns)
put_user_ns(cred->user_ns);
set_cred_user_ns(cred, get_user_ns(user_ns));
+ if (set_cred_ucounts(cred) < 0)
+ return -EINVAL;
+
return 0;
}
@@ -1378,7 +1385,7 @@ const struct proc_ns_operations userns_operations = {
static __init int user_namespaces_init(void)
{
- user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+ user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
return 0;
}
subsys_initcall(user_namespaces_init);
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index bb7bb3b478ab..9dae1f648713 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -26,7 +26,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
if (IS_ERR(mnt))
return mnt;
- file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700);
+ file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700);
if (IS_ERR(file)) {
mntput(mnt);
return ERR_CAST(file);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 92d3bcc5a5e0..ad912511a0c0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -92,7 +92,7 @@ __setup("nmi_watchdog=", hardlockup_panic_setup);
* own hardlockup detector.
*
* watchdog_nmi_enable/disable can be implemented to start and stop when
- * softlockup watchdog threads start and stop. The arch must select the
+ * softlockup watchdog start and stop. The arch must select the
* SOFTLOCKUP_DETECTOR Kconfig.
*/
int __weak watchdog_nmi_enable(unsigned int cpu)
@@ -335,7 +335,7 @@ static DEFINE_PER_CPU(struct completion, softlockup_completion);
static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
/*
- * The watchdog thread function - touches the timestamp.
+ * The watchdog feed function - touches the timestamp.
*
* It only runs once every sample_period seconds (4 seconds by
* default) to reset the softlockup timestamp. If this gets delayed
@@ -558,11 +558,7 @@ static void lockup_detector_reconfigure(void)
}
/*
- * Create the watchdog thread infrastructure and configure the detector(s).
- *
- * The threads are not unparked as watchdog_allowed_mask is empty. When
- * the threads are successfully initialized, take the proper locks and
- * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
+ * Create the watchdog infrastructure and configure the detector(s).
*/
static __init void lockup_detector_setup(void)
{
@@ -628,7 +624,7 @@ void lockup_detector_soft_poweroff(void)
#ifdef CONFIG_SYSCTL
-/* Propagate any changes to the watchdog threads */
+/* Propagate any changes to the watchdog infrastructure */
static void proc_watchdog_update(void)
{
/* Remove impossible cpus to keep sysctl output clean. */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 50142fc08902..1b3eb1e9531f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -205,9 +205,26 @@ struct pool_workqueue {
int refcnt; /* L: reference count */
int nr_in_flight[WORK_NR_COLORS];
/* L: nr of in_flight works */
+
+ /*
+ * nr_active management and WORK_STRUCT_INACTIVE:
+ *
+ * When pwq->nr_active >= max_active, new work item is queued to
+ * pwq->inactive_works instead of pool->worklist and marked with
+ * WORK_STRUCT_INACTIVE.
+ *
+ * All work items marked with WORK_STRUCT_INACTIVE do not participate
+ * in pwq->nr_active and all work items in pwq->inactive_works are
+ * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE
+ * work items are in pwq->inactive_works. Some of them are ready to
+ * run in pool->worklist or worker->scheduled. Those work itmes are
+ * only struct wq_barrier which is used for flush_work() and should
+ * not participate in pwq->nr_active. For non-barrier work item, it
+ * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
+ */
int nr_active; /* L: nr of active works */
int max_active; /* L: max active works */
- struct list_head delayed_works; /* L: delayed works */
+ struct list_head inactive_works; /* L: inactive works */
struct list_head pwqs_node; /* WR: node on wq->pwqs */
struct list_head mayday_node; /* MD: node on wq->maydays */
@@ -524,7 +541,7 @@ static inline void debug_work_deactivate(struct work_struct *work) { }
#endif
/**
- * worker_pool_assign_id - allocate ID and assing it to @pool
+ * worker_pool_assign_id - allocate ID and assign it to @pool
* @pool: the pool pointer of interest
*
* Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
@@ -579,9 +596,9 @@ static unsigned int work_color_to_flags(int color)
return color << WORK_STRUCT_COLOR_SHIFT;
}
-static int get_work_color(struct work_struct *work)
+static int get_work_color(unsigned long work_data)
{
- return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
+ return (work_data >> WORK_STRUCT_COLOR_SHIFT) &
((1 << WORK_STRUCT_COLOR_BITS) - 1);
}
@@ -1136,7 +1153,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
}
}
-static void pwq_activate_delayed_work(struct work_struct *work)
+static void pwq_activate_inactive_work(struct work_struct *work)
{
struct pool_workqueue *pwq = get_work_pwq(work);
@@ -1144,22 +1161,22 @@ static void pwq_activate_delayed_work(struct work_struct *work)
if (list_empty(&pwq->pool->worklist))
pwq->pool->watchdog_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
- __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
+ __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work));
pwq->nr_active++;
}
-static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
+static void pwq_activate_first_inactive(struct pool_workqueue *pwq)
{
- struct work_struct *work = list_first_entry(&pwq->delayed_works,
+ struct work_struct *work = list_first_entry(&pwq->inactive_works,
struct work_struct, entry);
- pwq_activate_delayed_work(work);
+ pwq_activate_inactive_work(work);
}
/**
* pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
* @pwq: pwq of interest
- * @color: color of work which left the queue
+ * @work_data: work_data of work which left the queue
*
* A work either has completed or is removed from pending queue,
* decrement nr_in_flight of its pwq and handle workqueue flushing.
@@ -1167,21 +1184,21 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
-static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
+static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data)
{
- /* uncolored work items don't participate in flushing or nr_active */
- if (color == WORK_NO_COLOR)
- goto out_put;
+ int color = get_work_color(work_data);
- pwq->nr_in_flight[color]--;
-
- pwq->nr_active--;
- if (!list_empty(&pwq->delayed_works)) {
- /* one down, submit a delayed one */
- if (pwq->nr_active < pwq->max_active)
- pwq_activate_first_delayed(pwq);
+ if (!(work_data & WORK_STRUCT_INACTIVE)) {
+ pwq->nr_active--;
+ if (!list_empty(&pwq->inactive_works)) {
+ /* one down, submit an inactive one */
+ if (pwq->nr_active < pwq->max_active)
+ pwq_activate_first_inactive(pwq);
+ }
}
+ pwq->nr_in_flight[color]--;
+
/* is flush in progress and are we at the flushing tip? */
if (likely(pwq->flush_color != color))
goto out_put;
@@ -1281,17 +1298,21 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
debug_work_deactivate(work);
/*
- * A delayed work item cannot be grabbed directly because
- * it might have linked NO_COLOR work items which, if left
- * on the delayed_list, will confuse pwq->nr_active
+ * A cancelable inactive work item must be in the
+ * pwq->inactive_works since a queued barrier can't be
+ * canceled (see the comments in insert_wq_barrier()).
+ *
+ * An inactive work item cannot be grabbed directly because
+ * it might have linked barrier work items which, if left
+ * on the inactive_works list, will confuse pwq->nr_active
* management later on and cause stall. Make sure the work
* item is activated before grabbing.
*/
- if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
- pwq_activate_delayed_work(work);
+ if (*work_data_bits(work) & WORK_STRUCT_INACTIVE)
+ pwq_activate_inactive_work(work);
list_del_init(&work->entry);
- pwq_dec_nr_in_flight(pwq, get_work_color(work));
+ pwq_dec_nr_in_flight(pwq, *work_data_bits(work));
/* work->data points to pwq iff queued, point to pool */
set_work_pool_and_keep_pending(work, pool->id);
@@ -1490,8 +1511,8 @@ retry:
if (list_empty(worklist))
pwq->pool->watchdog_ts = jiffies;
} else {
- work_flags |= WORK_STRUCT_DELAYED;
- worklist = &pwq->delayed_works;
+ work_flags |= WORK_STRUCT_INACTIVE;
+ worklist = &pwq->inactive_works;
}
debug_work_activate(work);
@@ -1912,14 +1933,14 @@ static void worker_detach_from_pool(struct worker *worker)
*/
static struct worker *create_worker(struct worker_pool *pool)
{
- struct worker *worker = NULL;
- int id = -1;
+ struct worker *worker;
+ int id;
char id_buf[16];
/* ID is needed to determine kthread name */
- id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
+ id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
if (id < 0)
- goto fail;
+ return NULL;
worker = alloc_worker(pool->node);
if (!worker)
@@ -1954,8 +1975,7 @@ static struct worker *create_worker(struct worker_pool *pool)
return worker;
fail:
- if (id >= 0)
- ida_simple_remove(&pool->worker_ida, id);
+ ida_free(&pool->worker_ida, id);
kfree(worker);
return NULL;
}
@@ -2173,7 +2193,7 @@ __acquires(&pool->lock)
struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool;
bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
- int work_color;
+ unsigned long work_data;
struct worker *collision;
#ifdef CONFIG_LOCKDEP
/*
@@ -2209,7 +2229,8 @@ __acquires(&pool->lock)
worker->current_work = work;
worker->current_func = work->func;
worker->current_pwq = pwq;
- work_color = get_work_color(work);
+ work_data = *work_data_bits(work);
+ worker->current_color = get_work_color(work_data);
/*
* Record wq name for cmdline and debug reporting, may get
@@ -2315,7 +2336,8 @@ __acquires(&pool->lock)
worker->current_work = NULL;
worker->current_func = NULL;
worker->current_pwq = NULL;
- pwq_dec_nr_in_flight(pwq, work_color);
+ worker->current_color = INT_MAX;
+ pwq_dec_nr_in_flight(pwq, work_data);
}
/**
@@ -2378,7 +2400,7 @@ woke_up:
set_pf_worker(false);
set_task_comm(worker->task, "kworker/dying");
- ida_simple_remove(&pool->worker_ida, worker->id);
+ ida_free(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker);
kfree(worker);
return 0;
@@ -2531,7 +2553,7 @@ repeat:
/*
* The above execution of rescued work items could
* have created more to rescue through
- * pwq_activate_first_delayed() or chained
+ * pwq_activate_first_inactive() or chained
* queueing. Let's put @pwq back on mayday list so
* that such back-to-back work items, which may be
* being used to relieve memory pressure, don't
@@ -2658,8 +2680,9 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
struct wq_barrier *barr,
struct work_struct *target, struct worker *worker)
{
+ unsigned int work_flags = 0;
+ unsigned int work_color;
struct list_head *head;
- unsigned int linked = 0;
/*
* debugobject calls are safe here even with pool->lock locked
@@ -2674,24 +2697,31 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
barr->task = current;
+ /* The barrier work item does not participate in pwq->nr_active. */
+ work_flags |= WORK_STRUCT_INACTIVE;
+
/*
* If @target is currently being executed, schedule the
* barrier to the worker; otherwise, put it after @target.
*/
- if (worker)
+ if (worker) {
head = worker->scheduled.next;
- else {
+ work_color = worker->current_color;
+ } else {
unsigned long *bits = work_data_bits(target);
head = target->entry.next;
/* there can already be other linked works, inherit and set */
- linked = *bits & WORK_STRUCT_LINKED;
+ work_flags |= *bits & WORK_STRUCT_LINKED;
+ work_color = get_work_color(*bits);
__set_bit(WORK_STRUCT_LINKED_BIT, bits);
}
+ pwq->nr_in_flight[work_color]++;
+ work_flags |= work_color_to_flags(work_color);
+
debug_work_activate(&barr->work);
- insert_work(pwq, &barr->work, head,
- work_color_to_flags(WORK_NO_COLOR) | linked);
+ insert_work(pwq, &barr->work, head, work_flags);
}
/**
@@ -2957,7 +2987,7 @@ reflush:
bool drained;
raw_spin_lock_irq(&pwq->pool->lock);
- drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
+ drained = !pwq->nr_active && list_empty(&pwq->inactive_works);
raw_spin_unlock_irq(&pwq->pool->lock);
if (drained)
@@ -3293,7 +3323,7 @@ int schedule_on_each_cpu(work_func_t func)
if (!works)
return -ENOMEM;
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu) {
struct work_struct *work = per_cpu_ptr(works, cpu);
@@ -3305,7 +3335,7 @@ int schedule_on_each_cpu(work_func_t func)
for_each_online_cpu(cpu)
flush_work(per_cpu_ptr(works, cpu));
- put_online_cpus();
+ cpus_read_unlock();
free_percpu(works);
return 0;
}
@@ -3676,15 +3706,21 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
unbound_release_work);
struct workqueue_struct *wq = pwq->wq;
struct worker_pool *pool = pwq->pool;
- bool is_last;
+ bool is_last = false;
- if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
- return;
+ /*
+ * when @pwq is not linked, it doesn't hold any reference to the
+ * @wq, and @wq is invalid to access.
+ */
+ if (!list_empty(&pwq->pwqs_node)) {
+ if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
+ return;
- mutex_lock(&wq->mutex);
- list_del_rcu(&pwq->pwqs_node);
- is_last = list_empty(&wq->pwqs);
- mutex_unlock(&wq->mutex);
+ mutex_lock(&wq->mutex);
+ list_del_rcu(&pwq->pwqs_node);
+ is_last = list_empty(&wq->pwqs);
+ mutex_unlock(&wq->mutex);
+ }
mutex_lock(&wq_pool_mutex);
put_unbound_pool(pool);
@@ -3707,7 +3743,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
* @pwq: target pool_workqueue
*
* If @pwq isn't freezing, set @pwq->max_active to the associated
- * workqueue's saved_max_active and activate delayed work items
+ * workqueue's saved_max_active and activate inactive work items
* accordingly. If @pwq is freezing, clear @pwq->max_active to zero.
*/
static void pwq_adjust_max_active(struct pool_workqueue *pwq)
@@ -3736,9 +3772,9 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
pwq->max_active = wq->saved_max_active;
- while (!list_empty(&pwq->delayed_works) &&
+ while (!list_empty(&pwq->inactive_works) &&
pwq->nr_active < pwq->max_active) {
- pwq_activate_first_delayed(pwq);
+ pwq_activate_first_inactive(pwq);
kick = true;
}
@@ -3757,7 +3793,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
}
-/* initialize newly alloced @pwq which is associated with @wq and @pool */
+/* initialize newly allocated @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
struct worker_pool *pool)
{
@@ -3769,7 +3805,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
pwq->wq = wq;
pwq->flush_color = -1;
pwq->refcnt = 1;
- INIT_LIST_HEAD(&pwq->delayed_works);
+ INIT_LIST_HEAD(&pwq->inactive_works);
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
@@ -4010,14 +4046,14 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
static void apply_wqattrs_lock(void)
{
/* CPUs should stay stable across pwq creations and installations */
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&wq_pool_mutex);
}
static void apply_wqattrs_unlock(void)
{
mutex_unlock(&wq_pool_mutex);
- put_online_cpus();
+ cpus_read_unlock();
}
static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
@@ -4062,7 +4098,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
*
* Performs GFP_KERNEL allocations.
*
- * Assumes caller has CPU hotplug read exclusion, i.e. get_online_cpus().
+ * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock().
*
* Return: 0 on success and -errno on failure.
*/
@@ -4190,7 +4226,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
return 0;
}
- get_online_cpus();
+ cpus_read_lock();
if (wq->flags & __WQ_ORDERED) {
ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
/* there should only be single pwq for ordering guarantee */
@@ -4200,7 +4236,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
} else {
ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
}
- put_online_cpus();
+ cpus_read_unlock();
return ret;
}
@@ -4356,7 +4392,7 @@ static bool pwq_busy(struct pool_workqueue *pwq)
if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1))
return true;
- if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+ if (pwq->nr_active || !list_empty(&pwq->inactive_works))
return true;
return false;
@@ -4552,7 +4588,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
else
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
- ret = !list_empty(&pwq->delayed_works);
+ ret = !list_empty(&pwq->inactive_works);
preempt_enable();
rcu_read_unlock();
@@ -4748,11 +4784,11 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_cont("\n");
}
- if (!list_empty(&pwq->delayed_works)) {
+ if (!list_empty(&pwq->inactive_works)) {
bool comma = false;
- pr_info(" delayed:");
- list_for_each_entry(work, &pwq->delayed_works, entry) {
+ pr_info(" inactive:");
+ list_for_each_entry(work, &pwq->inactive_works, entry) {
pr_cont_work(comma, work);
comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
}
@@ -4782,7 +4818,7 @@ void show_workqueue_state(void)
bool idle = true;
for_each_pwq(pwq, wq) {
- if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
+ if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
idle = false;
break;
}
@@ -4794,8 +4830,16 @@ void show_workqueue_state(void)
for_each_pwq(pwq, wq) {
raw_spin_lock_irqsave(&pwq->pool->lock, flags);
- if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+ if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
+ /*
+ * Defer printing to avoid deadlocks in console
+ * drivers that queue work while holding locks
+ * also taken in their write paths.
+ */
+ printk_deferred_enter();
show_pwq(pwq);
+ printk_deferred_exit();
+ }
raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
/*
* We could be printing a lot from atomic context, e.g.
@@ -4813,7 +4857,12 @@ void show_workqueue_state(void)
raw_spin_lock_irqsave(&pool->lock, flags);
if (pool->nr_workers == pool->nr_idle)
goto next_pool;
-
+ /*
+ * Defer printing to avoid deadlocks in console drivers that
+ * queue work while holding locks also taken in their write
+ * paths.
+ */
+ printk_deferred_enter();
pr_info("pool %d:", pool->id);
pr_cont_pool_info(pool);
pr_cont(" hung=%us workers=%d",
@@ -4828,6 +4877,7 @@ void show_workqueue_state(void)
first = false;
}
pr_cont("\n");
+ printk_deferred_exit();
next_pool:
raw_spin_unlock_irqrestore(&pool->lock, flags);
/*
@@ -5162,10 +5212,10 @@ long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
{
long ret = -ENODEV;
- get_online_cpus();
+ cpus_read_lock();
if (cpu_online(cpu))
ret = work_on_cpu(cpu, fn, arg);
- put_online_cpus();
+ cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_safe);
@@ -5177,7 +5227,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe);
* freeze_workqueues_begin - begin freezing workqueues
*
* Start freezing workqueues. After this function returns, all freezable
- * workqueues will queue new works to their delayed_works list instead of
+ * workqueues will queue new works to their inactive_works list instead of
* pool->worklist.
*
* CONTEXT:
@@ -5325,7 +5375,7 @@ static int workqueue_apply_unbound_cpumask(void)
* the affinity of all unbound workqueues. This function check the @cpumask
* and apply it to all unbound workqueues and updates all pwqs of them.
*
- * Retun: 0 - Success
+ * Return: 0 - Success
* -EINVAL - Invalid @cpumask
* -ENOMEM - Failed to allocate memory for attrs or pwqs.
*/
@@ -5437,7 +5487,7 @@ static ssize_t wq_pool_ids_show(struct device *dev,
const char *delim = "";
int node, written = 0;
- get_online_cpus();
+ cpus_read_lock();
rcu_read_lock();
for_each_node(node) {
written += scnprintf(buf + written, PAGE_SIZE - written,
@@ -5447,7 +5497,7 @@ static ssize_t wq_pool_ids_show(struct device *dev,
}
written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
rcu_read_unlock();
- put_online_cpus();
+ cpus_read_unlock();
return written;
}
@@ -5896,6 +5946,13 @@ static void __init wq_numa_init(void)
return;
}
+ for_each_possible_cpu(cpu) {
+ if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) {
+ pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
+ return;
+ }
+ }
+
wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs();
BUG_ON(!wq_update_unbound_numa_attrs_buf);
@@ -5913,11 +5970,6 @@ static void __init wq_numa_init(void)
for_each_possible_cpu(cpu) {
node = cpu_to_node(cpu);
- if (WARN_ON(node == NUMA_NO_NODE)) {
- pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
- /* happens iff arch is bonkers, let's just proceed */
- return;
- }
cpumask_set_cpu(cpu, tbl[node]);
}
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 498de0e909a4..e00b1204a8e9 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -30,7 +30,8 @@ struct worker {
struct work_struct *current_work; /* L: work being processed */
work_func_t current_func; /* L: current_work's fn */
- struct pool_workqueue *current_pwq; /* L: current_work's pwq */
+ struct pool_workqueue *current_pwq; /* L: current_work's pwq */
+ unsigned int current_color; /* L: current_work's color */
struct list_head scheduled; /* L: scheduled works */
/* 64 bytes boundary on 64bit, 32 on 32bit */