aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt19
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c4
-rw-r--r--kernel/audit_fsnotify.c2
-rw-r--r--kernel/auditsc.c21
-rw-r--r--kernel/bpf/bpf_iter.c2
-rw-r--r--kernel/bpf/bpf_lru_list.c7
-rw-r--r--kernel/bpf/btf.c109
-rw-r--r--kernel/bpf/cgroup.c120
-rw-r--r--kernel/bpf/core.c114
-rw-r--r--kernel/bpf/cpumap.c46
-rw-r--r--kernel/bpf/devmap.c4
-rw-r--r--kernel/bpf/disasm.c43
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/helpers.c12
-rw-r--r--kernel/bpf/inode.c13
-rw-r--r--kernel/bpf/preload/iterators/iterators.c2
-rw-r--r--kernel/bpf/stackmap.c143
-rw-r--r--kernel/bpf/syscall.c16
-rw-r--r--kernel/bpf/task_iter.c267
-rw-r--r--kernel/bpf/trampoline.c77
-rw-r--r--kernel/bpf/verifier.c1145
-rw-r--r--kernel/capability.c14
-rw-r--r--kernel/cgroup/cgroup.c57
-rw-r--r--kernel/cgroup/cpuset.c6
-rw-r--r--kernel/cpu.c7
-rw-r--r--kernel/debug/debug_core.c39
-rw-r--r--kernel/debug/gdbstub.c4
-rw-r--r--kernel/debug/kdb/kdb_private.h12
-rw-r--r--kernel/debug/kdb/kdb_support.c53
-rw-r--r--kernel/dma/Kconfig3
-rw-r--r--kernel/dma/map_benchmark.c12
-rw-r--r--kernel/dma/mapping.c42
-rw-r--r--kernel/dma/swiotlb.c310
-rw-r--r--kernel/entry/common.c17
-rw-r--r--kernel/events/core.c288
-rw-r--r--kernel/events/uprobes.c82
-rw-r--r--kernel/fork.c38
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/futex.c16
-rw-r--r--kernel/groups.c7
-rw-r--r--kernel/irq/irq_sim.c4
-rw-r--r--kernel/irq/irqdomain.c11
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/resend.c4
-rw-r--r--kernel/jump_label.c8
-rw-r--r--kernel/kallsyms.c8
-rw-r--r--kernel/kcsan/core.c26
-rw-r--r--kernel/kexec_core.c2
-rw-r--r--kernel/kexec_file.c5
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/kprobes.c44
-rw-r--r--kernel/livepatch/core.c7
-rw-r--r--kernel/locking/Makefile1
-rw-r--r--kernel/locking/irqflag-debug.c13
-rw-r--r--kernel/locking/lockdep.c186
-rw-r--r--kernel/locking/locktorture.c1
-rw-r--r--kernel/locking/mutex.c35
-rw-r--r--kernel/locking/qrwlock.c1
-rw-r--r--kernel/locking/rtmutex.c75
-rw-r--r--kernel/locking/rwsem.c2
-rw-r--r--kernel/locking/rwsem.h0
-rw-r--r--kernel/locking/semaphore.c2
-rw-r--r--kernel/module.c481
-rw-r--r--kernel/module_signature.c2
-rw-r--r--kernel/module_signing.c2
-rw-r--r--kernel/power/Kconfig12
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/printk/printk.c28
-rw-r--r--kernel/printk/printk_ringbuffer.h2
-rw-r--r--kernel/printk/printk_safe.c16
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcu/Kconfig5
-rw-r--r--kernel/rcu/rcu.h16
-rw-r--r--kernel/rcu/rcu_segcblist.c216
-rw-r--r--kernel/rcu/rcu_segcblist.h57
-rw-r--r--kernel/rcu/rcutorture.c395
-rw-r--r--kernel/rcu/refscale.c23
-rw-r--r--kernel/rcu/srcutiny.c77
-rw-r--r--kernel/rcu/srcutree.c147
-rw-r--r--kernel/rcu/tasks.h79
-rw-r--r--kernel/rcu/tree.c154
-rw-r--r--kernel/rcu/tree.h4
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h398
-rw-r--r--kernel/rcu/tree_stall.h60
-rw-r--r--kernel/rcu/update.c4
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/resource.c98
-rw-r--r--kernel/scftorture.c6
-rw-r--r--kernel/sched/core.c532
-rw-r--r--kernel/sched/cpufreq_schedutil.c122
-rw-r--r--kernel/sched/deadline.c94
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c324
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/membarrier.c6
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h51
-rw-r--r--kernel/sched/topology.c99
-rw-r--r--kernel/seccomp.c4
-rw-r--r--kernel/signal.c10
-rw-r--r--kernel/smp.c4
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/static_call.c101
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/alarmtimer.c10
-rw-r--r--kernel/time/hrtimer.c62
-rw-r--r--kernel/time/namespace.c6
-rw-r--r--kernel/time/posix-cpu-timers.c2
-rw-r--r--kernel/time/timer.c14
-rw-r--r--kernel/torture.c167
-rw-r--r--kernel/trace/Kconfig37
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c53
-rw-r--r--kernel/trace/bpf_trace.c6
-rw-r--r--kernel/trace/error_report-traces.c11
-rw-r--r--kernel/trace/preemptirq_delay_test.c14
-rw-r--r--kernel/trace/ring_buffer.c62
-rw-r--r--kernel/trace/trace.c289
-rw-r--r--kernel/trace/trace.h64
-rw-r--r--kernel/trace/trace_branch.c6
-rw-r--r--kernel/trace/trace_dynevent.c35
-rw-r--r--kernel/trace/trace_dynevent.h4
-rw-r--r--kernel/trace/trace_event_perf.c5
-rw-r--r--kernel/trace/trace_events.c40
-rw-r--r--kernel/trace/trace_events_inject.c6
-rw-r--r--kernel/trace/trace_events_synth.c322
-rw-r--r--kernel/trace/trace_functions.c31
-rw-r--r--kernel/trace/trace_functions_graph.c32
-rw-r--r--kernel/trace/trace_hwlat.c7
-rw-r--r--kernel/trace/trace_irqsoff.c86
-rw-r--r--kernel/trace/trace_kprobe.c47
-rw-r--r--kernel/trace/trace_mmiotrace.c16
-rw-r--r--kernel/trace/trace_output.c12
-rw-r--r--kernel/trace/trace_probe.c17
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_sched_wakeup.c71
-rw-r--r--kernel/trace/trace_syscalls.c20
-rw-r--r--kernel/trace/trace_uprobe.c23
-rw-r--r--kernel/tracepoint.c91
-rw-r--r--kernel/watch_queue.c2
-rw-r--r--kernel/workqueue.c4
146 files changed, 6092 insertions, 2869 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf82259cff96..416017301660 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -40,6 +40,7 @@ config PREEMPT
depends on !ARCH_NO_PREEMPT
select PREEMPTION
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+ select PREEMPT_DYNAMIC if HAVE_PREEMPT_DYNAMIC
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@@ -80,3 +81,21 @@ config PREEMPT_COUNT
config PREEMPTION
bool
select PREEMPT_COUNT
+
+config PREEMPT_DYNAMIC
+ bool
+ help
+ This option allows to define the preemption model on the kernel
+ command line parameter and thus override the default preemption
+ model defined during compile time.
+
+ The feature is primarily interesting for Linux distributions which
+ provide a pre-built kernel binary to reduce the number of kernel
+ flavors they offer while still offering different usecases.
+
+ The runtime overhead is negligible with HAVE_STATIC_CALL_INLINE enabled
+ but if runtime patching is not available for the specific architecture
+ then the potential overhead should be considered.
+
+ Interesting if you want the same pre-built kernel should be used for
+ both Server and Desktop workloads.
diff --git a/kernel/Makefile b/kernel/Makefile
index aa7368c7eabf..320f1f3941b7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -51,7 +51,7 @@ obj-y += livepatch/
obj-y += dma/
obj-y += entry/
-obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
+obj-$(CONFIG_KCMP) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 1ffc2e059027..551a394bc8f4 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2285,7 +2285,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
uid = from_kuid(&init_user_ns, task_uid(current));
oldloginuid = from_kuid(&init_user_ns, koldloginuid);
- loginuid = from_kuid(&init_user_ns, kloginuid),
+ loginuid = from_kuid(&init_user_ns, kloginuid);
tty = audit_get_tty();
audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
@@ -2365,7 +2365,7 @@ int audit_signal_info(int sig, struct task_struct *t)
*
* We can not do a netlink send inside an irq context because it blocks (last
* arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
- * queue and a tasklet is scheduled to remove them from the queue outside the
+ * queue and a kthread is scheduled to remove them from the queue outside the
* irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 5b3f01da172b..60739d5e3373 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -84,7 +84,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
dentry = kern_path_locked(pathname, &path);
if (IS_ERR(dentry))
- return (void *)dentry; /* returning an error */
+ return ERR_CAST(dentry); /* returning an error */
inode = path.dentry->d_inode;
inode_unlock(inode);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ce8c9e2279ba..47fb48f42c93 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -799,12 +799,12 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
return rule->mask[word] & bit;
}
-/* At syscall entry and exit time, this filter is called if the
- * audit_state is not low enough that auditing cannot take place, but is
- * also not high enough that we already know we have to write an audit
- * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
+/* At syscall exit time, this filter is called if the audit_state is
+ * not low enough that auditing cannot take place, but is also not
+ * high enough that we already know we have to write an audit record
+ * (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
*/
-static enum audit_state audit_filter_syscall(struct task_struct *tsk,
+static void audit_filter_syscall(struct task_struct *tsk,
struct audit_context *ctx,
struct list_head *list)
{
@@ -812,7 +812,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
enum audit_state state;
if (auditd_test_task(tsk))
- return AUDIT_DISABLED;
+ return;
rcu_read_lock();
list_for_each_entry_rcu(e, list, list) {
@@ -821,11 +821,11 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
&state, false)) {
rcu_read_unlock();
ctx->current_state = state;
- return state;
+ return;
}
}
rcu_read_unlock();
- return AUDIT_BUILD_CONTEXT;
+ return;
}
/*
@@ -1930,7 +1930,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
if (!dentry)
return 0;
- rc = get_vfs_caps_from_disk(dentry, &caps);
+ rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
if (rc)
return rc;
@@ -2481,7 +2481,8 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+ get_vfs_caps_from_disk(&init_user_ns,
+ bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 5454161407f1..a0d9eade9c80 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -287,7 +287,7 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info)
{
struct bpf_iter_target_info *tinfo;
- tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+ tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL);
if (!tinfo)
return -ENOMEM;
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 1b6b9349cb85..d99e89f113c4 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -502,13 +502,14 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
static void bpf_common_lru_push_free(struct bpf_lru *lru,
struct bpf_lru_node *node)
{
+ u8 node_type = READ_ONCE(node->type);
unsigned long flags;
- if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) ||
- WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE))
+ if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) ||
+ WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE))
return;
- if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) {
+ if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) {
struct bpf_lru_locallist *loc_l;
loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 84a36ee4a4c2..b1a76fe046cb 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -458,7 +458,7 @@ static bool btf_type_is_datasec(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
-static u32 btf_nr_types_total(const struct btf *btf)
+u32 btf_nr_types(const struct btf *btf)
{
u32 total = 0;
@@ -476,7 +476,7 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
const char *tname;
u32 i, total;
- total = btf_nr_types_total(btf);
+ total = btf_nr_types(btf);
for (i = 1; i < total; i++) {
t = btf_type_by_id(btf, i);
if (BTF_INFO_KIND(t->info) != kind)
@@ -3540,11 +3540,6 @@ static s32 btf_datasec_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (!btf_type_vlen(t)) {
- btf_verifier_log_type(env, t, "vlen == 0");
- return -EINVAL;
- }
-
if (!t->size) {
btf_verifier_log_type(env, t, "size == 0");
return -EINVAL;
@@ -4326,8 +4321,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
* is not supported yet.
* BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
*/
- if (log->level & BPF_LOG_LEVEL)
- bpf_log(log, "arg#%d type is not a struct\n", arg);
return NULL;
}
tname = btf_name_by_offset(btf, t->name_off);
@@ -5296,15 +5289,16 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
* Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
*/
int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *reg)
+ struct bpf_reg_state *regs)
{
struct bpf_verifier_log *log = &env->log;
struct bpf_prog *prog = env->prog;
struct btf *btf = prog->aux->btf;
const struct btf_param *args;
- const struct btf_type *t;
- u32 i, nargs, btf_id;
+ const struct btf_type *t, *ref_t;
+ u32 i, nargs, btf_id, type_size;
const char *tname;
+ bool is_global;
if (!prog->aux->func_info)
return -EINVAL;
@@ -5338,38 +5332,57 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs);
goto out;
}
+
+ is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
/* check that BTF function arguments match actual types that the
* verifier sees.
*/
for (i = 0; i < nargs; i++) {
+ struct bpf_reg_state *reg = &regs[i + 1];
+
t = btf_type_by_id(btf, args[i].type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (btf_type_is_int(t) || btf_type_is_enum(t)) {
- if (reg[i + 1].type == SCALAR_VALUE)
+ if (reg->type == SCALAR_VALUE)
continue;
bpf_log(log, "R%d is not a scalar\n", i + 1);
goto out;
}
if (btf_type_is_ptr(t)) {
- if (reg[i + 1].type == SCALAR_VALUE) {
- bpf_log(log, "R%d is not a pointer\n", i + 1);
- goto out;
- }
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) {
- if (reg[i + 1].type != PTR_TO_CTX) {
+ if (reg->type != PTR_TO_CTX) {
bpf_log(log,
"arg#%d expected pointer to ctx, but got %s\n",
i, btf_kind_str[BTF_INFO_KIND(t->info)]);
goto out;
}
- if (check_ctx_reg(env, &reg[i + 1], i + 1))
+ if (check_ctx_reg(env, reg, i + 1))
goto out;
continue;
}
+
+ if (!is_global)
+ goto out;
+
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+
+ ref_t = btf_resolve_size(btf, t, &type_size);
+ if (IS_ERR(ref_t)) {
+ bpf_log(log,
+ "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
+ PTR_ERR(ref_t));
+ goto out;
+ }
+
+ if (check_mem_reg(env, reg, i + 1, type_size))
+ goto out;
+
+ continue;
}
bpf_log(log, "Unrecognized arg#%d type %s\n",
i, btf_kind_str[BTF_INFO_KIND(t->info)]);
@@ -5393,14 +5406,14 @@ out:
* (either PTR_TO_CTX or SCALAR_VALUE).
*/
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *reg)
+ struct bpf_reg_state *regs)
{
struct bpf_verifier_log *log = &env->log;
struct bpf_prog *prog = env->prog;
enum bpf_prog_type prog_type = prog->type;
struct btf *btf = prog->aux->btf;
const struct btf_param *args;
- const struct btf_type *t;
+ const struct btf_type *t, *ref_t;
u32 i, nargs, btf_id;
const char *tname;
@@ -5464,16 +5477,35 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
* Only PTR_TO_CTX and SCALAR are supported atm.
*/
for (i = 0; i < nargs; i++) {
+ struct bpf_reg_state *reg = &regs[i + 1];
+
t = btf_type_by_id(btf, args[i].type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (btf_type_is_int(t) || btf_type_is_enum(t)) {
- reg[i + 1].type = SCALAR_VALUE;
+ reg->type = SCALAR_VALUE;
continue;
}
- if (btf_type_is_ptr(t) &&
- btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
- reg[i + 1].type = PTR_TO_CTX;
+ if (btf_type_is_ptr(t)) {
+ if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+ reg->type = PTR_TO_CTX;
+ continue;
+ }
+
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+
+ ref_t = btf_resolve_size(btf, t, &reg->mem_size);
+ if (IS_ERR(ref_t)) {
+ bpf_log(log,
+ "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
+ PTR_ERR(ref_t));
+ return -EINVAL;
+ }
+
+ reg->type = PTR_TO_MEM_OR_NULL;
+ reg->id = ++env->id_gen;
+
continue;
}
bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
@@ -5743,6 +5775,11 @@ bool btf_is_kernel(const struct btf *btf)
return btf->kernel_btf;
}
+bool btf_is_module(const struct btf *btf)
+{
+ return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0;
+}
+
static int btf_id_cmp_func(const void *a, const void *b)
{
const int *pa = a, *pb = b;
@@ -5877,3 +5914,25 @@ static int __init btf_module_init(void)
fs_initcall(btf_module_init);
#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */
+
+struct module *btf_try_get_module(const struct btf *btf)
+{
+ struct module *res = NULL;
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ struct btf_module *btf_mod, *tmp;
+
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->btf != btf)
+ continue;
+
+ if (try_module_get(btf_mod->module))
+ res = btf_mod->module;
+
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+#endif
+
+ return res;
+}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6aa9e10c6335..b567ca46555c 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -19,7 +19,7 @@
#include "../cgroup/cgroup-internal.h"
-DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
void cgroup_bpf_offline(struct cgroup *cgrp)
@@ -128,7 +128,7 @@ static void cgroup_bpf_release(struct work_struct *work)
if (pl->link)
bpf_cgroup_link_auto_detach(pl->link);
kfree(pl);
- static_branch_dec(&cgroup_bpf_enabled_key);
+ static_branch_dec(&cgroup_bpf_enabled_key[type]);
}
old_array = rcu_dereference_protected(
cgrp->bpf.effective[type],
@@ -499,7 +499,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
if (old_prog)
bpf_prog_put(old_prog);
else
- static_branch_inc(&cgroup_bpf_enabled_key);
+ static_branch_inc(&cgroup_bpf_enabled_key[type]);
bpf_cgroup_storages_link(new_storage, cgrp, type);
return 0;
@@ -698,7 +698,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
cgrp->bpf.flags[type] = 0;
if (old_prog)
bpf_prog_put(old_prog);
- static_branch_dec(&cgroup_bpf_enabled_key);
+ static_branch_dec(&cgroup_bpf_enabled_key[type]);
return 0;
cleanup:
@@ -1055,6 +1055,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* @uaddr: sockaddr struct provided by user
* @type: The type of program to be exectuted
* @t_ctx: Pointer to attach type specific context
+ * @flags: Pointer to u32 which contains higher bits of BPF program
+ * return value (OR'ed together).
*
* socket is expected to be of type INET or INET6.
*
@@ -1064,7 +1066,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
struct sockaddr *uaddr,
enum bpf_attach_type type,
- void *t_ctx)
+ void *t_ctx,
+ u32 *flags)
{
struct bpf_sock_addr_kern ctx = {
.sk = sk,
@@ -1087,7 +1090,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
}
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx,
+ BPF_PROG_RUN, flags);
return ret == 1 ? 0 : -EPERM;
}
@@ -1298,7 +1302,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
return empty;
}
-static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
+ struct bpf_sockopt_buf *buf)
{
if (unlikely(max_optlen < 0))
return -EINVAL;
@@ -1310,6 +1315,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
max_optlen = PAGE_SIZE;
}
+ if (max_optlen <= sizeof(buf->data)) {
+ /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
+ * bytes avoid the cost of kzalloc.
+ */
+ ctx->optval = buf->data;
+ ctx->optval_end = ctx->optval + max_optlen;
+ return max_optlen;
+ }
+
ctx->optval = kzalloc(max_optlen, GFP_USER);
if (!ctx->optval)
return -ENOMEM;
@@ -1319,16 +1333,26 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
return max_optlen;
}
-static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
+ struct bpf_sockopt_buf *buf)
{
+ if (ctx->optval == buf->data)
+ return;
kfree(ctx->optval);
}
+static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
+ struct bpf_sockopt_buf *buf)
+{
+ return ctx->optval != buf->data;
+}
+
int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
int *optname, char __user *optval,
int *optlen, char **kernel_optval)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_buf buf = {};
struct bpf_sockopt_kern ctx = {
.sk = sk,
.level = *level,
@@ -1340,8 +1364,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
* attached to the hook so we don't waste time allocating
* memory and locking the socket.
*/
- if (!cgroup_bpf_enabled ||
- __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+ if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
return 0;
/* Allocate a bit more than the initial user buffer for
@@ -1350,7 +1373,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
*/
max_optlen = max_t(int, 16, *optlen);
- max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1390,14 +1413,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
*/
if (ctx.optlen != 0) {
*optlen = ctx.optlen;
- *kernel_optval = ctx.optval;
+ /* We've used bpf_sockopt_kern->buf as an intermediary
+ * storage, but the BPF program indicates that we need
+ * to pass this data to the kernel setsockopt handler.
+ * No way to export on-stack buf, have to allocate a
+ * new buffer.
+ */
+ if (!sockopt_buf_allocated(&ctx, &buf)) {
+ void *p = kmalloc(ctx.optlen, GFP_USER);
+
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(p, ctx.optval, ctx.optlen);
+ *kernel_optval = p;
+ } else {
+ *kernel_optval = ctx.optval;
+ }
/* export and don't free sockopt buf */
return 0;
}
}
out:
- sockopt_free_buf(&ctx);
+ sockopt_free_buf(&ctx, &buf);
return ret;
}
@@ -1407,6 +1447,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
int retval)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_buf buf = {};
struct bpf_sockopt_kern ctx = {
.sk = sk,
.level = level,
@@ -1419,13 +1460,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
* attached to the hook so we don't waste time allocating
* memory and locking the socket.
*/
- if (!cgroup_bpf_enabled ||
- __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+ if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
return retval;
ctx.optlen = max_optlen;
- max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1488,9 +1528,55 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
ret = ctx.retval;
out:
- sockopt_free_buf(&ctx);
+ sockopt_free_buf(&ctx, &buf);
return ret;
}
+
+int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+ int optname, void *optval,
+ int *optlen, int retval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = level,
+ .optname = optname,
+ .retval = retval,
+ .optlen = *optlen,
+ .optval = optval,
+ .optval_end = optval + *optlen,
+ };
+ int ret;
+
+ /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
+ * user data back into BPF buffer when reval != 0. This is
+ * done as an optimization to avoid extra copy, assuming
+ * kernel won't populate the data in case of an error.
+ * Here we always pass the data and memset() should
+ * be called if that data shouldn't be "exported".
+ */
+
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+ &ctx, BPF_PROG_RUN);
+ if (!ret)
+ return -EPERM;
+
+ if (ctx.optlen > *optlen)
+ return -EFAULT;
+
+ /* BPF programs only allowed to set retval to 0, not some
+ * arbitrary value.
+ */
+ if (ctx.retval != 0 && ctx.retval != retval)
+ return -EFAULT;
+
+ /* BPF programs can shrink the buffer, export the modifications.
+ */
+ if (ctx.optlen != 0)
+ *optlen = ctx.optlen;
+
+ return ctx.retval;
+}
#endif
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 261f8692d0d2..3a283bf97f2f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -91,6 +91,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
vfree(fp);
return NULL;
}
+ fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags);
+ if (!fp->active) {
+ vfree(fp);
+ kfree(aux);
+ return NULL;
+ }
fp->pages = size / PAGE_SIZE;
fp->aux = aux;
@@ -114,8 +120,9 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
if (!prog)
return NULL;
- prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
- if (!prog->aux->stats) {
+ prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
+ if (!prog->stats) {
+ free_percpu(prog->active);
kfree(prog->aux);
vfree(prog);
return NULL;
@@ -124,7 +131,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
for_each_possible_cpu(cpu) {
struct bpf_prog_stats *pstats;
- pstats = per_cpu_ptr(prog->aux->stats, cpu);
+ pstats = per_cpu_ptr(prog->stats, cpu);
u64_stats_init(&pstats->syncp);
}
return prog;
@@ -238,6 +245,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
* reallocated structure.
*/
fp_old->aux = NULL;
+ fp_old->stats = NULL;
+ fp_old->active = NULL;
__bpf_prog_free(fp_old);
}
@@ -249,10 +258,11 @@ void __bpf_prog_free(struct bpf_prog *fp)
if (fp->aux) {
mutex_destroy(&fp->aux->used_maps_mutex);
mutex_destroy(&fp->aux->dst_mutex);
- free_percpu(fp->aux->stats);
kfree(fp->aux->poke_tab);
kfree(fp->aux);
}
+ free_percpu(fp->stats);
+ free_percpu(fp->active);
vfree(fp);
}
@@ -1108,6 +1118,8 @@ static void bpf_prog_clone_free(struct bpf_prog *fp)
* clone is guaranteed to not be locked.
*/
fp->aux = NULL;
+ fp->stats = NULL;
+ fp->active = NULL;
__bpf_prog_free(fp);
}
@@ -1309,8 +1321,8 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(STX, MEM, H), \
INSN_3(STX, MEM, W), \
INSN_3(STX, MEM, DW), \
- INSN_3(STX, XADD, W), \
- INSN_3(STX, XADD, DW), \
+ INSN_3(STX, ATOMIC, W), \
+ INSN_3(STX, ATOMIC, DW), \
/* Immediate based. */ \
INSN_3(ST, MEM, B), \
INSN_3(ST, MEM, H), \
@@ -1618,13 +1630,59 @@ out:
LDX_PROBE(DW, 8)
#undef LDX_PROBE
- STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
- atomic_add((u32) SRC, (atomic_t *)(unsigned long)
- (DST + insn->off));
- CONT;
- STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
- atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
- (DST + insn->off));
+#define ATOMIC_ALU_OP(BOP, KOP) \
+ case BOP: \
+ if (BPF_SIZE(insn->code) == BPF_W) \
+ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
+ (DST + insn->off)); \
+ else \
+ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
+ (DST + insn->off)); \
+ break; \
+ case BOP | BPF_FETCH: \
+ if (BPF_SIZE(insn->code) == BPF_W) \
+ SRC = (u32) atomic_fetch_##KOP( \
+ (u32) SRC, \
+ (atomic_t *)(unsigned long) (DST + insn->off)); \
+ else \
+ SRC = (u64) atomic64_fetch_##KOP( \
+ (u64) SRC, \
+ (atomic64_t *)(unsigned long) (DST + insn->off)); \
+ break;
+
+ STX_ATOMIC_DW:
+ STX_ATOMIC_W:
+ switch (IMM) {
+ ATOMIC_ALU_OP(BPF_ADD, add)
+ ATOMIC_ALU_OP(BPF_AND, and)
+ ATOMIC_ALU_OP(BPF_OR, or)
+ ATOMIC_ALU_OP(BPF_XOR, xor)
+#undef ATOMIC_ALU_OP
+
+ case BPF_XCHG:
+ if (BPF_SIZE(insn->code) == BPF_W)
+ SRC = (u32) atomic_xchg(
+ (atomic_t *)(unsigned long) (DST + insn->off),
+ (u32) SRC);
+ else
+ SRC = (u64) atomic64_xchg(
+ (atomic64_t *)(unsigned long) (DST + insn->off),
+ (u64) SRC);
+ break;
+ case BPF_CMPXCHG:
+ if (BPF_SIZE(insn->code) == BPF_W)
+ BPF_R0 = (u32) atomic_cmpxchg(
+ (atomic_t *)(unsigned long) (DST + insn->off),
+ (u32) BPF_R0, (u32) SRC);
+ else
+ BPF_R0 = (u64) atomic64_cmpxchg(
+ (atomic64_t *)(unsigned long) (DST + insn->off),
+ (u64) BPF_R0, (u64) SRC);
+ break;
+
+ default:
+ goto default_label;
+ }
CONT;
default_label:
@@ -1634,7 +1692,8 @@ out:
*
* Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
*/
- pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code);
+ pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n",
+ insn->code, insn->imm);
BUG_ON(1);
return 0;
}
@@ -2119,6 +2178,28 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
+void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
+ struct btf_mod_pair *used_btfs, u32 len)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ struct btf_mod_pair *btf_mod;
+ u32 i;
+
+ for (i = 0; i < len; i++) {
+ btf_mod = &used_btfs[i];
+ if (btf_mod->module)
+ module_put(btf_mod->module);
+ btf_put(btf_mod->btf);
+ }
+#endif
+}
+
+static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
+{
+ __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt);
+ kfree(aux->used_btfs);
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
@@ -2126,6 +2207,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
aux = container_of(work, struct bpf_prog_aux, work);
bpf_free_used_maps(aux);
+ bpf_free_used_btfs(aux);
if (bpf_prog_is_dev_bound(aux))
bpf_prog_offload_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
@@ -2262,6 +2344,10 @@ bool __weak bpf_helper_changes_pkt_data(void *func)
/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
* analysis code and wants explicit zero extension inserted by verifier.
* Otherwise, return FALSE.
+ *
+ * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
+ * you don't override this. JITs that don't want these extra insns can detect
+ * them using insn_is_zext.
*/
bool __weak bpf_jit_needs_zext(void)
{
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 747313698178..5d1469de6921 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -141,49 +141,6 @@ static void cpu_map_kthread_stop(struct work_struct *work)
kthread_stop(rcpu->kthread);
}
-static struct sk_buff *cpu_map_build_skb(struct xdp_frame *xdpf,
- struct sk_buff *skb)
-{
- unsigned int hard_start_headroom;
- unsigned int frame_size;
- void *pkt_data_start;
-
- /* Part of headroom was reserved to xdpf */
- hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom;
-
- /* Memory size backing xdp_frame data already have reserved
- * room for build_skb to place skb_shared_info in tailroom.
- */
- frame_size = xdpf->frame_sz;
-
- pkt_data_start = xdpf->data - hard_start_headroom;
- skb = build_skb_around(skb, pkt_data_start, frame_size);
- if (unlikely(!skb))
- return NULL;
-
- skb_reserve(skb, hard_start_headroom);
- __skb_put(skb, xdpf->len);
- if (xdpf->metasize)
- skb_metadata_set(skb, xdpf->metasize);
-
- /* Essential SKB info: protocol and skb->dev */
- skb->protocol = eth_type_trans(skb, xdpf->dev_rx);
-
- /* Optional SKB info, currently missing:
- * - HW checksum info (skb->ip_summed)
- * - HW RX hash (skb_set_hash)
- * - RX ring dev queue index (skb_record_rx_queue)
- */
-
- /* Until page_pool get SKB return path, release DMA here */
- xdp_release_frame(xdpf);
-
- /* Allow SKB to reuse area used by xdp_frame */
- xdp_scrub_frame(xdpf);
-
- return skb;
-}
-
static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
{
/* The tear-down procedure should have made sure that queue is
@@ -350,7 +307,8 @@ static int cpu_map_kthread_run(void *data)
struct sk_buff *skb = skbs[i];
int ret;
- skb = cpu_map_build_skb(xdpf, skb);
+ skb = __xdp_build_skb_from_frame(xdpf, skb,
+ xdpf->dev_rx);
if (!skb) {
xdp_return_frame(xdpf);
continue;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index f6e9c68afdd4..85d9d1b72a33 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -802,9 +802,7 @@ static int dev_map_notification(struct notifier_block *notifier,
break;
/* will be freed in free_netdev() */
- netdev->xdp_bulkq =
- __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue),
- sizeof(void *), GFP_ATOMIC);
+ netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue);
if (!netdev->xdp_bulkq)
return NOTIFY_BAD;
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index b44d8c447afd..3acc7e0b6916 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -80,6 +80,13 @@ const char *const bpf_alu_string[16] = {
[BPF_END >> 4] = "endian",
};
+static const char *const bpf_atomic_alu_string[16] = {
+ [BPF_ADD >> 4] = "add",
+ [BPF_AND >> 4] = "and",
+ [BPF_OR >> 4] = "or",
+ [BPF_XOR >> 4] = "or",
+};
+
static const char *const bpf_ldst_string[] = {
[BPF_W >> 3] = "u32",
[BPF_H >> 3] = "u16",
@@ -153,14 +160,44 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg,
insn->off, insn->src_reg);
- else if (BPF_MODE(insn->code) == BPF_XADD)
- verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm == BPF_ADD || insn->imm == BPF_AND ||
+ insn->imm == BPF_OR || insn->imm == BPF_XOR)) {
+ verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ bpf_alu_string[BPF_OP(insn->imm) >> 4],
+ insn->src_reg);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm == (BPF_ADD | BPF_FETCH) ||
+ insn->imm == (BPF_AND | BPF_FETCH) ||
+ insn->imm == (BPF_OR | BPF_FETCH) ||
+ insn->imm == (BPF_XOR | BPF_FETCH))) {
+ verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n",
+ insn->code, insn->src_reg,
+ BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+ bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4],
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off, insn->src_reg);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_CMPXCHG) {
+ verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n",
insn->code,
+ BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg, insn->off,
insn->src_reg);
- else
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_XCHG) {
+ verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n",
+ insn->code, insn->src_reg,
+ BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off, insn->src_reg);
+ } else {
verbose(cbs->private_data, "BUG_%02x\n", insn->code);
+ }
} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM) {
verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c1ac7f964bc9..d63912e73ad9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1148,7 +1148,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -1202,7 +1202,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 41ca280b1dc1..308427fe03a3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -720,14 +720,6 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_spin_lock_proto;
case BPF_FUNC_spin_unlock:
return &bpf_spin_unlock_proto;
- case BPF_FUNC_trace_printk:
- if (!perfmon_capable())
- return NULL;
- return bpf_get_trace_printk_proto();
- case BPF_FUNC_snprintf_btf:
- if (!perfmon_capable())
- return NULL;
- return &bpf_snprintf_btf_proto;
case BPF_FUNC_jiffies64:
return &bpf_jiffies64_proto;
case BPF_FUNC_per_cpu_ptr:
@@ -742,6 +734,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return NULL;
switch (func_id) {
+ case BPF_FUNC_trace_printk:
+ return bpf_get_trace_printk_proto();
case BPF_FUNC_get_current_task:
return &bpf_get_current_task_proto;
case BPF_FUNC_probe_read_user:
@@ -752,6 +746,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
return &bpf_probe_read_kernel_str_proto;
+ case BPF_FUNC_snprintf_btf:
+ return &bpf_snprintf_btf_proto;
default:
return NULL;
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index dd4b7fd60ee7..1576ff331ee4 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
inode->i_mtime = inode->i_atime;
inode->i_ctime = inode->i_atime;
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(&init_user_ns, inode, dir, mode);
return inode;
}
@@ -152,7 +152,8 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
dir->i_ctime = dir->i_mtime;
}
-static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
@@ -381,8 +382,8 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
return simple_lookup(dir, dentry, flags);
}
-static int bpf_symlink(struct inode *dir, struct dentry *dentry,
- const char *target)
+static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *target)
{
char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
struct inode *inode;
@@ -507,7 +508,7 @@ static void *bpf_obj_do_get(const char __user *pathname,
return ERR_PTR(ret);
inode = d_backing_inode(path.dentry);
- ret = inode_permission(inode, ACC_MODE(flags));
+ ret = path_permission(&path, ACC_MODE(flags));
if (ret)
goto out;
@@ -558,7 +559,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
{
struct bpf_prog *prog;
- int ret = inode_permission(inode, MAY_READ);
+ int ret = inode_permission(&init_user_ns, inode, MAY_READ);
if (ret)
return ERR_PTR(ret);
diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c
index b7ff87939172..5d872a705470 100644
--- a/kernel/bpf/preload/iterators/iterators.c
+++ b/kernel/bpf/preload/iterators/iterators.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
-#include <argp.h>
+#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index bfafbf115bf3..be35bfb7fb13 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -7,10 +7,9 @@
#include <linux/kernel.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
-#include <linux/elf.h>
-#include <linux/pagemap.h>
#include <linux/irq_work.h>
#include <linux/btf_ids.h>
+#include <linux/buildid.h>
#include "percpu_freelist.h"
#define STACK_CREATE_FLAG_MASK \
@@ -145,140 +144,6 @@ free_smap:
return ERR_PTR(err);
}
-#define BPF_BUILD_ID 3
-/*
- * Parse build id from the note segment. This logic can be shared between
- * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
- * identical.
- */
-static inline int stack_map_parse_build_id(void *page_addr,
- unsigned char *build_id,
- void *note_start,
- Elf32_Word note_size)
-{
- Elf32_Word note_offs = 0, new_offs;
-
- /* check for overflow */
- if (note_start < page_addr || note_start + note_size < note_start)
- return -EINVAL;
-
- /* only supports note that fits in the first page */
- if (note_start + note_size > page_addr + PAGE_SIZE)
- return -EINVAL;
-
- while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
- Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
-
- if (nhdr->n_type == BPF_BUILD_ID &&
- nhdr->n_namesz == sizeof("GNU") &&
- nhdr->n_descsz > 0 &&
- nhdr->n_descsz <= BPF_BUILD_ID_SIZE) {
- memcpy(build_id,
- note_start + note_offs +
- ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
- nhdr->n_descsz);
- memset(build_id + nhdr->n_descsz, 0,
- BPF_BUILD_ID_SIZE - nhdr->n_descsz);
- return 0;
- }
- new_offs = note_offs + sizeof(Elf32_Nhdr) +
- ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
- if (new_offs <= note_offs) /* overflow */
- break;
- note_offs = new_offs;
- }
- return -EINVAL;
-}
-
-/* Parse build ID from 32-bit ELF */
-static int stack_map_get_build_id_32(void *page_addr,
- unsigned char *build_id)
-{
- Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
- Elf32_Phdr *phdr;
- int i;
-
- /* only supports phdr that fits in one page */
- if (ehdr->e_phnum >
- (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
- return -EINVAL;
-
- phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
-
- for (i = 0; i < ehdr->e_phnum; ++i) {
- if (phdr[i].p_type == PT_NOTE &&
- !stack_map_parse_build_id(page_addr, build_id,
- page_addr + phdr[i].p_offset,
- phdr[i].p_filesz))
- return 0;
- }
- return -EINVAL;
-}
-
-/* Parse build ID from 64-bit ELF */
-static int stack_map_get_build_id_64(void *page_addr,
- unsigned char *build_id)
-{
- Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
- Elf64_Phdr *phdr;
- int i;
-
- /* only supports phdr that fits in one page */
- if (ehdr->e_phnum >
- (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
- return -EINVAL;
-
- phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
-
- for (i = 0; i < ehdr->e_phnum; ++i) {
- if (phdr[i].p_type == PT_NOTE &&
- !stack_map_parse_build_id(page_addr, build_id,
- page_addr + phdr[i].p_offset,
- phdr[i].p_filesz))
- return 0;
- }
- return -EINVAL;
-}
-
-/* Parse build ID of ELF file mapped to vma */
-static int stack_map_get_build_id(struct vm_area_struct *vma,
- unsigned char *build_id)
-{
- Elf32_Ehdr *ehdr;
- struct page *page;
- void *page_addr;
- int ret;
-
- /* only works for page backed storage */
- if (!vma->vm_file)
- return -EINVAL;
-
- page = find_get_page(vma->vm_file->f_mapping, 0);
- if (!page)
- return -EFAULT; /* page not mapped */
-
- ret = -EINVAL;
- page_addr = kmap_atomic(page);
- ehdr = (Elf32_Ehdr *)page_addr;
-
- /* compare magic x7f "ELF" */
- if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
- goto out;
-
- /* only support executable file and shared object file */
- if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
- goto out;
-
- if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
- ret = stack_map_get_build_id_32(page_addr, build_id);
- else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
- ret = stack_map_get_build_id_64(page_addr, build_id);
-out:
- kunmap_atomic(page_addr);
- put_page(page);
- return ret;
-}
-
static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
u64 *ips, u32 trace_nr, bool user)
{
@@ -319,18 +184,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
id_offs[i].ip = ips[i];
- memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
+ memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
}
return;
}
for (i = 0; i < trace_nr; i++) {
vma = find_vma(current->mm, ips[i]);
- if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
+ if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
/* per entry fall back to ips */
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
id_offs[i].ip = ips[i];
- memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
+ memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
continue;
}
id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e5999d86c76e..c859bc46d06c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1731,25 +1731,28 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
static void bpf_prog_get_stats(const struct bpf_prog *prog,
struct bpf_prog_stats *stats)
{
- u64 nsecs = 0, cnt = 0;
+ u64 nsecs = 0, cnt = 0, misses = 0;
int cpu;
for_each_possible_cpu(cpu) {
const struct bpf_prog_stats *st;
unsigned int start;
- u64 tnsecs, tcnt;
+ u64 tnsecs, tcnt, tmisses;
- st = per_cpu_ptr(prog->aux->stats, cpu);
+ st = per_cpu_ptr(prog->stats, cpu);
do {
start = u64_stats_fetch_begin_irq(&st->syncp);
tnsecs = st->nsecs;
tcnt = st->cnt;
+ tmisses = st->misses;
} while (u64_stats_fetch_retry_irq(&st->syncp, start));
nsecs += tnsecs;
cnt += tcnt;
+ misses += tmisses;
}
stats->nsecs = nsecs;
stats->cnt = cnt;
+ stats->misses = misses;
}
#ifdef CONFIG_PROC_FS
@@ -1768,14 +1771,16 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
"memlock:\t%llu\n"
"prog_id:\t%u\n"
"run_time_ns:\t%llu\n"
- "run_cnt:\t%llu\n",
+ "run_cnt:\t%llu\n"
+ "recursion_misses:\t%llu\n",
prog->type,
prog->jited,
prog_tag,
prog->pages * 1ULL << PAGE_SHIFT,
prog->aux->id,
stats.nsecs,
- stats.cnt);
+ stats.cnt,
+ stats.misses);
}
#endif
@@ -3438,6 +3443,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
bpf_prog_get_stats(prog, &stats);
info.run_time_ns = stats.nsecs;
info.run_cnt = stats.cnt;
+ info.recursion_misses = stats.misses;
if (!bpf_capable()) {
info.jited_prog_len = 0;
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 175b7b42bfc4..b68cb5d6d6eb 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -286,9 +286,248 @@ static const struct seq_operations task_file_seq_ops = {
.show = task_file_seq_show,
};
+struct bpf_iter_seq_task_vma_info {
+ /* The first field must be struct bpf_iter_seq_task_common.
+ * this is assumed by {init, fini}_seq_pidns() callback functions.
+ */
+ struct bpf_iter_seq_task_common common;
+ struct task_struct *task;
+ struct vm_area_struct *vma;
+ u32 tid;
+ unsigned long prev_vm_start;
+ unsigned long prev_vm_end;
+};
+
+enum bpf_task_vma_iter_find_op {
+ task_vma_iter_first_vma, /* use mm->mmap */
+ task_vma_iter_next_vma, /* use curr_vma->vm_next */
+ task_vma_iter_find_vma, /* use find_vma() to find next vma */
+};
+
+static struct vm_area_struct *
+task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
+{
+ struct pid_namespace *ns = info->common.ns;
+ enum bpf_task_vma_iter_find_op op;
+ struct vm_area_struct *curr_vma;
+ struct task_struct *curr_task;
+ u32 curr_tid = info->tid;
+
+ /* If this function returns a non-NULL vma, it holds a reference to
+ * the task_struct, and holds read lock on vma->mm->mmap_lock.
+ * If this function returns NULL, it does not hold any reference or
+ * lock.
+ */
+ if (info->task) {
+ curr_task = info->task;
+ curr_vma = info->vma;
+ /* In case of lock contention, drop mmap_lock to unblock
+ * the writer.
+ *
+ * After relock, call find(mm, prev_vm_end - 1) to find
+ * new vma to process.
+ *
+ * +------+------+-----------+
+ * | VMA1 | VMA2 | VMA3 |
+ * +------+------+-----------+
+ * | | | |
+ * 4k 8k 16k 400k
+ *
+ * For example, curr_vma == VMA2. Before unlock, we set
+ *
+ * prev_vm_start = 8k
+ * prev_vm_end = 16k
+ *
+ * There are a few cases:
+ *
+ * 1) VMA2 is freed, but VMA3 exists.
+ *
+ * find_vma() will return VMA3, just process VMA3.
+ *
+ * 2) VMA2 still exists.
+ *
+ * find_vma() will return VMA2, process VMA2->next.
+ *
+ * 3) no more vma in this mm.
+ *
+ * Process the next task.
+ *
+ * 4) find_vma() returns a different vma, VMA2'.
+ *
+ * 4.1) If VMA2 covers same range as VMA2', skip VMA2',
+ * because we already covered the range;
+ * 4.2) VMA2 and VMA2' covers different ranges, process
+ * VMA2'.
+ */
+ if (mmap_lock_is_contended(curr_task->mm)) {
+ info->prev_vm_start = curr_vma->vm_start;
+ info->prev_vm_end = curr_vma->vm_end;
+ op = task_vma_iter_find_vma;
+ mmap_read_unlock(curr_task->mm);
+ if (mmap_read_lock_killable(curr_task->mm))
+ goto finish;
+ } else {
+ op = task_vma_iter_next_vma;
+ }
+ } else {
+again:
+ curr_task = task_seq_get_next(ns, &curr_tid, true);
+ if (!curr_task) {
+ info->tid = curr_tid + 1;
+ goto finish;
+ }
+
+ if (curr_tid != info->tid) {
+ info->tid = curr_tid;
+ /* new task, process the first vma */
+ op = task_vma_iter_first_vma;
+ } else {
+ /* Found the same tid, which means the user space
+ * finished data in previous buffer and read more.
+ * We dropped mmap_lock before returning to user
+ * space, so it is necessary to use find_vma() to
+ * find the next vma to process.
+ */
+ op = task_vma_iter_find_vma;
+ }
+
+ if (!curr_task->mm)
+ goto next_task;
+
+ if (mmap_read_lock_killable(curr_task->mm))
+ goto finish;
+ }
+
+ switch (op) {
+ case task_vma_iter_first_vma:
+ curr_vma = curr_task->mm->mmap;
+ break;
+ case task_vma_iter_next_vma:
+ curr_vma = curr_vma->vm_next;
+ break;
+ case task_vma_iter_find_vma:
+ /* We dropped mmap_lock so it is necessary to use find_vma
+ * to find the next vma. This is similar to the mechanism
+ * in show_smaps_rollup().
+ */
+ curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1);
+ /* case 1) and 4.2) above just use curr_vma */
+
+ /* check for case 2) or case 4.1) above */
+ if (curr_vma &&
+ curr_vma->vm_start == info->prev_vm_start &&
+ curr_vma->vm_end == info->prev_vm_end)
+ curr_vma = curr_vma->vm_next;
+ break;
+ }
+ if (!curr_vma) {
+ /* case 3) above, or case 2) 4.1) with vma->next == NULL */
+ mmap_read_unlock(curr_task->mm);
+ goto next_task;
+ }
+ info->task = curr_task;
+ info->vma = curr_vma;
+ return curr_vma;
+
+next_task:
+ put_task_struct(curr_task);
+ info->task = NULL;
+ curr_tid++;
+ goto again;
+
+finish:
+ if (curr_task)
+ put_task_struct(curr_task);
+ info->task = NULL;
+ info->vma = NULL;
+ return NULL;
+}
+
+static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_task_vma_info *info = seq->private;
+ struct vm_area_struct *vma;
+
+ vma = task_vma_seq_get_next(info);
+ if (vma && *pos == 0)
+ ++*pos;
+
+ return vma;
+}
+
+static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_task_vma_info *info = seq->private;
+
+ ++*pos;
+ return task_vma_seq_get_next(info);
+}
+
+struct bpf_iter__task_vma {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct task_struct *, task);
+ __bpf_md_ptr(struct vm_area_struct *, vma);
+};
+
+DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
+ struct task_struct *task, struct vm_area_struct *vma)
+
+static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
+{
+ struct bpf_iter_seq_task_vma_info *info = seq->private;
+ struct bpf_iter__task_vma ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.task = info->task;
+ ctx.vma = info->vma;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int task_vma_seq_show(struct seq_file *seq, void *v)
+{
+ return __task_vma_seq_show(seq, false);
+}
+
+static void task_vma_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_seq_task_vma_info *info = seq->private;
+
+ if (!v) {
+ (void)__task_vma_seq_show(seq, true);
+ } else {
+ /* info->vma has not been seen by the BPF program. If the
+ * user space reads more, task_vma_seq_get_next should
+ * return this vma again. Set prev_vm_start to ~0UL,
+ * so that we don't skip the vma returned by the next
+ * find_vma() (case task_vma_iter_find_vma in
+ * task_vma_seq_get_next()).
+ */
+ info->prev_vm_start = ~0UL;
+ info->prev_vm_end = info->vma->vm_end;
+ mmap_read_unlock(info->task->mm);
+ put_task_struct(info->task);
+ info->task = NULL;
+ }
+}
+
+static const struct seq_operations task_vma_seq_ops = {
+ .start = task_vma_seq_start,
+ .next = task_vma_seq_next,
+ .stop = task_vma_seq_stop,
+ .show = task_vma_seq_show,
+};
+
BTF_ID_LIST(btf_task_file_ids)
BTF_ID(struct, task_struct)
BTF_ID(struct, file)
+BTF_ID(struct, vm_area_struct)
static const struct bpf_iter_seq_info task_seq_info = {
.seq_ops = &task_seq_ops,
@@ -328,6 +567,26 @@ static struct bpf_iter_reg task_file_reg_info = {
.seq_info = &task_file_seq_info,
};
+static const struct bpf_iter_seq_info task_vma_seq_info = {
+ .seq_ops = &task_vma_seq_ops,
+ .init_seq_private = init_seq_pidns,
+ .fini_seq_private = fini_seq_pidns,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info),
+};
+
+static struct bpf_iter_reg task_vma_reg_info = {
+ .target = "task_vma",
+ .feature = BPF_ITER_RESCHED,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__task_vma, task),
+ PTR_TO_BTF_ID_OR_NULL },
+ { offsetof(struct bpf_iter__task_vma, vma),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &task_vma_seq_info,
+};
+
static int __init task_iter_init(void)
{
int ret;
@@ -339,6 +598,12 @@ static int __init task_iter_init(void)
task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
- return bpf_iter_reg_target(&task_file_reg_info);
+ ret = bpf_iter_reg_target(&task_file_reg_info);
+ if (ret)
+ return ret;
+
+ task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
+ task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2];
+ return bpf_iter_reg_target(&task_vma_reg_info);
}
late_initcall(task_iter_init);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 35c5887d82ff..7bc3b3209224 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -381,55 +381,100 @@ out:
mutex_unlock(&trampoline_mutex);
}
+#define NO_START_TIME 1
+static u64 notrace bpf_prog_start_time(void)
+{
+ u64 start = NO_START_TIME;
+
+ if (static_branch_unlikely(&bpf_stats_enabled_key)) {
+ start = sched_clock();
+ if (unlikely(!start))
+ start = NO_START_TIME;
+ }
+ return start;
+}
+
+static void notrace inc_misses_counter(struct bpf_prog *prog)
+{
+ struct bpf_prog_stats *stats;
+
+ stats = this_cpu_ptr(prog->stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->misses++;
+ u64_stats_update_end(&stats->syncp);
+}
+
/* The logic is similar to BPF_PROG_RUN, but with an explicit
* rcu_read_lock() and migrate_disable() which are required
* for the trampoline. The macro is split into
- * call _bpf_prog_enter
+ * call __bpf_prog_enter
* call prog->bpf_func
* call __bpf_prog_exit
+ *
+ * __bpf_prog_enter returns:
+ * 0 - skip execution of the bpf prog
+ * 1 - execute bpf prog
+ * [2..MAX_U64] - excute bpf prog and record execution time.
+ * This is start time.
*/
-u64 notrace __bpf_prog_enter(void)
+u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
__acquires(RCU)
{
- u64 start = 0;
-
rcu_read_lock();
migrate_disable();
- if (static_branch_unlikely(&bpf_stats_enabled_key))
- start = sched_clock();
- return start;
+ if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
+ inc_misses_counter(prog);
+ return 0;
+ }
+ return bpf_prog_start_time();
}
-void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
- __releases(RCU)
+static void notrace update_prog_stats(struct bpf_prog *prog,
+ u64 start)
{
struct bpf_prog_stats *stats;
if (static_branch_unlikely(&bpf_stats_enabled_key) &&
- /* static_key could be enabled in __bpf_prog_enter
- * and disabled in __bpf_prog_exit.
+ /* static_key could be enabled in __bpf_prog_enter*
+ * and disabled in __bpf_prog_exit*.
* And vice versa.
- * Hence check that 'start' is not zero.
+ * Hence check that 'start' is valid.
*/
- start) {
- stats = this_cpu_ptr(prog->aux->stats);
+ start > NO_START_TIME) {
+ stats = this_cpu_ptr(prog->stats);
u64_stats_update_begin(&stats->syncp);
stats->cnt++;
stats->nsecs += sched_clock() - start;
u64_stats_update_end(&stats->syncp);
}
+}
+
+void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
+ __releases(RCU)
+{
+ update_prog_stats(prog, start);
+ __this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock();
}
-void notrace __bpf_prog_enter_sleepable(void)
+u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog)
{
rcu_read_lock_trace();
+ migrate_disable();
might_fault();
+ if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
+ inc_misses_counter(prog);
+ return 0;
+ }
+ return bpf_prog_start_time();
}
-void notrace __bpf_prog_exit_sleepable(void)
+void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
{
+ update_prog_stats(prog, start);
+ __this_cpu_dec(*(prog->active));
+ migrate_enable();
rcu_read_unlock_trace();
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 37581919e050..c56e3fcb5f1a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -228,6 +228,12 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
(poisoned ? BPF_MAP_KEY_POISON : 0ULL);
}
+static bool bpf_pseudo_call(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == BPF_PSEUDO_CALL;
+}
+
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
@@ -498,6 +504,13 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
func_id == BPF_FUNC_skc_to_tcp_request_sock;
}
+static bool is_cmpxchg_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_CMPXCHG;
+}
+
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
@@ -1073,6 +1086,51 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
__mark_reg_known_zero(regs + regno);
}
+static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
+{
+ switch (reg->type) {
+ case PTR_TO_MAP_VALUE_OR_NULL: {
+ const struct bpf_map *map = reg->map_ptr;
+
+ if (map->inner_map_meta) {
+ reg->type = CONST_PTR_TO_MAP;
+ reg->map_ptr = map->inner_map_meta;
+ } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
+ reg->type = PTR_TO_XDP_SOCK;
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+ reg->type = PTR_TO_SOCKET;
+ } else {
+ reg->type = PTR_TO_MAP_VALUE;
+ }
+ break;
+ }
+ case PTR_TO_SOCKET_OR_NULL:
+ reg->type = PTR_TO_SOCKET;
+ break;
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ reg->type = PTR_TO_SOCK_COMMON;
+ break;
+ case PTR_TO_TCP_SOCK_OR_NULL:
+ reg->type = PTR_TO_TCP_SOCK;
+ break;
+ case PTR_TO_BTF_ID_OR_NULL:
+ reg->type = PTR_TO_BTF_ID;
+ break;
+ case PTR_TO_MEM_OR_NULL:
+ reg->type = PTR_TO_MEM;
+ break;
+ case PTR_TO_RDONLY_BUF_OR_NULL:
+ reg->type = PTR_TO_RDONLY_BUF;
+ break;
+ case PTR_TO_RDWR_BUF_OR_NULL:
+ reg->type = PTR_TO_RDWR_BUF;
+ break;
+ default:
+ WARN_ONCE(1, "unknown nullable register type");
+ }
+}
+
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
{
return type_is_pkt_pointer(reg->type);
@@ -1486,9 +1544,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
/* determine subprog starts. The end is one before the next starts */
for (i = 0; i < insn_cnt; i++) {
- if (insn[i].code != (BPF_JMP | BPF_CALL))
- continue;
- if (insn[i].src_reg != BPF_PSEUDO_CALL)
+ if (!bpf_pseudo_call(insn + i))
continue;
if (!env->bpf_capable) {
verbose(env,
@@ -1654,7 +1710,11 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
if (class == BPF_STX) {
- if (reg->type != SCALAR_VALUE)
+ /* BPF_STX (including atomic variants) has multiple source
+ * operands, one of which is a ptr. Check whether the caller is
+ * asking about it.
+ */
+ if (t == SRC_OP && reg->type != SCALAR_VALUE)
return true;
return BPF_SIZE(code) == BPF_DW;
}
@@ -1686,22 +1746,38 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
return true;
}
-/* Return TRUE if INSN doesn't have explicit value define. */
-static bool insn_no_def(struct bpf_insn *insn)
+/* Return the regno defined by the insn, or -1. */
+static int insn_def_regno(const struct bpf_insn *insn)
{
- u8 class = BPF_CLASS(insn->code);
-
- return (class == BPF_JMP || class == BPF_JMP32 ||
- class == BPF_STX || class == BPF_ST);
+ switch (BPF_CLASS(insn->code)) {
+ case BPF_JMP:
+ case BPF_JMP32:
+ case BPF_ST:
+ return -1;
+ case BPF_STX:
+ if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm & BPF_FETCH)) {
+ if (insn->imm == BPF_CMPXCHG)
+ return BPF_REG_0;
+ else
+ return insn->src_reg;
+ } else {
+ return -1;
+ }
+ default:
+ return insn->dst_reg;
+ }
}
/* Return TRUE if INSN has defined any 32-bit value explicitly. */
static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- if (insn_no_def(insn))
+ int dst_reg = insn_def_regno(insn);
+
+ if (dst_reg == -1)
return false;
- return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP);
+ return !is_reg64(env, insn, dst_reg, NULL, DST_OP);
}
static void mark_insn_zext(struct bpf_verifier_env *env,
@@ -2271,12 +2347,14 @@ static void save_register_state(struct bpf_func_state *state,
state->stack[spi].slot_type[i] = STACK_SPILL;
}
-/* check_stack_read/write functions track spill/fill of registers,
+/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
-static int check_stack_write(struct bpf_verifier_env *env,
- struct bpf_func_state *state, /* func where register points to */
- int off, int size, int value_regno, int insn_idx)
+static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
+ /* stack frame we're writing to */
+ struct bpf_func_state *state,
+ int off, int size, int value_regno,
+ int insn_idx)
{
struct bpf_func_state *cur; /* state of the current function */
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
@@ -2402,9 +2480,175 @@ static int check_stack_write(struct bpf_verifier_env *env,
return 0;
}
-static int check_stack_read(struct bpf_verifier_env *env,
- struct bpf_func_state *reg_state /* func where register points to */,
- int off, int size, int value_regno)
+/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
+ * known to contain a variable offset.
+ * This function checks whether the write is permitted and conservatively
+ * tracks the effects of the write, considering that each stack slot in the
+ * dynamic range is potentially written to.
+ *
+ * 'off' includes 'regno->off'.
+ * 'value_regno' can be -1, meaning that an unknown value is being written to
+ * the stack.
+ *
+ * Spilled pointers in range are not marked as written because we don't know
+ * what's going to be actually written. This means that read propagation for
+ * future reads cannot be terminated by this write.
+ *
+ * For privileged programs, uninitialized stack slots are considered
+ * initialized by this write (even though we don't know exactly what offsets
+ * are going to be written to). The idea is that we don't want the verifier to
+ * reject future reads that access slots written to through variable offsets.
+ */
+static int check_stack_write_var_off(struct bpf_verifier_env *env,
+ /* func where register points to */
+ struct bpf_func_state *state,
+ int ptr_regno, int off, int size,
+ int value_regno, int insn_idx)
+{
+ struct bpf_func_state *cur; /* state of the current function */
+ int min_off, max_off;
+ int i, err;
+ struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
+ bool writing_zero = false;
+ /* set if the fact that we're writing a zero is used to let any
+ * stack slots remain STACK_ZERO
+ */
+ bool zero_used = false;
+
+ cur = env->cur_state->frame[env->cur_state->curframe];
+ ptr_reg = &cur->regs[ptr_regno];
+ min_off = ptr_reg->smin_value + off;
+ max_off = ptr_reg->smax_value + off + size;
+ if (value_regno >= 0)
+ value_reg = &cur->regs[value_regno];
+ if (value_reg && register_is_null(value_reg))
+ writing_zero = true;
+
+ err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE),
+ state->acquired_refs, true);
+ if (err)
+ return err;
+
+
+ /* Variable offset writes destroy any spilled pointers in range. */
+ for (i = min_off; i < max_off; i++) {
+ u8 new_type, *stype;
+ int slot, spi;
+
+ slot = -i - 1;
+ spi = slot / BPF_REG_SIZE;
+ stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
+
+ if (!env->allow_ptr_leaks
+ && *stype != NOT_INIT
+ && *stype != SCALAR_VALUE) {
+ /* Reject the write if there's are spilled pointers in
+ * range. If we didn't reject here, the ptr status
+ * would be erased below (even though not all slots are
+ * actually overwritten), possibly opening the door to
+ * leaks.
+ */
+ verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
+ insn_idx, i);
+ return -EINVAL;
+ }
+
+ /* Erase all spilled pointers. */
+ state->stack[spi].spilled_ptr.type = NOT_INIT;
+
+ /* Update the slot type. */
+ new_type = STACK_MISC;
+ if (writing_zero && *stype == STACK_ZERO) {
+ new_type = STACK_ZERO;
+ zero_used = true;
+ }
+ /* If the slot is STACK_INVALID, we check whether it's OK to
+ * pretend that it will be initialized by this write. The slot
+ * might not actually be written to, and so if we mark it as
+ * initialized future reads might leak uninitialized memory.
+ * For privileged programs, we will accept such reads to slots
+ * that may or may not be written because, if we're reject
+ * them, the error would be too confusing.
+ */
+ if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
+ verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
+ insn_idx, i);
+ return -EINVAL;
+ }
+ *stype = new_type;
+ }
+ if (zero_used) {
+ /* backtracking doesn't work for STACK_ZERO yet. */
+ err = mark_chain_precision(env, value_regno);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+/* When register 'dst_regno' is assigned some values from stack[min_off,
+ * max_off), we set the register's type according to the types of the
+ * respective stack slots. If all the stack values are known to be zeros, then
+ * so is the destination reg. Otherwise, the register is considered to be
+ * SCALAR. This function does not deal with register filling; the caller must
+ * ensure that all spilled registers in the stack range have been marked as
+ * read.
+ */
+static void mark_reg_stack_read(struct bpf_verifier_env *env,
+ /* func where src register points to */
+ struct bpf_func_state *ptr_state,
+ int min_off, int max_off, int dst_regno)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ int i, slot, spi;
+ u8 *stype;
+ int zeros = 0;
+
+ for (i = min_off; i < max_off; i++) {
+ slot = -i - 1;
+ spi = slot / BPF_REG_SIZE;
+ stype = ptr_state->stack[spi].slot_type;
+ if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
+ break;
+ zeros++;
+ }
+ if (zeros == max_off - min_off) {
+ /* any access_size read into register is zero extended,
+ * so the whole register == const_zero
+ */
+ __mark_reg_const_zero(&state->regs[dst_regno]);
+ /* backtracking doesn't support STACK_ZERO yet,
+ * so mark it precise here, so that later
+ * backtracking can stop here.
+ * Backtracking may not need this if this register
+ * doesn't participate in pointer adjustment.
+ * Forward propagation of precise flag is not
+ * necessary either. This mark is only to stop
+ * backtracking. Any register that contributed
+ * to const 0 was marked precise before spill.
+ */
+ state->regs[dst_regno].precise = true;
+ } else {
+ /* have read misc data from the stack */
+ mark_reg_unknown(env, state->regs, dst_regno);
+ }
+ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
+}
+
+/* Read the stack at 'off' and put the results into the register indicated by
+ * 'dst_regno'. It handles reg filling if the addressed stack slot is a
+ * spilled reg.
+ *
+ * 'dst_regno' can be -1, meaning that the read value is not going to a
+ * register.
+ *
+ * The access is assumed to be within the current stack bounds.
+ */
+static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
+ /* func where src register points to */
+ struct bpf_func_state *reg_state,
+ int off, int size, int dst_regno)
{
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
@@ -2412,11 +2656,6 @@ static int check_stack_read(struct bpf_verifier_env *env,
struct bpf_reg_state *reg;
u8 *stype;
- if (reg_state->allocated_stack <= slot) {
- verbose(env, "invalid read from stack off %d+0 size %d\n",
- off, size);
- return -EACCES;
- }
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
@@ -2427,9 +2666,9 @@ static int check_stack_read(struct bpf_verifier_env *env,
verbose(env, "invalid size of register fill\n");
return -EACCES;
}
- if (value_regno >= 0) {
- mark_reg_unknown(env, state->regs, value_regno);
- state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+ if (dst_regno >= 0) {
+ mark_reg_unknown(env, state->regs, dst_regno);
+ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
}
mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
return 0;
@@ -2441,16 +2680,16 @@ static int check_stack_read(struct bpf_verifier_env *env,
}
}
- if (value_regno >= 0) {
+ if (dst_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] = *reg;
+ state->regs[dst_regno] = *reg;
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
*/
- state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
} else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
- /* If value_regno==-1, the caller is asking us whether
+ /* If dst_regno==-1, the caller is asking us whether
* it is acceptable to use this value as a SCALAR_VALUE
* (e.g. for XADD).
* We must not allow unprivileged callers to do that
@@ -2462,70 +2701,167 @@ static int check_stack_read(struct bpf_verifier_env *env,
}
mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
} else {
- int zeros = 0;
+ u8 type;
for (i = 0; i < size; i++) {
- if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC)
+ type = stype[(slot - i) % BPF_REG_SIZE];
+ if (type == STACK_MISC)
continue;
- if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) {
- zeros++;
+ if (type == STACK_ZERO)
continue;
- }
verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
}
mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
- if (value_regno >= 0) {
- if (zeros == size) {
- /* any size read into register is zero extended,
- * so the whole register == const_zero
- */
- __mark_reg_const_zero(&state->regs[value_regno]);
- /* backtracking doesn't support STACK_ZERO yet,
- * so mark it precise here, so that later
- * backtracking can stop here.
- * Backtracking may not need this if this register
- * doesn't participate in pointer adjustment.
- * Forward propagation of precise flag is not
- * necessary either. This mark is only to stop
- * backtracking. Any register that contributed
- * to const 0 was marked precise before spill.
- */
- state->regs[value_regno].precise = true;
- } else {
- /* have read misc data from the stack */
- mark_reg_unknown(env, state->regs, value_regno);
- }
- state->regs[value_regno].live |= REG_LIVE_WRITTEN;
- }
+ if (dst_regno >= 0)
+ mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
}
return 0;
}
-static int check_stack_access(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg,
- int off, int size)
+enum stack_access_src {
+ ACCESS_DIRECT = 1, /* the access is performed by an instruction */
+ ACCESS_HELPER = 2, /* the access is performed by a helper */
+};
+
+static int check_stack_range_initialized(struct bpf_verifier_env *env,
+ int regno, int off, int access_size,
+ bool zero_size_allowed,
+ enum stack_access_src type,
+ struct bpf_call_arg_meta *meta);
+
+static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
+{
+ return cur_regs(env) + regno;
+}
+
+/* Read the stack at 'ptr_regno + off' and put the result into the register
+ * 'dst_regno'.
+ * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
+ * but not its variable offset.
+ * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
+ *
+ * As opposed to check_stack_read_fixed_off, this function doesn't deal with
+ * filling registers (i.e. reads of spilled register cannot be detected when
+ * the offset is not fixed). We conservatively mark 'dst_regno' as containing
+ * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
+ * offset; for a fixed offset check_stack_read_fixed_off should be used
+ * instead.
+ */
+static int check_stack_read_var_off(struct bpf_verifier_env *env,
+ int ptr_regno, int off, int size, int dst_regno)
{
- /* Stack accesses must be at a fixed offset, so that we
- * can determine what type of data were returned. See
- * check_stack_read().
+ /* The state of the source register. */
+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+ struct bpf_func_state *ptr_state = func(env, reg);
+ int err;
+ int min_off, max_off;
+
+ /* Note that we pass a NULL meta, so raw access will not be permitted.
*/
- if (!tnum_is_const(reg->var_off)) {
+ err = check_stack_range_initialized(env, ptr_regno, off, size,
+ false, ACCESS_DIRECT, NULL);
+ if (err)
+ return err;
+
+ min_off = reg->smin_value + off;
+ max_off = reg->smax_value + off;
+ mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
+ return 0;
+}
+
+/* check_stack_read dispatches to check_stack_read_fixed_off or
+ * check_stack_read_var_off.
+ *
+ * The caller must ensure that the offset falls within the allocated stack
+ * bounds.
+ *
+ * 'dst_regno' is a register which will receive the value from the stack. It
+ * can be -1, meaning that the read value is not going to a register.
+ */
+static int check_stack_read(struct bpf_verifier_env *env,
+ int ptr_regno, int off, int size,
+ int dst_regno)
+{
+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+ struct bpf_func_state *state = func(env, reg);
+ int err;
+ /* Some accesses are only permitted with a static offset. */
+ bool var_off = !tnum_is_const(reg->var_off);
+
+ /* The offset is required to be static when reads don't go to a
+ * register, in order to not leak pointers (see
+ * check_stack_read_fixed_off).
+ */
+ if (dst_regno < 0 && var_off) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "variable stack access var_off=%s off=%d size=%d\n",
+ verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
tn_buf, off, size);
return -EACCES;
}
+ /* Variable offset is prohibited for unprivileged mode for simplicity
+ * since it requires corresponding support in Spectre masking for stack
+ * ALU. See also retrieve_ptr_limit().
+ */
+ if (!env->bypass_spec_v1 && var_off) {
+ char tn_buf[48];
- if (off >= 0 || off < -MAX_BPF_STACK) {
- verbose(env, "invalid stack off=%d size=%d\n", off, size);
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
+ ptr_regno, tn_buf);
return -EACCES;
}
- return 0;
+ if (!var_off) {
+ off += reg->var_off.value;
+ err = check_stack_read_fixed_off(env, state, off, size,
+ dst_regno);
+ } else {
+ /* Variable offset stack reads need more conservative handling
+ * than fixed offset ones. Note that dst_regno >= 0 on this
+ * branch.
+ */
+ err = check_stack_read_var_off(env, ptr_regno, off, size,
+ dst_regno);
+ }
+ return err;
+}
+
+
+/* check_stack_write dispatches to check_stack_write_fixed_off or
+ * check_stack_write_var_off.
+ *
+ * 'ptr_regno' is the register used as a pointer into the stack.
+ * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
+ * 'value_regno' is the register whose value we're writing to the stack. It can
+ * be -1, meaning that we're not writing from a register.
+ *
+ * The caller must ensure that the offset falls within the maximum stack size.
+ */
+static int check_stack_write(struct bpf_verifier_env *env,
+ int ptr_regno, int off, int size,
+ int value_regno, int insn_idx)
+{
+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+ struct bpf_func_state *state = func(env, reg);
+ int err;
+
+ if (tnum_is_const(reg->var_off)) {
+ off += reg->var_off.value;
+ err = check_stack_write_fixed_off(env, state, off, size,
+ value_regno, insn_idx);
+ } else {
+ /* Variable offset stack reads need more conservative handling
+ * than fixed offset ones.
+ */
+ err = check_stack_write_var_off(env, state,
+ ptr_regno, off, size,
+ value_regno, insn_idx);
+ }
+ return err;
}
static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
@@ -2858,11 +3194,6 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
return -EACCES;
}
-static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
-{
- return cur_regs(env) + regno;
-}
-
static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
{
return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
@@ -2981,8 +3312,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
break;
case PTR_TO_STACK:
pointer_desc = "stack ";
- /* The stack spill tracking logic in check_stack_write()
- * and check_stack_read() relies on stack accesses being
+ /* The stack spill tracking logic in check_stack_write_fixed_off()
+ * and check_stack_read_fixed_off() relies on stack accesses being
* aligned.
*/
strict = true;
@@ -3074,9 +3405,7 @@ process_func:
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
- if (insn[i].code != (BPF_JMP | BPF_CALL))
- continue;
- if (insn[i].src_reg != BPF_PSEUDO_CALL)
+ if (!bpf_pseudo_call(insn + i))
continue;
/* remember insn and function to return to */
ret_insn[frame] = i + 1;
@@ -3400,6 +3729,91 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
return 0;
}
+/* Check that the stack access at the given offset is within bounds. The
+ * maximum valid offset is -1.
+ *
+ * The minimum valid offset is -MAX_BPF_STACK for writes, and
+ * -state->allocated_stack for reads.
+ */
+static int check_stack_slot_within_bounds(int off,
+ struct bpf_func_state *state,
+ enum bpf_access_type t)
+{
+ int min_valid_off;
+
+ if (t == BPF_WRITE)
+ min_valid_off = -MAX_BPF_STACK;
+ else
+ min_valid_off = -state->allocated_stack;
+
+ if (off < min_valid_off || off > -1)
+ return -EACCES;
+ return 0;
+}
+
+/* Check that the stack access at 'regno + off' falls within the maximum stack
+ * bounds.
+ *
+ * 'off' includes `regno->offset`, but not its dynamic part (if any).
+ */
+static int check_stack_access_within_bounds(
+ struct bpf_verifier_env *env,
+ int regno, int off, int access_size,
+ enum stack_access_src src, enum bpf_access_type type)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = regs + regno;
+ struct bpf_func_state *state = func(env, reg);
+ int min_off, max_off;
+ int err;
+ char *err_extra;
+
+ if (src == ACCESS_HELPER)
+ /* We don't know if helpers are reading or writing (or both). */
+ err_extra = " indirect access to";
+ else if (type == BPF_READ)
+ err_extra = " read from";
+ else
+ err_extra = " write to";
+
+ if (tnum_is_const(reg->var_off)) {
+ min_off = reg->var_off.value + off;
+ if (access_size > 0)
+ max_off = min_off + access_size - 1;
+ else
+ max_off = min_off;
+ } else {
+ if (reg->smax_value >= BPF_MAX_VAR_OFF ||
+ reg->smin_value <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
+ err_extra, regno);
+ return -EACCES;
+ }
+ min_off = reg->smin_value + off;
+ if (access_size > 0)
+ max_off = reg->smax_value + off + access_size - 1;
+ else
+ max_off = min_off;
+ }
+
+ err = check_stack_slot_within_bounds(min_off, state, type);
+ if (!err)
+ err = check_stack_slot_within_bounds(max_off, state, type);
+
+ if (err) {
+ if (tnum_is_const(reg->var_off)) {
+ verbose(env, "invalid%s stack R%d off=%d size=%d\n",
+ err_extra, regno, off, access_size);
+ } else {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n",
+ err_extra, regno, tn_buf, access_size);
+ }
+ }
+ return err;
+}
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
@@ -3515,8 +3929,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
}
} else if (reg->type == PTR_TO_STACK) {
- off += reg->var_off.value;
- err = check_stack_access(env, reg, off, size);
+ /* Basic bounds checks. */
+ err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
if (err)
return err;
@@ -3525,12 +3939,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err)
return err;
- if (t == BPF_WRITE)
- err = check_stack_write(env, state, off, size,
- value_regno, insn_idx);
- else
- err = check_stack_read(env, state, off, size,
+ if (t == BPF_READ)
+ err = check_stack_read(env, regno, off, size,
value_regno);
+ else
+ err = check_stack_write(env, regno, off, size,
+ value_regno, insn_idx);
} else if (reg_is_pkt_pointer(reg)) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
verbose(env, "cannot write into packet\n");
@@ -3606,13 +4020,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return err;
}
-static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
+static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
+ int load_reg;
int err;
- if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
- insn->imm != 0) {
- verbose(env, "BPF_XADD uses reserved fields\n");
+ switch (insn->imm) {
+ case BPF_ADD:
+ case BPF_ADD | BPF_FETCH:
+ case BPF_AND:
+ case BPF_AND | BPF_FETCH:
+ case BPF_OR:
+ case BPF_OR | BPF_FETCH:
+ case BPF_XOR:
+ case BPF_XOR | BPF_FETCH:
+ case BPF_XCHG:
+ case BPF_CMPXCHG:
+ break;
+ default:
+ verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
+ return -EINVAL;
+ }
+
+ if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) {
+ verbose(env, "invalid atomic operand size\n");
return -EINVAL;
}
@@ -3626,6 +4057,13 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
if (err)
return err;
+ if (insn->imm == BPF_CMPXCHG) {
+ /* Check comparison of R0 with memory location */
+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ if (err)
+ return err;
+ }
+
if (is_pointer_value(env, insn->src_reg)) {
verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
return -EACCES;
@@ -3635,66 +4073,91 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
is_pkt_reg(env, insn->dst_reg) ||
is_flow_key_reg(env, insn->dst_reg) ||
is_sk_reg(env, insn->dst_reg)) {
- verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
+ verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str[reg_state(env, insn->dst_reg)->type]);
return -EACCES;
}
- /* check whether atomic_add can read the memory */
+ if (insn->imm & BPF_FETCH) {
+ if (insn->imm == BPF_CMPXCHG)
+ load_reg = BPF_REG_0;
+ else
+ load_reg = insn->src_reg;
+
+ /* check and record load of old value */
+ err = check_reg_arg(env, load_reg, DST_OP);
+ if (err)
+ return err;
+ } else {
+ /* This instruction accesses a memory location but doesn't
+ * actually load it into a register.
+ */
+ load_reg = -1;
+ }
+
+ /* check whether we can read the memory */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ, -1, true);
+ BPF_SIZE(insn->code), BPF_READ, load_reg, true);
if (err)
return err;
- /* check whether atomic_add can write into the same memory */
- return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_WRITE, -1, true);
-}
-
-static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
- int off, int access_size,
- bool zero_size_allowed)
-{
- struct bpf_reg_state *reg = reg_state(env, regno);
-
- if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
- access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
- if (tnum_is_const(reg->var_off)) {
- verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
- regno, off, access_size);
- } else {
- char tn_buf[48];
+ /* check whether we can write into the same memory */
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, -1, true);
+ if (err)
+ return err;
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n",
- regno, tn_buf, access_size);
- }
- return -EACCES;
- }
return 0;
}
-/* when register 'regno' is passed into function that will read 'access_size'
- * bytes from that pointer, make sure that it's within stack boundary
- * and all elements of stack are initialized.
- * Unlike most pointer bounds-checking functions, this one doesn't take an
- * 'off' argument, so it has to add in reg->off itself.
+/* When register 'regno' is used to read the stack (either directly or through
+ * a helper function) make sure that it's within stack boundary and, depending
+ * on the access type, that all elements of the stack are initialized.
+ *
+ * 'off' includes 'regno->off', but not its dynamic part (if any).
+ *
+ * All registers that have been spilled on the stack in the slots within the
+ * read offsets are marked as read.
*/
-static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
- int access_size, bool zero_size_allowed,
- struct bpf_call_arg_meta *meta)
+static int check_stack_range_initialized(
+ struct bpf_verifier_env *env, int regno, int off,
+ int access_size, bool zero_size_allowed,
+ enum stack_access_src type, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = func(env, reg);
int err, min_off, max_off, i, j, slot, spi;
+ char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
+ enum bpf_access_type bounds_check_type;
+ /* Some accesses can write anything into the stack, others are
+ * read-only.
+ */
+ bool clobber = false;
+
+ if (access_size == 0 && !zero_size_allowed) {
+ verbose(env, "invalid zero-sized read\n");
+ return -EACCES;
+ }
+
+ if (type == ACCESS_HELPER) {
+ /* The bounds checks for writes are more permissive than for
+ * reads. However, if raw_mode is not set, we'll do extra
+ * checks below.
+ */
+ bounds_check_type = BPF_WRITE;
+ clobber = true;
+ } else {
+ bounds_check_type = BPF_READ;
+ }
+ err = check_stack_access_within_bounds(env, regno, off, access_size,
+ type, bounds_check_type);
+ if (err)
+ return err;
+
if (tnum_is_const(reg->var_off)) {
- min_off = max_off = reg->var_off.value + reg->off;
- err = __check_stack_boundary(env, regno, min_off, access_size,
- zero_size_allowed);
- if (err)
- return err;
+ min_off = max_off = reg->var_off.value + off;
} else {
/* Variable offset is prohibited for unprivileged mode for
* simplicity since it requires corresponding support in
@@ -3705,8 +4168,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n",
- regno, tn_buf);
+ verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
+ regno, err_extra, tn_buf);
return -EACCES;
}
/* Only initialized buffer on stack is allowed to be accessed
@@ -3718,28 +4181,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
if (meta && meta->raw_mode)
meta = NULL;
- if (reg->smax_value >= BPF_MAX_VAR_OFF ||
- reg->smax_value <= -BPF_MAX_VAR_OFF) {
- verbose(env, "R%d unbounded indirect variable offset stack access\n",
- regno);
- return -EACCES;
- }
- min_off = reg->smin_value + reg->off;
- max_off = reg->smax_value + reg->off;
- err = __check_stack_boundary(env, regno, min_off, access_size,
- zero_size_allowed);
- if (err) {
- verbose(env, "R%d min value is outside of stack bound\n",
- regno);
- return err;
- }
- err = __check_stack_boundary(env, regno, max_off, access_size,
- zero_size_allowed);
- if (err) {
- verbose(env, "R%d max value is outside of stack bound\n",
- regno);
- return err;
- }
+ min_off = reg->smin_value + off;
+ max_off = reg->smax_value + off;
}
if (meta && meta->raw_mode) {
@@ -3759,8 +4202,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
if (*stype == STACK_MISC)
goto mark;
if (*stype == STACK_ZERO) {
- /* helper can write anything into the stack */
- *stype = STACK_MISC;
+ if (clobber) {
+ /* helper can write anything into the stack */
+ *stype = STACK_MISC;
+ }
goto mark;
}
@@ -3771,22 +4216,24 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
if (state->stack[spi].slot_type[0] == STACK_SPILL &&
(state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
env->allow_ptr_leaks)) {
- __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
- for (j = 0; j < BPF_REG_SIZE; j++)
- state->stack[spi].slot_type[j] = STACK_MISC;
+ if (clobber) {
+ __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ state->stack[spi].slot_type[j] = STACK_MISC;
+ }
goto mark;
}
err:
if (tnum_is_const(reg->var_off)) {
- verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
- min_off, i - min_off, access_size);
+ verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
+ err_extra, regno, min_off, i - min_off, access_size);
} else {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n",
- tn_buf, i - min_off, access_size);
+ verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
+ err_extra, regno, tn_buf, i - min_off, access_size);
}
return -EACCES;
mark:
@@ -3835,8 +4282,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
"rdwr",
&env->prog->aux->max_rdwr_access);
case PTR_TO_STACK:
- return check_stack_boundary(env, regno, access_size,
- zero_size_allowed, meta);
+ return check_stack_range_initialized(
+ env,
+ regno, reg->off, access_size,
+ zero_size_allowed, ACCESS_HELPER, meta);
default: /* scalar_value or invalid ptr */
/* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
@@ -3850,6 +4299,29 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
}
}
+int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno, u32 mem_size)
+{
+ if (register_is_null(reg))
+ return 0;
+
+ if (reg_type_may_be_null(reg->type)) {
+ /* Assuming that the register contains a value check if the memory
+ * access is safe. Temporarily save and restore the register's state as
+ * the conversion shouldn't be visible to a caller.
+ */
+ const struct bpf_reg_state saved_reg = *reg;
+ int rv;
+
+ mark_ptr_not_null_reg(reg);
+ rv = check_helper_mem_access(env, regno, mem_size, true, NULL);
+ *reg = saved_reg;
+ return rv;
+ }
+
+ return check_helper_mem_access(env, regno, mem_size, true, NULL);
+}
+
/* Implementation details:
* bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
* Two bpf_map_lookups (even with the same key) will have different reg->id.
@@ -4321,7 +4793,7 @@ skip_type_check:
err = mark_chain_precision(env, regno);
} else if (arg_type_is_alloc_size(arg_type)) {
if (!tnum_is_const(reg->var_off)) {
- verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n",
+ verbose(env, "R%d is not a known constant'\n",
regno);
return -EACCES;
}
@@ -4834,8 +5306,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
subprog);
clear_caller_saved_regs(env, caller->regs);
- /* All global functions return SCALAR_VALUE */
+ /* All global functions return a 64-bit SCALAR_VALUE */
mark_reg_unknown(env, caller->regs, BPF_REG_0);
+ caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
/* continue with next insn after call */
return 0;
@@ -5500,6 +5973,41 @@ do_sim:
return !ret ? -EFAULT : 0;
}
+/* check that stack access falls within stack limits and that 'reg' doesn't
+ * have a variable offset.
+ *
+ * Variable offset is prohibited for unprivileged mode for simplicity since it
+ * requires corresponding support in Spectre masking for stack ALU. See also
+ * retrieve_ptr_limit().
+ *
+ *
+ * 'off' includes 'reg->off'.
+ */
+static int check_stack_access_for_ptr_arithmetic(
+ struct bpf_verifier_env *env,
+ int regno,
+ const struct bpf_reg_state *reg,
+ int off)
+{
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n",
+ regno, tn_buf, off);
+ return -EACCES;
+ }
+
+ if (off >= 0 || off < -MAX_BPF_STACK) {
+ verbose(env, "R%d stack pointer arithmetic goes out of range, "
+ "prohibited for !root; off=%d\n", regno, off);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
* Caller should also handle BPF_MOV case separately.
* If we return -EACCES, caller may want to try again treating pointer as a
@@ -5743,10 +6251,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
"prohibited for !root\n", dst);
return -EACCES;
} else if (dst_reg->type == PTR_TO_STACK &&
- check_stack_access(env, dst_reg, dst_reg->off +
- dst_reg->var_off.value, 1)) {
- verbose(env, "R%d stack pointer arithmetic goes out of range, "
- "prohibited for !root\n", dst);
+ check_stack_access_for_ptr_arithmetic(
+ env, dst, dst_reg, dst_reg->off +
+ dst_reg->var_off.value)) {
return -EACCES;
}
}
@@ -6225,7 +6732,7 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
* 3) the signed bounds cross zero, so they tell us nothing
* about the result
* If the value in dst_reg is known nonnegative, then again the
- * unsigned bounts capture the signed bounds.
+ * unsigned bounds capture the signed bounds.
* Thus, in all cases it suffices to blow away our signed bounds
* and rely on inferring new ones from the unsigned bounds and
* var_off of the result.
@@ -6256,7 +6763,7 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
* 3) the signed bounds cross zero, so they tell us nothing
* about the result
* If the value in dst_reg is known nonnegative, then again the
- * unsigned bounts capture the signed bounds.
+ * unsigned bounds capture the signed bounds.
* Thus, in all cases it suffices to blow away our signed bounds
* and rely on inferring new ones from the unsigned bounds and
* var_off of the result.
@@ -7326,43 +7833,19 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
}
if (is_null) {
reg->type = SCALAR_VALUE;
- } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
- const struct bpf_map *map = reg->map_ptr;
-
- if (map->inner_map_meta) {
- reg->type = CONST_PTR_TO_MAP;
- reg->map_ptr = map->inner_map_meta;
- } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
- reg->type = PTR_TO_XDP_SOCK;
- } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
- map->map_type == BPF_MAP_TYPE_SOCKHASH) {
- reg->type = PTR_TO_SOCKET;
- } else {
- reg->type = PTR_TO_MAP_VALUE;
- }
- } else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
- reg->type = PTR_TO_SOCKET;
- } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
- reg->type = PTR_TO_SOCK_COMMON;
- } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
- reg->type = PTR_TO_TCP_SOCK;
- } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
- reg->type = PTR_TO_BTF_ID;
- } else if (reg->type == PTR_TO_MEM_OR_NULL) {
- reg->type = PTR_TO_MEM;
- } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) {
- reg->type = PTR_TO_RDONLY_BUF;
- } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) {
- reg->type = PTR_TO_RDWR_BUF;
- }
- if (is_null) {
/* We don't need id and ref_obj_id from this point
* onwards anymore, thus we should better reset it,
* so that state pruning has chances to take effect.
*/
reg->id = 0;
reg->ref_obj_id = 0;
- } else if (!reg_may_point_to_spin_lock(reg)) {
+
+ return;
+ }
+
+ mark_ptr_not_null_reg(reg);
+
+ if (!reg_may_point_to_spin_lock(reg)) {
/* For not-NULL ptr, reg->ref_obj_id will be reset
* in release_reg_references().
*
@@ -7945,6 +8428,9 @@ static int check_return_code(struct bpf_verifier_env *env)
env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
range = tnum_range(1, 1);
+ if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
+ range = tnum_range(0, 3);
break;
case BPF_PROG_TYPE_CGROUP_SKB:
if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
@@ -9530,14 +10016,19 @@ static int do_check(struct bpf_verifier_env *env)
} else if (class == BPF_STX) {
enum bpf_reg_type *prev_dst_type, dst_reg_type;
- if (BPF_MODE(insn->code) == BPF_XADD) {
- err = check_xadd(env, env->insn_idx, insn);
+ if (BPF_MODE(insn->code) == BPF_ATOMIC) {
+ err = check_atomic(env, env->insn_idx, insn);
if (err)
return err;
env->insn_idx++;
continue;
}
+ if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
+ verbose(env, "BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+
/* check src1 operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
@@ -9709,6 +10200,36 @@ process_bpf_exit:
return 0;
}
+static int find_btf_percpu_datasec(struct btf *btf)
+{
+ const struct btf_type *t;
+ const char *tname;
+ int i, n;
+
+ /*
+ * Both vmlinux and module each have their own ".data..percpu"
+ * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF
+ * types to look at only module's own BTF types.
+ */
+ n = btf_nr_types(btf);
+ if (btf_is_module(btf))
+ i = btf_nr_types(btf_vmlinux);
+ else
+ i = 1;
+
+ for(; i < n; i++) {
+ t = btf_type_by_id(btf, i);
+ if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC)
+ continue;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!strcmp(tname, ".data..percpu"))
+ return i;
+ }
+
+ return -ENOENT;
+}
+
/* replace pseudo btf_id with kernel symbol address */
static int check_pseudo_btf_id(struct bpf_verifier_env *env,
struct bpf_insn *insn,
@@ -9716,48 +10237,57 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
{
const struct btf_var_secinfo *vsi;
const struct btf_type *datasec;
+ struct btf_mod_pair *btf_mod;
const struct btf_type *t;
const char *sym_name;
bool percpu = false;
u32 type, id = insn->imm;
+ struct btf *btf;
s32 datasec_id;
u64 addr;
- int i;
+ int i, btf_fd, err;
- if (!btf_vmlinux) {
- verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
- return -EINVAL;
- }
-
- if (insn[1].imm != 0) {
- verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n");
- return -EINVAL;
+ btf_fd = insn[1].imm;
+ if (btf_fd) {
+ btf = btf_get_by_fd(btf_fd);
+ if (IS_ERR(btf)) {
+ verbose(env, "invalid module BTF object FD specified.\n");
+ return -EINVAL;
+ }
+ } else {
+ if (!btf_vmlinux) {
+ verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
+ return -EINVAL;
+ }
+ btf = btf_vmlinux;
+ btf_get(btf);
}
- t = btf_type_by_id(btf_vmlinux, id);
+ t = btf_type_by_id(btf, id);
if (!t) {
verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
- return -ENOENT;
+ err = -ENOENT;
+ goto err_put;
}
if (!btf_type_is_var(t)) {
- verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n",
- id);
- return -EINVAL;
+ verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id);
+ err = -EINVAL;
+ goto err_put;
}
- sym_name = btf_name_by_offset(btf_vmlinux, t->name_off);
+ sym_name = btf_name_by_offset(btf, t->name_off);
addr = kallsyms_lookup_name(sym_name);
if (!addr) {
verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
sym_name);
- return -ENOENT;
+ err = -ENOENT;
+ goto err_put;
}
- datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu",
- BTF_KIND_DATASEC);
+ datasec_id = find_btf_percpu_datasec(btf);
if (datasec_id > 0) {
- datasec = btf_type_by_id(btf_vmlinux, datasec_id);
+ datasec = btf_type_by_id(btf, datasec_id);
for_each_vsi(i, datasec, vsi) {
if (vsi->type == id) {
percpu = true;
@@ -9770,10 +10300,10 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
insn[1].imm = addr >> 32;
type = t->type;
- t = btf_type_skip_modifiers(btf_vmlinux, type, NULL);
+ t = btf_type_skip_modifiers(btf, type, NULL);
if (percpu) {
aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
- aux->btf_var.btf = btf_vmlinux;
+ aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
} else if (!btf_type_is_struct(t)) {
const struct btf_type *ret;
@@ -9781,21 +10311,54 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
u32 tsize;
/* resolve the type size of ksym. */
- ret = btf_resolve_size(btf_vmlinux, t, &tsize);
+ ret = btf_resolve_size(btf, t, &tsize);
if (IS_ERR(ret)) {
- tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+ tname = btf_name_by_offset(btf, t->name_off);
verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
tname, PTR_ERR(ret));
- return -EINVAL;
+ err = -EINVAL;
+ goto err_put;
}
aux->btf_var.reg_type = PTR_TO_MEM;
aux->btf_var.mem_size = tsize;
} else {
aux->btf_var.reg_type = PTR_TO_BTF_ID;
- aux->btf_var.btf = btf_vmlinux;
+ aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
}
+
+ /* check whether we recorded this BTF (and maybe module) already */
+ for (i = 0; i < env->used_btf_cnt; i++) {
+ if (env->used_btfs[i].btf == btf) {
+ btf_put(btf);
+ return 0;
+ }
+ }
+
+ if (env->used_btf_cnt >= MAX_USED_BTFS) {
+ err = -E2BIG;
+ goto err_put;
+ }
+
+ btf_mod = &env->used_btfs[env->used_btf_cnt];
+ btf_mod->btf = btf;
+ btf_mod->module = NULL;
+
+ /* if we reference variables from kernel module, bump its refcount */
+ if (btf_is_module(btf)) {
+ btf_mod->module = btf_try_get_module(btf);
+ if (!btf_mod->module) {
+ err = -ENXIO;
+ goto err_put;
+ }
+ }
+
+ env->used_btf_cnt++;
+
return 0;
+err_put:
+ btf_put(btf);
+ return err;
}
static int check_map_prealloc(struct bpf_map *map)
@@ -9897,15 +10460,22 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_HASH:
case BPF_MAP_TYPE_LRU_HASH:
case BPF_MAP_TYPE_ARRAY:
+ case BPF_MAP_TYPE_PERCPU_HASH:
+ case BPF_MAP_TYPE_PERCPU_ARRAY:
+ case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+ case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+ case BPF_MAP_TYPE_HASH_OF_MAPS:
if (!is_preallocated_map(map)) {
verbose(env,
- "Sleepable programs can only use preallocated hash maps\n");
+ "Sleepable programs can only use preallocated maps\n");
return -EINVAL;
}
break;
+ case BPF_MAP_TYPE_RINGBUF:
+ break;
default:
verbose(env,
- "Sleepable programs can only use array and hash maps\n");
+ "Sleepable programs can only use array, hash, and ringbuf maps\n");
return -EINVAL;
}
@@ -9942,13 +10512,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (BPF_CLASS(insn->code) == BPF_STX &&
- ((BPF_MODE(insn->code) != BPF_MEM &&
- BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
- verbose(env, "BPF_STX uses reserved fields\n");
- return -EINVAL;
- }
-
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_insn_aux_data *aux;
struct bpf_map *map;
@@ -10092,6 +10655,13 @@ static void release_maps(struct bpf_verifier_env *env)
env->used_map_cnt);
}
+/* drop refcnt of maps used by the rejected program */
+static void release_btfs(struct bpf_verifier_env *env)
+{
+ __bpf_free_used_btfs(env->prog->aux, env->used_btfs,
+ env->used_btf_cnt);
+}
+
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
{
@@ -10463,8 +11033,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
for (i = 0; i < len; i++) {
int adj_idx = i + delta;
struct bpf_insn insn;
+ int load_reg;
insn = insns[adj_idx];
+ load_reg = insn_def_regno(&insn);
if (!aux[adj_idx].zext_dst) {
u8 code, class;
u32 imm_rnd;
@@ -10474,14 +11046,14 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
code = insn.code;
class = BPF_CLASS(code);
- if (insn_no_def(&insn))
+ if (load_reg == -1)
continue;
/* NOTE: arg "reg" (the fourth one) is only used for
- * BPF_STX which has been ruled out in above
- * check, it is safe to pass NULL here.
+ * BPF_STX + SRC_OP, so it is safe to pass NULL
+ * here.
*/
- if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) {
+ if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) {
if (class == BPF_LD &&
BPF_MODE(code) == BPF_IMM)
i++;
@@ -10496,18 +11068,33 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
imm_rnd = get_random_int();
rnd_hi32_patch[0] = insn;
rnd_hi32_patch[1].imm = imm_rnd;
- rnd_hi32_patch[3].dst_reg = insn.dst_reg;
+ rnd_hi32_patch[3].dst_reg = load_reg;
patch = rnd_hi32_patch;
patch_len = 4;
goto apply_patch_buffer;
}
- if (!bpf_jit_needs_zext())
+ /* Add in an zero-extend instruction if a) the JIT has requested
+ * it or b) it's a CMPXCHG.
+ *
+ * The latter is because: BPF_CMPXCHG always loads a value into
+ * R0, therefore always zero-extends. However some archs'
+ * equivalent instruction only does this load when the
+ * comparison is successful. This detail of CMPXCHG is
+ * orthogonal to the general zero-extension behaviour of the
+ * CPU, so it's treated independently of bpf_jit_needs_zext.
+ */
+ if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
continue;
+ if (WARN_ON(load_reg == -1)) {
+ verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
+ return -EFAULT;
+ }
+
zext_patch[0] = insn;
- zext_patch[1].dst_reg = insn.dst_reg;
- zext_patch[1].src_reg = insn.dst_reg;
+ zext_patch[1].dst_reg = load_reg;
+ zext_patch[1].src_reg = load_reg;
patch = zext_patch;
patch_len = 2;
apply_patch_buffer:
@@ -10723,8 +11310,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
return 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (insn->code != (BPF_JMP | BPF_CALL) ||
- insn->src_reg != BPF_PSEUDO_CALL)
+ if (!bpf_pseudo_call(insn))
continue;
/* Upon error here we cannot fall back to interpreter but
* need a hard reject of the program. Thus -EFAULT is
@@ -10765,7 +11351,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
/* BPF_PROG_RUN doesn't call subprogs directly,
* hence main prog stats include the runtime of subprogs.
* subprogs don't have IDs and not reachable via prog_get_next_id
- * func[i]->aux->stats will never be accessed and stays NULL
+ * func[i]->stats will never be accessed and stays NULL
*/
func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
if (!func[i])
@@ -10853,8 +11439,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
for (i = 0; i < env->subprog_cnt; i++) {
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
- if (insn->code != (BPF_JMP | BPF_CALL) ||
- insn->src_reg != BPF_PSEUDO_CALL)
+ if (!bpf_pseudo_call(insn))
continue;
subprog = insn->off;
insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
@@ -10899,8 +11484,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* later look the same as if they were interpreted only.
*/
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (insn->code != (BPF_JMP | BPF_CALL) ||
- insn->src_reg != BPF_PSEUDO_CALL)
+ if (!bpf_pseudo_call(insn))
continue;
insn->off = env->insn_aux_data[i].call_imm;
subprog = find_subprog(env, i + insn->off + 1);
@@ -10929,8 +11513,7 @@ out_undo_insn:
/* cleanup main prog to be interpreted */
prog->jit_requested = 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (insn->code != (BPF_JMP | BPF_CALL) ||
- insn->src_reg != BPF_PSEUDO_CALL)
+ if (!bpf_pseudo_call(insn))
continue;
insn->off = 0;
insn->imm = env->insn_aux_data[i].call_imm;
@@ -10965,8 +11548,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return -EINVAL;
}
for (i = 0; i < prog->len; i++, insn++) {
- if (insn->code != (BPF_JMP | BPF_CALL) ||
- insn->src_reg != BPF_PSEUDO_CALL)
+ if (!bpf_pseudo_call(insn))
continue;
depth = get_callee_stack_depth(env, insn, i);
if (depth < 0)
@@ -11006,7 +11588,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
bool isdiv = BPF_OP(insn->code) == BPF_DIV;
struct bpf_insn *patchlet;
struct bpf_insn chk_and_div[] = {
- /* Rx div 0 -> 0 */
+ /* [R,W]x div 0 -> 0 */
BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
BPF_JNE | BPF_K, insn->src_reg,
0, 2, 0),
@@ -11015,16 +11597,18 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*insn,
};
struct bpf_insn chk_and_mod[] = {
- /* Rx mod 0 -> Rx */
+ /* [R,W]x mod 0 -> [R,W]x */
BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
BPF_JEQ | BPF_K, insn->src_reg,
- 0, 1, 0),
+ 0, 1 + (is64 ? 0 : 1), 0),
*insn,
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
};
patchlet = isdiv ? chk_and_div : chk_and_mod;
cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
- ARRAY_SIZE(chk_and_mod);
+ ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
if (!new_prog)
@@ -11429,6 +12013,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
mark_reg_known_zero(env, regs, i);
else if (regs[i].type == SCALAR_VALUE)
mark_reg_unknown(env, regs, i);
+ else if (regs[i].type == PTR_TO_MEM_OR_NULL) {
+ const u32 mem_size = regs[i].mem_size;
+
+ mark_reg_known_zero(env, regs, i);
+ regs[i].mem_size = mem_size;
+ regs[i].id = ++env->id_gen;
+ }
}
} else {
/* 1st arg to a function */
@@ -12007,6 +12598,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->strict_alignment = false;
env->allow_ptr_leaks = bpf_allow_ptr_leaks();
+ env->allow_uninit_stack = bpf_allow_uninit_stack();
env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
env->bypass_spec_v1 = bpf_bypass_spec_v1();
env->bypass_spec_v4 = bpf_bypass_spec_v4();
@@ -12102,7 +12694,10 @@ skip_full_check:
goto err_release_maps;
}
- if (ret == 0 && env->used_map_cnt) {
+ if (ret)
+ goto err_release_maps;
+
+ if (env->used_map_cnt) {
/* if program passed verifier, update used_maps in bpf_prog_info */
env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
sizeof(env->used_maps[0]),
@@ -12116,15 +12711,29 @@ skip_full_check:
memcpy(env->prog->aux->used_maps, env->used_maps,
sizeof(env->used_maps[0]) * env->used_map_cnt);
env->prog->aux->used_map_cnt = env->used_map_cnt;
+ }
+ if (env->used_btf_cnt) {
+ /* if program passed verifier, update used_btfs in bpf_prog_aux */
+ env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt,
+ sizeof(env->used_btfs[0]),
+ GFP_KERNEL);
+ if (!env->prog->aux->used_btfs) {
+ ret = -ENOMEM;
+ goto err_release_maps;
+ }
+ memcpy(env->prog->aux->used_btfs, env->used_btfs,
+ sizeof(env->used_btfs[0]) * env->used_btf_cnt);
+ env->prog->aux->used_btf_cnt = env->used_btf_cnt;
+ }
+ if (env->used_map_cnt || env->used_btf_cnt) {
/* program is valid. Convert pseudo bpf_ld_imm64 into generic
* bpf_ld_imm64 instructions
*/
convert_pseudo_ld_imm64(env);
}
- if (ret == 0)
- adjust_btf_func(env);
+ adjust_btf_func(env);
err_release_maps:
if (!env->prog->aux->used_maps)
@@ -12132,6 +12741,8 @@ err_release_maps:
* them now. Otherwise free_used_maps() will release them.
*/
release_maps(env);
+ if (!env->prog->aux->used_btfs)
+ release_btfs(env);
/* extension progs temporarily inherit the attach_type of their targets
for verification purposes, so set it back to zero before returning
diff --git a/kernel/capability.c b/kernel/capability.c
index de7eac903a2a..46a361dde042 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -484,10 +484,12 @@ EXPORT_SYMBOL(file_ns_capable);
*
* Return true if the inode uid and gid are within the namespace.
*/
-bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
+ struct user_namespace *mnt_userns,
+ const struct inode *inode)
{
- return kuid_has_mapping(ns, inode->i_uid) &&
- kgid_has_mapping(ns, inode->i_gid);
+ return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) &&
+ kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode));
}
/**
@@ -499,11 +501,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
-bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
+bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
+ const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
- return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
+ return ns_capable(ns, cap) &&
+ privileged_wrt_inode_uidgid(ns, mnt_userns, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1ea995f801ec..9153b20e5cc6 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4672,7 +4672,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
if (!inode)
return -ENOMEM;
- ret = inode_permission(inode, MAY_WRITE);
+ ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
iput(inode);
return ret;
}
@@ -4728,8 +4728,8 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
return ret;
}
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ bool threadgroup)
{
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
@@ -4740,7 +4740,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, true, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -4750,19 +4750,26 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
+ /* process and thread migrations follow same delegation rule */
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb, true);
+ of->file->f_path.dentry->d_sb, threadgroup);
if (ret)
goto out_finish;
- ret = cgroup_attach_task(dst_cgrp, task, true);
+ ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
cgroup_procs_write_finish(task, locked);
out_unlock:
cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
+ return ret;
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, true) ?: nbytes;
}
static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
@@ -4773,41 +4780,7 @@ static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- struct cgroup *src_cgrp, *dst_cgrp;
- struct task_struct *task;
- ssize_t ret;
- bool locked;
-
- buf = strstrip(buf);
-
- dst_cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!dst_cgrp)
- return -ENODEV;
-
- task = cgroup_procs_write_start(buf, false, &locked);
- ret = PTR_ERR_OR_ZERO(task);
- if (ret)
- goto out_unlock;
-
- /* find the source cgroup */
- spin_lock_irq(&css_set_lock);
- src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- /* thread migrations follow the cgroup.procs delegation rule */
- ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb, false);
- if (ret)
- goto out_finish;
-
- ret = cgroup_attach_task(dst_cgrp, task, false);
-
-out_finish:
- cgroup_procs_write_finish(task, locked);
-out_unlock:
- cgroup_kn_unlock(of->kn);
-
- return ret ?: nbytes;
+ return __cgroup_procs_write(of, buf, false) ?: nbytes;
}
/* cgroup core interface files for the default hierarchy */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 53c70c470a38..5258b68153e0 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -98,7 +98,7 @@ struct cpuset {
* and if it ends up empty, it will inherit the parent's mask.
*
*
- * On legacy hierachy:
+ * On legacy hierarchy:
*
* The user-configured masks are always the same with effective masks.
*/
@@ -1309,10 +1309,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
*
- * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * When configured cpumask is changed, the effective cpumasks of this cpuset
* and all its descendants need to be updated.
*
- * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
+ * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
*
* Called with cpuset_mutex held
*/
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 4e11e91010e1..1b6302ecbabe 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -330,6 +330,13 @@ void lockdep_assert_cpus_held(void)
percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
+#ifdef CONFIG_LOCKDEP
+int lockdep_is_cpus_held(void)
+{
+ return percpu_rwsem_is_held(&cpu_hotplug_lock);
+}
+#endif
+
static void lockdep_acquire_cpus_lock(void)
{
rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index af6e8b4fb359..4708aec492df 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -119,7 +119,6 @@ static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
*/
static atomic_t masters_in_kgdb;
static atomic_t slaves_in_kgdb;
-static atomic_t kgdb_break_tasklet_var;
atomic_t kgdb_setting_breakpoint;
struct task_struct *kgdb_usethread;
@@ -456,6 +455,17 @@ setundefined:
return 0;
}
+void kgdb_free_init_mem(void)
+{
+ int i;
+
+ /* Clear init memory breakpoints. */
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (init_section_contains((void *)kgdb_break[i].bpt_addr, 0))
+ kgdb_break[i].state = BP_UNDEFINED;
+ }
+}
+
#ifdef CONFIG_KGDB_KDB
void kdb_dump_stack_on_cpu(int cpu)
{
@@ -1084,31 +1094,6 @@ static void kgdb_unregister_callbacks(void)
}
}
-/*
- * There are times a tasklet needs to be used vs a compiled in
- * break point so as to cause an exception outside a kgdb I/O module,
- * such as is the case with kgdboe, where calling a breakpoint in the
- * I/O driver itself would be fatal.
- */
-static void kgdb_tasklet_bpt(unsigned long ing)
-{
- kgdb_breakpoint();
- atomic_set(&kgdb_break_tasklet_var, 0);
-}
-
-static DECLARE_TASKLET_OLD(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt);
-
-void kgdb_schedule_breakpoint(void)
-{
- if (atomic_read(&kgdb_break_tasklet_var) ||
- atomic_read(&kgdb_active) != -1 ||
- atomic_read(&kgdb_setting_breakpoint))
- return;
- atomic_inc(&kgdb_break_tasklet_var);
- tasklet_schedule(&kgdb_tasklet_breakpoint);
-}
-EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
-
/**
* kgdb_register_io_module - register KGDB IO module
* @new_dbg_io_ops: the io ops vector
@@ -1166,7 +1151,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
EXPORT_SYMBOL_GPL(kgdb_register_io_module);
/**
- * kkgdb_unregister_io_module - unregister KGDB IO module
+ * kgdb_unregister_io_module - unregister KGDB IO module
* @old_dbg_io_ops: the io ops vector
*
* Unregister it with the KGDB core.
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index a77df59d9ca5..e149a0ac9e9e 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -595,7 +595,7 @@ static char *gdb_hex_reg_helper(int regnum, char *out)
dbg_reg_def[i].size);
}
-/* Handle the 'p' individual regster get */
+/* Handle the 'p' individual register get */
static void gdb_cmd_reg_get(struct kgdb_state *ks)
{
unsigned long regnum;
@@ -610,7 +610,7 @@ static void gdb_cmd_reg_get(struct kgdb_state *ks)
gdb_hex_reg_helper(regnum, remcom_out_buffer);
}
-/* Handle the 'P' individual regster set */
+/* Handle the 'P' individual register set */
static void gdb_cmd_reg_set(struct kgdb_state *ks)
{
unsigned long regnum;
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index a4281fb99299..6cb92f7bbbd0 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -230,7 +230,7 @@ extern struct task_struct *kdb_curr_task(int);
#define kdb_task_has_cpu(p) (task_curr(p))
-#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
+#define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL)
extern void *debug_kmalloc(size_t size, gfp_t flags);
extern void debug_kfree(void *);
@@ -254,4 +254,14 @@ extern char kdb_prompt_str[];
#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
#endif /* CONFIG_KGDB_KDB */
+
+#define kdb_func_printf(format, args...) \
+ kdb_printf("%s: " format, __func__, ## args)
+
+#define kdb_dbg_printf(mask, format, args...) \
+ do { \
+ if (KDB_DEBUG(mask)) \
+ kdb_func_printf(format, ## args); \
+ } while (0)
+
#endif /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 6226502ce049..f7c1885abeb6 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -39,20 +39,15 @@
*/
int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
{
- if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: symname=%s, symtab=%px\n", symname,
- symtab);
+ kdb_dbg_printf(AR, "symname=%s, symtab=%px\n", symname, symtab);
memset(symtab, 0, sizeof(*symtab));
symtab->sym_start = kallsyms_lookup_name(symname);
if (symtab->sym_start) {
- if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: returns 1, "
- "symtab->sym_start=0x%lx\n",
- symtab->sym_start);
+ kdb_dbg_printf(AR, "returns 1, symtab->sym_start=0x%lx\n",
+ symtab->sym_start);
return 1;
}
- if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: returns 0\n");
+ kdb_dbg_printf(AR, "returns 0\n");
return 0;
}
EXPORT_SYMBOL(kdbgetsymval);
@@ -87,16 +82,14 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
#define knt1_size 128 /* must be >= kallsyms table size */
char *knt1 = NULL;
- if (KDB_DEBUG(AR))
- kdb_printf("kdbnearsym: addr=0x%lx, symtab=%px\n", addr, symtab);
+ kdb_dbg_printf(AR, "addr=0x%lx, symtab=%px\n", addr, symtab);
memset(symtab, 0, sizeof(*symtab));
if (addr < 4096)
goto out;
knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
if (!knt1) {
- kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
- addr);
+ kdb_func_printf("addr=0x%lx cannot kmalloc knt1\n", addr);
goto out;
}
symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
@@ -147,11 +140,8 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
if (symtab->mod_name == NULL)
symtab->mod_name = "kernel";
- if (KDB_DEBUG(AR))
- kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
- "symtab->mod_name=%px, symtab->sym_name=%px (%s)\n", ret,
- symtab->sym_start, symtab->mod_name, symtab->sym_name,
- symtab->sym_name);
+ kdb_dbg_printf(AR, "returns %d symtab->sym_start=0x%lx, symtab->mod_name=%px, symtab->sym_name=%px (%s)\n",
+ ret, symtab->sym_start, symtab->mod_name, symtab->sym_name, symtab->sym_name);
out:
debug_kfree(knt1);
@@ -328,7 +318,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
int ret = copy_from_kernel_nofault((char *)res, (char *)addr, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
- kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
+ kdb_func_printf("Bad address 0x%lx\n", addr);
KDB_STATE_SET(SUPPRESS);
}
ret = KDB_BADADDR;
@@ -353,7 +343,7 @@ int kdb_putarea_size(unsigned long addr, void *res, size_t size)
int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
- kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
+ kdb_func_printf("Bad address 0x%lx\n", addr);
KDB_STATE_SET(SUPPRESS);
}
ret = KDB_BADADDR;
@@ -435,7 +425,7 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
fallthrough;
default:
diag = KDB_BADWIDTH;
- kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
+ kdb_func_printf("bad width %zu\n", size);
}
return diag;
}
@@ -484,7 +474,7 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
fallthrough;
default:
diag = KDB_BADWIDTH;
- kdb_printf("kdb_getword: bad width %ld\n", (long) size);
+ kdb_func_printf("bad width %zu\n", size);
}
return diag;
}
@@ -528,7 +518,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
fallthrough;
default:
diag = KDB_BADWIDTH;
- kdb_printf("kdb_putword: bad width %ld\n", (long) size);
+ kdb_func_printf("bad width %zu\n", size);
}
return diag;
}
@@ -602,8 +592,7 @@ unsigned long kdb_task_state_string(const char *s)
res = ~0UL;
break;
default:
- kdb_printf("%s: unknown flag '%c' ignored\n",
- __func__, *s);
+ kdb_func_printf("unknown flag '%c' ignored\n", *s);
break;
}
++s;
@@ -884,18 +873,16 @@ void debug_kusage(void)
if (!debug_kusage_one_time)
goto out;
debug_kusage_one_time = 0;
- kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
- __func__, dah_first);
+ kdb_func_printf("debug_kmalloc memory leak dah_first %d\n", dah_first);
if (dah_first) {
h_used = (struct debug_alloc_header *)debug_alloc_pool;
- kdb_printf("%s: h_used %px size %d\n", __func__, h_used,
- h_used->size);
+ kdb_func_printf("h_used %px size %d\n", h_used, h_used->size);
}
do {
h_used = (struct debug_alloc_header *)
((char *)h_free + dah_overhead + h_free->size);
- kdb_printf("%s: h_used %px size %d caller %px\n",
- __func__, h_used, h_used->size, h_used->caller);
+ kdb_func_printf("h_used %px size %d caller %px\n",
+ h_used, h_used->size, h_used->caller);
h_free = (struct debug_alloc_header *)
(debug_alloc_pool + h_free->next);
} while (h_free->next);
@@ -903,8 +890,8 @@ void debug_kusage(void)
((char *)h_free + dah_overhead + h_free->size);
if ((char *)h_used - debug_alloc_pool !=
sizeof(debug_alloc_pool_aligned))
- kdb_printf("%s: h_used %px size %d caller %px\n",
- __func__, h_used, h_used->size, h_used->caller);
+ kdb_func_printf("h_used %px size %d caller %px\n",
+ h_used, h_used->size, h_used->caller);
out:
spin_unlock(&dap_lock);
}
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 479fc145acfc..77b405508743 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -33,9 +33,6 @@ config NEED_DMA_MAP_STATE
config ARCH_DMA_ADDR_T_64BIT
def_bool 64BIT || PHYS_ADDR_T_64BIT
-config ARCH_HAS_DMA_COHERENCE_H
- bool
-
config ARCH_HAS_DMA_SET_MASK
bool
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index da95df381483..e0e64f8b0739 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -21,6 +21,7 @@
#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
#define DMA_MAP_MAX_THREADS 1024
#define DMA_MAP_MAX_SECONDS 300
+#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC)
#define DMA_MAP_BIDIRECTIONAL 0
#define DMA_MAP_TO_DEVICE 1
@@ -36,7 +37,8 @@ struct map_benchmark {
__s32 node; /* which numa node this benchmark will run on */
__u32 dma_bits; /* DMA addressing capability */
__u32 dma_dir; /* DMA data direction */
- __u8 expansion[84]; /* For future use */
+ __u32 dma_trans_ns; /* time for DMA transmission in ns */
+ __u8 expansion[80]; /* For future use */
};
struct map_benchmark_data {
@@ -87,6 +89,9 @@ static int map_benchmark_thread(void *data)
map_etime = ktime_get();
map_delta = ktime_sub(map_etime, map_stime);
+ /* Pretend DMA is transmitting */
+ ndelay(map->bparam.dma_trans_ns);
+
unmap_stime = ktime_get();
dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
unmap_etime = ktime_get();
@@ -218,6 +223,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
return -EINVAL;
}
+ if (map->bparam.dma_trans_ns > DMA_MAP_MAX_TRANS_DELAY) {
+ pr_err("invalid transmission delay\n");
+ return -EINVAL;
+ }
+
if (map->bparam.node != NUMA_NO_NODE &&
!node_possible(map->bparam.node)) {
pr_err("invalid numa node\n");
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index f87a89d08654..b6a633679933 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -16,6 +16,8 @@
#include "debug.h"
#include "direct.h"
+bool dma_default_coherent;
+
/*
* Managed DMA API
*/
@@ -515,46 +517,6 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page,
}
EXPORT_SYMBOL_GPL(dma_free_pages);
-void *dma_alloc_noncoherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
-{
- const struct dma_map_ops *ops = get_dma_ops(dev);
- void *vaddr;
-
- if (!ops || !ops->alloc_noncoherent) {
- struct page *page;
-
- page = dma_alloc_pages(dev, size, dma_handle, dir, gfp);
- if (!page)
- return NULL;
- return page_address(page);
- }
-
- size = PAGE_ALIGN(size);
- vaddr = ops->alloc_noncoherent(dev, size, dma_handle, dir, gfp);
- if (vaddr)
- debug_dma_map_page(dev, virt_to_page(vaddr), 0, size, dir,
- *dma_handle);
- return vaddr;
-}
-EXPORT_SYMBOL_GPL(dma_alloc_noncoherent);
-
-void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_handle, enum dma_data_direction dir)
-{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
- if (!ops || !ops->free_noncoherent) {
- dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir);
- return;
- }
-
- size = PAGE_ALIGN(size);
- debug_dma_unmap_page(dev, dma_handle, size, dir);
- ops->free_noncoherent(dev, size, vaddr, dma_handle, dir);
-}
-EXPORT_SYMBOL_GPL(dma_free_noncoherent);
-
int dma_supported(struct device *dev, u64 mask)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 7c42df6e6100..c10e855a03bc 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -50,9 +50,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/swiotlb.h>
-#define OFFSET(val,align) ((unsigned long) \
- ( (val) & ( (align) - 1)))
-
#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
/*
@@ -103,6 +100,11 @@ static unsigned int max_segment;
static phys_addr_t *io_tlb_orig_addr;
/*
+ * The mapped buffer's size should be validated during a sync operation.
+ */
+static size_t *io_tlb_orig_size;
+
+/*
* Protect the above data structures in the map and unmap calls
*/
static DEFINE_SPINLOCK(io_tlb_lock);
@@ -171,7 +173,7 @@ void __init swiotlb_adjust_size(unsigned long new_size)
* adjust/expand SWIOTLB size for their use.
*/
if (!io_tlb_nslabs) {
- size = ALIGN(new_size, 1 << IO_TLB_SHIFT);
+ size = ALIGN(new_size, IO_TLB_SIZE);
io_tlb_nslabs = size >> IO_TLB_SHIFT;
io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
@@ -192,6 +194,16 @@ void swiotlb_print_info(void)
bytes >> 20);
}
+static inline unsigned long io_tlb_offset(unsigned long val)
+{
+ return val & (IO_TLB_SEGSIZE - 1);
+}
+
+static inline unsigned long nr_slots(u64 val)
+{
+ return DIV_ROUND_UP(val, IO_TLB_SIZE);
+}
+
/*
* Early SWIOTLB allocation may be too early to allow an architecture to
* perform the desired operations. This function allows the architecture to
@@ -240,9 +252,16 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
__func__, alloc_size, PAGE_SIZE);
+ alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t));
+ io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!io_tlb_orig_size)
+ panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+
for (i = 0; i < io_tlb_nslabs; i++) {
- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+ io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ io_tlb_orig_size[i] = 0;
}
io_tlb_index = 0;
no_iotlb_memory = false;
@@ -363,7 +382,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
* between io_tlb_start and io_tlb_end.
*/
io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
- get_order(io_tlb_nslabs * sizeof(int)));
+ get_order(io_tlb_nslabs * sizeof(int)));
if (!io_tlb_list)
goto cleanup3;
@@ -374,9 +393,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
if (!io_tlb_orig_addr)
goto cleanup4;
+ io_tlb_orig_size = (size_t *)
+ __get_free_pages(GFP_KERNEL,
+ get_order(io_tlb_nslabs *
+ sizeof(size_t)));
+ if (!io_tlb_orig_size)
+ goto cleanup5;
+
+
for (i = 0; i < io_tlb_nslabs; i++) {
- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+ io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ io_tlb_orig_size[i] = 0;
}
io_tlb_index = 0;
no_iotlb_memory = false;
@@ -389,6 +417,10 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
return 0;
+cleanup5:
+ free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs *
+ sizeof(phys_addr_t)));
+
cleanup4:
free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
sizeof(int)));
@@ -404,6 +436,8 @@ void __init swiotlb_exit(void)
return;
if (late_alloc) {
+ free_pages((unsigned long)io_tlb_orig_size,
+ get_order(io_tlb_nslabs * sizeof(size_t)));
free_pages((unsigned long)io_tlb_orig_addr,
get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
@@ -413,6 +447,8 @@ void __init swiotlb_exit(void)
} else {
memblock_free_late(__pa(io_tlb_orig_addr),
PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
+ memblock_free_late(__pa(io_tlb_orig_size),
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)));
memblock_free_late(__pa(io_tlb_list),
PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
memblock_free_late(io_tlb_start,
@@ -461,79 +497,71 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
}
}
-phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
- size_t mapping_size, size_t alloc_size,
- enum dma_data_direction dir, unsigned long attrs)
-{
- dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start);
- unsigned long flags;
- phys_addr_t tlb_addr;
- unsigned int nslots, stride, index, wrap;
- int i;
- unsigned long mask;
- unsigned long offset_slots;
- unsigned long max_slots;
- unsigned long tmp_io_tlb_used;
-
- if (no_iotlb_memory)
- panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
-
- if (mem_encrypt_active())
- pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
+#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT))
- if (mapping_size > alloc_size) {
- dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
- mapping_size, alloc_size);
- return (phys_addr_t)DMA_MAPPING_ERROR;
- }
+/*
+ * Return the offset into a iotlb slot required to keep the device happy.
+ */
+static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
+{
+ return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1);
+}
- mask = dma_get_seg_boundary(hwdev);
+/*
+ * Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
+ */
+static inline unsigned long get_max_slots(unsigned long boundary_mask)
+{
+ if (boundary_mask == ~0UL)
+ return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+ return nr_slots(boundary_mask + 1);
+}
- tbl_dma_addr &= mask;
+static unsigned int wrap_index(unsigned int index)
+{
+ if (index >= io_tlb_nslabs)
+ return 0;
+ return index;
+}
- offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+/*
+ * Find a suitable number of IO TLB entries size that will fit this request and
+ * allocate a buffer from that IO TLB pool.
+ */
+static int find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size)
+{
+ unsigned long boundary_mask = dma_get_seg_boundary(dev);
+ dma_addr_t tbl_dma_addr =
+ phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask;
+ unsigned long max_slots = get_max_slots(boundary_mask);
+ unsigned int iotlb_align_mask =
+ dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
+ unsigned int nslots = nr_slots(alloc_size), stride;
+ unsigned int index, wrap, count = 0, i;
+ unsigned long flags;
- /*
- * Carefully handle integer overflow which can occur when mask == ~0UL.
- */
- max_slots = mask + 1
- ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
- : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+ BUG_ON(!nslots);
/*
- * For mappings greater than or equal to a page, we limit the stride
- * (and hence alignment) to a page size.
+ * For mappings with an alignment requirement don't bother looping to
+ * unaligned slots once we found an aligned one. For allocations of
+ * PAGE_SIZE or larger only look for page aligned allocations.
*/
- nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
if (alloc_size >= PAGE_SIZE)
- stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
- else
- stride = 1;
-
- BUG_ON(!nslots);
+ stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
- /*
- * Find suitable number of IO TLB entries size that will fit this
- * request and allocate a buffer from that IO TLB pool.
- */
spin_lock_irqsave(&io_tlb_lock, flags);
-
if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
goto not_found;
- index = ALIGN(io_tlb_index, stride);
- if (index >= io_tlb_nslabs)
- index = 0;
- wrap = index;
-
+ index = wrap = wrap_index(ALIGN(io_tlb_index, stride));
do {
- while (iommu_is_span_boundary(index, nslots, offset_slots,
- max_slots)) {
- index += stride;
- if (index >= io_tlb_nslabs)
- index = 0;
- if (index == wrap)
- goto not_found;
+ if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
+ (orig_addr & iotlb_align_mask)) {
+ index = wrap_index(index + 1);
+ continue;
}
/*
@@ -541,55 +569,96 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
* contiguous buffers, we allocate the buffers from that slot
* and mark the entries as '0' indicating unavailable.
*/
- if (io_tlb_list[index] >= nslots) {
- int count = 0;
-
- for (i = index; i < (int) (index + nslots); i++)
- io_tlb_list[i] = 0;
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
- tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT);
-
- /*
- * Update the indices to avoid searching in the next
- * round.
- */
- io_tlb_index = ((index + nslots) < io_tlb_nslabs
- ? (index + nslots) : 0);
-
- goto found;
+ if (!iommu_is_span_boundary(index, nslots,
+ nr_slots(tbl_dma_addr),
+ max_slots)) {
+ if (io_tlb_list[index] >= nslots)
+ goto found;
}
- index += stride;
- if (index >= io_tlb_nslabs)
- index = 0;
+ index = wrap_index(index + stride);
} while (index != wrap);
not_found:
- tmp_io_tlb_used = io_tlb_used;
-
spin_unlock_irqrestore(&io_tlb_lock, flags);
- if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
- dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
- alloc_size, io_tlb_nslabs, tmp_io_tlb_used);
- return (phys_addr_t)DMA_MAPPING_ERROR;
+ return -1;
+
found:
+ for (i = index; i < index + nslots; i++)
+ io_tlb_list[i] = 0;
+ for (i = index - 1;
+ io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
+ io_tlb_list[i]; i--)
+ io_tlb_list[i] = ++count;
+
+ /*
+ * Update the indices to avoid searching in the next round.
+ */
+ if (index + nslots < io_tlb_nslabs)
+ io_tlb_index = index + nslots;
+ else
+ io_tlb_index = 0;
io_tlb_used += nslots;
+
spin_unlock_irqrestore(&io_tlb_lock, flags);
+ return index;
+}
+
+phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
+ size_t mapping_size, size_t alloc_size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ unsigned int index, i;
+ phys_addr_t tlb_addr;
+
+ if (no_iotlb_memory)
+ panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+
+ if (mem_encrypt_active())
+ pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
+
+ if (mapping_size > alloc_size) {
+ dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
+ mapping_size, alloc_size);
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
+
+ index = find_slots(dev, orig_addr, alloc_size + offset);
+ if (index == -1) {
+ if (!(attrs & DMA_ATTR_NO_WARN))
+ dev_warn_ratelimited(dev,
+ "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
+ alloc_size, io_tlb_nslabs, io_tlb_used);
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
/*
* Save away the mapping from the original address to the DMA address.
* This is needed when we sync the memory. Then we sync the buffer if
* needed.
*/
- for (i = 0; i < nslots; i++)
- io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
+ for (i = 0; i < nr_slots(alloc_size + offset); i++) {
+ io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i);
+ io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
+ }
+ tlb_addr = slot_addr(io_tlb_start, index) + offset;
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
-
return tlb_addr;
}
+static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size)
+{
+ if (*size > orig_size) {
+ /* Warn and truncate mapping_size */
+ dev_WARN_ONCE(hwdev, 1,
+ "Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n",
+ orig_size, *size);
+ *size = orig_size;
+ }
+}
+
/*
* tlb_addr is the physical address of the bounce buffer to unmap.
*/
@@ -598,10 +667,13 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
enum dma_data_direction dir, unsigned long attrs)
{
unsigned long flags;
- int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
- int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+ unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
+ int i, count, nslots = nr_slots(alloc_size + offset);
+ int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = io_tlb_orig_addr[index];
+ validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size);
+
/*
* First, sync the memory before unmapping the entry
*/
@@ -617,26 +689,30 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
* with slots below and above the pool being returned.
*/
spin_lock_irqsave(&io_tlb_lock, flags);
- {
- count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
- io_tlb_list[index + nslots] : 0);
- /*
- * Step 1: return the slots to the free list, merging the
- * slots with superceeding slots
- */
- for (i = index + nslots - 1; i >= index; i--) {
- io_tlb_list[i] = ++count;
- io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
- }
- /*
- * Step 2: merge the returned slots with the preceding slots,
- * if available (non zero)
- */
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
+ if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
+ count = io_tlb_list[index + nslots];
+ else
+ count = 0;
- io_tlb_used -= nslots;
+ /*
+ * Step 1: return the slots to the free list, merging the slots with
+ * superceeding slots
+ */
+ for (i = index + nslots - 1; i >= index; i--) {
+ io_tlb_list[i] = ++count;
+ io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ io_tlb_orig_size[i] = 0;
}
+
+ /*
+ * Step 2: merge the returned slots with the preceding slots, if
+ * available (non zero)
+ */
+ for (i = index - 1;
+ io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i];
+ i--)
+ io_tlb_list[i] = ++count;
+ io_tlb_used -= nslots;
spin_unlock_irqrestore(&io_tlb_lock, flags);
}
@@ -645,11 +721,13 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
enum dma_sync_target target)
{
int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+ size_t orig_size = io_tlb_orig_size[index];
phys_addr_t orig_addr = io_tlb_orig_addr[index];
if (orig_addr == INVALID_PHYS_ADDR)
return;
- orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1);
+
+ validate_sync_size_and_truncate(hwdev, orig_size, &size);
switch (target) {
case SYNC_FOR_CPU:
@@ -707,7 +785,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
size_t swiotlb_max_mapping_size(struct device *dev)
{
- return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
+ return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
}
bool is_swiotlb_active(void)
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index f9d491b17b78..8442e5c9cfa2 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -184,6 +184,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
* enabled above.
*/
local_irq_disable_exit_to_user();
+
+ /* Check if any of the above work has queued a deferred wakeup */
+ rcu_nocb_flush_deferred_wakeup();
+
ti_work = READ_ONCE(current_thread_info()->flags);
}
@@ -197,6 +201,9 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
lockdep_assert_irqs_disabled();
+ /* Flush pending rcuog wakeup before the last need_resched() check */
+ rcu_nocb_flush_deferred_wakeup();
+
if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
@@ -385,6 +392,9 @@ void irqentry_exit_cond_resched(void)
preempt_schedule_irq();
}
}
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+#endif
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
{
@@ -411,8 +421,13 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
}
instrumentation_begin();
- if (IS_ENABLED(CONFIG_PREEMPTION))
+ if (IS_ENABLED(CONFIG_PREEMPTION)) {
+#ifdef CONFIG_PREEMT_DYNAMIC
+ static_call(irqentry_exit_cond_resched)();
+#else
irqentry_exit_cond_resched();
+#endif
+ }
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 55d18791a72d..03db40f6cba9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -53,6 +53,7 @@
#include <linux/min_heap.h>
#include <linux/highmem.h>
#include <linux/pgtable.h>
+#include <linux/buildid.h>
#include "internal.h"
@@ -268,7 +269,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
if (!event->parent) {
/*
* If this is a !child event, we must hold ctx::mutex to
- * stabilize the the event->ctx relation. See
+ * stabilize the event->ctx relation. See
* perf_event_ctx_lock().
*/
lockdep_assert_held(&ctx->mutex);
@@ -385,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -397,6 +399,7 @@ static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
static atomic_t nr_cgroup_events __read_mostly;
static atomic_t nr_text_poke_events __read_mostly;
+static atomic_t nr_build_id_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -1301,7 +1304,7 @@ static void put_ctx(struct perf_event_context *ctx)
* life-time rules separate them. That is an exiting task cannot fork, and a
* spawning task cannot (yet) exit.
*
- * But remember that that these are parent<->child context relations, and
+ * But remember that these are parent<->child context relations, and
* migration does not affect children, therefore these two orderings should not
* interact.
*
@@ -1440,7 +1443,7 @@ static u64 primary_event_id(struct perf_event *event)
/*
* Get the perf_event_context for a task and lock it.
*
- * This has to cope with with the fact that until it is locked,
+ * This has to cope with the fact that until it is locked,
* the context could get moved to another task.
*/
static struct perf_event_context *
@@ -1595,50 +1598,91 @@ static void perf_event_groups_init(struct perf_event_groups *groups)
groups->index = 0;
}
+static inline struct cgroup *event_cgroup(const struct perf_event *event)
+{
+ struct cgroup *cgroup = NULL;
+
+#ifdef CONFIG_CGROUP_PERF
+ if (event->cgrp)
+ cgroup = event->cgrp->css.cgroup;
+#endif
+
+ return cgroup;
+}
+
/*
* Compare function for event groups;
*
* Implements complex key that first sorts by CPU and then by virtual index
* which provides ordering when rotating groups for the same CPU.
*/
-static bool
-perf_event_groups_less(struct perf_event *left, struct perf_event *right)
+static __always_inline int
+perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
+ const u64 left_group_index, const struct perf_event *right)
{
- if (left->cpu < right->cpu)
- return true;
- if (left->cpu > right->cpu)
- return false;
+ if (left_cpu < right->cpu)
+ return -1;
+ if (left_cpu > right->cpu)
+ return 1;
#ifdef CONFIG_CGROUP_PERF
- if (left->cgrp != right->cgrp) {
- if (!left->cgrp || !left->cgrp->css.cgroup) {
- /*
- * Left has no cgroup but right does, no cgroups come
- * first.
- */
- return true;
- }
- if (!right->cgrp || !right->cgrp->css.cgroup) {
- /*
- * Right has no cgroup but left does, no cgroups come
- * first.
- */
- return false;
- }
- /* Two dissimilar cgroups, order by id. */
- if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
- return true;
+ {
+ const struct cgroup *right_cgroup = event_cgroup(right);
- return false;
+ if (left_cgroup != right_cgroup) {
+ if (!left_cgroup) {
+ /*
+ * Left has no cgroup but right does, no
+ * cgroups come first.
+ */
+ return -1;
+ }
+ if (!right_cgroup) {
+ /*
+ * Right has no cgroup but left does, no
+ * cgroups come first.
+ */
+ return 1;
+ }
+ /* Two dissimilar cgroups, order by id. */
+ if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
+ return -1;
+
+ return 1;
+ }
}
#endif
- if (left->group_index < right->group_index)
- return true;
- if (left->group_index > right->group_index)
- return false;
+ if (left_group_index < right->group_index)
+ return -1;
+ if (left_group_index > right->group_index)
+ return 1;
- return false;
+ return 0;
+}
+
+#define __node_2_pe(node) \
+ rb_entry((node), struct perf_event, group_node)
+
+static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
+{
+ struct perf_event *e = __node_2_pe(a);
+ return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
+ __node_2_pe(b)) < 0;
+}
+
+struct __group_key {
+ int cpu;
+ struct cgroup *cgroup;
+};
+
+static inline int __group_cmp(const void *key, const struct rb_node *node)
+{
+ const struct __group_key *a = key;
+ const struct perf_event *b = __node_2_pe(node);
+
+ /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
+ return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
}
/*
@@ -1650,27 +1694,9 @@ static void
perf_event_groups_insert(struct perf_event_groups *groups,
struct perf_event *event)
{
- struct perf_event *node_event;
- struct rb_node *parent;
- struct rb_node **node;
-
event->group_index = ++groups->index;
- node = &groups->tree.rb_node;
- parent = *node;
-
- while (*node) {
- parent = *node;
- node_event = container_of(*node, struct perf_event, group_node);
-
- if (perf_event_groups_less(event, node_event))
- node = &parent->rb_left;
- else
- node = &parent->rb_right;
- }
-
- rb_link_node(&event->group_node, parent, node);
- rb_insert_color(&event->group_node, &groups->tree);
+ rb_add(&event->group_node, &groups->tree, __group_less);
}
/*
@@ -1718,45 +1744,17 @@ static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
struct cgroup *cgrp)
{
- struct perf_event *node_event = NULL, *match = NULL;
- struct rb_node *node = groups->tree.rb_node;
-#ifdef CONFIG_CGROUP_PERF
- u64 node_cgrp_id, cgrp_id = 0;
-
- if (cgrp)
- cgrp_id = cgrp->kn->id;
-#endif
-
- while (node) {
- node_event = container_of(node, struct perf_event, group_node);
-
- if (cpu < node_event->cpu) {
- node = node->rb_left;
- continue;
- }
- if (cpu > node_event->cpu) {
- node = node->rb_right;
- continue;
- }
-#ifdef CONFIG_CGROUP_PERF
- node_cgrp_id = 0;
- if (node_event->cgrp && node_event->cgrp->css.cgroup)
- node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
+ struct __group_key key = {
+ .cpu = cpu,
+ .cgroup = cgrp,
+ };
+ struct rb_node *node;
- if (cgrp_id < node_cgrp_id) {
- node = node->rb_left;
- continue;
- }
- if (cgrp_id > node_cgrp_id) {
- node = node->rb_right;
- continue;
- }
-#endif
- match = node_event;
- node = node->rb_left;
- }
+ node = rb_find_first(&key, &groups->tree, __group_cmp);
+ if (node)
+ return __node_2_pe(node);
- return match;
+ return NULL;
}
/*
@@ -1765,27 +1763,17 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu,
static struct perf_event *
perf_event_groups_next(struct perf_event *event)
{
- struct perf_event *next;
-#ifdef CONFIG_CGROUP_PERF
- u64 curr_cgrp_id = 0;
- u64 next_cgrp_id = 0;
-#endif
-
- next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
- if (next == NULL || next->cpu != event->cpu)
- return NULL;
-
-#ifdef CONFIG_CGROUP_PERF
- if (event->cgrp && event->cgrp->css.cgroup)
- curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
+ struct __group_key key = {
+ .cpu = event->cpu,
+ .cgroup = event_cgroup(event),
+ };
+ struct rb_node *next;
- if (next->cgrp && next->cgrp->css.cgroup)
- next_cgrp_id = next->cgrp->css.cgroup->kn->id;
+ next = rb_next_match(&key, &event->group_node, __group_cmp);
+ if (next)
+ return __node_2_pe(next);
- if (curr_cgrp_id != next_cgrp_id)
- return NULL;
-#endif
- return next;
+ return NULL;
}
/*
@@ -1879,8 +1867,8 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
- if (sample_type & PERF_SAMPLE_WEIGHT)
- size += sizeof(data->weight);
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ size += sizeof(data->weight.full);
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
@@ -2499,7 +2487,7 @@ static void perf_set_shadow_time(struct perf_event *event,
* But this is a bit hairy.
*
* So instead, we have an explicit cgroup call to remain
- * within the time time source all along. We believe it
+ * within the time source all along. We believe it
* is cleaner and simpler to understand.
*/
if (is_cgroup_event(event))
@@ -3474,11 +3462,16 @@ unlock:
}
}
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
void perf_sched_cb_dec(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- --cpuctx->sched_cb_usage;
+ this_cpu_dec(perf_sched_cb_usages);
+
+ if (!--cpuctx->sched_cb_usage)
+ list_del(&cpuctx->sched_cb_entry);
}
@@ -3486,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- cpuctx->sched_cb_usage++;
+ if (!cpuctx->sched_cb_usage++)
+ list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+ this_cpu_inc(perf_sched_cb_usages);
}
/*
@@ -3515,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
+static void perf_pmu_sched_task(struct task_struct *prev,
+ struct task_struct *next,
+ bool sched_in)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (prev == next)
+ return;
+
+ list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+ /* will be handled in perf_event_context_sched_in/out */
+ if (cpuctx->task_ctx)
+ continue;
+
+ __perf_pmu_sched_task(cpuctx, sched_in);
+ }
+}
+
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
@@ -3537,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(task, next, false);
+
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
@@ -3845,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
+
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(prev, task, true);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -4669,10 +4689,12 @@ static void unaccount_event(struct perf_event *event)
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
dec = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
+ if (event->attr.build_id)
+ atomic_dec(&nr_build_id_events);
if (event->attr.comm)
atomic_dec(&nr_comm_events);
if (event->attr.namespaces)
@@ -6907,8 +6929,8 @@ void perf_output_sample(struct perf_output_handle *handle,
data->regs_user.regs);
}
- if (sample_type & PERF_SAMPLE_WEIGHT)
- perf_output_put(handle, data->weight);
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ perf_output_put(handle, data->weight.full);
if (sample_type & PERF_SAMPLE_DATA_SRC)
perf_output_put(handle, data->data_src.val);
@@ -8046,6 +8068,8 @@ struct perf_mmap_event {
u64 ino;
u64 ino_generation;
u32 prot, flags;
+ u8 build_id[BUILD_ID_SIZE_MAX];
+ u32 build_id_size;
struct {
struct perf_event_header header;
@@ -8077,6 +8101,7 @@ static void perf_event_mmap_output(struct perf_event *event,
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
u32 type = mmap_event->event_id.header.type;
+ bool use_build_id;
int ret;
if (!perf_event_mmap_match(event, data))
@@ -8101,13 +8126,25 @@ static void perf_event_mmap_output(struct perf_event *event,
mmap_event->event_id.pid = perf_event_pid(event, current);
mmap_event->event_id.tid = perf_event_tid(event, current);
+ use_build_id = event->attr.build_id && mmap_event->build_id_size;
+
+ if (event->attr.mmap2 && use_build_id)
+ mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
+
perf_output_put(&handle, mmap_event->event_id);
if (event->attr.mmap2) {
- perf_output_put(&handle, mmap_event->maj);
- perf_output_put(&handle, mmap_event->min);
- perf_output_put(&handle, mmap_event->ino);
- perf_output_put(&handle, mmap_event->ino_generation);
+ if (use_build_id) {
+ u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
+
+ __output_copy(&handle, size, 4);
+ __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
+ } else {
+ perf_output_put(&handle, mmap_event->maj);
+ perf_output_put(&handle, mmap_event->min);
+ perf_output_put(&handle, mmap_event->ino);
+ perf_output_put(&handle, mmap_event->ino_generation);
+ }
perf_output_put(&handle, mmap_event->prot);
perf_output_put(&handle, mmap_event->flags);
}
@@ -8236,6 +8273,9 @@ got_name:
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
+ if (atomic_read(&nr_build_id_events))
+ build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
+
perf_iterate_sb(perf_event_mmap_output,
mmap_event,
NULL);
@@ -11168,10 +11208,12 @@ static void account_event(struct perf_event *event)
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
inc = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
+ if (event->attr.build_id)
+ atomic_inc(&nr_build_id_events);
if (event->attr.comm)
atomic_inc(&nr_comm_events);
if (event->attr.namespaces)
@@ -11564,6 +11606,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (attr->sample_type & PERF_SAMPLE_CGROUP)
return -EINVAL;
#endif
+ if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
+ (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
+ return -EINVAL;
out:
return ret;
@@ -12960,6 +13005,7 @@ static void __init perf_event_init_all_cpus(void)
#ifdef CONFIG_CGROUP_PERF
INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
#endif
+ INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index bf9edd8d75be..6addc9780319 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -613,41 +613,56 @@ static void put_uprobe(struct uprobe *uprobe)
}
}
-static int match_uprobe(struct uprobe *l, struct uprobe *r)
+static __always_inline
+int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
+ const struct uprobe *r)
{
- if (l->inode < r->inode)
+ if (l_inode < r->inode)
return -1;
- if (l->inode > r->inode)
+ if (l_inode > r->inode)
return 1;
- if (l->offset < r->offset)
+ if (l_offset < r->offset)
return -1;
- if (l->offset > r->offset)
+ if (l_offset > r->offset)
return 1;
return 0;
}
+#define __node_2_uprobe(node) \
+ rb_entry((node), struct uprobe, rb_node)
+
+struct __uprobe_key {
+ struct inode *inode;
+ loff_t offset;
+};
+
+static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b)
+{
+ const struct __uprobe_key *a = key;
+ return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b));
+}
+
+static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ struct uprobe *u = __node_2_uprobe(a);
+ return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
+}
+
static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
{
- struct uprobe u = { .inode = inode, .offset = offset };
- struct rb_node *n = uprobes_tree.rb_node;
- struct uprobe *uprobe;
- int match;
+ struct __uprobe_key key = {
+ .inode = inode,
+ .offset = offset,
+ };
+ struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
- while (n) {
- uprobe = rb_entry(n, struct uprobe, rb_node);
- match = match_uprobe(&u, uprobe);
- if (!match)
- return get_uprobe(uprobe);
+ if (node)
+ return get_uprobe(__node_2_uprobe(node));
- if (match < 0)
- n = n->rb_left;
- else
- n = n->rb_right;
- }
return NULL;
}
@@ -668,32 +683,15 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
- struct rb_node **p = &uprobes_tree.rb_node;
- struct rb_node *parent = NULL;
- struct uprobe *u;
- int match;
+ struct rb_node *node;
- while (*p) {
- parent = *p;
- u = rb_entry(parent, struct uprobe, rb_node);
- match = match_uprobe(uprobe, u);
- if (!match)
- return get_uprobe(u);
+ node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
+ if (node)
+ return get_uprobe(__node_2_uprobe(node));
- if (match < 0)
- p = &parent->rb_left;
- else
- p = &parent->rb_right;
-
- }
-
- u = NULL;
- rb_link_node(&uprobe->rb_node, parent, p);
- rb_insert_color(&uprobe->rb_node, &uprobes_tree);
/* get access + creation ref */
refcount_set(&uprobe->ref, 2);
-
- return u;
+ return NULL;
}
/*
@@ -1735,7 +1733,7 @@ void uprobe_free_utask(struct task_struct *t)
}
/*
- * Allocate a uprobe_task object for the task if if necessary.
+ * Allocate a uprobe_task object for the task if necessary.
* Called when the thread hits a breakpoint.
*
* Returns:
diff --git a/kernel/fork.c b/kernel/fork.c
index d66cd1014211..54cc905e5fe0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -994,6 +994,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
+static void mm_init_pasid(struct mm_struct *mm)
+{
+#ifdef CONFIG_IOMMU_SUPPORT
+ mm->pasid = INIT_PASID;
+#endif
+}
+
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
@@ -1024,6 +1031,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ mm_init_pasid(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
@@ -1940,6 +1948,8 @@ static __latent_entropy struct task_struct *copy_process(
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
+ if (args->io_thread)
+ p->flags |= PF_IO_WORKER;
/*
* This _must_ happen before we call free_task(), i.e. before we jump
@@ -2411,6 +2421,34 @@ struct mm_struct *copy_init_mm(void)
}
/*
+ * This is like kernel_clone(), but shaved down and tailored to just
+ * creating io_uring workers. It returns a created task, or an error pointer.
+ * The returned task is inactive, and the caller must fire it up through
+ * wake_up_new_task(p). All signals are blocked in the created task.
+ */
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
+{
+ unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+ CLONE_IO;
+ struct kernel_clone_args args = {
+ .flags = ((lower_32_bits(flags) | CLONE_VM |
+ CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .stack = (unsigned long)fn,
+ .stack_size = (unsigned long)arg,
+ .io_thread = 1,
+ };
+ struct task_struct *tsk;
+
+ tsk = copy_process(NULL, 0, node, &args);
+ if (!IS_ERR(tsk)) {
+ sigfillset(&tsk->blocked);
+ sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
+ }
+ return tsk;
+}
+
+/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
diff --git a/kernel/freezer.c b/kernel/freezer.c
index dc520f01f99d..1a2d57d1327c 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -134,7 +134,7 @@ bool freeze_task(struct task_struct *p)
return false;
}
- if (!(p->flags & PF_KTHREAD))
+ if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))
fake_signal_wake_up(p);
else
wake_up_state(p, TASK_INTERRUPTIBLE);
diff --git a/kernel/futex.c b/kernel/futex.c
index 45a13eb8894e..00febd6dea9c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2728,14 +2728,13 @@ retry:
goto out;
restart = &current->restart_block;
- restart->fn = futex_wait_restart;
restart->futex.uaddr = uaddr;
restart->futex.val = val;
restart->futex.time = *abs_time;
restart->futex.bitset = bitset;
restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
- ret = -ERESTART_RESTARTBLOCK;
+ ret = set_restart_fn(restart, futex_wait_restart);
out:
if (to) {
@@ -3012,7 +3011,7 @@ retry:
* Success, we're done! No tricky corner cases.
*/
if (!ret)
- goto out_putkey;
+ return ret;
/*
* The atomic access to the futex value generated a
* pagefault, so retry the user-access and the wakeup:
@@ -3029,7 +3028,7 @@ retry:
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
- goto out_putkey;
+ return ret;
}
/*
@@ -3050,7 +3049,7 @@ retry:
default:
WARN_ON_ONCE(1);
- goto out_putkey;
+ return ret;
}
}
@@ -3061,7 +3060,6 @@ retry:
out_unlock:
spin_unlock(&hb->lock);
-out_putkey:
return ret;
pi_retry:
@@ -3763,8 +3761,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
- struct __kernel_timespec __user *, utime, u32 __user *, uaddr2,
- u32, val3)
+ const struct __kernel_timespec __user *, utime,
+ u32 __user *, uaddr2, u32, val3)
{
struct timespec64 ts;
ktime_t t, *tp = NULL;
@@ -3959,7 +3957,7 @@ err_unlock:
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
- struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
+ const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
u32, val3)
{
struct timespec64 ts;
diff --git a/kernel/groups.c b/kernel/groups.c
index fe7e6385530e..787b381c7c00 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -15,12 +15,7 @@
struct group_info *groups_alloc(int gidsetsize)
{
struct group_info *gi;
- unsigned int len;
-
- len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize;
- gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
- if (!gi)
- gi = __vmalloc(len, GFP_KERNEL_ACCOUNT);
+ gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT);
if (!gi)
return NULL;
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 48006608baf0..40880c350b95 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -159,7 +159,7 @@ static const struct irq_domain_ops irq_sim_domain_ops = {
* irq_domain_create_sim - Create a new interrupt simulator irq_domain and
* allocate a range of dummy interrupts.
*
- * @fnode: struct fwnode_handle to be associated with this domain.
+ * @fwnode: struct fwnode_handle to be associated with this domain.
* @num_irqs: Number of interrupts to allocate.
*
* On success: return a new irq_domain object.
@@ -228,7 +228,7 @@ static void devm_irq_domain_release_sim(struct device *dev, void *res)
* a managed device.
*
* @dev: Device to initialize the simulator object for.
- * @fnode: struct fwnode_handle to be associated with this domain.
+ * @fwnode: struct fwnode_handle to be associated with this domain.
* @num_irqs: Number of interrupts to allocate
*
* On success: return a new irq_domain object.
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 6aacd342cd14..d10ab1d689d5 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -205,6 +205,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
}
fwnode_handle_get(fwnode);
+ fwnode_dev_initialized(fwnode, true);
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
@@ -253,6 +254,7 @@ void irq_domain_remove(struct irq_domain *domain)
pr_debug("Removed domain %s\n", domain->name);
+ fwnode_dev_initialized(domain->fwnode, false);
fwnode_handle_put(domain->fwnode);
if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
kfree(domain->name);
@@ -1896,16 +1898,15 @@ DEFINE_SHOW_ATTRIBUTE(irq_domain_debug);
static void debugfs_add_domain_dir(struct irq_domain *d)
{
- if (!d->name || !domain_dir || d->debugfs_file)
+ if (!d->name || !domain_dir)
return;
- d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d,
- &irq_domain_debug_fops);
+ debugfs_create_file(d->name, 0444, domain_dir, d,
+ &irq_domain_debug_fops);
}
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- debugfs_remove(d->debugfs_file);
- d->debugfs_file = NULL;
+ debugfs_remove(debugfs_lookup(d->name, domain_dir));
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index dec3f73e8db9..21ea370fccda 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1142,11 +1142,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
irqreturn_t ret;
local_bh_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
ret = action->thread_fn(action->irq, action->dev_id);
if (ret == IRQ_HANDLED)
atomic_inc(&desc->threads_handled);
irq_finalize_oneshot(desc, action);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
local_bh_enable();
return ret;
}
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 8ccd32a0cc80..bd1d85c610aa 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -27,7 +27,7 @@ static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
/*
* Run software resends of IRQ's
*/
-static void resend_irqs(unsigned long arg)
+static void resend_irqs(struct tasklet_struct *unused)
{
struct irq_desc *desc;
int irq;
@@ -45,7 +45,7 @@ static void resend_irqs(unsigned long arg)
}
/* Tasklet to handle resend: */
-static DECLARE_TASKLET_OLD(resend_tasklet, resend_irqs);
+static DECLARE_TASKLET(resend_tasklet, resend_irqs);
static int irq_sw_resend(struct irq_desc *desc)
{
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index c6a39d662935..ba39fbb1f8e7 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -407,6 +407,14 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init)
return false;
if (!kernel_text_address(jump_entry_code(entry))) {
+ /*
+ * This skips patching built-in __exit, which
+ * is part of init_section_contains() but is
+ * not part of kernel_text_address().
+ *
+ * Skipping built-in __exit is fine since it
+ * will never be executed.
+ */
WARN_ONCE(!jump_entry_is_init(entry),
"can't patch jump_label at %pS",
(void *)jump_entry_code(entry));
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fe9de067771c..8043a90aa50e 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -177,6 +177,11 @@ unsigned long kallsyms_lookup_name(const char *name)
return module_kallsyms_lookup_name(name);
}
+#ifdef CONFIG_LIVEPATCH
+/*
+ * Iterate over all symbols in vmlinux. For symbols from modules use
+ * module_kallsyms_on_each_symbol instead.
+ */
int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
unsigned long),
void *data)
@@ -192,8 +197,9 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
if (ret != 0)
return ret;
}
- return module_kallsyms_on_each_symbol(fn, data);
+ return 0;
}
+#endif /* CONFIG_LIVEPATCH */
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 3994a217bde7..3bf98db9c702 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -12,7 +12,6 @@
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
-#include <linux/random.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
@@ -101,7 +100,7 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1];
static DEFINE_PER_CPU(long, kcsan_skip);
/* For kcsan_prandom_u32_max(). */
-static DEFINE_PER_CPU(struct rnd_state, kcsan_rand_state);
+static DEFINE_PER_CPU(u32, kcsan_rand_state);
static __always_inline atomic_long_t *find_watchpoint(unsigned long addr,
size_t size,
@@ -275,20 +274,17 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *
}
/*
- * Returns a pseudo-random number in interval [0, ep_ro). See prandom_u32_max()
- * for more details.
- *
- * The open-coded version here is using only safe primitives for all contexts
- * where we can have KCSAN instrumentation. In particular, we cannot use
- * prandom_u32() directly, as its tracepoint could cause recursion.
+ * Returns a pseudo-random number in interval [0, ep_ro). Simple linear
+ * congruential generator, using constants from "Numerical Recipes".
*/
static u32 kcsan_prandom_u32_max(u32 ep_ro)
{
- struct rnd_state *state = &get_cpu_var(kcsan_rand_state);
- const u32 res = prandom_u32_state(state);
+ u32 state = this_cpu_read(kcsan_rand_state);
+
+ state = 1664525 * state + 1013904223;
+ this_cpu_write(kcsan_rand_state, state);
- put_cpu_var(kcsan_rand_state);
- return (u32)(((u64) res * ep_ro) >> 32);
+ return state % ep_ro;
}
static inline void reset_kcsan_skip(void)
@@ -639,10 +635,14 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
void __init kcsan_init(void)
{
+ int cpu;
+
BUG_ON(!in_task());
kcsan_debugfs_init();
- prandom_seed_full_state(&kcsan_rand_state);
+
+ for_each_possible_cpu(cpu)
+ per_cpu(kcsan_rand_state, cpu) = (u32)get_cycles();
/*
* We are in the init task, and no other tasks should be running;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index aa919585c24b..a0b6780740c8 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1076,7 +1076,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
if (!buf)
return;
memset(&prstatus, 0, sizeof(prstatus));
- prstatus.pr_pid = current->pid;
+ prstatus.common.pr_pid = current->pid;
elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
&prstatus, sizeof(prstatus));
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b02086d70492..5c3447cf7ad5 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -166,6 +166,11 @@ void kimage_file_post_load_cleanup(struct kimage *image)
vfree(pi->sechdrs);
pi->sechdrs = NULL;
+#ifdef CONFIG_IMA_KEXEC
+ vfree(image->ima_buffer);
+ image->ima_buffer = NULL;
+#endif /* CONFIG_IMA_KEXEC */
+
/* See if architecture has anything to cleanup post load */
arch_kimage_file_post_load_cleanup(image);
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 39d30ccf8d87..48aaf2ac0d0d 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -13,8 +13,6 @@ void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
-int machine_kexec_post_load(struct kimage *image);
-
extern struct mutex kexec_mutex;
#ifdef CONFIG_KEXEC_FILE
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d5a3eb74a657..745f08fdd7a6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -861,7 +861,6 @@ out:
cpus_read_unlock();
}
-#ifdef CONFIG_SYSCTL
static void optimize_all_kprobes(void)
{
struct hlist_head *head;
@@ -887,6 +886,7 @@ out:
mutex_unlock(&kprobe_mutex);
}
+#ifdef CONFIG_SYSCTL
static void unoptimize_all_kprobes(void)
{
struct hlist_head *head;
@@ -1520,13 +1520,16 @@ valid:
return ap;
}
-/* Return error if the kprobe is being re-registered */
-static inline int check_kprobe_rereg(struct kprobe *p)
+/*
+ * Warn and return error if the kprobe is being re-registered since
+ * there must be a software bug.
+ */
+static inline int warn_kprobe_rereg(struct kprobe *p)
{
int ret = 0;
mutex_lock(&kprobe_mutex);
- if (__get_valid_kprobe(p))
+ if (WARN_ON_ONCE(__get_valid_kprobe(p)))
ret = -EINVAL;
mutex_unlock(&kprobe_mutex);
@@ -1614,7 +1617,7 @@ int register_kprobe(struct kprobe *p)
return PTR_ERR(addr);
p->addr = addr;
- ret = check_kprobe_rereg(p);
+ ret = warn_kprobe_rereg(p);
if (ret)
return ret;
@@ -1995,7 +1998,7 @@ int register_kretprobe(struct kretprobe *rp)
return ret;
/* If only rp->kp.addr is specified, check reregistering kprobes */
- if (rp->kp.addr && check_kprobe_rereg(&rp->kp))
+ if (rp->kp.addr && warn_kprobe_rereg(&rp->kp))
return -EINVAL;
if (kretprobe_blacklist_size) {
@@ -2497,18 +2500,14 @@ static int __init init_kprobes(void)
}
}
-#if defined(CONFIG_OPTPROBES)
-#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
- /* Init kprobe_optinsn_slots */
- kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
-#endif
- /* By default, kprobes can be optimized */
- kprobes_allow_optimization = true;
-#endif
-
/* By default, kprobes are armed */
kprobes_all_disarmed = false;
+#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
+ /* Init kprobe_optinsn_slots for allocation */
+ kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+#endif
+
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
@@ -2523,6 +2522,21 @@ static int __init init_kprobes(void)
}
early_initcall(init_kprobes);
+#if defined(CONFIG_OPTPROBES)
+static int __init init_optprobes(void)
+{
+ /*
+ * Enable kprobe optimization - this kicks the optimizer which
+ * depends on synchronize_rcu_tasks() and ksoftirqd, that is
+ * not spawned in early initcall. So delay the optimization.
+ */
+ optimize_all_kprobes();
+
+ return 0;
+}
+subsys_initcall(init_optprobes);
+#endif
+
#ifdef CONFIG_DEBUG_FS
static void report_probe(struct seq_file *pi, struct kprobe *p,
const char *sym, int offset, char *modname, struct kprobe *pp)
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index f76fdb925532..335d988bd811 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -19,6 +19,7 @@
#include <linux/moduleloader.h>
#include <linux/completion.h>
#include <linux/memory.h>
+#include <linux/rcupdate.h>
#include <asm/cacheflush.h>
#include "core.h"
#include "patch.h"
@@ -57,7 +58,7 @@ static void klp_find_object_module(struct klp_object *obj)
if (!klp_is_module(obj))
return;
- mutex_lock(&module_mutex);
+ rcu_read_lock_sched();
/*
* We do not want to block removal of patched modules and therefore
* we do not take a reference here. The patches are removed by
@@ -74,7 +75,7 @@ static void klp_find_object_module(struct klp_object *obj)
if (mod && mod->klp_alive)
obj->mod = mod;
- mutex_unlock(&module_mutex);
+ rcu_read_unlock_sched();
}
static bool klp_initialized(void)
@@ -163,12 +164,10 @@ static int klp_find_object_symbol(const char *objname, const char *name,
.pos = sympos,
};
- mutex_lock(&module_mutex);
if (objname)
module_kallsyms_on_each_symbol(klp_find_callback, &args);
else
kallsyms_on_each_symbol(klp_find_callback, &args);
- mutex_unlock(&module_mutex);
/*
* Ensure an address was found. If sympos is 0, ensure symbol is unique;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 6d11cfb9b41f..8838f1d7c4a2 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -15,6 +15,7 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
endif
+obj-$(CONFIG_DEBUG_IRQFLAGS) += irqflag-debug.o
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
diff --git a/kernel/locking/irqflag-debug.c b/kernel/locking/irqflag-debug.c
new file mode 100644
index 000000000000..810b50344d35
--- /dev/null
+++ b/kernel/locking/irqflag-debug.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/irqflags.h>
+
+noinstr void warn_bogus_irq_restore(void)
+{
+ instrumentation_begin();
+ WARN_ONCE(1, "raw_local_irq_restore() called with IRQs enabled\n");
+ instrumentation_end();
+}
+EXPORT_SYMBOL(warn_bogus_irq_restore);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index bdaf4829098c..c6d0c1dc6253 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1290,6 +1290,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
class->name_version = count_matching_names(class);
class->wait_type_inner = lock->wait_type_inner;
class->wait_type_outer = lock->wait_type_outer;
+ class->lock_type = lock->lock_type;
/*
* We use RCU's safe list-add method to make
* parallel walking of the hash-list safe:
@@ -1671,6 +1672,7 @@ static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset)
static enum bfs_result __bfs(struct lock_list *source_entry,
void *data,
bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
struct lock_list **target_entry,
int offset)
{
@@ -1731,7 +1733,12 @@ static enum bfs_result __bfs(struct lock_list *source_entry,
/*
* Step 3: we haven't visited this and there is a strong
* dependency path to this, so check with @match.
+ * If @skip is provide and returns true, we skip this
+ * lock (and any path this lock is in).
*/
+ if (skip && skip(lock, data))
+ continue;
+
if (match(lock, data)) {
*target_entry = lock;
return BFS_RMATCH;
@@ -1774,9 +1781,10 @@ static inline enum bfs_result
__bfs_forwards(struct lock_list *src_entry,
void *data,
bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry,
+ return __bfs(src_entry, data, match, skip, target_entry,
offsetof(struct lock_class, locks_after));
}
@@ -1785,9 +1793,10 @@ static inline enum bfs_result
__bfs_backwards(struct lock_list *src_entry,
void *data,
bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry,
+ return __bfs(src_entry, data, match, skip, target_entry,
offsetof(struct lock_class, locks_before));
}
@@ -2018,7 +2027,7 @@ static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
unsigned long count = 0;
struct lock_list *target_entry;
- __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
+ __bfs_forwards(this, (void *)&count, noop_count, NULL, &target_entry);
return count;
}
@@ -2043,7 +2052,7 @@ static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
unsigned long count = 0;
struct lock_list *target_entry;
- __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
+ __bfs_backwards(this, (void *)&count, noop_count, NULL, &target_entry);
return count;
}
@@ -2071,11 +2080,12 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
static noinline enum bfs_result
check_path(struct held_lock *target, struct lock_list *src_entry,
bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
enum bfs_result ret;
- ret = __bfs_forwards(src_entry, target, match, target_entry);
+ ret = __bfs_forwards(src_entry, target, match, skip, target_entry);
if (unlikely(bfs_error(ret)))
print_bfs_bug(ret);
@@ -2102,7 +2112,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
debug_atomic_inc(nr_cyclic_checks);
- ret = check_path(target, &src_entry, hlock_conflict, &target_entry);
+ ret = check_path(target, &src_entry, hlock_conflict, NULL, &target_entry);
if (unlikely(ret == BFS_RMATCH)) {
if (!*trace) {
@@ -2120,46 +2130,6 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
return ret;
}
-#ifdef CONFIG_LOCKDEP_SMALL
-/*
- * Check that the dependency graph starting at <src> can lead to
- * <target> or not. If it can, <src> -> <target> dependency is already
- * in the graph.
- *
- * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if
- * any error appears in the bfs search.
- */
-static noinline enum bfs_result
-check_redundant(struct held_lock *src, struct held_lock *target)
-{
- enum bfs_result ret;
- struct lock_list *target_entry;
- struct lock_list src_entry;
-
- bfs_init_root(&src_entry, src);
- /*
- * Special setup for check_redundant().
- *
- * To report redundant, we need to find a strong dependency path that
- * is equal to or stronger than <src> -> <target>. So if <src> is E,
- * we need to let __bfs() only search for a path starting at a -(E*)->,
- * we achieve this by setting the initial node's ->only_xr to true in
- * that case. And if <prev> is S, we set initial ->only_xr to false
- * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant.
- */
- src_entry.only_xr = src->read == 0;
-
- debug_atomic_inc(nr_redundant_checks);
-
- ret = check_path(target, &src_entry, hlock_equal, &target_entry);
-
- if (ret == BFS_RMATCH)
- debug_atomic_inc(nr_redundant);
-
- return ret;
-}
-#endif
-
#ifdef CONFIG_TRACE_IRQFLAGS
/*
@@ -2230,6 +2200,44 @@ static inline bool usage_match(struct lock_list *entry, void *mask)
return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask);
}
+static inline bool usage_skip(struct lock_list *entry, void *mask)
+{
+ /*
+ * Skip local_lock() for irq inversion detection.
+ *
+ * For !RT, local_lock() is not a real lock, so it won't carry any
+ * dependency.
+ *
+ * For RT, an irq inversion happens when we have lock A and B, and on
+ * some CPU we can have:
+ *
+ * lock(A);
+ * <interrupted>
+ * lock(B);
+ *
+ * where lock(B) cannot sleep, and we have a dependency B -> ... -> A.
+ *
+ * Now we prove local_lock() cannot exist in that dependency. First we
+ * have the observation for any lock chain L1 -> ... -> Ln, for any
+ * 1 <= i <= n, Li.inner_wait_type <= L1.inner_wait_type, otherwise
+ * wait context check will complain. And since B is not a sleep lock,
+ * therefore B.inner_wait_type >= 2, and since the inner_wait_type of
+ * local_lock() is 3, which is greater than 2, therefore there is no
+ * way the local_lock() exists in the dependency B -> ... -> A.
+ *
+ * As a result, we will skip local_lock(), when we search for irq
+ * inversion bugs.
+ */
+ if (entry->class->lock_type == LD_LOCK_PERCPU) {
+ if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+ return false;
+
+ return true;
+ }
+
+ return false;
+}
+
/*
* Find a node in the forwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
@@ -2245,7 +2253,7 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
debug_atomic_inc(nr_find_usage_forwards_checks);
- result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);
+ result = __bfs_forwards(root, &usage_mask, usage_match, usage_skip, target_entry);
return result;
}
@@ -2262,7 +2270,7 @@ find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
debug_atomic_inc(nr_find_usage_backwards_checks);
- result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);
+ result = __bfs_backwards(root, &usage_mask, usage_match, usage_skip, target_entry);
return result;
}
@@ -2627,7 +2635,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
*/
bfs_init_rootb(&this, prev);
- ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
+ ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, usage_skip, NULL);
if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
@@ -2694,8 +2702,68 @@ static inline int check_irq_usage(struct task_struct *curr,
{
return 1;
}
+
+static inline bool usage_skip(struct lock_list *entry, void *mask)
+{
+ return false;
+}
+
#endif /* CONFIG_TRACE_IRQFLAGS */
+#ifdef CONFIG_LOCKDEP_SMALL
+/*
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not. If it can, <src> -> <target> dependency is already
+ * in the graph.
+ *
+ * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if
+ * any error appears in the bfs search.
+ */
+static noinline enum bfs_result
+check_redundant(struct held_lock *src, struct held_lock *target)
+{
+ enum bfs_result ret;
+ struct lock_list *target_entry;
+ struct lock_list src_entry;
+
+ bfs_init_root(&src_entry, src);
+ /*
+ * Special setup for check_redundant().
+ *
+ * To report redundant, we need to find a strong dependency path that
+ * is equal to or stronger than <src> -> <target>. So if <src> is E,
+ * we need to let __bfs() only search for a path starting at a -(E*)->,
+ * we achieve this by setting the initial node's ->only_xr to true in
+ * that case. And if <prev> is S, we set initial ->only_xr to false
+ * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant.
+ */
+ src_entry.only_xr = src->read == 0;
+
+ debug_atomic_inc(nr_redundant_checks);
+
+ /*
+ * Note: we skip local_lock() for redundant check, because as the
+ * comment in usage_skip(), A -> local_lock() -> B and A -> B are not
+ * the same.
+ */
+ ret = check_path(target, &src_entry, hlock_equal, usage_skip, &target_entry);
+
+ if (ret == BFS_RMATCH)
+ debug_atomic_inc(nr_redundant);
+
+ return ret;
+}
+
+#else
+
+static inline enum bfs_result
+check_redundant(struct held_lock *src, struct held_lock *target)
+{
+ return BFS_RNOMATCH;
+}
+
+#endif
+
static void inc_chains(int irq_context)
{
if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT)
@@ -2916,7 +2984,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
}
}
-#ifdef CONFIG_LOCKDEP_SMALL
/*
* Is the <prev> -> <next> link redundant?
*/
@@ -2925,7 +2992,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return 0;
else if (ret == BFS_RMATCH)
return 2;
-#endif
if (!*trace) {
*trace = save_trace();
@@ -3707,7 +3773,7 @@ static void
print_usage_bug(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
{
- if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ if (!debug_locks_off() || debug_locks_silent)
return;
pr_warn("\n");
@@ -3748,6 +3814,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
{
if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) {
+ graph_unlock();
print_usage_bug(curr, this, bad_bit, new_bit);
return 0;
}
@@ -4503,9 +4570,9 @@ print_lock_invalid_wait_context(struct task_struct *curr,
*/
static int check_wait_context(struct task_struct *curr, struct held_lock *next)
{
- short next_inner = hlock_class(next)->wait_type_inner;
- short next_outer = hlock_class(next)->wait_type_outer;
- short curr_inner;
+ u8 next_inner = hlock_class(next)->wait_type_inner;
+ u8 next_outer = hlock_class(next)->wait_type_outer;
+ u8 curr_inner;
int depth;
if (!curr->lockdep_depth || !next_inner || next->trylock)
@@ -4528,7 +4595,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
for (; depth < curr->lockdep_depth; depth++) {
struct held_lock *prev = curr->held_locks + depth;
- short prev_inner = hlock_class(prev)->wait_type_inner;
+ u8 prev_inner = hlock_class(prev)->wait_type_inner;
if (prev_inner) {
/*
@@ -4577,9 +4644,9 @@ static inline int check_wait_context(struct task_struct *curr,
/*
* Initialize a lock instance's lock-class mapping info:
*/
-void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
+void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass,
- short inner, short outer)
+ u8 inner, u8 outer, u8 lock_type)
{
int i;
@@ -4602,6 +4669,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
lock->wait_type_outer = outer;
lock->wait_type_inner = inner;
+ lock->lock_type = lock_type;
/*
* No key, no joy, we need to hash something.
@@ -4636,7 +4704,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
raw_local_irq_restore(flags);
}
}
-EXPORT_SYMBOL_GPL(lockdep_init_map_waits);
+EXPORT_SYMBOL_GPL(lockdep_init_map_type);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index fd838cea3934..0ab94e1f1276 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -27,7 +27,6 @@
#include <linux/moduleparam.h>
#include <linux/delay.h>
#include <linux/slab.h>
-#include <linux/percpu-rwsem.h>
#include <linux/torture.h>
#include <linux/reboot.h>
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 5352ce50a97e..622ebdfcd083 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -86,16 +86,6 @@ bool mutex_is_locked(struct mutex *lock)
}
EXPORT_SYMBOL(mutex_is_locked);
-__must_check enum mutex_trylock_recursive_enum
-mutex_trylock_recursive(struct mutex *lock)
-{
- if (unlikely(__mutex_owner(lock) == current))
- return MUTEX_TRYLOCK_RECURSIVE;
-
- return mutex_trylock(lock);
-}
-EXPORT_SYMBOL(mutex_trylock_recursive);
-
static inline unsigned long __owner_flags(unsigned long owner)
{
return owner & MUTEX_FLAGS;
@@ -636,7 +626,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
*/
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
if (!waiter) {
/*
@@ -712,7 +702,7 @@ fail:
#else
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
return false;
}
@@ -932,6 +922,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
struct ww_mutex *ww;
int ret;
+ if (!use_ww_ctx)
+ ww_ctx = NULL;
+
might_sleep();
#ifdef CONFIG_DEBUG_MUTEXES
@@ -939,7 +932,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
#endif
ww = container_of(lock, struct ww_mutex, base);
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
return -EALREADY;
@@ -956,10 +949,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
if (__mutex_trylock(lock) ||
- mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) {
+ mutex_optimistic_spin(lock, ww_ctx, NULL)) {
/* got the lock, yay! */
lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
ww_mutex_set_context_fastpath(ww, ww_ctx);
preempt_enable();
return 0;
@@ -970,7 +963,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* After waiting to acquire the wait_lock, try again.
*/
if (__mutex_trylock(lock)) {
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
__ww_mutex_check_waiters(lock, ww_ctx);
goto skip_wait;
@@ -1023,7 +1016,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto err;
}
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx);
if (ret)
goto err;
@@ -1036,7 +1029,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* ww_mutex needs to always recheck its position since its waiter
* list is not FIFO ordered.
*/
- if ((use_ww_ctx && ww_ctx) || !first) {
+ if (ww_ctx || !first) {
first = __mutex_waiter_is_first(lock, &waiter);
if (first)
__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
@@ -1049,7 +1042,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* or we must see its unlock and acquire.
*/
if (__mutex_trylock(lock) ||
- (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter)))
+ (first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
break;
spin_lock(&lock->wait_lock);
@@ -1058,7 +1051,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
acquired:
__set_current_state(TASK_RUNNING);
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
/*
* Wound-Wait; we stole the lock (!first_waiter), check the
* waiters as anyone might want to wound us.
@@ -1078,7 +1071,7 @@ skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
spin_unlock(&lock->wait_lock);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fe9ca92faa2a..4786dd271b45 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -12,7 +12,6 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/spinlock.h>
-#include <asm/qrwlock.h>
/**
* queued_read_lock_slowpath - acquire read lock of a queue rwlock
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2f8cd616d3b2..48fff6437901 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -267,27 +267,18 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
return 1;
}
+#define __node_2_waiter(node) \
+ rb_entry((node), struct rt_mutex_waiter, tree_entry)
+
+static inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
+{
+ return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b));
+}
+
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &lock->waiters.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- bool leftmost = true;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
-
- rb_link_node(&waiter->tree_entry, parent, link);
- rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost);
+ rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
}
static void
@@ -300,27 +291,18 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
RB_CLEAR_NODE(&waiter->tree_entry);
}
+#define __node_2_pi_waiter(node) \
+ rb_entry((node), struct rt_mutex_waiter, pi_tree_entry)
+
+static inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+{
+ return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b));
+}
+
static void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &task->pi_waiters.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- bool leftmost = true;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
-
- rb_link_node(&waiter->pi_tree_entry, parent, link);
- rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost);
+ rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less);
}
static void
@@ -1438,7 +1420,7 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
}
/*
- * Performs the wakeup of the the top-waiter and re-enables preemption.
+ * Performs the wakeup of the top-waiter and re-enables preemption.
*/
void rt_mutex_postunlock(struct wake_q_head *wake_q)
{
@@ -1604,8 +1586,11 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
- * Futex variant, that since futex variants do not use the fast-path, can be
- * simple and will not need to retry.
+ * __rt_mutex_futex_unlock - Futex variant, that since futex variants
+ * do not use the fast-path, can be simple and will not need to retry.
+ *
+ * @lock: The rt_mutex to be unlocked
+ * @wake_q: The wake queue head from which to get the next lock waiter
*/
bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
struct wake_q_head *wake_q)
@@ -1662,13 +1647,15 @@ void rt_mutex_destroy(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_destroy);
/**
- * __rt_mutex_init - initialize the rt lock
+ * __rt_mutex_init - initialize the rt_mutex
*
- * @lock: the rt lock to be initialized
+ * @lock: The rt_mutex to be initialized
+ * @name: The lock name used for debugging
+ * @key: The lock class key used for debugging
*
- * Initialize the rt lock to unlocked state.
+ * Initialize the rt_mutex to unlocked state.
*
- * Initializing of a locked rt lock is not allowed
+ * Initializing of a locked rt_mutex is not allowed
*/
void __rt_mutex_init(struct rt_mutex *lock, const char *name,
struct lock_class_key *key)
@@ -1832,7 +1819,7 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
* been started.
* @waiter: the pre-initialized rt_mutex_waiter
*
- * Wait for the the lock acquisition started on our behalf by
+ * Wait for the lock acquisition started on our behalf by
* rt_mutex_start_proxy_lock(). Upon failure, the caller must call
* rt_mutex_cleanup_proxy_lock().
*
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index ba67600c7b2c..abba5df50006 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1048,7 +1048,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
/*
* If there were already threads queued before us and:
- * 1) there are no no active locks, wake the front
+ * 1) there are no active locks, wake the front
* queued process(es) as the handoff bit might be set.
* 2) there are no active writers and some readers, the lock
* must be read owned; so we try to wake any read lock
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/kernel/locking/rwsem.h
+++ /dev/null
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index d9dd94defc0a..9aa855a96c4a 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -119,7 +119,7 @@ EXPORT_SYMBOL(down_killable);
* @sem: the semaphore to be acquired
*
* Try to acquire the semaphore atomically. Returns 0 if the semaphore has
- * been acquired successfully or 1 if it it cannot be acquired.
+ * been acquired successfully or 1 if it cannot be acquired.
*
* NOTE: This return value is inverted from both spin_trylock and
* mutex_trylock! Be careful about this when converting code.
diff --git a/kernel/module.c b/kernel/module.c
index 4bf30e4b3eaa..30479355ab85 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -87,8 +87,7 @@
* 3) module_addr_min/module_addr_max.
* (delete and add uses RCU list operations).
*/
-DEFINE_MUTEX(module_mutex);
-EXPORT_SYMBOL_GPL(module_mutex);
+static DEFINE_MUTEX(module_mutex);
static LIST_HEAD(modules);
/* Work queue for freeing init sections in success case */
@@ -256,11 +255,6 @@ static void mod_update_bounds(struct module *mod)
struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
#endif /* CONFIG_KGDB_KDB */
-static void module_assert_mutex(void)
-{
- lockdep_assert_held(&module_mutex);
-}
-
static void module_assert_mutex_or_preempt(void)
{
#ifdef CONFIG_LOCKDEP
@@ -414,19 +408,8 @@ extern const struct kernel_symbol __start___ksymtab[];
extern const struct kernel_symbol __stop___ksymtab[];
extern const struct kernel_symbol __start___ksymtab_gpl[];
extern const struct kernel_symbol __stop___ksymtab_gpl[];
-extern const struct kernel_symbol __start___ksymtab_gpl_future[];
-extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
extern const s32 __start___kcrctab[];
extern const s32 __start___kcrctab_gpl[];
-extern const s32 __start___kcrctab_gpl_future[];
-#ifdef CONFIG_UNUSED_SYMBOLS
-extern const struct kernel_symbol __start___ksymtab_unused[];
-extern const struct kernel_symbol __stop___ksymtab_unused[];
-extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
-extern const s32 __start___kcrctab_unused[];
-extern const s32 __start___kcrctab_unused_gpl[];
-#endif
#ifndef CONFIG_MODVERSIONS
#define symversion(base, idx) NULL
@@ -434,87 +417,14 @@ extern const s32 __start___kcrctab_unused_gpl[];
#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
#endif
-static bool each_symbol_in_section(const struct symsearch *arr,
- unsigned int arrsize,
- struct module *owner,
- bool (*fn)(const struct symsearch *syms,
- struct module *owner,
- void *data),
- void *data)
-{
- unsigned int j;
-
- for (j = 0; j < arrsize; j++) {
- if (fn(&arr[j], owner, data))
- return true;
- }
-
- return false;
-}
-
-/* Returns true as soon as fn returns true, otherwise false. */
-static bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
- struct module *owner,
- void *data),
- void *data)
-{
- struct module *mod;
- static const struct symsearch arr[] = {
- { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
- NOT_GPL_ONLY, false },
- { __start___ksymtab_gpl, __stop___ksymtab_gpl,
- __start___kcrctab_gpl,
- GPL_ONLY, false },
- { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
- __start___kcrctab_gpl_future,
- WILL_BE_GPL_ONLY, false },
-#ifdef CONFIG_UNUSED_SYMBOLS
- { __start___ksymtab_unused, __stop___ksymtab_unused,
- __start___kcrctab_unused,
- NOT_GPL_ONLY, true },
- { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
- __start___kcrctab_unused_gpl,
- GPL_ONLY, true },
-#endif
- };
-
- module_assert_mutex_or_preempt();
-
- if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
- return true;
-
- list_for_each_entry_rcu(mod, &modules, list,
- lockdep_is_held(&module_mutex)) {
- struct symsearch arr[] = {
- { mod->syms, mod->syms + mod->num_syms, mod->crcs,
- NOT_GPL_ONLY, false },
- { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
- mod->gpl_crcs,
- GPL_ONLY, false },
- { mod->gpl_future_syms,
- mod->gpl_future_syms + mod->num_gpl_future_syms,
- mod->gpl_future_crcs,
- WILL_BE_GPL_ONLY, false },
-#ifdef CONFIG_UNUSED_SYMBOLS
- { mod->unused_syms,
- mod->unused_syms + mod->num_unused_syms,
- mod->unused_crcs,
- NOT_GPL_ONLY, true },
- { mod->unused_gpl_syms,
- mod->unused_gpl_syms + mod->num_unused_gpl_syms,
- mod->unused_gpl_crcs,
- GPL_ONLY, true },
-#endif
- };
-
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
-
- if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
- return true;
- }
- return false;
-}
+struct symsearch {
+ const struct kernel_symbol *start, *stop;
+ const s32 *crcs;
+ enum mod_license {
+ NOT_GPL_ONLY,
+ GPL_ONLY,
+ } license;
+};
struct find_symbol_arg {
/* Input */
@@ -535,28 +445,8 @@ static bool check_exported_symbol(const struct symsearch *syms,
{
struct find_symbol_arg *fsa = data;
- if (!fsa->gplok) {
- if (syms->license == GPL_ONLY)
- return false;
- if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) {
- pr_warn("Symbol %s is being used by a non-GPL module, "
- "which will not be allowed in the future\n",
- fsa->name);
- }
- }
-
-#ifdef CONFIG_UNUSED_SYMBOLS
- if (syms->unused && fsa->warn) {
- pr_warn("Symbol %s is marked as UNUSED, however this module is "
- "using it.\n", fsa->name);
- pr_warn("This symbol will go away in the future.\n");
- pr_warn("Please evaluate if this is the right api to use and "
- "if it really is, submit a report to the linux kernel "
- "mailing list together with submitting your code for "
- "inclusion.\n");
- }
-#endif
-
+ if (!fsa->gplok && syms->license == GPL_ONLY)
+ return false;
fsa->owner = owner;
fsa->crc = symversion(syms->crcs, symnum);
fsa->sym = &syms->start[symnum];
@@ -619,31 +509,44 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
* Find an exported symbol and return it, along with, (optional) crc and
* (optional) module which owns it. Needs preempt disabled or module_mutex.
*/
-static const struct kernel_symbol *find_symbol(const char *name,
- struct module **owner,
- const s32 **crc,
- enum mod_license *license,
- bool gplok,
- bool warn)
-{
- struct find_symbol_arg fsa;
-
- fsa.name = name;
- fsa.gplok = gplok;
- fsa.warn = warn;
-
- if (each_symbol_section(find_exported_symbol_in_section, &fsa)) {
- if (owner)
- *owner = fsa.owner;
- if (crc)
- *crc = fsa.crc;
- if (license)
- *license = fsa.license;
- return fsa.sym;
+static bool find_symbol(struct find_symbol_arg *fsa)
+{
+ static const struct symsearch arr[] = {
+ { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
+ NOT_GPL_ONLY },
+ { __start___ksymtab_gpl, __stop___ksymtab_gpl,
+ __start___kcrctab_gpl,
+ GPL_ONLY },
+ };
+ struct module *mod;
+ unsigned int i;
+
+ module_assert_mutex_or_preempt();
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++)
+ if (find_exported_symbol_in_section(&arr[i], NULL, fsa))
+ return true;
+
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
+ struct symsearch arr[] = {
+ { mod->syms, mod->syms + mod->num_syms, mod->crcs,
+ NOT_GPL_ONLY },
+ { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
+ mod->gpl_crcs,
+ GPL_ONLY },
+ };
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++)
+ if (find_exported_symbol_in_section(&arr[i], mod, fsa))
+ return true;
}
- pr_debug("Failed to find symbol %s\n", name);
- return NULL;
+ pr_debug("Failed to find symbol %s\n", fsa->name);
+ return false;
}
/*
@@ -669,10 +572,8 @@ static struct module *find_module_all(const char *name, size_t len,
struct module *find_module(const char *name)
{
- module_assert_mutex();
return find_module_all(name, strlen(name), false);
}
-EXPORT_SYMBOL_GPL(find_module);
#ifdef CONFIG_SMP
@@ -1107,12 +1008,15 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
void __symbol_put(const char *symbol)
{
- struct module *owner;
+ struct find_symbol_arg fsa = {
+ .name = symbol,
+ .gplok = true,
+ };
preempt_disable();
- if (!find_symbol(symbol, &owner, NULL, NULL, true, false))
+ if (!find_symbol(&fsa))
BUG();
- module_put(owner);
+ module_put(fsa.owner);
preempt_enable();
}
EXPORT_SYMBOL(__symbol_put);
@@ -1381,19 +1285,22 @@ bad_version:
static inline int check_modstruct_version(const struct load_info *info,
struct module *mod)
{
- const s32 *crc;
+ struct find_symbol_arg fsa = {
+ .name = "module_layout",
+ .gplok = true,
+ };
/*
* Since this should be found in kernel (which can't be removed), no
* locking is necessary -- use preempt_disable() to placate lockdep.
*/
preempt_disable();
- if (!find_symbol("module_layout", NULL, &crc, NULL, true, false)) {
+ if (!find_symbol(&fsa)) {
preempt_enable();
BUG();
}
preempt_enable();
- return check_version(info, "module_layout", mod, crc);
+ return check_version(info, "module_layout", mod, fsa.crc);
}
/* First part is kernel version, which we ignore if module has crcs. */
@@ -1487,10 +1394,11 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
const char *name,
char ownername[])
{
- struct module *owner;
- const struct kernel_symbol *sym;
- const s32 *crc;
- enum mod_license license;
+ struct find_symbol_arg fsa = {
+ .name = name,
+ .gplok = !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)),
+ .warn = true,
+ };
int err;
/*
@@ -1500,42 +1408,40 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
*/
sched_annotate_sleep();
mutex_lock(&module_mutex);
- sym = find_symbol(name, &owner, &crc, &license,
- !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
- if (!sym)
+ if (!find_symbol(&fsa))
goto unlock;
- if (license == GPL_ONLY)
+ if (fsa.license == GPL_ONLY)
mod->using_gplonly_symbols = true;
- if (!inherit_taint(mod, owner)) {
- sym = NULL;
+ if (!inherit_taint(mod, fsa.owner)) {
+ fsa.sym = NULL;
goto getname;
}
- if (!check_version(info, name, mod, crc)) {
- sym = ERR_PTR(-EINVAL);
+ if (!check_version(info, name, mod, fsa.crc)) {
+ fsa.sym = ERR_PTR(-EINVAL);
goto getname;
}
- err = verify_namespace_is_imported(info, sym, mod);
+ err = verify_namespace_is_imported(info, fsa.sym, mod);
if (err) {
- sym = ERR_PTR(err);
+ fsa.sym = ERR_PTR(err);
goto getname;
}
- err = ref_module(mod, owner);
+ err = ref_module(mod, fsa.owner);
if (err) {
- sym = ERR_PTR(err);
+ fsa.sym = ERR_PTR(err);
goto getname;
}
getname:
/* We must make copy under the lock if we failed to get ref. */
- strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
+ strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN);
unlock:
mutex_unlock(&module_mutex);
- return sym;
+ return fsa.sym;
}
static const struct kernel_symbol *
@@ -2296,16 +2202,19 @@ static void free_module(struct module *mod)
void *__symbol_get(const char *symbol)
{
- struct module *owner;
- const struct kernel_symbol *sym;
+ struct find_symbol_arg fsa = {
+ .name = symbol,
+ .gplok = true,
+ .warn = true,
+ };
preempt_disable();
- sym = find_symbol(symbol, &owner, NULL, NULL, true, true);
- if (sym && strong_try_module_get(owner))
- sym = NULL;
+ if (!find_symbol(&fsa) || strong_try_module_get(fsa.owner)) {
+ preempt_enable();
+ return NULL;
+ }
preempt_enable();
-
- return sym ? (void *)kernel_symbol_value(sym) : NULL;
+ return (void *)kernel_symbol_value(fsa.sym);
}
EXPORT_SYMBOL_GPL(__symbol_get);
@@ -2318,7 +2227,6 @@ EXPORT_SYMBOL_GPL(__symbol_get);
static int verify_exported_symbols(struct module *mod)
{
unsigned int i;
- struct module *owner;
const struct kernel_symbol *s;
struct {
const struct kernel_symbol *sym;
@@ -2326,21 +2234,19 @@ static int verify_exported_symbols(struct module *mod)
} arr[] = {
{ mod->syms, mod->num_syms },
{ mod->gpl_syms, mod->num_gpl_syms },
- { mod->gpl_future_syms, mod->num_gpl_future_syms },
-#ifdef CONFIG_UNUSED_SYMBOLS
- { mod->unused_syms, mod->num_unused_syms },
- { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
-#endif
};
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
- if (find_symbol(kernel_symbol_name(s), &owner, NULL,
- NULL, true, false)) {
+ struct find_symbol_arg fsa = {
+ .name = kernel_symbol_name(s),
+ .gplok = true,
+ };
+ if (find_symbol(&fsa)) {
pr_err("%s: exports duplicate symbol %s"
" (owned by %s)\n",
mod->name, kernel_symbol_name(s),
- module_name(owner));
+ module_name(fsa.owner));
return -ENOEXEC;
}
}
@@ -2348,6 +2254,21 @@ static int verify_exported_symbols(struct module *mod)
return 0;
}
+static bool ignore_undef_symbol(Elf_Half emachine, const char *name)
+{
+ /*
+ * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as
+ * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64.
+ * i386 has a similar problem but may not deserve a fix.
+ *
+ * If we ever have to ignore many symbols, consider refactoring the code to
+ * only warn if referenced by a relocation.
+ */
+ if (emachine == EM_386 || emachine == EM_X86_64)
+ return !strcmp(name, "_GLOBAL_OFFSET_TABLE_");
+ return false;
+}
+
/* Change all symbols so that st_value encodes the pointer directly. */
static int simplify_symbols(struct module *mod, const struct load_info *info)
{
@@ -2395,8 +2316,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
break;
}
- /* Ok if weak. */
- if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
+ /* Ok if weak or ignored. */
+ if (!ksym &&
+ (ELF_ST_BIND(sym[i].st_info) == STB_WEAK ||
+ ignore_undef_symbol(info->hdr->e_machine, name)))
break;
ret = PTR_ERR(ksym) ?: -ENOENT;
@@ -2964,7 +2887,7 @@ static int module_sig_check(struct load_info *info, int flags)
}
if (is_module_sig_enforced()) {
- pr_notice("%s: loading of %s is rejected\n", info->name, reason);
+ pr_notice("Loading of %s is rejected\n", reason);
return -EKEYREJECTED;
}
@@ -2977,9 +2900,33 @@ static int module_sig_check(struct load_info *info, int flags)
}
#endif /* !CONFIG_MODULE_SIG */
-/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
-static int elf_header_check(struct load_info *info)
+static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
+{
+ unsigned long secend;
+
+ /*
+ * Check for both overflow and offset/size being
+ * too large.
+ */
+ secend = shdr->sh_offset + shdr->sh_size;
+ if (secend < shdr->sh_offset || secend > info->len)
+ return -ENOEXEC;
+
+ return 0;
+}
+
+/*
+ * Sanity checks against invalid binaries, wrong arch, weird elf version.
+ *
+ * Also do basic validity checks against section offsets and sizes, the
+ * section name string table, and the indices used for it (sh_name).
+ */
+static int elf_validity_check(struct load_info *info)
{
+ unsigned int i;
+ Elf_Shdr *shdr, *strhdr;
+ int err;
+
if (info->len < sizeof(*(info->hdr)))
return -ENOEXEC;
@@ -2989,11 +2936,78 @@ static int elf_header_check(struct load_info *info)
|| info->hdr->e_shentsize != sizeof(Elf_Shdr))
return -ENOEXEC;
+ /*
+ * e_shnum is 16 bits, and sizeof(Elf_Shdr) is
+ * known and small. So e_shnum * sizeof(Elf_Shdr)
+ * will not overflow unsigned long on any platform.
+ */
if (info->hdr->e_shoff >= info->len
|| (info->hdr->e_shnum * sizeof(Elf_Shdr) >
info->len - info->hdr->e_shoff))
return -ENOEXEC;
+ info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
+
+ /*
+ * Verify if the section name table index is valid.
+ */
+ if (info->hdr->e_shstrndx == SHN_UNDEF
+ || info->hdr->e_shstrndx >= info->hdr->e_shnum)
+ return -ENOEXEC;
+
+ strhdr = &info->sechdrs[info->hdr->e_shstrndx];
+ err = validate_section_offset(info, strhdr);
+ if (err < 0)
+ return err;
+
+ /*
+ * The section name table must be NUL-terminated, as required
+ * by the spec. This makes strcmp and pr_* calls that access
+ * strings in the section safe.
+ */
+ info->secstrings = (void *)info->hdr + strhdr->sh_offset;
+ if (info->secstrings[strhdr->sh_size - 1] != '\0')
+ return -ENOEXEC;
+
+ /*
+ * The code assumes that section 0 has a length of zero and
+ * an addr of zero, so check for it.
+ */
+ if (info->sechdrs[0].sh_type != SHT_NULL
+ || info->sechdrs[0].sh_size != 0
+ || info->sechdrs[0].sh_addr != 0)
+ return -ENOEXEC;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ shdr = &info->sechdrs[i];
+ switch (shdr->sh_type) {
+ case SHT_NULL:
+ case SHT_NOBITS:
+ continue;
+ case SHT_SYMTAB:
+ if (shdr->sh_link == SHN_UNDEF
+ || shdr->sh_link >= info->hdr->e_shnum)
+ return -ENOEXEC;
+ fallthrough;
+ default:
+ err = validate_section_offset(info, shdr);
+ if (err < 0) {
+ pr_err("Invalid ELF section in module (section %u type %u)\n",
+ i, shdr->sh_type);
+ return err;
+ }
+
+ if (shdr->sh_flags & SHF_ALLOC) {
+ if (shdr->sh_name >= strhdr->sh_size) {
+ pr_err("Invalid ELF section name in module (section %u type %u)\n",
+ i, shdr->sh_type);
+ return -ENOEXEC;
+ }
+ }
+ break;
+ }
+ }
+
return 0;
}
@@ -3095,11 +3109,6 @@ static int rewrite_section_headers(struct load_info *info, int flags)
for (i = 1; i < info->hdr->e_shnum; i++) {
Elf_Shdr *shdr = &info->sechdrs[i];
- if (shdr->sh_type != SHT_NOBITS
- && info->len < shdr->sh_offset + shdr->sh_size) {
- pr_err("Module len %lu truncated\n", info->len);
- return -ENOEXEC;
- }
/*
* Mark all sections sh_addr with their address in the
@@ -3133,11 +3142,6 @@ static int setup_load_info(struct load_info *info, int flags)
{
unsigned int i;
- /* Set up the convenience variables */
- info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
- info->secstrings = (void *)info->hdr
- + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
-
/* Try to find a name early so we can log errors with a module name */
info->index.info = find_sec(info, ".modinfo");
if (info->index.info)
@@ -3241,22 +3245,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->gpl_syms),
&mod->num_gpl_syms);
mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
- mod->gpl_future_syms = section_objs(info,
- "__ksymtab_gpl_future",
- sizeof(*mod->gpl_future_syms),
- &mod->num_gpl_future_syms);
- mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
-
-#ifdef CONFIG_UNUSED_SYMBOLS
- mod->unused_syms = section_objs(info, "__ksymtab_unused",
- sizeof(*mod->unused_syms),
- &mod->num_unused_syms);
- mod->unused_crcs = section_addr(info, "__kcrctab_unused");
- mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
- sizeof(*mod->unused_gpl_syms),
- &mod->num_unused_gpl_syms);
- mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
-#endif
+
#ifdef CONFIG_CONSTRUCTORS
mod->ctors = section_objs(info, ".ctors",
sizeof(*mod->ctors), &mod->num_ctors);
@@ -3437,14 +3426,8 @@ static int check_module_license_and_versions(struct module *mod)
pr_warn("%s: module license taints kernel.\n", mod->name);
#ifdef CONFIG_MODVERSIONS
- if ((mod->num_syms && !mod->crcs)
- || (mod->num_gpl_syms && !mod->gpl_crcs)
- || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
-#ifdef CONFIG_UNUSED_SYMBOLS
- || (mod->num_unused_syms && !mod->unused_crcs)
- || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
-#endif
- ) {
+ if ((mod->num_syms && !mod->crcs) ||
+ (mod->num_gpl_syms && !mod->gpl_crcs)) {
return try_to_force_load(mod,
"no versions for exported symbols");
}
@@ -3894,26 +3877,50 @@ static int load_module(struct load_info *info, const char __user *uargs,
long err = 0;
char *after_dashes;
- err = elf_header_check(info);
+ /*
+ * Do the signature check (if any) first. All that
+ * the signature check needs is info->len, it does
+ * not need any of the section info. That can be
+ * set up later. This will minimize the chances
+ * of a corrupt module causing problems before
+ * we even get to the signature check.
+ *
+ * The check will also adjust info->len by stripping
+ * off the sig length at the end of the module, making
+ * checks against info->len more correct.
+ */
+ err = module_sig_check(info, flags);
+ if (err)
+ goto free_copy;
+
+ /*
+ * Do basic sanity checks against the ELF header and
+ * sections.
+ */
+ err = elf_validity_check(info);
if (err) {
- pr_err("Module has invalid ELF header\n");
+ pr_err("Module has invalid ELF structures\n");
goto free_copy;
}
+ /*
+ * Everything checks out, so set up the section info
+ * in the info structure.
+ */
err = setup_load_info(info, flags);
if (err)
goto free_copy;
+ /*
+ * Now that we know we have the correct module name, check
+ * if it's blacklisted.
+ */
if (blacklisted(info->name)) {
err = -EPERM;
pr_err("Module %s is blacklisted\n", info->name);
goto free_copy;
}
- err = module_sig_check(info, flags);
- if (err)
- goto free_copy;
-
err = rewrite_section_headers(info, flags);
if (err)
goto free_copy;
@@ -4374,16 +4381,16 @@ unsigned long module_kallsyms_lookup_name(const char *name)
return ret;
}
+#ifdef CONFIG_LIVEPATCH
int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
struct module *, unsigned long),
void *data)
{
struct module *mod;
unsigned int i;
- int ret;
-
- module_assert_mutex();
+ int ret = 0;
+ mutex_lock(&module_mutex);
list_for_each_entry(mod, &modules, list) {
/* We hold module_mutex: no need for rcu_dereference_sched */
struct mod_kallsyms *kallsyms = mod->kallsyms;
@@ -4399,11 +4406,13 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
ret = fn(data, kallsyms_symbol_name(kallsyms, i),
mod, kallsyms_symbol_value(sym));
if (ret != 0)
- return ret;
+ break;
}
}
- return 0;
+ mutex_unlock(&module_mutex);
+ return ret;
}
+#endif /* CONFIG_LIVEPATCH */
#endif /* CONFIG_KALLSYMS */
/* Maximum number of characters written by module_flags() */
diff --git a/kernel/module_signature.c b/kernel/module_signature.c
index 4224a1086b7d..00132d12487c 100644
--- a/kernel/module_signature.c
+++ b/kernel/module_signature.c
@@ -25,7 +25,7 @@ int mod_check_sig(const struct module_signature *ms, size_t file_len,
return -EBADMSG;
if (ms->id_type != PKEY_ID_PKCS7) {
- pr_err("%s: Module is not signed with expected PKCS#7 message\n",
+ pr_err("%s: not signed with expected PKCS#7 message\n",
name);
return -ENOPKG;
}
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 9d9fc678c91d..8723ae70ea1f 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -30,7 +30,7 @@ int mod_verify_sig(const void *mod, struct load_info *info)
memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
- ret = mod_check_sig(&ms, modlen, info->name);
+ ret = mod_check_sig(&ms, modlen, "module");
if (ret)
return ret;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a7320f07689d..6bfe3ead10ad 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -139,7 +139,6 @@ config PM_SLEEP_SMP_NONZERO_CPU
config PM_AUTOSLEEP
bool "Opportunistic sleep"
depends on PM_SLEEP
- default n
help
Allow the kernel to trigger a system transition into a global sleep
state automatically whenever there are no active wakeup sources.
@@ -147,7 +146,6 @@ config PM_AUTOSLEEP
config PM_WAKELOCKS
bool "User space wakeup sources interface"
depends on PM_SLEEP
- default n
help
Allow user space to create, activate and deactivate wakeup source
objects with the help of a sysfs-based interface.
@@ -293,7 +291,6 @@ config PM_GENERIC_DOMAINS
config WQ_POWER_EFFICIENT_DEFAULT
bool "Enable workqueue power-efficient mode by default"
depends on PM
- default n
help
Per-cpu workqueues are generally preferred because they show
better performance thanks to cache locality; unfortunately,
@@ -322,15 +319,14 @@ config CPU_PM
bool
config ENERGY_MODEL
- bool "Energy Model for CPUs"
+ bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)"
depends on SMP
depends on CPU_FREQ
- default n
help
Several subsystems (thermal and/or the task scheduler for example)
- can leverage information about the energy consumed by CPUs to make
- smarter decisions. This config option enables the framework from
- which subsystems can access the energy models.
+ can leverage information about the energy consumed by devices to
+ make smarter decisions. This config option enables the framework
+ from which subsystems can access the energy models.
The exact usage of the energy model is subsystem-dependent.
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0aefd6f57e0a..12c7e1bb442f 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -387,7 +387,7 @@ static struct attribute *suspend_attrs[] = {
NULL,
};
-static struct attribute_group suspend_attr_group = {
+static const struct attribute_group suspend_attr_group = {
.name = "suspend_stats",
.attrs = suspend_attrs,
};
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 45b054b7b5ec..50cc63534486 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -235,7 +235,7 @@ void thaw_kernel_threads(void)
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
- if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
+ if (p->flags & PF_KTHREAD)
__thaw_task(p);
}
read_unlock(&tasklist_lock);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5a95c688621f..575a34b88936 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -735,9 +735,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
logbuf_lock_irq();
}
- if (user->seq < prb_first_valid_seq(prb)) {
+ if (r->info->seq != user->seq) {
/* our last seen message is gone, return error and reset */
- user->seq = prb_first_valid_seq(prb);
+ user->seq = r->info->seq;
ret = -EPIPE;
logbuf_unlock_irq();
goto out;
@@ -812,6 +812,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
{
struct devkmsg_user *user = file->private_data;
+ struct printk_info info;
__poll_t ret = 0;
if (!user)
@@ -820,9 +821,9 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
poll_wait(file, &log_wait, wait);
logbuf_lock_irq();
- if (prb_read_valid(prb, user->seq, NULL)) {
+ if (prb_read_valid_info(prb, user->seq, &info, NULL)) {
/* return error when data has vanished underneath us */
- if (user->seq < prb_first_valid_seq(prb))
+ if (info.seq != user->seq)
ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
else
ret = EPOLLIN|EPOLLRDNORM;
@@ -1559,6 +1560,7 @@ static void syslog_clear(void)
int do_syslog(int type, char __user *buf, int len, int source)
{
+ struct printk_info info;
bool clear = false;
static int saved_console_loglevel = LOGLEVEL_DEFAULT;
int error;
@@ -1629,9 +1631,14 @@ int do_syslog(int type, char __user *buf, int len, int source)
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
logbuf_lock_irq();
- if (syslog_seq < prb_first_valid_seq(prb)) {
+ if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
+ /* No unread messages. */
+ logbuf_unlock_irq();
+ return 0;
+ }
+ if (info.seq != syslog_seq) {
/* messages are gone, move to first one */
- syslog_seq = prb_first_valid_seq(prb);
+ syslog_seq = info.seq;
syslog_partial = 0;
}
if (source == SYSLOG_FROM_PROC) {
@@ -1643,7 +1650,6 @@ int do_syslog(int type, char __user *buf, int len, int source)
error = prb_next_seq(prb) - syslog_seq;
} else {
bool time = syslog_partial ? syslog_time : printk_time;
- struct printk_info info;
unsigned int line_count;
u64 seq;
@@ -3429,9 +3435,11 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
goto out;
logbuf_lock_irqsave(flags);
- if (dumper->cur_seq < prb_first_valid_seq(prb)) {
- /* messages are gone, move to first available one */
- dumper->cur_seq = prb_first_valid_seq(prb);
+ if (prb_read_valid_info(prb, dumper->cur_seq, &info, NULL)) {
+ if (info.seq != dumper->cur_seq) {
+ /* messages are gone, move to first available one */
+ dumper->cur_seq = info.seq;
+ }
}
/* last entry */
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
index 5dc9d022db07..73cc80e01cef 100644
--- a/kernel/printk/printk_ringbuffer.h
+++ b/kernel/printk/printk_ringbuffer.h
@@ -287,7 +287,7 @@ _DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0])
/* Writer Interface */
/**
- * prb_rec_init_wd() - Initialize a buffer for writing records.
+ * prb_rec_init_wr() - Initialize a buffer for writing records.
*
* @r: The record to initialize.
* @text_buf_size: The needed text buffer size.
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index a0e6f746de6c..2e9e3ed7d63e 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -45,6 +45,8 @@ struct printk_safe_seq_buf {
static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
static DEFINE_PER_CPU(int, printk_context);
+static DEFINE_RAW_SPINLOCK(safe_read_lock);
+
#ifdef CONFIG_PRINTK_NMI
static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
#endif
@@ -180,8 +182,6 @@ static void report_message_lost(struct printk_safe_seq_buf *s)
*/
static void __printk_safe_flush(struct irq_work *work)
{
- static raw_spinlock_t read_lock =
- __RAW_SPIN_LOCK_INITIALIZER(read_lock);
struct printk_safe_seq_buf *s =
container_of(work, struct printk_safe_seq_buf, work);
unsigned long flags;
@@ -195,7 +195,7 @@ static void __printk_safe_flush(struct irq_work *work)
* different CPUs. This is especially important when printing
* a backtrace.
*/
- raw_spin_lock_irqsave(&read_lock, flags);
+ raw_spin_lock_irqsave(&safe_read_lock, flags);
i = 0;
more:
@@ -232,7 +232,7 @@ more:
out:
report_message_lost(s);
- raw_spin_unlock_irqrestore(&read_lock, flags);
+ raw_spin_unlock_irqrestore(&safe_read_lock, flags);
}
/**
@@ -278,6 +278,14 @@ void printk_safe_flush_on_panic(void)
raw_spin_lock_init(&logbuf_lock);
}
+ if (raw_spin_is_locked(&safe_read_lock)) {
+ if (num_online_cpus() > 1)
+ return;
+
+ debug_locks_off();
+ raw_spin_lock_init(&safe_read_lock);
+ }
+
printk_safe_flush();
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 61db50f7ca86..821cf1723814 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -375,7 +375,7 @@ static int ptrace_attach(struct task_struct *task, long request,
audit_ptrace(task);
retval = -EPERM;
- if (unlikely(task->flags & PF_KTHREAD))
+ if (unlikely(task->flags & (PF_KTHREAD | PF_IO_WORKER)))
goto out;
if (same_thread_group(task, current))
goto out;
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index cdc57b4f6d48..3128b7cf8e1f 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -95,6 +95,7 @@ config TASKS_RUDE_RCU
config TASKS_TRACE_RCU
def_bool 0
+ select IRQ_WORK
help
This option enables a task-based RCU implementation that uses
explicit rcu_read_lock_trace() read-side markers, and allows
@@ -188,8 +189,8 @@ config RCU_FAST_NO_HZ
config RCU_BOOST
bool "Enable RCU priority boosting"
- depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
- default n
+ depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT
+ default y if PREEMPT_RT
help
This option boosts the priority of preempted RCU readers that
block the current preemptible RCU grace period for too long.
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 59ef1ae6dc37..bf0827d4b659 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -378,7 +378,11 @@ do { \
smp_mb__after_unlock_lock(); \
} while (0)
-#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock))
+#define raw_spin_unlock_rcu_node(p) \
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock(&ACCESS_PRIVATE(p, lock)); \
+} while (0)
#define raw_spin_lock_irq_rcu_node(p) \
do { \
@@ -387,7 +391,10 @@ do { \
} while (0)
#define raw_spin_unlock_irq_rcu_node(p) \
- raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)); \
+} while (0)
#define raw_spin_lock_irqsave_rcu_node(p, flags) \
do { \
@@ -396,7 +403,10 @@ do { \
} while (0)
#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \
- raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags); \
+} while (0)
#define raw_spin_trylock_rcu_node(p) \
({ \
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 2d2a6b6b9dfb..7f181c9675f7 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -7,10 +7,10 @@
* Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
-#include <linux/types.h>
-#include <linux/kernel.h>
+#include <linux/cpu.h>
#include <linux/interrupt.h>
-#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
#include "rcu_segcblist.h"
@@ -88,23 +88,135 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
#endif
}
+/* Get the length of a segment of the rcu_segcblist structure. */
+static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
+{
+ return READ_ONCE(rsclp->seglen[seg]);
+}
+
+/* Return number of callbacks in segmented callback list by summing seglen. */
+long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp)
+{
+ long len = 0;
+ int i;
+
+ for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+ len += rcu_segcblist_get_seglen(rsclp, i);
+
+ return len;
+}
+
+/* Set the length of a segment of the rcu_segcblist structure. */
+static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
+{
+ WRITE_ONCE(rsclp->seglen[seg], v);
+}
+
+/* Increase the numeric length of a segment by a specified amount. */
+static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, long v)
+{
+ WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v);
+}
+
+/* Move from's segment length to to's segment. */
+static void rcu_segcblist_move_seglen(struct rcu_segcblist *rsclp, int from, int to)
+{
+ long len;
+
+ if (from == to)
+ return;
+
+ len = rcu_segcblist_get_seglen(rsclp, from);
+ if (!len)
+ return;
+
+ rcu_segcblist_add_seglen(rsclp, to, len);
+ rcu_segcblist_set_seglen(rsclp, from, 0);
+}
+
+/* Increment segment's length. */
+static void rcu_segcblist_inc_seglen(struct rcu_segcblist *rsclp, int seg)
+{
+ rcu_segcblist_add_seglen(rsclp, seg, 1);
+}
+
/*
* Increase the numeric length of an rcu_segcblist structure by the
* specified amount, which can be negative. This can cause the ->len
* field to disagree with the actual number of callbacks on the structure.
* This increase is fully ordered with respect to the callers accesses
* both before and after.
+ *
+ * So why on earth is a memory barrier required both before and after
+ * the update to the ->len field???
+ *
+ * The reason is that rcu_barrier() locklessly samples each CPU's ->len
+ * field, and if a given CPU's field is zero, avoids IPIing that CPU.
+ * This can of course race with both queuing and invoking of callbacks.
+ * Failing to correctly handle either of these races could result in
+ * rcu_barrier() failing to IPI a CPU that actually had callbacks queued
+ * which rcu_barrier() was obligated to wait on. And if rcu_barrier()
+ * failed to wait on such a callback, unloading certain kernel modules
+ * would result in calls to functions whose code was no longer present in
+ * the kernel, for but one example.
+ *
+ * Therefore, ->len transitions from 1->0 and 0->1 have to be carefully
+ * ordered with respect with both list modifications and the rcu_barrier().
+ *
+ * The queuing case is CASE 1 and the invoking case is CASE 2.
+ *
+ * CASE 1: Suppose that CPU 0 has no callbacks queued, but invokes
+ * call_rcu() just as CPU 1 invokes rcu_barrier(). CPU 0's ->len field
+ * will transition from 0->1, which is one of the transitions that must
+ * be handled carefully. Without the full memory barriers after the ->len
+ * update and at the beginning of rcu_barrier(), the following could happen:
+ *
+ * CPU 0 CPU 1
+ *
+ * call_rcu().
+ * rcu_barrier() sees ->len as 0.
+ * set ->len = 1.
+ * rcu_barrier() does nothing.
+ * module is unloaded.
+ * callback invokes unloaded function!
+ *
+ * With the full barriers, any case where rcu_barrier() sees ->len as 0 will
+ * have unambiguously preceded the return from the racing call_rcu(), which
+ * means that this call_rcu() invocation is OK to not wait on. After all,
+ * you are supposed to make sure that any problematic call_rcu() invocations
+ * happen before the rcu_barrier().
+ *
+ *
+ * CASE 2: Suppose that CPU 0 is invoking its last callback just as
+ * CPU 1 invokes rcu_barrier(). CPU 0's ->len field will transition from
+ * 1->0, which is one of the transitions that must be handled carefully.
+ * Without the full memory barriers before the ->len update and at the
+ * end of rcu_barrier(), the following could happen:
+ *
+ * CPU 0 CPU 1
+ *
+ * start invoking last callback
+ * set ->len = 0 (reordered)
+ * rcu_barrier() sees ->len as 0
+ * rcu_barrier() does nothing.
+ * module is unloaded
+ * callback executing after unloaded!
+ *
+ * With the full barriers, any case where rcu_barrier() sees ->len as 0
+ * will be fully ordered after the completion of the callback function,
+ * so that the module unloading operation is completely safe.
+ *
*/
-static void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
+void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
{
#ifdef CONFIG_RCU_NOCB_CPU
- smp_mb__before_atomic(); /* Up to the caller! */
+ smp_mb__before_atomic(); // Read header comment above.
atomic_long_add(v, &rsclp->len);
- smp_mb__after_atomic(); /* Up to the caller! */
+ smp_mb__after_atomic(); // Read header comment above.
#else
- smp_mb(); /* Up to the caller! */
+ smp_mb(); // Read header comment above.
WRITE_ONCE(rsclp->len, rsclp->len + v);
- smp_mb(); /* Up to the caller! */
+ smp_mb(); // Read header comment above.
#endif
}
@@ -120,26 +232,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp)
}
/*
- * Exchange the numeric length of the specified rcu_segcblist structure
- * with the specified value. This can cause the ->len field to disagree
- * with the actual number of callbacks on the structure. This exchange is
- * fully ordered with respect to the callers accesses both before and after.
- */
-static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v)
-{
-#ifdef CONFIG_RCU_NOCB_CPU
- return atomic_long_xchg(&rsclp->len, v);
-#else
- long ret = rsclp->len;
-
- smp_mb(); /* Up to the caller! */
- WRITE_ONCE(rsclp->len, v);
- smp_mb(); /* Up to the caller! */
- return ret;
-#endif
-}
-
-/*
* Initialize an rcu_segcblist structure.
*/
void rcu_segcblist_init(struct rcu_segcblist *rsclp)
@@ -149,10 +241,12 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)
BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
rsclp->head = NULL;
- for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+ for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
rsclp->tails[i] = &rsclp->head;
+ rcu_segcblist_set_seglen(rsclp, i, 0);
+ }
rcu_segcblist_set_len(rsclp, 0);
- rsclp->enabled = 1;
+ rcu_segcblist_set_flags(rsclp, SEGCBLIST_ENABLED);
}
/*
@@ -163,16 +257,21 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
{
WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
- rsclp->enabled = 0;
+ rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED);
}
/*
* Mark the specified rcu_segcblist structure as offloaded. This
* structure must be empty.
*/
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp)
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload)
{
- rsclp->offloaded = 1;
+ if (offload) {
+ rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY);
+ rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED);
+ } else {
+ rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED);
+ }
}
/*
@@ -245,7 +344,7 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp)
{
rcu_segcblist_inc_len(rsclp);
- smp_mb(); /* Ensure counts are updated before callback is enqueued. */
+ rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL);
rhp->next = NULL;
WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next);
@@ -274,6 +373,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
if (rsclp->tails[i] != rsclp->tails[i - 1])
break;
+ rcu_segcblist_inc_seglen(rsclp, i);
WRITE_ONCE(*rsclp->tails[i], rhp);
for (; i <= RCU_NEXT_TAIL; i++)
WRITE_ONCE(rsclp->tails[i], &rhp->next);
@@ -281,21 +381,6 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
}
/*
- * Extract only the counts from the specified rcu_segcblist structure,
- * and place them in the specified rcu_cblist structure. This function
- * supports both callback orphaning and invocation, hence the separation
- * of counts and callbacks. (Callbacks ready for invocation must be
- * orphaned and adopted separately from pending callbacks, but counts
- * apply to all callbacks. Locking must be used to make sure that
- * both orphaned-callbacks lists are consistent.)
- */
-void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
- struct rcu_cblist *rclp)
-{
- rclp->len = rcu_segcblist_xchg_len(rsclp, 0);
-}
-
-/*
* Extract only those callbacks ready to be invoked from the specified
* rcu_segcblist structure and place them in the specified rcu_cblist
* structure.
@@ -307,6 +392,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
if (!rcu_segcblist_ready_cbs(rsclp))
return; /* Nothing to do. */
+ rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_DONE_TAIL);
*rclp->tail = rsclp->head;
WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]);
WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
@@ -314,6 +400,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
WRITE_ONCE(rsclp->tails[i], &rsclp->head);
+ rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
}
/*
@@ -330,11 +417,15 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
if (!rcu_segcblist_pend_cbs(rsclp))
return; /* Nothing to do. */
+ rclp->len = 0;
*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
- for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
+ for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) {
+ rclp->len += rcu_segcblist_get_seglen(rsclp, i);
WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);
+ rcu_segcblist_set_seglen(rsclp, i, 0);
+ }
}
/*
@@ -345,7 +436,6 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp)
{
rcu_segcblist_add_len(rsclp, rclp->len);
- rclp->len = 0;
}
/*
@@ -359,6 +449,7 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
if (!rclp->head)
return; /* No callbacks to move. */
+ rcu_segcblist_add_seglen(rsclp, RCU_DONE_TAIL, rclp->len);
*rclp->tail = rsclp->head;
WRITE_ONCE(rsclp->head, rclp->head);
for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
@@ -379,6 +470,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
{
if (!rclp->head)
return; /* Nothing to do. */
+
+ rcu_segcblist_add_seglen(rsclp, RCU_NEXT_TAIL, rclp->len);
WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head);
WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
}
@@ -403,6 +496,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
break;
WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
+ rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL);
}
/* If no callbacks moved, nothing more need be done. */
@@ -423,6 +517,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
break; /* No more callbacks. */
WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
+ rcu_segcblist_move_seglen(rsclp, i, j);
rsclp->gp_seq[j] = rsclp->gp_seq[i];
}
}
@@ -444,7 +539,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
*/
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
{
- int i;
+ int i, j;
WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
@@ -487,6 +582,10 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL)
return false;
+ /* Accounting: everything below i is about to get merged into i. */
+ for (j = i + 1; j <= RCU_NEXT_TAIL; j++)
+ rcu_segcblist_move_seglen(rsclp, j, i);
+
/*
* Merge all later callbacks, including newly arrived callbacks,
* into the segment located by the for-loop above. Assign "seq"
@@ -514,13 +613,24 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
struct rcu_cblist donecbs;
struct rcu_cblist pendcbs;
+ lockdep_assert_cpus_held();
+
rcu_cblist_init(&donecbs);
rcu_cblist_init(&pendcbs);
- rcu_segcblist_extract_count(src_rsclp, &donecbs);
+
rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
+
+ /*
+ * No need smp_mb() before setting length to 0, because CPU hotplug
+ * lock excludes rcu_barrier.
+ */
+ rcu_segcblist_set_len(src_rsclp, 0);
+
rcu_segcblist_insert_count(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_count(dst_rsclp, &pendcbs);
rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
+
rcu_segcblist_init(src_rsclp);
}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 492262bcb591..9a19328ff251 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -15,6 +15,9 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
return READ_ONCE(rclp->len);
}
+/* Return number of callbacks in segmented callback list by summing seglen. */
+long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
+
void rcu_cblist_init(struct rcu_cblist *rclp);
void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
@@ -50,19 +53,51 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
#endif
}
+static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ rsclp->flags |= flags;
+}
+
+static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ rsclp->flags &= ~flags;
+}
+
+static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ return READ_ONCE(rsclp->flags) & flags;
+}
+
/*
* Is the specified rcu_segcblist enabled, for example, not corresponding
* to an offline CPU?
*/
static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
{
- return rsclp->enabled;
+ return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED);
}
-/* Is the specified rcu_segcblist offloaded? */
+/* Is the specified rcu_segcblist offloaded, or is SEGCBLIST_SOFTIRQ_ONLY set? */
static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
{
- return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rsclp->offloaded;
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ !rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY))
+ return true;
+
+ return false;
+}
+
+static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp)
+{
+ int flags = SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | SEGCBLIST_OFFLOADED;
+
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && (rsclp->flags & flags) == flags)
+ return true;
+
+ return false;
}
/*
@@ -75,10 +110,22 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
return !READ_ONCE(*READ_ONCE(rsclp->tails[seg]));
}
+/*
+ * Is the specified segment of the specified rcu_segcblist structure
+ * empty of callbacks?
+ */
+static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
+{
+ if (seg == RCU_DONE_TAIL)
+ return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
+ return rsclp->tails[seg - 1] == rsclp->tails[seg];
+}
+
void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);
+void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v);
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp);
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
@@ -88,8 +135,6 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp);
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
struct rcu_head *rhp);
-void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
- struct rcu_cblist *rclp);
void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp);
void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 528ed10b78fd..99657ffa6688 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -85,6 +85,7 @@ torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(bool, gp_normal, false,
"Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
@@ -97,6 +98,8 @@ torture_param(int, object_debug, 0,
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
torture_param(int, onoff_interval, 0,
"Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable");
+torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
torture_param(int, read_exit_delay, 13,
"Delay between read-then-exit episodes (s)");
torture_param(int, read_exit_burst, 16,
@@ -127,10 +130,12 @@ static char *torture_type = "rcu";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)");
+static int nrealnocbers;
static int nrealreaders;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
+static struct task_struct **nocb_tasks;
static struct task_struct *stats_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
@@ -142,11 +147,22 @@ static struct task_struct *read_exit_task;
#define RCU_TORTURE_PIPE_LEN 10
+// Mailbox-like structure to check RCU global memory ordering.
+struct rcu_torture_reader_check {
+ unsigned long rtc_myloops;
+ int rtc_chkrdr;
+ unsigned long rtc_chkloops;
+ int rtc_ready;
+ struct rcu_torture_reader_check *rtc_assigner;
+} ____cacheline_internodealigned_in_smp;
+
+// Update-side data structure used to check RCU readers.
struct rcu_torture {
struct rcu_head rtort_rcu;
int rtort_pipe_count;
struct list_head rtort_free;
int rtort_mbtest;
+ struct rcu_torture_reader_check *rtort_chkp;
};
static LIST_HEAD(rcu_torture_freelist);
@@ -157,10 +173,13 @@ static DEFINE_SPINLOCK(rcu_torture_lock);
static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count);
static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch);
static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+static struct rcu_torture_reader_check *rcu_torture_reader_mbchk;
static atomic_t n_rcu_torture_alloc;
static atomic_t n_rcu_torture_alloc_fail;
static atomic_t n_rcu_torture_free;
static atomic_t n_rcu_torture_mberror;
+static atomic_t n_rcu_torture_mbchk_fail;
+static atomic_t n_rcu_torture_mbchk_tries;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_barrier_error;
static long n_rcu_torture_boost_ktrerror;
@@ -174,6 +193,8 @@ static unsigned long n_read_exits;
static struct list_head rcu_torture_removed;
static unsigned long shutdown_jiffies;
static unsigned long start_gp_seq;
+static atomic_long_t n_nocb_offload;
+static atomic_long_t n_nocb_deoffload;
static int rcu_torture_writer_state;
#define RTWS_FIXED_DELAY 0
@@ -183,9 +204,11 @@ static int rcu_torture_writer_state;
#define RTWS_EXP_SYNC 4
#define RTWS_COND_GET 5
#define RTWS_COND_SYNC 6
-#define RTWS_SYNC 7
-#define RTWS_STUTTER 8
-#define RTWS_STOPPING 9
+#define RTWS_POLL_GET 7
+#define RTWS_POLL_WAIT 8
+#define RTWS_SYNC 9
+#define RTWS_STUTTER 10
+#define RTWS_STOPPING 11
static const char * const rcu_torture_writer_state_names[] = {
"RTWS_FIXED_DELAY",
"RTWS_DELAY",
@@ -194,6 +217,8 @@ static const char * const rcu_torture_writer_state_names[] = {
"RTWS_EXP_SYNC",
"RTWS_COND_GET",
"RTWS_COND_SYNC",
+ "RTWS_POLL_GET",
+ "RTWS_POLL_WAIT",
"RTWS_SYNC",
"RTWS_STUTTER",
"RTWS_STOPPING",
@@ -311,7 +336,9 @@ struct rcu_torture_ops {
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
- unsigned long (*get_state)(void);
+ unsigned long (*get_gp_state)(void);
+ unsigned long (*start_gp_poll)(void);
+ bool (*poll_gp_state)(unsigned long oldstate);
void (*cond_sync)(unsigned long oldstate);
call_rcu_func_t call;
void (*cb_barrier)(void);
@@ -386,7 +413,12 @@ static bool
rcu_torture_pipe_update_one(struct rcu_torture *rp)
{
int i;
+ struct rcu_torture_reader_check *rtrcp = READ_ONCE(rp->rtort_chkp);
+ if (rtrcp) {
+ WRITE_ONCE(rp->rtort_chkp, NULL);
+ smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire().
+ }
i = READ_ONCE(rp->rtort_pipe_count);
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
@@ -461,7 +493,7 @@ static struct rcu_torture_ops rcu_ops = {
.deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
- .get_state = get_state_synchronize_rcu,
+ .get_gp_state = get_state_synchronize_rcu,
.cond_sync = cond_synchronize_rcu,
.call = call_rcu,
.cb_barrier = rcu_barrier,
@@ -570,6 +602,21 @@ static void srcu_torture_synchronize(void)
synchronize_srcu(srcu_ctlp);
}
+static unsigned long srcu_torture_get_gp_state(void)
+{
+ return get_state_synchronize_srcu(srcu_ctlp);
+}
+
+static unsigned long srcu_torture_start_gp_poll(void)
+{
+ return start_poll_synchronize_srcu(srcu_ctlp);
+}
+
+static bool srcu_torture_poll_gp_state(unsigned long oldstate)
+{
+ return poll_state_synchronize_srcu(srcu_ctlp, oldstate);
+}
+
static void srcu_torture_call(struct rcu_head *head,
rcu_callback_t func)
{
@@ -601,6 +648,9 @@ static struct rcu_torture_ops srcu_ops = {
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .get_gp_state = srcu_torture_get_gp_state,
+ .start_gp_poll = srcu_torture_start_gp_poll,
+ .poll_gp_state = srcu_torture_poll_gp_state,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
@@ -1018,42 +1068,26 @@ rcu_torture_fqs(void *arg)
return 0;
}
+// Used by writers to randomly choose from the available grace-period
+// primitives. The only purpose of the initialization is to size the array.
+static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC };
+static int nsynctypes;
+
/*
- * RCU torture writer kthread. Repeatedly substitutes a new structure
- * for that pointed to by rcu_torture_current, freeing the old structure
- * after a series of grace periods (the "pipeline").
+ * Determine which grace-period primitives are available.
*/
-static int
-rcu_torture_writer(void *arg)
+static void rcu_torture_write_types(void)
{
- bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
- int expediting = 0;
- unsigned long gp_snap;
bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
- bool gp_sync1 = gp_sync;
- int i;
- int oldnice = task_nice(current);
- struct rcu_torture *rp;
- struct rcu_torture *old_rp;
- static DEFINE_TORTURE_RANDOM(rand);
- bool stutter_waited;
- int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
- RTWS_COND_GET, RTWS_SYNC };
- int nsynctypes = 0;
-
- VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
- if (!can_expedite)
- pr_alert("%s" TORTURE_FLAG
- " GP expediting controlled from boot/sysfs for %s.\n",
- torture_type, cur_ops->name);
+ bool gp_poll1 = gp_poll, gp_sync1 = gp_sync;
/* Initialize synctype[] array. If none set, take default. */
- if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
- gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
- if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) {
+ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1)
+ gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true;
+ if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
synctype[nsynctypes++] = RTWS_COND_GET;
pr_info("%s: Testing conditional GPs.\n", __func__);
- } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) {
+ } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) {
pr_alert("%s: gp_cond without primitives.\n", __func__);
}
if (gp_exp1 && cur_ops->exp_sync) {
@@ -1068,12 +1102,46 @@ rcu_torture_writer(void *arg)
} else if (gp_normal && !cur_ops->deferred_free) {
pr_alert("%s: gp_normal without primitives.\n", __func__);
}
+ if (gp_poll1 && cur_ops->start_gp_poll && cur_ops->poll_gp_state) {
+ synctype[nsynctypes++] = RTWS_POLL_GET;
+ pr_info("%s: Testing polling GPs.\n", __func__);
+ } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) {
+ pr_alert("%s: gp_poll without primitives.\n", __func__);
+ }
if (gp_sync1 && cur_ops->sync) {
synctype[nsynctypes++] = RTWS_SYNC;
pr_info("%s: Testing normal GPs.\n", __func__);
} else if (gp_sync && !cur_ops->sync) {
pr_alert("%s: gp_sync without primitives.\n", __func__);
}
+}
+
+/*
+ * RCU torture writer kthread. Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+ bool boot_ended;
+ bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
+ unsigned long cookie;
+ int expediting = 0;
+ unsigned long gp_snap;
+ int i;
+ int idx;
+ int oldnice = task_nice(current);
+ struct rcu_torture *rp;
+ struct rcu_torture *old_rp;
+ static DEFINE_TORTURE_RANDOM(rand);
+ bool stutter_waited;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+ if (!can_expedite)
+ pr_alert("%s" TORTURE_FLAG
+ " GP expediting controlled from boot/sysfs for %s.\n",
+ torture_type, cur_ops->name);
if (WARN_ONCE(nsynctypes == 0,
"rcu_torture_writer: No update-side primitives.\n")) {
/*
@@ -1087,7 +1155,7 @@ rcu_torture_writer(void *arg)
do {
rcu_torture_writer_state = RTWS_FIXED_DELAY;
- schedule_timeout_uninterruptible(1);
+ torture_hrtimeout_us(500, 1000, &rand);
rp = rcu_torture_alloc();
if (rp == NULL)
continue;
@@ -1107,6 +1175,18 @@ rcu_torture_writer(void *arg)
atomic_inc(&rcu_torture_wcount[i]);
WRITE_ONCE(old_rp->rtort_pipe_count,
old_rp->rtort_pipe_count + 1);
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
+ idx = cur_ops->readlock();
+ cookie = cur_ops->get_gp_state();
+ WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE &&
+ cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 1 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
+ cur_ops->readunlock(idx);
+ }
switch (synctype[torture_random(&rand) % nsynctypes]) {
case RTWS_DEF_FREE:
rcu_torture_writer_state = RTWS_DEF_FREE;
@@ -1119,15 +1199,21 @@ rcu_torture_writer(void *arg)
break;
case RTWS_COND_GET:
rcu_torture_writer_state = RTWS_COND_GET;
- gp_snap = cur_ops->get_state();
- i = torture_random(&rand) % 16;
- if (i != 0)
- schedule_timeout_interruptible(i);
- udelay(torture_random(&rand) % 1000);
+ gp_snap = cur_ops->get_gp_state();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC;
cur_ops->cond_sync(gp_snap);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_POLL_GET:
+ rcu_torture_writer_state = RTWS_POLL_GET;
+ gp_snap = cur_ops->start_gp_poll();
+ rcu_torture_writer_state = RTWS_POLL_WAIT;
+ while (!cur_ops->poll_gp_state(gp_snap))
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_SYNC:
rcu_torture_writer_state = RTWS_SYNC;
cur_ops->sync();
@@ -1137,6 +1223,14 @@ rcu_torture_writer(void *arg)
WARN_ON_ONCE(1);
break;
}
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE &&
+ !cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
}
WRITE_ONCE(rcu_torture_current_version,
rcu_torture_current_version + 1);
@@ -1155,12 +1249,13 @@ rcu_torture_writer(void *arg)
!rcu_gp_is_normal();
}
rcu_torture_writer_state = RTWS_STUTTER;
+ boot_ended = rcu_inkernel_boot_has_ended();
stutter_waited = stutter_wait("rcu_torture_writer");
if (stutter_waited &&
!READ_ONCE(rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
!torture_must_stop() &&
- rcu_inkernel_boot_has_ended())
+ boot_ended)
for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
if (list_empty(&rcu_tortures[i].rtort_free) &&
rcu_access_pointer(rcu_torture_current) !=
@@ -1194,26 +1289,43 @@ rcu_torture_writer(void *arg)
static int
rcu_torture_fakewriter(void *arg)
{
+ unsigned long gp_snap;
DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
set_user_nice(current, MAX_NICE);
do {
- schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
- udelay(torture_random(&rand) & 0x3ff);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand);
if (cur_ops->cb_barrier != NULL &&
torture_random(&rand) % (nfakewriters * 8) == 0) {
cur_ops->cb_barrier();
- } else if (gp_normal == gp_exp) {
- if (cur_ops->sync && torture_random(&rand) & 0x80)
- cur_ops->sync();
- else if (cur_ops->exp_sync)
+ } else {
+ switch (synctype[torture_random(&rand) % nsynctypes]) {
+ case RTWS_DEF_FREE:
+ break;
+ case RTWS_EXP_SYNC:
cur_ops->exp_sync();
- } else if (gp_normal && cur_ops->sync) {
- cur_ops->sync();
- } else if (cur_ops->exp_sync) {
- cur_ops->exp_sync();
+ break;
+ case RTWS_COND_GET:
+ gp_snap = cur_ops->get_gp_state();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync(gp_snap);
+ break;
+ case RTWS_POLL_GET:
+ gp_snap = cur_ops->start_gp_poll();
+ while (!cur_ops->poll_gp_state(gp_snap)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_SYNC:
+ cur_ops->sync();
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
}
stutter_wait("rcu_torture_fakewriter");
} while (!torture_must_stop());
@@ -1227,6 +1339,62 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp)
kfree(rhp);
}
+// Set up and carry out testing of RCU's global memory ordering
+static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
+ struct torture_random_state *trsp)
+{
+ unsigned long loops;
+ int noc = torture_num_online_cpus();
+ int rdrchked;
+ int rdrchker;
+ struct rcu_torture_reader_check *rtrcp; // Me.
+ struct rcu_torture_reader_check *rtrcp_assigner; // Assigned us to do checking.
+ struct rcu_torture_reader_check *rtrcp_chked; // Reader being checked.
+ struct rcu_torture_reader_check *rtrcp_chker; // Reader doing checking when not me.
+
+ if (myid < 0)
+ return; // Don't try this from timer handlers.
+
+ // Increment my counter.
+ rtrcp = &rcu_torture_reader_mbchk[myid];
+ WRITE_ONCE(rtrcp->rtc_myloops, rtrcp->rtc_myloops + 1);
+
+ // Attempt to assign someone else some checking work.
+ rdrchked = torture_random(trsp) % nrealreaders;
+ rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked];
+ rdrchker = torture_random(trsp) % nrealreaders;
+ rtrcp_chker = &rcu_torture_reader_mbchk[rdrchker];
+ if (rdrchked != myid && rdrchked != rdrchker && noc >= rdrchked && noc >= rdrchker &&
+ smp_load_acquire(&rtrcp->rtc_chkrdr) < 0 && // Pairs with smp_store_release below.
+ !READ_ONCE(rtp->rtort_chkp) &&
+ !smp_load_acquire(&rtrcp_chker->rtc_assigner)) { // Pairs with smp_store_release below.
+ rtrcp->rtc_chkloops = READ_ONCE(rtrcp_chked->rtc_myloops);
+ WARN_ON_ONCE(rtrcp->rtc_chkrdr >= 0);
+ rtrcp->rtc_chkrdr = rdrchked;
+ WARN_ON_ONCE(rtrcp->rtc_ready); // This gets set after the grace period ends.
+ if (cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, NULL, rtrcp) ||
+ cmpxchg_relaxed(&rtp->rtort_chkp, NULL, rtrcp))
+ (void)cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, rtrcp, NULL); // Back out.
+ }
+
+ // If assigned some completed work, do it!
+ rtrcp_assigner = READ_ONCE(rtrcp->rtc_assigner);
+ if (!rtrcp_assigner || !smp_load_acquire(&rtrcp_assigner->rtc_ready))
+ return; // No work or work not yet ready.
+ rdrchked = rtrcp_assigner->rtc_chkrdr;
+ if (WARN_ON_ONCE(rdrchked < 0))
+ return;
+ rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked];
+ loops = READ_ONCE(rtrcp_chked->rtc_myloops);
+ atomic_inc(&n_rcu_torture_mbchk_tries);
+ if (ULONG_CMP_LT(loops, rtrcp_assigner->rtc_chkloops))
+ atomic_inc(&n_rcu_torture_mbchk_fail);
+ rtrcp_assigner->rtc_chkloops = loops + ULONG_MAX / 2;
+ rtrcp_assigner->rtc_ready = 0;
+ smp_store_release(&rtrcp->rtc_assigner, NULL); // Someone else can assign us work.
+ smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign.
+}
+
/*
* Do one extension of an RCU read-side critical section using the
* current reader state in readstate (set to zero for initial entry
@@ -1362,8 +1530,9 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
* no data to read. Can be invoked both from process context and
* from a timer handler.
*/
-static bool rcu_torture_one_read(struct torture_random_state *trsp)
+static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
{
+ unsigned long cookie;
int i;
unsigned long started;
unsigned long completed;
@@ -1379,6 +1548,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
WARN_ON_ONCE(!rcu_is_watching());
newstate = rcutorture_extend_mask(readstate, trsp);
rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ cookie = cur_ops->get_gp_state();
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
@@ -1394,6 +1565,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
}
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
+ rcu_torture_reader_do_mbchk(myid, p, trsp);
rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp);
preempt_disable();
pipe_count = READ_ONCE(p->rtort_pipe_count);
@@ -1415,6 +1587,13 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
}
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ONCE(cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 3 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK);
// This next splat is expected behavior if leakpointer, especially
@@ -1443,7 +1622,7 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand);
static void rcu_torture_timer(struct timer_list *unused)
{
atomic_long_inc(&n_rcu_torture_timers);
- (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand));
+ (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand), -1);
/* Test call_rcu() invocation from interrupt handler. */
if (cur_ops->call) {
@@ -1479,13 +1658,13 @@ rcu_torture_reader(void *arg)
if (!timer_pending(&t))
mod_timer(&t, jiffies + 1);
}
- if (!rcu_torture_one_read(&rand) && !torture_must_stop())
+ if (!rcu_torture_one_read(&rand, myid) && !torture_must_stop())
schedule_timeout_interruptible(HZ);
if (time_after(jiffies, lastsleep) && !torture_must_stop()) {
- schedule_timeout_interruptible(1);
+ torture_hrtimeout_us(500, 1000, &rand);
lastsleep = jiffies + 10;
}
- while (num_online_cpus() < mynumonline && !torture_must_stop())
+ while (torture_num_online_cpus() < mynumonline && !torture_must_stop())
schedule_timeout_interruptible(HZ / 5);
stutter_wait("rcu_torture_reader");
} while (!torture_must_stop());
@@ -1499,6 +1678,53 @@ rcu_torture_reader(void *arg)
}
/*
+ * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to
+ * increase race probabilities and fuzzes the interval between toggling.
+ */
+static int rcu_nocb_toggle(void *arg)
+{
+ int cpu;
+ int maxcpu = -1;
+ int oldnice = task_nice(current);
+ long r;
+ DEFINE_TORTURE_RANDOM(rand);
+ ktime_t toggle_delay;
+ unsigned long toggle_fuzz;
+ ktime_t toggle_interval = ms_to_ktime(nocbs_toggle);
+
+ VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started");
+ while (!rcu_inkernel_boot_has_ended())
+ schedule_timeout_interruptible(HZ / 10);
+ for_each_online_cpu(cpu)
+ maxcpu = cpu;
+ WARN_ON(maxcpu < 0);
+ if (toggle_interval > ULONG_MAX)
+ toggle_fuzz = ULONG_MAX >> 3;
+ else
+ toggle_fuzz = toggle_interval >> 3;
+ if (toggle_fuzz <= 0)
+ toggle_fuzz = NSEC_PER_USEC;
+ do {
+ r = torture_random(&rand);
+ cpu = (r >> 4) % (maxcpu + 1);
+ if (r & 0x1) {
+ rcu_nocb_cpu_offload(cpu);
+ atomic_long_inc(&n_nocb_offload);
+ } else {
+ rcu_nocb_cpu_deoffload(cpu);
+ atomic_long_inc(&n_nocb_deoffload);
+ }
+ toggle_delay = torture_random(&rand) % toggle_fuzz + toggle_interval;
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_hrtimeout(&toggle_delay, HRTIMER_MODE_REL);
+ if (stutter_wait("rcu_nocb_toggle"))
+ sched_set_normal(current, oldnice);
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_nocb_toggle");
+ return 0;
+}
+
+/*
* Print torture statistics. Caller must ensure that there is only
* one call to this function at a given time!!! This is normally
* accomplished by relying on the module system to only have one copy
@@ -1539,8 +1765,9 @@ rcu_torture_stats_print(void)
atomic_read(&n_rcu_torture_alloc),
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free));
- pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ",
+ pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld rtbre: %ld ",
atomic_read(&n_rcu_torture_mberror),
+ atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries),
n_rcu_torture_barrier_error,
n_rcu_torture_boost_ktrerror,
n_rcu_torture_boost_rterror);
@@ -1553,16 +1780,20 @@ rcu_torture_stats_print(void)
data_race(n_barrier_successes),
data_race(n_barrier_attempts),
data_race(n_rcu_torture_barrier_error));
- pr_cont("read-exits: %ld\n", data_race(n_read_exits));
+ pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic.
+ pr_cont("nocb-toggles: %ld:%ld\n",
+ atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload));
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) ||
+ atomic_read(&n_rcu_torture_mbchk_fail) ||
n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror ||
n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure ||
i > 1) {
pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror));
+ WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail));
WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier()
WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread
WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio
@@ -1647,7 +1878,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"stall_cpu_block=%d "
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d "
- "read_exit_delay=%d read_exit_burst=%d\n",
+ "read_exit_delay=%d read_exit_burst=%d "
+ "nocbs_nthreads=%d nocbs_toggle=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -1657,7 +1889,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
stall_cpu_block,
n_barrier_cbs,
onoff_interval, onoff_holdoff,
- read_exit_delay, read_exit_burst);
+ read_exit_delay, read_exit_burst,
+ nocbs_nthreads, nocbs_toggle);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -2392,7 +2625,7 @@ static int rcu_torture_read_exit_child(void *trsp_in)
// Minimize time between reading and exiting.
while (!kthread_should_stop())
schedule_timeout_uninterruptible(1);
- (void)rcu_torture_one_read(trsp);
+ (void)rcu_torture_one_read(trsp, -1);
return 0;
}
@@ -2500,6 +2733,13 @@ rcu_torture_cleanup(void)
torture_stop_kthread(rcu_torture_stall, stall_task);
torture_stop_kthread(rcu_torture_writer, writer_task);
+ if (nocb_tasks) {
+ for (i = 0; i < nrealnocbers; i++)
+ torture_stop_kthread(rcu_nocb_toggle, nocb_tasks[i]);
+ kfree(nocb_tasks);
+ nocb_tasks = NULL;
+ }
+
if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
torture_stop_kthread(rcu_torture_reader,
@@ -2507,6 +2747,8 @@ rcu_torture_cleanup(void)
kfree(reader_tasks);
reader_tasks = NULL;
}
+ kfree(rcu_torture_reader_mbchk);
+ rcu_torture_reader_mbchk = NULL;
if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++)
@@ -2604,6 +2846,7 @@ static void rcu_test_debug_objects(void)
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
struct rcu_head rh1;
struct rcu_head rh2;
+ struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
init_rcu_head_on_stack(&rh1);
init_rcu_head_on_stack(&rh2);
@@ -2616,6 +2859,10 @@ static void rcu_test_debug_objects(void)
local_irq_disable(); /* Make it harder to start a new grace period. */
call_rcu(&rh2, rcu_torture_leak_cb);
call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ if (rhp) {
+ call_rcu(rhp, rcu_torture_leak_cb);
+ call_rcu(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
+ }
local_irq_enable();
rcu_read_unlock();
preempt_enable();
@@ -2710,6 +2957,8 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_alloc_fail, 0);
atomic_set(&n_rcu_torture_free, 0);
atomic_set(&n_rcu_torture_mberror, 0);
+ atomic_set(&n_rcu_torture_mbchk_fail, 0);
+ atomic_set(&n_rcu_torture_mbchk_tries, 0);
atomic_set(&n_rcu_torture_error, 0);
n_rcu_torture_barrier_error = 0;
n_rcu_torture_boost_ktrerror = 0;
@@ -2729,6 +2978,7 @@ rcu_torture_init(void)
/* Start up the kthreads. */
+ rcu_torture_write_types();
firsterr = torture_create_kthread(rcu_torture_writer, NULL,
writer_task);
if (firsterr)
@@ -2751,17 +3001,40 @@ rcu_torture_init(void)
}
reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
- if (reader_tasks == NULL) {
+ rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk),
+ GFP_KERNEL);
+ if (!reader_tasks || !rcu_torture_reader_mbchk) {
VERBOSE_TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
for (i = 0; i < nrealreaders; i++) {
+ rcu_torture_reader_mbchk[i].rtc_chkrdr = -1;
firsterr = torture_create_kthread(rcu_torture_reader, (void *)i,
reader_tasks[i]);
if (firsterr)
goto unwind;
}
+ nrealnocbers = nocbs_nthreads;
+ if (WARN_ON(nrealnocbers < 0))
+ nrealnocbers = 1;
+ if (WARN_ON(nocbs_toggle < 0))
+ nocbs_toggle = HZ;
+ if (nrealnocbers > 0) {
+ nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL);
+ if (nocb_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ } else {
+ nocb_tasks = NULL;
+ }
+ for (i = 0; i < nrealnocbers; i++) {
+ firsterr = torture_create_kthread(rcu_nocb_toggle, NULL, nocb_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
if (stat_interval > 0) {
firsterr = torture_create_kthread(rcu_torture_stats, NULL,
stats_task);
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 23ff36a66f97..02dd9767b559 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -46,6 +46,18 @@
#define VERBOSE_SCALEOUT(s, x...) \
do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0)
+static atomic_t verbose_batch_ctr;
+
+#define VERBOSE_SCALEOUT_BATCH(s, x...) \
+do { \
+ if (verbose && \
+ (verbose_batched <= 0 || \
+ !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \
+ schedule_timeout_uninterruptible(1); \
+ pr_alert("%s" SCALE_FLAG s, scale_type, ## x); \
+ } \
+} while (0)
+
#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \
do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0)
@@ -57,6 +69,7 @@ module_param(scale_type, charp, 0444);
MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s");
// Wait until there are multiple CPUs before starting test.
torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
@@ -368,14 +381,14 @@ ref_scale_reader(void *arg)
u64 start;
s64 duration;
- VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me);
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me);
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
set_user_nice(current, MAX_NICE);
atomic_inc(&n_init);
if (holdoff)
schedule_timeout_interruptible(holdoff * HZ);
repeat:
- VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
// Wait for signal that this reader can start.
wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
@@ -392,7 +405,7 @@ repeat:
while (atomic_read_acquire(&n_started))
cpu_relax();
- VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx);
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d started", me, exp_idx);
// To reduce noise, do an initial cache-warming invocation, check
@@ -421,8 +434,8 @@ repeat:
if (atomic_dec_and_test(&nreaders_exp))
wake_up(&main_wq);
- VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)",
- me, exp_idx, atomic_read(&nreaders_exp));
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)",
+ me, exp_idx, atomic_read(&nreaders_exp));
if (!torture_must_stop())
goto repeat;
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 6208c1dae5c9..26344dc6483b 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -34,6 +34,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp)
ssp->srcu_gp_running = false;
ssp->srcu_gp_waiting = false;
ssp->srcu_idx = 0;
+ ssp->srcu_idx_max = 0;
INIT_WORK(&ssp->srcu_work, srcu_drive_gp);
INIT_LIST_HEAD(&ssp->srcu_work.entry);
return 0;
@@ -84,6 +85,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
WARN_ON(ssp->srcu_gp_waiting);
WARN_ON(ssp->srcu_cb_head);
WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
+ WARN_ON(ssp->srcu_idx != ssp->srcu_idx_max);
+ WARN_ON(ssp->srcu_idx & 0x1);
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
@@ -114,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head))
+ if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
return; /* Already running or nothing to do. */
/* Remove recently arrived callbacks and wait for readers. */
@@ -124,11 +127,12 @@ void srcu_drive_gp(struct work_struct *wp)
ssp->srcu_cb_head = NULL;
ssp->srcu_cb_tail = &ssp->srcu_cb_head;
local_irq_enable();
- idx = ssp->srcu_idx;
- WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx);
+ idx = (ssp->srcu_idx & 0x2) / 2;
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
/* Invoke the callbacks we removed above. */
while (lh) {
@@ -146,11 +150,27 @@ void srcu_drive_gp(struct work_struct *wp)
* straighten that out.
*/
WRITE_ONCE(ssp->srcu_gp_running, false);
- if (READ_ONCE(ssp->srcu_cb_head))
+ if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
schedule_work(&ssp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
+static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
+{
+ unsigned short cookie;
+
+ cookie = get_state_synchronize_srcu(ssp);
+ if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+ return;
+ WRITE_ONCE(ssp->srcu_idx_max, cookie);
+ if (!READ_ONCE(ssp->srcu_gp_running)) {
+ if (likely(srcu_init_done))
+ schedule_work(&ssp->srcu_work);
+ else if (list_empty(&ssp->srcu_work.entry))
+ list_add(&ssp->srcu_work.entry, &srcu_boot_list);
+ }
+}
+
/*
* Enqueue an SRCU callback on the specified srcu_struct structure,
* initiating grace-period processing if it is not already running.
@@ -166,12 +186,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
*ssp->srcu_cb_tail = rhp;
ssp->srcu_cb_tail = &rhp->next;
local_irq_restore(flags);
- if (!READ_ONCE(ssp->srcu_gp_running)) {
- if (likely(srcu_init_done))
- schedule_work(&ssp->srcu_work);
- else if (list_empty(&ssp->srcu_work.entry))
- list_add(&ssp->srcu_work.entry, &srcu_boot_list);
- }
+ srcu_gp_start_if_needed(ssp);
}
EXPORT_SYMBOL_GPL(call_srcu);
@@ -190,6 +205,48 @@ void synchronize_srcu(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+/*
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
+ */
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+ unsigned long ret;
+
+ barrier();
+ ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1;
+ barrier();
+ return ret & USHRT_MAX;
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
+
+/*
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
+ *
+ * The difference between this and get_state_synchronize_srcu() is that
+ * this function ensures that the poll_state_synchronize_srcu() will
+ * eventually return the value true.
+ */
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ unsigned long ret = get_state_synchronize_srcu(ssp);
+
+ srcu_gp_start_if_needed(ssp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
+
+/*
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
+ */
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie);
+
+ barrier();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+
/* Lockdep diagnostics. */
void __init rcu_scheduler_starting(void)
{
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0f23d20d485a..e26547b34ad3 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -808,6 +808,46 @@ static void srcu_leak_callback(struct rcu_head *rhp)
}
/*
+ * Start an SRCU grace period, and also queue the callback if non-NULL.
+ */
+static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
+ struct rcu_head *rhp, bool do_norm)
+{
+ unsigned long flags;
+ int idx;
+ bool needexp = false;
+ bool needgp = false;
+ unsigned long s;
+ struct srcu_data *sdp;
+
+ check_init_srcu_struct(ssp);
+ idx = srcu_read_lock(ssp);
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_rcu_node(sdp, flags);
+ if (rhp)
+ rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&ssp->srcu_gp_seq));
+ s = rcu_seq_snap(&ssp->srcu_gp_seq);
+ (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
+ if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
+ sdp->srcu_gp_seq_needed = s;
+ needgp = true;
+ }
+ if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
+ sdp->srcu_gp_seq_needed_exp = s;
+ needexp = true;
+ }
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+ if (needgp)
+ srcu_funnel_gp_start(ssp, sdp, s, do_norm);
+ else if (needexp)
+ srcu_funnel_exp_start(ssp, sdp->mynode, s);
+ srcu_read_unlock(ssp, idx);
+ return s;
+}
+
+/*
* Enqueue an SRCU callback on the srcu_data structure associated with
* the current CPU and the specified srcu_struct structure, initiating
* grace-period processing if it is not already running.
@@ -838,14 +878,6 @@ static void srcu_leak_callback(struct rcu_head *rhp)
static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func, bool do_norm)
{
- unsigned long flags;
- int idx;
- bool needexp = false;
- bool needgp = false;
- unsigned long s;
- struct srcu_data *sdp;
-
- check_init_srcu_struct(ssp);
if (debug_rcu_head_queue(rhp)) {
/* Probable double call_srcu(), so leak the callback. */
WRITE_ONCE(rhp->func, srcu_leak_callback);
@@ -853,28 +885,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
return;
}
rhp->func = func;
- idx = srcu_read_lock(ssp);
- sdp = raw_cpu_ptr(ssp->sda);
- spin_lock_irqsave_rcu_node(sdp, flags);
- rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
- rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
- s = rcu_seq_snap(&ssp->srcu_gp_seq);
- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
- if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
- sdp->srcu_gp_seq_needed = s;
- needgp = true;
- }
- if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
- sdp->srcu_gp_seq_needed_exp = s;
- needexp = true;
- }
- spin_unlock_irqrestore_rcu_node(sdp, flags);
- if (needgp)
- srcu_funnel_gp_start(ssp, sdp, s, do_norm);
- else if (needexp)
- srcu_funnel_exp_start(ssp, sdp->mynode, s);
- srcu_read_unlock(ssp, idx);
+ (void)srcu_gp_start_if_needed(ssp, rhp, do_norm);
}
/**
@@ -1003,6 +1014,77 @@ void synchronize_srcu(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+/**
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
+ * @ssp: srcu_struct to provide cookie for.
+ *
+ * This function returns a cookie that can be passed to
+ * poll_state_synchronize_srcu(), which will return true if a full grace
+ * period has elapsed in the meantime. It is the caller's responsibility
+ * to make sure that grace period happens, for example, by invoking
+ * call_srcu() after return from get_state_synchronize_srcu().
+ */
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+ // Any prior manipulation of SRCU-protected data must happen
+ // before the load from ->srcu_gp_seq.
+ smp_mb();
+ return rcu_seq_snap(&ssp->srcu_gp_seq);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
+
+/**
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
+ * @ssp: srcu_struct to provide cookie for.
+ *
+ * This function returns a cookie that can be passed to
+ * poll_state_synchronize_srcu(), which will return true if a full grace
+ * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(),
+ * this function also ensures that any needed SRCU grace period will be
+ * started. This convenience does come at a cost in terms of CPU overhead.
+ */
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ return srcu_gp_start_if_needed(ssp, NULL, true);
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
+
+/**
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
+ * @ssp: srcu_struct to provide cookie for.
+ * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu().
+ *
+ * This function takes the cookie that was returned from either
+ * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and
+ * returns @true if an SRCU grace period elapsed since the time that the
+ * cookie was created.
+ *
+ * Because cookies are finite in size, wrapping/overflow is possible.
+ * This is more pronounced on 32-bit systems where cookies are 32 bits,
+ * where in theory wrapping could happen in about 14 hours assuming
+ * 25-microsecond expedited SRCU grace periods. However, a more likely
+ * overflow lower bound is on the order of 24 days in the case of
+ * one-millisecond SRCU grace periods. Of course, wrapping in a 64-bit
+ * system requires geologic timespans, as in more than seven million years
+ * even for expedited SRCU grace periods.
+ *
+ * Wrapping/overflow is much more of an issue for CONFIG_SMP=n systems
+ * that also have CONFIG_PREEMPTION=n, which selects Tiny SRCU. This uses
+ * a 16-bit cookie, which rcutorture routinely wraps in a matter of a
+ * few minutes. If this proves to be a problem, this counter will be
+ * expanded to the same size as for Tree SRCU.
+ */
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie))
+ return false;
+ // Ensure that the end of the SRCU grace period happens before
+ // any subsequent code that the caller might execute.
+ smp_mb(); // ^^^
+ return true;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+
/*
* Callback function for srcu_barrier() use.
*/
@@ -1160,6 +1242,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
*/
static void srcu_invoke_callbacks(struct work_struct *work)
{
+ long len;
bool more;
struct rcu_cblist ready_cbs;
struct rcu_head *rhp;
@@ -1182,6 +1265,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
/* We are on the job! Extract and invoke ready callbacks. */
sdp->srcu_cblist_invoking = true;
rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
+ len = ready_cbs.len;
spin_unlock_irq_rcu_node(sdp);
rhp = rcu_cblist_dequeue(&ready_cbs);
for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
@@ -1190,13 +1274,14 @@ static void srcu_invoke_callbacks(struct work_struct *work)
rhp->func(rhp);
local_bh_enable();
}
+ WARN_ON_ONCE(ready_cbs.len);
/*
* Update counts, accelerate new callbacks, and if needed,
* schedule another round of callback invocation.
*/
spin_lock_irq_rcu_node(sdp);
- rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
+ rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
rcu_seq_snap(&ssp->srcu_gp_seq));
sdp->srcu_cblist_invoking = false;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 36607551f966..af7c19439f4e 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1224,6 +1224,82 @@ void show_rcu_tasks_gp_kthreads(void)
}
#endif /* #ifndef CONFIG_TINY_RCU */
+#ifdef CONFIG_PROVE_RCU
+struct rcu_tasks_test_desc {
+ struct rcu_head rh;
+ const char *name;
+ bool notrun;
+};
+
+static struct rcu_tasks_test_desc tests[] = {
+ {
+ .name = "call_rcu_tasks()",
+ /* If not defined, the test is skipped. */
+ .notrun = !IS_ENABLED(CONFIG_TASKS_RCU),
+ },
+ {
+ .name = "call_rcu_tasks_rude()",
+ /* If not defined, the test is skipped. */
+ .notrun = !IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
+ },
+ {
+ .name = "call_rcu_tasks_trace()",
+ /* If not defined, the test is skipped. */
+ .notrun = !IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
+ }
+};
+
+static void test_rcu_tasks_callback(struct rcu_head *rhp)
+{
+ struct rcu_tasks_test_desc *rttd =
+ container_of(rhp, struct rcu_tasks_test_desc, rh);
+
+ pr_info("Callback from %s invoked.\n", rttd->name);
+
+ rttd->notrun = true;
+}
+
+static void rcu_tasks_initiate_self_tests(void)
+{
+ pr_info("Running RCU-tasks wait API self tests\n");
+#ifdef CONFIG_TASKS_RCU
+ synchronize_rcu_tasks();
+ call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+ synchronize_rcu_tasks_rude();
+ call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ synchronize_rcu_tasks_trace();
+ call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
+#endif
+}
+
+static int rcu_tasks_verify_self_tests(void)
+{
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ if (!tests[i].notrun) { // still hanging.
+ pr_err("%s has been failed.\n", tests[i].name);
+ ret = -1;
+ }
+ }
+
+ if (ret)
+ WARN_ON(1);
+
+ return ret;
+}
+late_initcall(rcu_tasks_verify_self_tests);
+#else /* #ifdef CONFIG_PROVE_RCU */
+static void rcu_tasks_initiate_self_tests(void) { }
+#endif /* #else #ifdef CONFIG_PROVE_RCU */
+
void __init rcu_init_tasks_generic(void)
{
#ifdef CONFIG_TASKS_RCU
@@ -1237,6 +1313,9 @@ void __init rcu_init_tasks_generic(void)
#ifdef CONFIG_TASKS_TRACE_RCU
rcu_spawn_tasks_trace_kthread();
#endif
+
+ // Run the self-tests.
+ rcu_tasks_initiate_self_tests();
}
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 40e5e3dd253e..da6f5213fb74 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -83,6 +83,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nesting = 1,
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+#ifdef CONFIG_RCU_NOCB_CPU
+ .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
+#endif
};
static struct rcu_state rcu_state = {
.level = { &rcu_state.node[0] },
@@ -100,8 +103,10 @@ static struct rcu_state rcu_state = {
static bool dump_tree;
module_param(dump_tree, bool, 0444);
/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
-static bool use_softirq = true;
+static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
+#ifndef CONFIG_PREEMPT_RT
module_param(use_softirq, bool, 0444);
+#endif
/* Control rcu_node-tree auto-balancing at boot time. */
static bool rcu_fanout_exact;
module_param(rcu_fanout_exact, bool, 0444);
@@ -644,7 +649,6 @@ static noinstr void rcu_eqs_enter(bool user)
trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
rdp = this_cpu_ptr(&rcu_data);
- do_nocb_deferred_wakeup(rdp);
rcu_prepare_for_idle();
rcu_preempt_deferred_qs(current);
@@ -678,6 +682,50 @@ void rcu_idle_enter(void)
EXPORT_SYMBOL_GPL(rcu_idle_enter);
#ifdef CONFIG_NO_HZ_FULL
+
+#if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)
+/*
+ * An empty function that will trigger a reschedule on
+ * IRQ tail once IRQs get re-enabled on userspace/guest resume.
+ */
+static void late_wakeup_func(struct irq_work *work)
+{
+}
+
+static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
+ IRQ_WORK_INIT(late_wakeup_func);
+
+/*
+ * If either:
+ *
+ * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
+ * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
+ *
+ * In these cases the late RCU wake ups aren't supported in the resched loops and our
+ * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
+ * get re-enabled again.
+ */
+noinstr static void rcu_irq_work_resched(void)
+{
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
+ if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
+ return;
+
+ if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
+ return;
+
+ instrumentation_begin();
+ if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
+ irq_work_queue(this_cpu_ptr(&late_wakeup_work));
+ }
+ instrumentation_end();
+}
+
+#else
+static inline void rcu_irq_work_resched(void) { }
+#endif
+
/**
* rcu_user_enter - inform RCU that we are resuming userspace.
*
@@ -692,8 +740,16 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
noinstr void rcu_user_enter(void)
{
lockdep_assert_irqs_disabled();
+
+ /*
+ * Other than generic entry implementation, we may be past the last
+ * rescheduling opportunity in the entry code. Trigger a self IPI
+ * that will fire and reschedule once we resume in user/guest mode.
+ */
+ rcu_irq_work_resched();
rcu_eqs_enter(true);
}
+
#endif /* CONFIG_NO_HZ_FULL */
/**
@@ -1495,6 +1551,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
+
/*
* Callbacks are often registered with incomplete grace-period
* information. Something about the fact that getting exact
@@ -1515,6 +1573,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
else
trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
+
return ret;
}
@@ -1765,7 +1825,7 @@ static bool rcu_gp_init(void)
* go offline later. Please also refer to "Hotplug CPU" section
* of RCU's Requirements documentation.
*/
- rcu_state.gp_state = RCU_GP_ONOFF;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
rcu_for_each_leaf_node(rnp) {
smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
firstseq = READ_ONCE(rnp->ofl_seq);
@@ -1831,7 +1891,7 @@ static bool rcu_gp_init(void)
* The grace period cannot complete until the initialization
* process finishes, because this kthread handles both.
*/
- rcu_state.gp_state = RCU_GP_INIT;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT);
rcu_for_each_node_breadth_first(rnp) {
rcu_gp_slow(gp_init_delay);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -1930,17 +1990,22 @@ static void rcu_gp_fqs_loop(void)
ret = 0;
for (;;) {
if (!ret) {
- rcu_state.jiffies_force_qs = jiffies + j;
+ WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
+ /*
+ * jiffies_force_qs before RCU_GP_WAIT_FQS state
+ * update; required for stall checks.
+ */
+ smp_wmb();
WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
jiffies + (j ? 3 * j : 2));
}
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswait"));
- rcu_state.gp_state = RCU_GP_WAIT_FQS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
ret = swait_event_idle_timeout_exclusive(
rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
rcu_gp_torture_wait();
- rcu_state.gp_state = RCU_GP_DOING_FQS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
/* Locking provides needed memory barriers. */
/* If grace period done, leave loop. */
if (!READ_ONCE(rnp->qsmask) &&
@@ -2054,7 +2119,7 @@ static void rcu_gp_cleanup(void)
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
rcu_seq_end(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
- rcu_state.gp_state = RCU_GP_IDLE;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE);
/* Check for GP requests since above loop. */
rdp = this_cpu_ptr(&rcu_data);
if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
@@ -2093,12 +2158,12 @@ static int __noreturn rcu_gp_kthread(void *unused)
for (;;) {
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("reqwait"));
- rcu_state.gp_state = RCU_GP_WAIT_GPS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS);
swait_event_idle_exclusive(rcu_state.gp_wq,
READ_ONCE(rcu_state.gp_flags) &
RCU_GP_FLAG_INIT);
rcu_gp_torture_wait();
- rcu_state.gp_state = RCU_GP_DONE_GPS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS);
/* Locking provides needed memory barrier. */
if (rcu_gp_init())
break;
@@ -2113,9 +2178,9 @@ static int __noreturn rcu_gp_kthread(void *unused)
rcu_gp_fqs_loop();
/* Handle grace-period end. */
- rcu_state.gp_state = RCU_GP_CLEANUP;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP);
rcu_gp_cleanup();
- rcu_state.gp_state = RCU_GP_CLEANED;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED);
}
}
@@ -2430,11 +2495,12 @@ int rcutree_dead_cpu(unsigned int cpu)
static void rcu_do_batch(struct rcu_data *rdp)
{
int div;
+ bool __maybe_unused empty;
unsigned long flags;
const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
- long bl, count;
+ long bl, count = 0;
long pending, tlimit = 0;
/* If no callbacks are ready, just return. */
@@ -2471,14 +2537,18 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
if (offloaded)
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
+
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
rcu_nocb_unlock_irqrestore(rdp, flags);
/* Invoke callbacks. */
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
rhp = rcu_cblist_dequeue(&rcl);
+
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
rcu_callback_t f;
+ count++;
debug_rcu_head_unqueue(rhp);
rcu_lock_acquire(&rcu_callback_map);
@@ -2492,21 +2562,19 @@ static void rcu_do_batch(struct rcu_data *rdp)
/*
* Stop only if limit reached and CPU has something to do.
- * Note: The rcl structure counts down from zero.
*/
- if (-rcl.len >= bl && !offloaded &&
+ if (count >= bl && !offloaded &&
(need_resched() ||
(!is_idle_task(current) && !rcu_is_callbacks_kthread())))
break;
if (unlikely(tlimit)) {
/* only call local_clock() every 32 callbacks */
- if (likely((-rcl.len & 31) || local_clock() < tlimit))
+ if (likely((count & 31) || local_clock() < tlimit))
continue;
/* Exceeded the time limit, so leave. */
break;
}
- if (offloaded) {
- WARN_ON_ONCE(in_serving_softirq());
+ if (!in_serving_softirq()) {
local_bh_enable();
lockdep_assert_irqs_enabled();
cond_resched_tasks_rcu_qs();
@@ -2517,15 +2585,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
local_irq_save(flags);
rcu_nocb_lock(rdp);
- count = -rcl.len;
rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
/* Update counts and requeue any remaining callbacks. */
rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
- smp_mb(); /* List handling before counting for rcu_barrier(). */
- rcu_segcblist_insert_count(&rdp->cblist, &rcl);
+ rcu_segcblist_add_len(&rdp->cblist, -count);
/* Reinstate batch limit if we have worked down the excess. */
count = rcu_segcblist_n_cbs(&rdp->cblist);
@@ -2543,9 +2609,12 @@ static void rcu_do_batch(struct rcu_data *rdp)
* The following usually indicates a double call_rcu(). To track
* this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
*/
- WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
+ empty = rcu_segcblist_empty(&rdp->cblist);
+ WARN_ON_ONCE(count == 0 && !empty);
WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- count != 0 && rcu_segcblist_empty(&rdp->cblist));
+ count != 0 && empty);
+ WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
+ WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
rcu_nocb_unlock_irqrestore(rdp, flags);
@@ -2566,6 +2635,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
void rcu_sched_clock_irq(int user)
{
trace_rcu_utilization(TPS("Start scheduler-tick"));
+ lockdep_assert_irqs_disabled();
raw_cpu_inc(rcu_data.ticks_this_gp);
/* The load-acquire pairs with the store-release setting to true. */
if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
@@ -2579,6 +2649,7 @@ void rcu_sched_clock_irq(int user)
rcu_flavor_sched_clock_irq(user);
if (rcu_pending(user))
invoke_rcu_core();
+ lockdep_assert_irqs_disabled();
trace_rcu_utilization(TPS("End scheduler-tick"));
}
@@ -2688,7 +2759,7 @@ static __latent_entropy void rcu_core(void)
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
+ const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
return;
@@ -2708,17 +2779,17 @@ static __latent_entropy void rcu_core(void)
/* No grace period and unregistered callbacks? */
if (!rcu_gp_in_progress() &&
- rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
- local_irq_save(flags);
+ rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) {
+ rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
rcu_accelerate_cbs_unlocked(rnp, rdp);
- local_irq_restore(flags);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
}
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
/* If there are callbacks ready, invoke them. */
- if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
likely(READ_ONCE(rcu_scheduler_fully_active)))
rcu_do_batch(rdp);
@@ -2941,6 +3012,7 @@ static void check_cb_ovld(struct rcu_data *rdp)
static void
__call_rcu(struct rcu_head *head, rcu_callback_t func)
{
+ static atomic_t doublefrees;
unsigned long flags;
struct rcu_data *rdp;
bool was_alldone;
@@ -2954,8 +3026,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
* Use rcu:rcu_callback trace event to find the previous
* time callback was passed to __call_rcu().
*/
- WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
- head, head->func);
+ if (atomic_inc_return(&doublefrees) < 4) {
+ pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
+ mem_dump_obj(head);
+ }
WRITE_ONCE(head->func, rcu_leak_callback);
return;
}
@@ -2989,6 +3063,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
trace_rcu_callback(rcu_state.name, head,
rcu_segcblist_n_cbs(&rdp->cblist));
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
+
/* Go handle any RCU core processing required. */
if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
@@ -3498,6 +3574,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
goto unlock_return;
}
+ kasan_record_aux_stack(ptr);
success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
if (!success) {
run_page_cache_worker(krcp);
@@ -3747,6 +3824,8 @@ static int rcu_pending(int user)
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ lockdep_assert_irqs_disabled();
+
/* Check for CPU stalls, if enabled. */
check_cpu_stall(rdp);
@@ -4001,12 +4080,18 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rcu_state.n_force_qs;
rdp->blimit = blimit;
- if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
- !rcu_segcblist_is_offloaded(&rdp->cblist))
- rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
rcu_dynticks_eqs_online();
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ /*
+ * Lock in case the CB/GP kthreads are still around handling
+ * old callbacks (longer term we should flush all callbacks
+ * before completing CPU offline)
+ */
+ rcu_nocb_lock(rdp);
+ if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */
+ rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
+ rcu_nocb_unlock(rdp);
/*
* Add CPU to leaf rcu_node pending-online bitmask. Any needed
@@ -4159,6 +4244,9 @@ void rcu_report_dead(unsigned int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+ // Do any dangling deferred wakeups.
+ do_nocb_deferred_wakeup(rdp);
+
/* QS for any half-done expedited grace period. */
preempt_disable();
rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7708ed161f4a..71821d59d95c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -201,6 +201,7 @@ struct rcu_data {
/* 5) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
+ struct swait_queue_head nocb_state_wq; /* For offloading state changes */
struct task_struct *nocb_gp_kthread;
raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
atomic_t nocb_lock_contended; /* Contention experienced. */
@@ -256,6 +257,7 @@ struct rcu_data {
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
+#define RCU_NOCB_WAKE_OFF -1
#define RCU_NOCB_WAKE_NOT 0
#define RCU_NOCB_WAKE 1
#define RCU_NOCB_WAKE_FORCE 2
@@ -433,7 +435,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
static void __init rcu_spawn_nocb_kthreads(void);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 8760b6ead770..6c6ff06d4ae6 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -545,7 +545,7 @@ static void synchronize_rcu_expedited_wait(void)
data_race(rnp_root->expmask),
".T"[!!data_race(rnp_root->exp_tasks)]);
if (ndetected) {
- pr_err("blocking rcu_node structures:");
+ pr_err("blocking rcu_node structures (internal RCU debug):");
rcu_for_each_node_breadth_first(rnp) {
if (rnp == rnp_root)
continue; /* printed unconditionally */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7e291ce0a1d6..2d603771c7dc 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -682,6 +682,7 @@ static void rcu_flavor_sched_clock_irq(int user)
{
struct task_struct *t = current;
+ lockdep_assert_irqs_disabled();
if (user || rcu_is_cpu_rrupt_from_idle()) {
rcu_note_voluntary_context_switch(current);
}
@@ -1631,8 +1632,8 @@ bool rcu_is_nocb_cpu(int cpu)
* Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
* and this function releases it.
*/
-static void wake_nocb_gp(struct rcu_data *rdp, bool force,
- unsigned long flags)
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
+ unsigned long flags)
__releases(rdp->nocb_lock)
{
bool needwake = false;
@@ -1643,7 +1644,7 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force,
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("AlreadyAwake"));
rcu_nocb_unlock_irqrestore(rdp, flags);
- return;
+ return false;
}
del_timer(&rdp->nocb_timer);
rcu_nocb_unlock_irqrestore(rdp, flags);
@@ -1656,6 +1657,8 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force,
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
if (needwake)
wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ return needwake;
}
/*
@@ -1665,6 +1668,8 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force,
static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
const char *reason)
{
+ if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF)
+ return;
if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
mod_timer(&rdp->nocb_timer, jiffies + 1);
if (rdp->nocb_defer_wakeup < waketype)
@@ -1929,6 +1934,52 @@ static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
}
/*
+ * Check if we ignore this rdp.
+ *
+ * We check that without holding the nocb lock but
+ * we make sure not to miss a freshly offloaded rdp
+ * with the current ordering:
+ *
+ * rdp_offload_toggle() nocb_gp_enabled_cb()
+ * ------------------------- ----------------------------
+ * WRITE flags LOCK nocb_gp_lock
+ * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
+ * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
+ * UNLOCK nocb_gp_lock READ flags
+ */
+static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
+
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_state)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *needwake_state = true;
+ }
+ return true;
+ }
+
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We will ignore this rdp until it ever gets re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *needwake_state = true;
+ return false;
+}
+
+
+/*
* No-CBs GP kthreads come here to wait for additional callbacks to show up
* or for grace periods to end.
*/
@@ -1956,8 +2007,18 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
*/
WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
+ bool needwake_state = false;
+
+ if (!nocb_gp_enabled_cb(rdp))
+ continue;
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
rcu_nocb_lock_irqsave(rdp, flags);
+ if (!nocb_gp_update_state(rdp, &needwake_state)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+ continue;
+ }
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
if (bypass_ncbs &&
(time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
@@ -1967,6 +2028,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
continue; /* No callbacks here, try next. */
}
if (bypass_ncbs) {
@@ -2018,6 +2081,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
}
if (needwake_gp)
rcu_gp_kthread_wake();
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
}
my_rdp->nocb_gp_bypass = bypass;
@@ -2081,14 +2146,27 @@ static int rcu_nocb_gp_kthread(void *arg)
return 0;
}
+static inline bool nocb_cb_can_run(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
+{
+ return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+}
+
/*
* Invoke any ready callbacks from the corresponding no-CBs CPU,
* then, if there are no more, wait for more to appear.
*/
static void nocb_cb_wait(struct rcu_data *rdp)
{
+ struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long cur_gp_seq;
unsigned long flags;
+ bool needwake_state = false;
bool needwake_gp = false;
struct rcu_node *rnp = rdp->mynode;
@@ -2100,32 +2178,55 @@ static void nocb_cb_wait(struct rcu_data *rdp)
local_bh_enable();
lockdep_assert_irqs_enabled();
rcu_nocb_lock_irqsave(rdp, flags);
- if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
}
- if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_gp)
- rcu_gp_kthread_wake();
- return;
- }
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
WRITE_ONCE(rdp->nocb_cb_sleep, true);
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+ if (rcu_segcblist_ready_cbs(cblist))
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ } else {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We won't touch the callbacks and keep sleeping until we ever
+ * get re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+
+ if (rdp->nocb_cb_sleep)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+
rcu_nocb_unlock_irqrestore(rdp, flags);
if (needwake_gp)
rcu_gp_kthread_wake();
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- !READ_ONCE(rdp->nocb_cb_sleep));
- if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */
- /* ^^^ Ensure CB invocation follows _sleep test. */
- return;
- }
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+
+ do {
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+
+ // VVV Ensure CB invocation follows _sleep test.
+ if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ } while (!nocb_cb_can_run(rdp));
}
/*
@@ -2148,24 +2249,27 @@ static int rcu_nocb_cb_kthread(void *arg)
/* Is a deferred wakeup of rcu_nocb_kthread() required? */
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
{
- return READ_ONCE(rdp->nocb_defer_wakeup);
+ return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT;
}
/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
{
unsigned long flags;
int ndw;
+ int ret;
rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_nocb_need_deferred_wakeup(rdp)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
- return;
+ return false;
}
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+
+ return ret;
}
/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
@@ -2181,12 +2285,208 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
* This means we do an inexact common-case check. Note that if
* we miss, ->nocb_timer will eventually clean things up.
*/
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
if (rcu_nocb_need_deferred_wakeup(rdp))
- do_nocb_deferred_wakeup_common(rdp);
+ return do_nocb_deferred_wakeup_common(rdp);
+ return false;
}
+void rcu_nocb_flush_deferred_wakeup(void)
+{
+ do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
+
+static int rdp_offload_toggle(struct rcu_data *rdp,
+ bool offload, unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ bool wake_gp = false;
+
+ rcu_segcblist_offload(cblist, offload);
+
+ if (rdp->nocb_cb_sleep)
+ rdp->nocb_cb_sleep = false;
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ /*
+ * Ignore former value of nocb_cb_sleep and force wake up as it could
+ * have been spuriously set to false already.
+ */
+ swake_up_one(&rdp->nocb_cb_wq);
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ if (rdp_gp->nocb_gp_sleep) {
+ rdp_gp->nocb_gp_sleep = false;
+ wake_gp = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ return 0;
+}
+
+static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ pr_info("De-offloading %d\n", rdp->cpu);
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * If there are still pending work offloaded, the offline
+ * CPU won't help much handling them.
+ */
+ if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ return -EBUSY;
+ }
+
+ ret = rdp_offload_toggle(rdp, false, flags);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
+ SEGCBLIST_KTHREAD_GP));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /* Make sure nocb timer won't stay around */
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ del_timer_sync(&rdp->nocb_timer);
+
+ /*
+ * Flush bypass. While IRQs are disabled and once we set
+ * SEGCBLIST_SOFTIRQ_ONLY, no callback is supposed to be
+ * enqueued on bypass.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_nocb_flush_bypass(rdp, NULL, jiffies);
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
+ /*
+ * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
+ * rcu_nocb_unlock_irqrestore() anymore. Theoretically we
+ * could set SEGCBLIST_SOFTIRQ_ONLY with cb unlocked and IRQs
+ * disabled now, but let's be paranoid.
+ */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ return ret;
+}
+
+static long rcu_nocb_rdp_deoffload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ return __rcu_nocb_rdp_deoffload(rdp);
+}
+
+int rcu_nocb_cpu_deoffload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ if (rdp == rdp->nocb_gp_rdp) {
+ pr_info("Can't deoffload an rdp GP leader (yet)\n");
+ return -EINVAL;
+ }
+ mutex_lock(&rcu_state.barrier_mutex);
+ cpus_read_lock();
+ if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ if (cpu_online(cpu))
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+ else
+ ret = __rcu_nocb_rdp_deoffload(rdp);
+ if (!ret)
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ }
+ cpus_read_unlock();
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+
+static int __rcu_nocb_rdp_offload(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ /*
+ * For now we only support re-offload, ie: the rdp must have been
+ * offloaded on boot first.
+ */
+ if (!rdp->nocb_gp_rdp)
+ return -EINVAL;
+
+ pr_info("Offloading %d\n", rdp->cpu);
+ /*
+ * Can't use rcu_nocb_lock_irqsave() while we are in
+ * SEGCBLIST_SOFTIRQ_ONLY mode.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ /* Re-enable nocb timer */
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ /*
+ * We didn't take the nocb lock while working on the
+ * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
+ * Every modifications that have been done previously on
+ * rdp->cblist must be visible remotely by the nocb kthreads
+ * upon wake up after reading the cblist flags.
+ *
+ * The layout against nocb_lock enforces that ordering:
+ *
+ * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
+ * ------------------------- ----------------------------
+ * WRITE callbacks rcu_nocb_lock()
+ * rcu_nocb_lock() READ flags
+ * WRITE flags READ callbacks
+ * rcu_nocb_unlock() rcu_nocb_unlock()
+ */
+ ret = rdp_offload_toggle(rdp, true, flags);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+
+ return ret;
+}
+
+static long rcu_nocb_rdp_offload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ return __rcu_nocb_rdp_offload(rdp);
+}
+
+int rcu_nocb_cpu_offload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ mutex_lock(&rcu_state.barrier_mutex);
+ cpus_read_lock();
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ if (cpu_online(cpu))
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+ else
+ ret = __rcu_nocb_rdp_offload(rdp);
+ if (!ret)
+ cpumask_set_cpu(cpu, rcu_nocb_mask);
+ }
+ cpus_read_unlock();
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+
void __init rcu_init_nohz(void)
{
int cpu;
@@ -2229,7 +2529,9 @@ void __init rcu_init_nohz(void)
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_offload(&rdp->cblist);
+ rcu_segcblist_offload(&rdp->cblist, true);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
}
rcu_organize_nocb_kthreads();
}
@@ -2239,6 +2541,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
init_swait_queue_head(&rdp->nocb_cb_wq);
init_swait_queue_head(&rdp->nocb_gp_wq);
+ init_swait_queue_head(&rdp->nocb_state_wq);
raw_spin_lock_init(&rdp->nocb_lock);
raw_spin_lock_init(&rdp->nocb_bypass_lock);
raw_spin_lock_init(&rdp->nocb_gp_lock);
@@ -2381,6 +2684,19 @@ void rcu_bind_current_to_nocb(void)
}
EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
+// The ->on_cpu field is available only in CONFIG_SMP=y, so...
+#ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : "";
+}
+#else // #ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return "";
+}
+#endif // #else #ifdef CONFIG_SMP
+
/*
* Dump out nocb grace-period kthread state for the specified rcu_data
* structure.
@@ -2389,7 +2705,7 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
{
struct rcu_node *rnp = rdp->mynode;
- pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n",
+ pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
rdp->cpu,
"kK"[!!rdp->nocb_gp_kthread],
"lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
@@ -2403,12 +2719,17 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
".B"[!!rdp->nocb_gp_bypass],
".G"[!!rdp->nocb_gp_gp],
(long)rdp->nocb_gp_seq,
- rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops));
+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
+ rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
}
/* Dump out nocb kthread state for the specified rcu_data structure. */
static void show_rcu_nocb_state(struct rcu_data *rdp)
{
+ char bufw[20];
+ char bufr[20];
struct rcu_segcblist *rsclp = &rdp->cblist;
bool waslocked;
bool wastimer;
@@ -2417,8 +2738,11 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
if (rdp->nocb_gp_rdp == rdp)
show_rcu_nocb_gp_state(rdp);
- pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n",
+ sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
"kK"[!!rdp->nocb_cb_kthread],
"bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
"cC"[!!atomic_read(&rdp->nocb_lock_contended)],
@@ -2429,11 +2753,16 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
jiffies - rdp->nocb_nobypass_last,
rdp->nocb_nobypass_count,
".D"[rcu_segcblist_ready_cbs(rsclp)],
- ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)],
- ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)],
- ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)],
+ ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
+ ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
+ ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
- rcu_segcblist_n_cbs(&rdp->cblist));
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
/* It is OK for GP kthreads to have GP state. */
if (rdp->nocb_gp_rdp == rdp)
@@ -2518,8 +2847,9 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
return false;
}
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
+ return false;
}
static void rcu_spawn_cpu_nocb_kthread(int cpu)
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 70d48c52fabc..475b26171b20 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -266,6 +266,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
struct task_struct *t;
struct task_struct *ts[8];
+ lockdep_assert_irqs_disabled();
if (!rcu_preempt_blocked_readers_cgp(rnp))
return 0;
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
@@ -290,6 +291,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
".q"[rscr.rs.b.need_qs],
".e"[rscr.rs.b.exp_hint],
".l"[rscr.on_blkd_list]);
+ lockdep_assert_irqs_disabled();
put_task_struct(t);
ndetected++;
}
@@ -333,9 +335,12 @@ static void rcu_dump_cpu_stacks(void)
rcu_for_each_leaf_node(rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
for_each_leaf_node_possible_cpu(rnp, cpu)
- if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
- if (!trigger_single_cpu_backtrace(cpu))
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+ if (cpu_is_offline(cpu))
+ pr_err("Offline CPU %d blocking current GP.\n", cpu);
+ else if (!trigger_single_cpu_backtrace(cpu))
dump_cpu_task(cpu);
+ }
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -449,25 +454,66 @@ static void print_cpu_stall_info(int cpu)
/* Complain about starvation of grace-period kthread. */
static void rcu_check_gp_kthread_starvation(void)
{
+ int cpu;
struct task_struct *gpk = rcu_state.gp_kthread;
unsigned long j;
if (rcu_is_gp_kthread_starving(&j)) {
+ cpu = gpk ? task_cpu(gpk) : -1;
pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
rcu_state.name, j,
(long)rcu_seq_current(&rcu_state.gp_seq),
data_race(rcu_state.gp_flags),
gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
- gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
+ gpk ? gpk->state : ~0, cpu);
if (gpk) {
pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
pr_err("RCU grace-period kthread stack dump:\n");
sched_show_task(gpk);
+ if (cpu >= 0) {
+ if (cpu_is_offline(cpu)) {
+ pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
+ } else {
+ pr_err("Stack dump where RCU GP kthread last ran:\n");
+ if (!trigger_single_cpu_backtrace(cpu))
+ dump_cpu_task(cpu);
+ }
+ }
wake_up_process(gpk);
}
}
}
+/* Complain about missing wakeups from expired fqs wait timer */
+static void rcu_check_gp_kthread_expired_fqs_timer(void)
+{
+ struct task_struct *gpk = rcu_state.gp_kthread;
+ short gp_state;
+ unsigned long jiffies_fqs;
+ int cpu;
+
+ /*
+ * Order reads of .gp_state and .jiffies_force_qs.
+ * Matching smp_wmb() is present in rcu_gp_fqs_loop().
+ */
+ gp_state = smp_load_acquire(&rcu_state.gp_state);
+ jiffies_fqs = READ_ONCE(rcu_state.jiffies_force_qs);
+
+ if (gp_state == RCU_GP_WAIT_FQS &&
+ time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) &&
+ gpk && !READ_ONCE(gpk->on_rq)) {
+ cpu = task_cpu(gpk);
+ pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx\n",
+ rcu_state.name, (jiffies - jiffies_fqs),
+ (long)rcu_seq_current(&rcu_state.gp_seq),
+ data_race(rcu_state.gp_flags),
+ gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
+ gpk->state);
+ pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
+ cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu));
+ }
+}
+
static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
{
int cpu;
@@ -478,6 +524,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
struct rcu_node *rnp;
long totqlen = 0;
+ lockdep_assert_irqs_disabled();
+
/* Kick and suppress, if so configured. */
rcu_stall_kick_kthreads();
if (rcu_stall_is_suppressed())
@@ -499,6 +547,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
}
}
ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock.
+ lockdep_assert_irqs_disabled();
}
for_each_possible_cpu(cpu)
@@ -529,6 +578,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
WRITE_ONCE(rcu_state.jiffies_stall,
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+ rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
panic_on_rcu_stall();
@@ -544,6 +594,8 @@ static void print_cpu_stall(unsigned long gps)
struct rcu_node *rnp = rcu_get_root();
long totqlen = 0;
+ lockdep_assert_irqs_disabled();
+
/* Kick and suppress, if so configured. */
rcu_stall_kick_kthreads();
if (rcu_stall_is_suppressed())
@@ -564,6 +616,7 @@ static void print_cpu_stall(unsigned long gps)
jiffies - gps,
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
rcu_dump_cpu_stacks();
@@ -598,6 +651,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
unsigned long js;
struct rcu_node *rnp;
+ lockdep_assert_irqs_disabled();
if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) ||
!rcu_gp_in_progress())
return;
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 39334d2d2b37..b95ae86c40a7 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -56,8 +56,10 @@
#ifndef CONFIG_TINY_RCU
module_param(rcu_expedited, int, 0);
module_param(rcu_normal, int, 0);
-static int rcu_normal_after_boot;
+static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT);
+#ifndef CONFIG_PREEMPT_RT
module_param(rcu_normal_after_boot, int, 0);
+#endif
#endif /* #ifndef CONFIG_TINY_RCU */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/reboot.c b/kernel/reboot.c
index eb1b15850761..a6ad5eb2fa73 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -244,8 +244,6 @@ void migrate_to_reboot_cpu(void)
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
- if (pm_power_off_prepare)
- pm_power_off_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
if (!cmd)
diff --git a/kernel/resource.c b/kernel/resource.c
index 833394f9c608..627e61b0c124 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -18,12 +18,15 @@
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
+#include <linux/pseudo_fs.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/pfn.h>
#include <linux/mm.h>
+#include <linux/mount.h>
#include <linux/resource_ext.h>
+#include <uapi/linux/magic.h>
#include <asm/io.h>
@@ -1119,6 +1122,55 @@ resource_size_t resource_alignment(struct resource *res)
static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
+static struct inode *iomem_inode;
+
+#ifdef CONFIG_IO_STRICT_DEVMEM
+static void revoke_iomem(struct resource *res)
+{
+ /* pairs with smp_store_release() in iomem_init_inode() */
+ struct inode *inode = smp_load_acquire(&iomem_inode);
+
+ /*
+ * Check that the initialization has completed. Losing the race
+ * is ok because it means drivers are claiming resources before
+ * the fs_initcall level of init and prevent iomem_get_mapping users
+ * from establishing mappings.
+ */
+ if (!inode)
+ return;
+
+ /*
+ * The expectation is that the driver has successfully marked
+ * the resource busy by this point, so devmem_is_allowed()
+ * should start returning false, however for performance this
+ * does not iterate the entire resource range.
+ */
+ if (devmem_is_allowed(PHYS_PFN(res->start)) &&
+ devmem_is_allowed(PHYS_PFN(res->end))) {
+ /*
+ * *cringe* iomem=relaxed says "go ahead, what's the
+ * worst that can happen?"
+ */
+ return;
+ }
+
+ unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1);
+}
+#else
+static void revoke_iomem(struct resource *res) {}
+#endif
+
+struct address_space *iomem_get_mapping(void)
+{
+ /*
+ * This function is only called from file open paths, hence guaranteed
+ * that fs_initcalls have completed and no need to check for NULL. But
+ * since revoke_iomem can be called before the initcall we still need
+ * the barrier to appease checkers.
+ */
+ return smp_load_acquire(&iomem_inode)->i_mapping;
+}
+
/**
* __request_region - create a new busy resource region
* @parent: parent resource descriptor
@@ -1186,7 +1238,7 @@ struct resource * __request_region(struct resource *parent,
write_unlock(&resource_lock);
if (res && orig_parent == &iomem_resource)
- revoke_devmem(res);
+ revoke_iomem(res);
return res;
}
@@ -1786,4 +1838,48 @@ static int __init strict_iomem(char *str)
return 1;
}
+static int iomem_fs_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type iomem_fs_type = {
+ .name = "iomem",
+ .owner = THIS_MODULE,
+ .init_fs_context = iomem_fs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int __init iomem_init_inode(void)
+{
+ static struct vfsmount *iomem_vfs_mount;
+ static int iomem_fs_cnt;
+ struct inode *inode;
+ int rc;
+
+ rc = simple_pin_fs(&iomem_fs_type, &iomem_vfs_mount, &iomem_fs_cnt);
+ if (rc < 0) {
+ pr_err("Cannot mount iomem pseudo filesystem: %d\n", rc);
+ return rc;
+ }
+
+ inode = alloc_anon_inode(iomem_vfs_mount->mnt_sb);
+ if (IS_ERR(inode)) {
+ rc = PTR_ERR(inode);
+ pr_err("Cannot allocate inode for iomem: %d\n", rc);
+ simple_release_fs(&iomem_vfs_mount, &iomem_fs_cnt);
+ return rc;
+ }
+
+ /*
+ * Publish iomem revocation inode initialized.
+ * Pairs with smp_load_acquire() in revoke_iomem().
+ */
+ smp_store_release(&iomem_inode, inode);
+
+ return 0;
+}
+
+fs_initcall(iomem_init_inode);
+
__setup("iomem=", strict_iomem);
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index d55a9f8cda3d..2377cbb32474 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -398,6 +398,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
static int scftorture_invoker(void *arg)
{
int cpu;
+ int curcpu;
DEFINE_TORTURE_RANDOM(rand);
struct scf_statistics *scfp = (struct scf_statistics *)arg;
bool was_offline = false;
@@ -412,7 +413,10 @@ static int scftorture_invoker(void *arg)
VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id());
// Make sure that the CPU is affinitized appropriately during testing.
- WARN_ON_ONCE(smp_processor_id() != scfp->cpu);
+ curcpu = smp_processor_id();
+ WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids,
+ "%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n",
+ __func__, scfp->cpu, curcpu, nr_cpu_ids);
if (!atomic_dec_return(&n_started))
while (atomic_read_acquire(&n_started)) {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ff74fca39ed2..98191218d891 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -355,8 +355,9 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
+ ktime_t time = rq->hrtick_time;
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
}
/*
@@ -380,7 +381,6 @@ static void __hrtick_start(void *arg)
void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time;
s64 delta;
/*
@@ -388,9 +388,7 @@ void hrtick_start(struct rq *rq, u64 delay)
* doesn't make sense and can cause timer DoS.
*/
delta = max_t(s64, delay, 10000LL);
- time = ktime_add_ns(timer->base->get_time(), delta);
-
- hrtimer_set_expires(timer, time);
+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
if (rq == this_rq())
__hrtick_restart(rq);
@@ -1864,8 +1862,13 @@ struct migration_arg {
struct set_affinity_pending *pending;
};
+/*
+ * @refs: number of wait_for_completion()
+ * @stop_pending: is @stop_work in use
+ */
struct set_affinity_pending {
refcount_t refs;
+ unsigned int stop_pending;
struct completion done;
struct cpu_stop_work stop_work;
struct migration_arg arg;
@@ -1900,8 +1903,8 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
*/
static int migration_cpu_stop(void *data)
{
- struct set_affinity_pending *pending;
struct migration_arg *arg = data;
+ struct set_affinity_pending *pending = arg->pending;
struct task_struct *p = arg->task;
int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
@@ -1923,7 +1926,6 @@ static int migration_cpu_stop(void *data)
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
- pending = p->migration_pending;
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1934,21 +1936,14 @@ static int migration_cpu_stop(void *data)
goto out;
if (pending) {
- p->migration_pending = NULL;
+ if (p->migration_pending == pending)
+ p->migration_pending = NULL;
complete = true;
}
- /* migrate_enable() -- we must not race against SCA */
if (dest_cpu < 0) {
- /*
- * When this was migrate_enable() but we no longer
- * have a @pending, a concurrent SCA 'fixed' things
- * and we should be valid again. Nothing to do.
- */
- if (!pending) {
- WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
goto out;
- }
dest_cpu = cpumask_any_distribute(&p->cpus_mask);
}
@@ -1958,7 +1953,14 @@ static int migration_cpu_stop(void *data)
else
p->wake_cpu = dest_cpu;
- } else if (dest_cpu < 0 || pending) {
+ /*
+ * XXX __migrate_task() can fail, at which point we might end
+ * up running on a dodgy CPU, AFAICT this can only happen
+ * during CPU hotplug, at which point we'll get pushed out
+ * anyway, so it's probably not a big deal.
+ */
+
+ } else if (pending) {
/*
* This happens when we get migrated between migrate_enable()'s
* preempt_enable() and scheduling the stopper task. At that
@@ -1973,43 +1975,32 @@ static int migration_cpu_stop(void *data)
* ->pi_lock, so the allowed mask is stable - if it got
* somewhere allowed, we're done.
*/
- if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
- p->migration_pending = NULL;
+ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+ if (p->migration_pending == pending)
+ p->migration_pending = NULL;
complete = true;
goto out;
}
/*
- * When this was migrate_enable() but we no longer have an
- * @pending, a concurrent SCA 'fixed' things and we should be
- * valid again. Nothing to do.
- */
- if (!pending) {
- WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
- goto out;
- }
-
- /*
* When migrate_enable() hits a rq mis-match we can't reliably
* determine is_migration_disabled() and so have to chase after
* it.
*/
+ WARN_ON_ONCE(!pending->stop_pending);
task_rq_unlock(rq, p, &rf);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
return 0;
}
out:
+ if (pending)
+ pending->stop_pending = false;
task_rq_unlock(rq, p, &rf);
if (complete)
complete_all(&pending->done);
- /* For pending->{arg,stop_work} */
- pending = arg->pending;
- if (pending && refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
-
return 0;
}
@@ -2196,11 +2187,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
int dest_cpu, unsigned int flags)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
- struct migration_arg arg = {
- .task = p,
- .dest_cpu = dest_cpu,
- };
- bool complete = false;
+ bool stop_pending, complete = false;
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
@@ -2212,12 +2199,16 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
push_task = get_task_struct(p);
}
+ /*
+ * If there are pending waiters, but no pending stop_work,
+ * then complete now.
+ */
pending = p->migration_pending;
- if (pending) {
- refcount_inc(&pending->refs);
+ if (pending && !pending->stop_pending) {
p->migration_pending = NULL;
complete = true;
}
+
task_rq_unlock(rq, p, rf);
if (push_task) {
@@ -2226,7 +2217,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
}
if (complete)
- goto do_complete;
+ complete_all(&pending->done);
return 0;
}
@@ -2237,6 +2228,12 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
/* Install the request */
refcount_set(&my_pending.refs, 1);
init_completion(&my_pending.done);
+ my_pending.arg = (struct migration_arg) {
+ .task = p,
+ .dest_cpu = -1, /* any */
+ .pending = &my_pending,
+ };
+
p->migration_pending = &my_pending;
} else {
pending = p->migration_pending;
@@ -2261,45 +2258,41 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
return -EINVAL;
}
- if (flags & SCA_MIGRATE_ENABLE) {
-
- refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
- p->migration_flags &= ~MDF_PUSH;
- task_rq_unlock(rq, p, rf);
-
- pending->arg = (struct migration_arg) {
- .task = p,
- .dest_cpu = -1,
- .pending = pending,
- };
-
- stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
- &pending->arg, &pending->stop_work);
-
- return 0;
- }
-
if (task_running(rq, p) || p->state == TASK_WAKING) {
/*
- * Lessen races (and headaches) by delegating
- * is_migration_disabled(p) checks to the stopper, which will
- * run on the same CPU as said p.
+ * MIGRATE_ENABLE gets here because 'p == current', but for
+ * anything else we cannot do is_migration_disabled(), punt
+ * and have the stopper function handle it all race-free.
*/
+ stop_pending = pending->stop_pending;
+ if (!stop_pending)
+ pending->stop_pending = true;
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ p->migration_flags &= ~MDF_PUSH;
+
task_rq_unlock(rq, p, rf);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ if (!stop_pending) {
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ }
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ return 0;
} else {
if (!is_migration_disabled(p)) {
if (task_on_rq_queued(p))
rq = move_queued_task(rq, rf, p, dest_cpu);
- p->migration_pending = NULL;
- complete = true;
+ if (!pending->stop_pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
}
task_rq_unlock(rq, p, rf);
-do_complete:
if (complete)
complete_all(&pending->done);
}
@@ -2307,7 +2300,7 @@ do_complete:
wait_for_completion(&pending->done);
if (refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
+ wake_up_var(&pending->refs); /* No UaF, just an address */
/*
* Block the original owner of &pending until all subsequent callers
@@ -2315,6 +2308,9 @@ do_complete:
*/
wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+ /* ARGH */
+ WARN_ON_ONCE(my_pending.stop_pending);
+
return 0;
}
@@ -3478,7 +3474,7 @@ out:
/**
* try_invoke_on_locked_down_task - Invoke a function on task in fixed state
- * @p: Process for which the function is to be invoked.
+ * @p: Process for which the function is to be invoked, can be @current.
* @func: Function to invoke.
* @arg: Argument to function.
*
@@ -3496,12 +3492,11 @@ out:
*/
bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
{
- bool ret = false;
struct rq_flags rf;
+ bool ret = false;
struct rq *rq;
- lockdep_assert_irqs_enabled();
- raw_spin_lock_irq(&p->pi_lock);
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
if (p->on_rq) {
rq = __task_rq_lock(p, &rf);
if (task_rq(p) == rq)
@@ -3518,7 +3513,7 @@ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct t
ret = func(p, arg);
}
}
- raw_spin_unlock_irq(&p->pi_lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
}
@@ -4971,7 +4966,7 @@ static void __sched notrace __schedule(bool preempt)
schedule_debug(prev, preempt);
- if (sched_feat(HRTICK))
+ if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
@@ -5265,6 +5260,12 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+#endif
+
+
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
*
@@ -5317,8 +5318,197 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#endif
+
#endif /* CONFIG_PREEMPTION */
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+#include <linux/entry-common.h>
+
+/*
+ * SC:cond_resched
+ * SC:might_resched
+ * SC:preempt_schedule
+ * SC:preempt_schedule_notrace
+ * SC:irqentry_exit_cond_resched
+ *
+ *
+ * NONE:
+ * cond_resched <- __cond_resched
+ * might_resched <- RET0
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * VOLUNTARY:
+ * cond_resched <- __cond_resched
+ * might_resched <- __cond_resched
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * FULL:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ */
+
+enum {
+ preempt_dynamic_none = 0,
+ preempt_dynamic_voluntary,
+ preempt_dynamic_full,
+};
+
+static int preempt_dynamic_mode = preempt_dynamic_full;
+
+static int sched_dynamic_mode(const char *str)
+{
+ if (!strcmp(str, "none"))
+ return 0;
+
+ if (!strcmp(str, "voluntary"))
+ return 1;
+
+ if (!strcmp(str, "full"))
+ return 2;
+
+ return -1;
+}
+
+static void sched_dynamic_update(int mode)
+{
+ /*
+ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+ * the ZERO state, which is invalid.
+ */
+ static_call_update(cond_resched, __cond_resched);
+ static_call_update(might_resched, __cond_resched);
+ static_call_update(preempt_schedule, __preempt_schedule_func);
+ static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+ static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+
+ switch (mode) {
+ case preempt_dynamic_none:
+ static_call_update(cond_resched, __cond_resched);
+ static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+ static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
+ static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
+ static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+ pr_info("Dynamic Preempt: none\n");
+ break;
+
+ case preempt_dynamic_voluntary:
+ static_call_update(cond_resched, __cond_resched);
+ static_call_update(might_resched, __cond_resched);
+ static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
+ static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
+ static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+ pr_info("Dynamic Preempt: voluntary\n");
+ break;
+
+ case preempt_dynamic_full:
+ static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0);
+ static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+ static_call_update(preempt_schedule, __preempt_schedule_func);
+ static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+ static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+ pr_info("Dynamic Preempt: full\n");
+ break;
+ }
+
+ preempt_dynamic_mode = mode;
+}
+
+static int __init setup_preempt_mode(char *str)
+{
+ int mode = sched_dynamic_mode(str);
+ if (mode < 0) {
+ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+ return 1;
+ }
+
+ sched_dynamic_update(mode);
+ return 0;
+}
+__setup("preempt=", setup_preempt_mode);
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[16];
+ int mode;
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ mode = sched_dynamic_mode(strstrip(buf));
+ if (mode < 0)
+ return mode;
+
+ sched_dynamic_update(mode);
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int sched_dynamic_show(struct seq_file *m, void *v)
+{
+ static const char * preempt_modes[] = {
+ "none", "voluntary", "full"
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, "(");
+ seq_puts(m, preempt_modes[i]);
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, ")");
+
+ seq_puts(m, " ");
+ }
+
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static int sched_dynamic_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_dynamic_show, NULL);
+}
+
+static const struct file_operations sched_dynamic_fops = {
+ .open = sched_dynamic_open,
+ .write = sched_dynamic_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static __init int sched_init_debug_dynamic(void)
+{
+ debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops);
+ return 0;
+}
+late_initcall(sched_init_debug_dynamic);
+
+#endif /* CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+
+
/*
* This is the entry point to schedule() from kernel preemption
* off of irq context.
@@ -5616,8 +5806,12 @@ SYSCALL_DEFINE1(nice, int, increment)
* @p: the task in question.
*
* Return: The priority value as seen by users in /proc.
- * RT tasks are offset by -200. Normal tasks are centered
- * around 0, value goes from -16 to +15.
+ *
+ * sched policy return value kernel prio user prio/nice
+ *
+ * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
+ * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
+ * deadline -101 -1 0
*/
int task_prio(const struct task_struct *p)
{
@@ -5676,6 +5870,120 @@ struct task_struct *idle_task(int cpu)
return cpu_rq(cpu)->idle;
}
+#ifdef CONFIG_SMP
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the irq utilization.
+ *
+ * The DL bandwidth number otoh is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum cpu_util_type type,
+ struct task_struct *p)
+{
+ unsigned long dl_util, util, irq;
+ struct rq *rq = cpu_rq(cpu);
+
+ if (!uclamp_is_used() &&
+ type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
+ return max;
+ }
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= max))
+ return max;
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ *
+ * CFS and RT utilization can be boosted or capped, depending on
+ * utilization clamp constraints requested by currently RUNNABLE
+ * tasks.
+ * When there are no CFS RUNNABLE tasks, clamps are released and
+ * frequency will be gracefully reduced with the utilization decay.
+ */
+ util = util_cfs + cpu_util_rt(rq);
+ if (type == FREQUENCY_UTIL)
+ util = uclamp_rq_util_with(rq, util, p);
+
+ dl_util = cpu_util_dl(rq);
+
+ /*
+ * For frequency selection we do not make cpu_util_dl() a permanent part
+ * of this sum because we want to use cpu_bw_dl() later on, but we need
+ * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
+ * that we select f_max when there is no idle time.
+ *
+ * NOTE: numerical errors or stop class might cause us to not quite hit
+ * saturation when we should -- something for later.
+ */
+ if (util + dl_util >= max)
+ return max;
+
+ /*
+ * OTOH, for energy computation we need the estimated running time, so
+ * include util_dl and ignore dl_bw.
+ */
+ if (type == ENERGY_UTIL)
+ util += dl_util;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * irq metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * max - irq
+ * U' = irq + --------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, max);
+ util += irq;
+
+ /*
+ * Bandwidth required by DEADLINE must always be granted while, for
+ * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
+ * to gracefully reduce the frequency when no tasks show up for longer
+ * periods of time.
+ *
+ * Ideally we would like to set bw_dl as min/guaranteed freq and util +
+ * bw_dl as requested freq. However, cpufreq is not yet ready for such
+ * an interface. So, we only do the latter for now.
+ */
+ if (type == FREQUENCY_UTIL)
+ util += cpu_bw_dl(rq);
+
+ return min(max, util);
+}
+
+unsigned long sched_cpu_util(int cpu, unsigned long max)
+{
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
+ ENERGY_UTIL, NULL);
+}
+#endif /* CONFIG_SMP */
+
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
@@ -5797,11 +6105,10 @@ recheck:
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
- * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+ * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
* SCHED_BATCH and SCHED_IDLE is 0.
*/
- if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
- (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
+ if (attr->sched_priority > MAX_RT_PRIO-1)
return -EINVAL;
if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
(rt_policy(policy) != (attr->sched_priority != 0)))
@@ -6668,17 +6975,27 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
-#ifndef CONFIG_PREEMPTION
-int __sched _cond_resched(void)
+#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
+int __sched __cond_resched(void)
{
if (should_resched(0)) {
preempt_schedule_common();
return 1;
}
+#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
+#endif
return 0;
}
-EXPORT_SYMBOL(_cond_resched);
+EXPORT_SYMBOL(__cond_resched);
+#endif
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
+EXPORT_STATIC_CALL_TRAMP(cond_resched);
+
+DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
+EXPORT_STATIC_CALL_TRAMP(might_resched);
#endif
/*
@@ -6709,6 +7026,46 @@ int __cond_resched_lock(spinlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_lock);
+int __cond_resched_rwlock_read(rwlock_t *lock)
+{
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
+
+ lockdep_assert_held_read(lock);
+
+ if (rwlock_needbreak(lock) || resched) {
+ read_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ cpu_relax();
+ ret = 1;
+ read_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_read);
+
+int __cond_resched_rwlock_write(rwlock_t *lock)
+{
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
+
+ lockdep_assert_held_write(lock);
+
+ if (rwlock_needbreak(lock) || resched) {
+ write_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ cpu_relax();
+ ret = 1;
+ write_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_write);
+
/**
* yield - yield the current processor to other threads.
*
@@ -6869,7 +7226,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
switch (policy) {
case SCHED_FIFO:
case SCHED_RR:
- ret = MAX_USER_RT_PRIO-1;
+ ret = MAX_RT_PRIO-1;
break;
case SCHED_DEADLINE:
case SCHED_NORMAL:
@@ -7509,6 +7866,12 @@ int sched_cpu_deactivate(unsigned int cpu)
struct rq_flags rf;
int ret;
+ /*
+ * Remove CPU from nohz.idle_cpus_mask to prevent participating in
+ * load balancing when not active
+ */
+ nohz_balance_exit_idle(rq);
+
set_cpu_active(cpu, false);
/*
@@ -7653,7 +8016,6 @@ int sched_cpu_dying(unsigned int cpu)
calc_load_migrate(rq);
update_max_interval();
- nohz_balance_exit_idle(rq);
hrtick_clear(rq);
return 0;
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 6931f0cdeb80..50cbad89f7fa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -26,7 +26,7 @@ struct sugov_policy {
struct sugov_tunables *tunables;
struct list_head tunables_hook;
- raw_spinlock_t update_lock; /* For shared policies */
+ raw_spinlock_t update_lock;
u64 last_freq_update_time;
s64 freq_update_delay_ns;
unsigned int next_freq;
@@ -171,112 +171,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
-/*
- * This function computes an effective utilization for the given CPU, to be
- * used for frequency selection given the linear relation: f = u * f_max.
- *
- * The scheduler tracks the following metrics:
- *
- * cpu_util_{cfs,rt,dl,irq}()
- * cpu_bw_dl()
- *
- * Where the cfs,rt and dl util numbers are tracked with the same metric and
- * synchronized windows and are thus directly comparable.
- *
- * The cfs,rt,dl utilization are the running times measured with rq->clock_task
- * which excludes things like IRQ and steal-time. These latter are then accrued
- * in the irq utilization.
- *
- * The DL bandwidth number otoh is not a measured metric but a value computed
- * based on the task model parameters and gives the minimal utilization
- * required to meet deadlines.
- */
-unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type,
- struct task_struct *p)
-{
- unsigned long dl_util, util, irq;
- struct rq *rq = cpu_rq(cpu);
-
- if (!uclamp_is_used() &&
- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
- return max;
- }
-
- /*
- * Early check to see if IRQ/steal time saturates the CPU, can be
- * because of inaccuracies in how we track these -- see
- * update_irq_load_avg().
- */
- irq = cpu_util_irq(rq);
- if (unlikely(irq >= max))
- return max;
-
- /*
- * Because the time spend on RT/DL tasks is visible as 'lost' time to
- * CFS tasks and we use the same metric to track the effective
- * utilization (PELT windows are synchronized) we can directly add them
- * to obtain the CPU's actual utilization.
- *
- * CFS and RT utilization can be boosted or capped, depending on
- * utilization clamp constraints requested by currently RUNNABLE
- * tasks.
- * When there are no CFS RUNNABLE tasks, clamps are released and
- * frequency will be gracefully reduced with the utilization decay.
- */
- util = util_cfs + cpu_util_rt(rq);
- if (type == FREQUENCY_UTIL)
- util = uclamp_rq_util_with(rq, util, p);
-
- dl_util = cpu_util_dl(rq);
-
- /*
- * For frequency selection we do not make cpu_util_dl() a permanent part
- * of this sum because we want to use cpu_bw_dl() later on, but we need
- * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
- * that we select f_max when there is no idle time.
- *
- * NOTE: numerical errors or stop class might cause us to not quite hit
- * saturation when we should -- something for later.
- */
- if (util + dl_util >= max)
- return max;
-
- /*
- * OTOH, for energy computation we need the estimated running time, so
- * include util_dl and ignore dl_bw.
- */
- if (type == ENERGY_UTIL)
- util += dl_util;
-
- /*
- * There is still idle time; further improve the number by using the
- * irq metric. Because IRQ/steal time is hidden from the task clock we
- * need to scale the task numbers:
- *
- * max - irq
- * U' = irq + --------- * U
- * max
- */
- util = scale_irq_capacity(util, irq, max);
- util += irq;
-
- /*
- * Bandwidth required by DEADLINE must always be granted while, for
- * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
- * to gracefully reduce the frequency when no tasks show up for longer
- * periods of time.
- *
- * Ideally we would like to set bw_dl as min/guaranteed freq and util +
- * bw_dl as requested freq. However, cpufreq is not yet ready for such
- * an interface. So, we only do the latter for now.
- */
- if (type == FREQUENCY_UTIL)
- util += cpu_bw_dl(rq);
-
- return min(max, util);
-}
-
static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
@@ -284,7 +178,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = schedutil_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,
+ sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,
FREQUENCY_UTIL, NULL);
}
@@ -426,23 +320,21 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
* Make sugov_should_update_freq() ignore the rate limit when DL
* has increased the utilization.
*/
-static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
+static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
{
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
- sg_policy->limits_changed = true;
+ sg_cpu->sg_policy->limits_changed = true;
}
static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
u64 time, unsigned int flags)
{
- struct sugov_policy *sg_policy = sg_cpu->sg_policy;
-
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
- ignore_dl_rate_limit(sg_cpu, sg_policy);
+ ignore_dl_rate_limit(sg_cpu);
- if (!sugov_should_update_freq(sg_policy, time))
+ if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
return false;
sugov_get_util(sg_cpu);
@@ -557,7 +449,7 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
- ignore_dl_rate_limit(sg_cpu, sg_policy);
+ ignore_dl_rate_limit(sg_cpu);
if (sugov_should_update_freq(sg_policy, time)) {
next_f = sugov_next_freq_shared(sg_cpu, time);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 75686c6d4436..aac3539aa0fe 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -517,58 +517,44 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
update_dl_migration(dl_rq);
}
+#define __node_2_pdl(node) \
+ rb_entry((node), struct task_struct, pushable_dl_tasks)
+
+static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b)
+{
+ return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl);
+}
+
/*
* The list of pushable -deadline task is not a plist, like in
* sched_rt.c, it is an rb-tree with tasks ordered by deadline.
*/
static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
- struct dl_rq *dl_rq = &rq->dl;
- struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct task_struct *entry;
- bool leftmost = true;
+ struct rb_node *leftmost;
BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct task_struct,
- pushable_dl_tasks);
- if (dl_entity_preempt(&p->dl, &entry->dl))
- link = &parent->rb_left;
- else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
-
+ leftmost = rb_add_cached(&p->pushable_dl_tasks,
+ &rq->dl.pushable_dl_tasks_root,
+ __pushable_less);
if (leftmost)
- dl_rq->earliest_dl.next = p->dl.deadline;
-
- rb_link_node(&p->pushable_dl_tasks, parent, link);
- rb_insert_color_cached(&p->pushable_dl_tasks,
- &dl_rq->pushable_dl_tasks_root, leftmost);
+ rq->dl.earliest_dl.next = p->dl.deadline;
}
static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
struct dl_rq *dl_rq = &rq->dl;
+ struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root;
+ struct rb_node *leftmost;
if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
return;
- if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) {
- struct rb_node *next_node;
-
- next_node = rb_next(&p->pushable_dl_tasks);
- if (next_node) {
- dl_rq->earliest_dl.next = rb_entry(next_node,
- struct task_struct, pushable_dl_tasks)->dl.deadline;
- }
- }
+ leftmost = rb_erase_cached(&p->pushable_dl_tasks, root);
+ if (leftmost)
+ dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline;
- rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
}
@@ -1478,29 +1464,21 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
dec_dl_migration(dl_se, dl_rq);
}
+#define __node_2_dle(node) \
+ rb_entry((node), struct sched_dl_entity, rb_node)
+
+static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
+{
+ return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
+}
+
static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rb_node **link = &dl_rq->root.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct sched_dl_entity *entry;
- int leftmost = 1;
BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct sched_dl_entity, rb_node);
- if (dl_time_before(dl_se->deadline, entry->deadline))
- link = &parent->rb_left;
- else {
- link = &parent->rb_right;
- leftmost = 0;
- }
- }
-
- rb_link_node(&dl_se->rb_node, parent, link);
- rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost);
+ rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less);
inc_dl_tasks(dl_se, dl_rq);
}
@@ -1513,6 +1491,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
return;
rb_erase_cached(&dl_se->rb_node, &dl_rq->root);
+
RB_CLEAR_NODE(&dl_se->rb_node);
dec_dl_tasks(dl_se, dl_rq);
@@ -1853,7 +1832,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
if (!first)
return;
- if (hrtick_enabled(rq))
+ if (hrtick_enabled_dl(rq))
start_hrtick_dl(rq, p);
if (rq->curr->sched_class != &dl_sched_class)
@@ -1916,7 +1895,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
* not being the leftmost task anymore. In that case NEED_RESCHED will
* be set and schedule() will start a new hrtick for the next task.
*/
- if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
+ if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 &&
is_leftmost(p, &rq->dl))
start_hrtick_dl(rq, p);
}
@@ -2409,9 +2388,13 @@ void dl_add_task_root_domain(struct task_struct *p)
struct rq *rq;
struct dl_bw *dl_b;
- rq = task_rq_lock(p, &rf);
- if (!dl_task(p))
- goto unlock;
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ if (!dl_task(p)) {
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
+ return;
+ }
+
+ rq = __task_rq_lock(p, &rf);
dl_b = &rq->rd->dl_bw;
raw_spin_lock(&dl_b->lock);
@@ -2420,7 +2403,6 @@ void dl_add_task_root_domain(struct task_struct *p)
raw_spin_unlock(&dl_b->lock);
-unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -2514,7 +2496,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (task_on_rq_queued(p) || rq->curr == p) {
+ if (task_on_rq_queued(p) || task_current(rq, p)) {
#ifdef CONFIG_SMP
/*
* This might be too much, but unfortunately
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2357921580f9..486f403a778b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -486,7 +486,7 @@ static char *task_group_path(struct task_group *tg)
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
- if (rq->curr == p)
+ if (task_current(rq, p))
SEQ_printf(m, ">R");
else
SEQ_printf(m, " %c", task_state_to_char(p));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04a3ce20da67..794c2cb945f8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -531,12 +531,15 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
return min_vruntime;
}
-static inline int entity_before(struct sched_entity *a,
+static inline bool entity_before(struct sched_entity *a,
struct sched_entity *b)
{
return (s64)(a->vruntime - b->vruntime) < 0;
}
+#define __node_2_se(node) \
+ rb_entry((node), struct sched_entity, run_node)
+
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
@@ -552,8 +555,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
}
if (leftmost) { /* non-empty tree */
- struct sched_entity *se;
- se = rb_entry(leftmost, struct sched_entity, run_node);
+ struct sched_entity *se = __node_2_se(leftmost);
if (!curr)
vruntime = se->vruntime;
@@ -569,37 +571,17 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
#endif
}
+static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+{
+ return entity_before(__node_2_se(a), __node_2_se(b));
+}
+
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct sched_entity *entry;
- bool leftmost = true;
-
- /*
- * Find the right place in the rbtree:
- */
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct sched_entity, run_node);
- /*
- * We dont care about collisions. Nodes with
- * the same key stay together.
- */
- if (entity_before(se, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
-
- rb_link_node(&se->run_node, parent, link);
- rb_insert_color_cached(&se->run_node,
- &cfs_rq->tasks_timeline, leftmost);
+ rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -614,7 +596,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
if (!left)
return NULL;
- return rb_entry(left, struct sched_entity, run_node);
+ return __node_2_se(left);
}
static struct sched_entity *__pick_next_entity(struct sched_entity *se)
@@ -624,7 +606,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
if (!next)
return NULL;
- return rb_entry(next, struct sched_entity, run_node);
+ return __node_2_se(next);
}
#ifdef CONFIG_SCHED_DEBUG
@@ -635,7 +617,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
if (!last)
return NULL;
- return rb_entry(last, struct sched_entity, run_node);
+ return __node_2_se(last);
}
/**************************************************************
@@ -3943,6 +3925,22 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
trace_sched_util_est_cfs_tp(cfs_rq);
}
+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Update root cfs_rq's estimated utilization */
+ enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+ trace_sched_util_est_cfs_tp(cfs_rq);
+}
+
/*
* Check if a (signed) value is within a specified (unsigned) margin,
* based on the observation that:
@@ -3956,23 +3954,16 @@ static inline bool within_margin(int value, int margin)
return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
}
-static void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+static inline void util_est_update(struct cfs_rq *cfs_rq,
+ struct task_struct *p,
+ bool task_sleep)
{
long last_ewma_diff;
struct util_est ue;
- int cpu;
if (!sched_feat(UTIL_EST))
return;
- /* Update root cfs_rq's estimated utilization */
- ue.enqueued = cfs_rq->avg.util_est.enqueued;
- ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
-
- trace_sched_util_est_cfs_tp(cfs_rq);
-
/*
* Skip update of task's estimated utilization when the task has not
* yet completed an activation, e.g. being migrated.
@@ -4012,8 +4003,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
- cpu = cpu_of(rq_of(cfs_rq));
- if (task_util(p) > capacity_orig_of(cpu))
+ if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
return;
/*
@@ -4052,7 +4042,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
if (!static_branch_unlikely(&sched_asym_cpucapacity))
return;
- if (!p) {
+ if (!p || p->nr_cpus_allowed == 1) {
rq->misfit_task_load = 0;
return;
}
@@ -4096,8 +4086,11 @@ static inline void
util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
- bool task_sleep) {}
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
+ bool task_sleep) {}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
@@ -5133,7 +5126,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
/*
* When a group wakes up we want to make sure that its quota is not already
* expired/exceeded, otherwise it may be allowed to steal additional ticks of
- * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ * runtime as update_curr() throttling can not trigger until it's on-rq.
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
@@ -5419,7 +5412,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
s64 delta = slice - ran;
if (delta < 0) {
- if (rq->curr == p)
+ if (task_current(rq, p))
resched_curr(rq);
return;
}
@@ -5436,7 +5429,7 @@ static void hrtick_update(struct rq *rq)
{
struct task_struct *curr = rq->curr;
- if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
+ if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
return;
if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
@@ -5609,6 +5602,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
+ util_est_dequeue(&rq->cfs, p);
+
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
@@ -5659,7 +5654,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
rq->next_balance = jiffies;
dequeue_throttle:
- util_est_dequeue(&rq->cfs, p, task_sleep);
+ util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -6006,6 +6001,14 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return new_cpu;
}
+static inline int __select_idle_cpu(int cpu)
+{
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ return cpu;
+
+ return -1;
+}
+
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
EXPORT_SYMBOL_GPL(sched_smt_present);
@@ -6064,74 +6067,51 @@ unlock:
* there are no idle cores left in the system; tracked through
* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
*/
-static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- int core, cpu;
+ bool idle = true;
+ int cpu;
if (!static_branch_likely(&sched_smt_present))
- return -1;
+ return __select_idle_cpu(core);
- if (!test_idle_cores(target, false))
- return -1;
-
- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
-
- for_each_cpu_wrap(core, cpus, target) {
- bool idle = true;
-
- for_each_cpu(cpu, cpu_smt_mask(core)) {
- if (!available_idle_cpu(cpu)) {
- idle = false;
- break;
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (!available_idle_cpu(cpu)) {
+ idle = false;
+ if (*idle_cpu == -1) {
+ if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
+ *idle_cpu = cpu;
+ break;
+ }
+ continue;
}
+ break;
}
-
- if (idle)
- return core;
-
- cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
+ if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
+ *idle_cpu = cpu;
}
- /*
- * Failed to find an idle core; stop looking for one.
- */
- set_idle_cores(target, 0);
+ if (idle)
+ return core;
+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
return -1;
}
-/*
- * Scan the local SMT mask for idle CPUs.
- */
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
-{
- int cpu;
-
- if (!static_branch_likely(&sched_smt_present))
- return -1;
-
- for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
- !cpumask_test_cpu(cpu, sched_domain_span(sd)))
- continue;
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
- return cpu;
- }
+#else /* CONFIG_SCHED_SMT */
- return -1;
+static inline void set_idle_cores(int cpu, int val)
+{
}
-#else /* CONFIG_SCHED_SMT */
-
-static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+static inline bool test_idle_cores(int cpu, bool def)
{
- return -1;
+ return def;
}
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
- return -1;
+ return __select_idle_cpu(core);
}
#endif /* CONFIG_SCHED_SMT */
@@ -6144,49 +6124,61 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ bool smt = test_idle_cores(target, false);
+ int this = smp_processor_id();
struct sched_domain *this_sd;
- u64 avg_cost, avg_idle;
u64 time;
- int this = smp_processor_id();
- int cpu, nr = INT_MAX;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
return -1;
- /*
- * Due to large variance we need a large fuzz factor; hackbench in
- * particularly is sensitive here.
- */
- avg_idle = this_rq()->avg_idle / 512;
- avg_cost = this_sd->avg_scan_cost + 1;
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
- if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
- return -1;
+ if (sched_feat(SIS_PROP) && !smt) {
+ u64 avg_cost, avg_idle, span_avg;
+
+ /*
+ * Due to large variance we need a large fuzz factor;
+ * hackbench in particularly is sensitive here.
+ */
+ avg_idle = this_rq()->avg_idle / 512;
+ avg_cost = this_sd->avg_scan_cost + 1;
- if (sched_feat(SIS_PROP)) {
- u64 span_avg = sd->span_weight * avg_idle;
+ span_avg = sd->span_weight * avg_idle;
if (span_avg > 4*avg_cost)
nr = div_u64(span_avg, avg_cost);
else
nr = 4;
- }
-
- time = cpu_clock(this);
- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+ time = cpu_clock(this);
+ }
for_each_cpu_wrap(cpu, cpus, target) {
- if (!--nr)
- return -1;
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
- break;
+ if (smt) {
+ i = select_idle_core(p, cpu, cpus, &idle_cpu);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+
+ } else {
+ if (!--nr)
+ return -1;
+ idle_cpu = __select_idle_cpu(cpu);
+ if ((unsigned int)idle_cpu < nr_cpumask_bits)
+ break;
+ }
}
- time = cpu_clock(this) - time;
- update_avg(&this_sd->avg_scan_cost, time);
+ if (smt)
+ set_idle_cores(this, false);
- return cpu;
+ if (sched_feat(SIS_PROP) && !smt) {
+ time = cpu_clock(this) - time;
+ update_avg(&this_sd->avg_scan_cost, time);
+ }
+
+ return idle_cpu;
}
/*
@@ -6315,18 +6307,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (!sd)
return target;
- i = select_idle_core(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
-
i = select_idle_cpu(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
- i = select_idle_smt(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
-
return target;
}
@@ -6543,7 +6527,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* is already enough to scale the EM reported power
* consumption at the (eventually clamped) cpu_capacity.
*/
- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap,
ENERGY_UTIL, NULL);
/*
@@ -6553,7 +6537,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap,
FREQUENCY_UTIL, tsk);
max_util = max(max_util, cpu_util);
}
@@ -6651,7 +6635,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* IOW, placing the task there would make the CPU
* overutilized. Take uclamp into account to see how
* much capacity we can get out of the CPU; this is
- * aligned with schedutil_cpu_util().
+ * aligned with sched_cpu_util().
*/
util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
if (!fits_capacity(util, cpu_cap))
@@ -7132,7 +7116,7 @@ done: __maybe_unused;
list_move(&p->se.group_node, &rq->cfs_tasks);
#endif
- if (hrtick_enabled(rq))
+ if (hrtick_enabled_fair(rq))
hrtick_start_fair(rq, p);
update_misfit_status(p, rq);
@@ -9389,8 +9373,11 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- capacity = capacity_of(i);
nr_running = rq->cfs.h_nr_running;
+ if (!nr_running)
+ continue;
+
+ capacity = capacity_of(i);
/*
* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
@@ -9496,13 +9483,32 @@ asym_active_balance(struct lb_env *env)
}
static inline bool
-voluntary_active_balance(struct lb_env *env)
+imbalanced_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
+
+ /*
+ * The imbalanced case includes the case of pinned tasks preventing a fair
+ * distribution of the load on the system but also the even distribution of the
+ * threads on a system with spare capacity
+ */
+ if ((env->migration_type == migrate_task) &&
+ (sd->nr_balance_failed > sd->cache_nice_tries+2))
+ return 1;
+
+ return 0;
+}
+
+static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
if (asym_active_balance(env))
return 1;
+ if (imbalanced_active_balance(env))
+ return 1;
+
/*
* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
* It's worth migrating the task if the src_cpu's capacity is reduced
@@ -9522,16 +9528,6 @@ voluntary_active_balance(struct lb_env *env)
return 0;
}
-static int need_active_balance(struct lb_env *env)
-{
- struct sched_domain *sd = env->sd;
-
- if (voluntary_active_balance(env))
- return 1;
-
- return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
-}
-
static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
@@ -9623,6 +9619,8 @@ redo:
env.src_rq = busiest;
ld_moved = 0;
+ /* Clear this flag as soon as we find a pullable task */
+ env.flags |= LBF_ALL_PINNED;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
@@ -9630,7 +9628,6 @@ redo:
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- env.flags |= LBF_ALL_PINNED;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
@@ -9756,10 +9753,12 @@ more_balance:
if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
- env.flags |= LBF_ALL_PINNED;
goto out_one_pinned;
}
+ /* Record that we found at least one task that could run on this_cpu */
+ env.flags &= ~LBF_ALL_PINNED;
+
/*
* ->active_balance synchronizes accesses to
* ->active_balance_work. Once set, it's cleared
@@ -9781,21 +9780,13 @@ more_balance:
/* We've kicked active balancing, force task migration. */
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
- } else
+ } else {
sd->nr_balance_failed = 0;
+ }
- if (likely(!active_balance) || voluntary_active_balance(&env)) {
+ if (likely(!active_balance) || need_active_balance(&env)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
- } else {
- /*
- * If we've begun active balancing, start to back off. This
- * case may not be covered by the all_pinned logic if there
- * is only 1 task on the busy runqueue (because we don't call
- * detach_tasks).
- */
- if (sd->balance_interval < sd->max_interval)
- sd->balance_interval *= 2;
}
goto out;
@@ -10700,8 +10691,11 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
*/
void trigger_load_balance(struct rq *rq)
{
- /* Don't need to rebalance while attached to NULL domain */
- if (unlikely(on_null_domain(rq)))
+ /*
+ * Don't need to rebalance while attached to NULL domain or
+ * runqueue CPU is not active
+ */
+ if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
return;
if (time_after_eq(jiffies, rq->next_balance))
@@ -10806,7 +10800,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (rq->curr == p) {
+ if (task_current(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
} else
@@ -10939,7 +10933,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
- if (rq->curr == p)
+ if (task_current(rq, p))
resched_curr(rq);
else
check_preempt_curr(rq, p, 0);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 68d369cba9e4..1bc2b158fc51 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -38,6 +38,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
+SCHED_FEAT(HRTICK_DL, false)
SCHED_FEAT(DOUBLE_TICK, false)
/*
@@ -54,7 +55,6 @@ SCHED_FEAT(TTWU_QUEUE, true)
/*
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
*/
-SCHED_FEAT(SIS_AVG_CPU, false)
SCHED_FEAT(SIS_PROP, true)
/*
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 305727ea0677..7199e6f23789 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -285,6 +285,7 @@ static void do_idle(void)
}
arch_cpu_idle_enter();
+ rcu_nocb_flush_deferred_wakeup();
/*
* In poll mode we reenable interrupts and spin. Also if we
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 08ae45ad9261..b5add64d9698 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -454,7 +454,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)
/*
* For each cpu runqueue, if the task's mm match @mm, ensure that all
- * @mm's membarrier state set bits are also set in in the runqueue's
+ * @mm's membarrier state set bits are also set in the runqueue's
* membarrier state. This ensures that a runqueue scheduling
* between threads which are users of @mm has its membarrier state
* updated.
@@ -471,9 +471,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)
}
rcu_read_unlock();
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
- preempt_enable();
+ on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
free_cpumask_var(tmpmask);
cpus_read_unlock();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index dbe4629cf7ba..8f720b71d13d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2357,7 +2357,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
- if (rq->curr == p) {
+ if (task_current(rq, p)) {
#ifdef CONFIG_SMP
/*
* If our priority decreases while running, we
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bb09988451a0..10a1522b1e30 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -140,7 +140,7 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
* scale_load() and scale_load_down(w) to convert between them. The
* following must be true:
*
- * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
+ * scale_load(sched_prio_to_weight[NICE_TO_PRIO(0)-MAX_RT_PRIO]) == NICE_0_LOAD
*
*/
#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT)
@@ -1031,6 +1031,7 @@ struct rq {
call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
+ ktime_t hrtick_time;
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -2104,17 +2105,39 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
*/
static inline int hrtick_enabled(struct rq *rq)
{
- if (!sched_feat(HRTICK))
- return 0;
if (!cpu_active(cpu_of(rq)))
return 0;
return hrtimer_is_hres_active(&rq->hrtick_timer);
}
+static inline int hrtick_enabled_fair(struct rq *rq)
+{
+ if (!sched_feat(HRTICK))
+ return 0;
+ return hrtick_enabled(rq);
+}
+
+static inline int hrtick_enabled_dl(struct rq *rq)
+{
+ if (!sched_feat(HRTICK_DL))
+ return 0;
+ return hrtick_enabled(rq);
+}
+
void hrtick_start(struct rq *rq, u64 delay);
#else
+static inline int hrtick_enabled_fair(struct rq *rq)
+{
+ return 0;
+}
+
+static inline int hrtick_enabled_dl(struct rq *rq)
+{
+ return 0;
+}
+
static inline int hrtick_enabled(struct rq *rq)
{
return 0;
@@ -2558,27 +2581,24 @@ static inline unsigned long capacity_orig_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity_orig;
}
-#endif
/**
- * enum schedutil_type - CPU utilization type
+ * enum cpu_util_type - CPU utilization type
* @FREQUENCY_UTIL: Utilization used to select frequency
* @ENERGY_UTIL: Utilization used during energy calculation
*
* The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
* need to be aggregated differently depending on the usage made of them. This
- * enum is used within schedutil_freq_util() to differentiate the types of
+ * enum is used within effective_cpu_util() to differentiate the types of
* utilization expected by the callers, and adjust the aggregation accordingly.
*/
-enum schedutil_type {
+enum cpu_util_type {
FREQUENCY_UTIL,
ENERGY_UTIL,
};
-#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
-
-unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type,
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum cpu_util_type type,
struct task_struct *p);
static inline unsigned long cpu_bw_dl(struct rq *rq)
@@ -2607,14 +2627,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
{
return READ_ONCE(rq->avg_rt.util_avg);
}
-#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
-static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type,
- struct task_struct *p)
-{
- return 0;
-}
-#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+#endif
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 5d3675c7a76b..09d35044bd88 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1596,66 +1596,58 @@ static void init_numa_topology_type(void)
}
}
+
+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
+
void sched_init_numa(void)
{
- int next_distance, curr_distance = node_distance(0, 0);
struct sched_domain_topology_level *tl;
- int level = 0;
- int i, j, k;
-
- sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
- if (!sched_domains_numa_distance)
- return;
-
- /* Includes NUMA identity node at level 0. */
- sched_domains_numa_distance[level++] = curr_distance;
- sched_domains_numa_levels = level;
+ unsigned long *distance_map;
+ int nr_levels = 0;
+ int i, j;
/*
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
* unique distances in the node_distance() table.
- *
- * Assumes node_distance(0,j) includes all distances in
- * node_distance(i,j) in order to avoid cubic time.
*/
- next_distance = curr_distance;
+ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
+ if (!distance_map)
+ return;
+
+ bitmap_zero(distance_map, NR_DISTANCE_VALUES);
for (i = 0; i < nr_node_ids; i++) {
for (j = 0; j < nr_node_ids; j++) {
- for (k = 0; k < nr_node_ids; k++) {
- int distance = node_distance(i, k);
-
- if (distance > curr_distance &&
- (distance < next_distance ||
- next_distance == curr_distance))
- next_distance = distance;
-
- /*
- * While not a strong assumption it would be nice to know
- * about cases where if node A is connected to B, B is not
- * equally connected to A.
- */
- if (sched_debug() && node_distance(k, i) != distance)
- sched_numa_warn("Node-distance not symmetric");
+ int distance = node_distance(i, j);
- if (sched_debug() && i && !find_numa_distance(distance))
- sched_numa_warn("Node-0 not representative");
+ if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
+ sched_numa_warn("Invalid distance value range");
+ return;
}
- if (next_distance != curr_distance) {
- sched_domains_numa_distance[level++] = next_distance;
- sched_domains_numa_levels = level;
- curr_distance = next_distance;
- } else break;
+
+ bitmap_set(distance_map, distance, 1);
}
+ }
+ /*
+ * We can now figure out how many unique distance values there are and
+ * allocate memory accordingly.
+ */
+ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
- /*
- * In case of sched_debug() we verify the above assumption.
- */
- if (!sched_debug())
- break;
+ sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+ if (!sched_domains_numa_distance) {
+ bitmap_free(distance_map);
+ return;
+ }
+
+ for (i = 0, j = 0; i < nr_levels; i++, j++) {
+ j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
+ sched_domains_numa_distance[i] = j;
}
+ bitmap_free(distance_map);
+
/*
- * 'level' contains the number of unique distances
+ * 'nr_levels' contains the number of unique distances
*
* The sched_domains_numa_distance[] array includes the actual distance
* numbers.
@@ -1664,15 +1656,15 @@ void sched_init_numa(void)
/*
* Here, we should temporarily reset sched_domains_numa_levels to 0.
* If it fails to allocate memory for array sched_domains_numa_masks[][],
- * the array will contain less then 'level' members. This could be
+ * the array will contain less then 'nr_levels' members. This could be
* dangerous when we use it to iterate array sched_domains_numa_masks[][]
* in other functions.
*
- * We reset it to 'level' at the end of this function.
+ * We reset it to 'nr_levels' at the end of this function.
*/
sched_domains_numa_levels = 0;
- sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
if (!sched_domains_numa_masks)
return;
@@ -1680,7 +1672,7 @@ void sched_init_numa(void)
* Now for each level, construct a mask per node which contains all
* CPUs of nodes that are that many hops away from us.
*/
- for (i = 0; i < level; i++) {
+ for (i = 0; i < nr_levels; i++) {
sched_domains_numa_masks[i] =
kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
if (!sched_domains_numa_masks[i])
@@ -1688,12 +1680,17 @@ void sched_init_numa(void)
for (j = 0; j < nr_node_ids; j++) {
struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ int k;
+
if (!mask)
return;
sched_domains_numa_masks[i][j] = mask;
for_each_node(k) {
+ if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+ sched_numa_warn("Node-distance not symmetric");
+
if (node_distance(j, k) > sched_domains_numa_distance[i])
continue;
@@ -1705,7 +1702,7 @@ void sched_init_numa(void)
/* Compute default topology size */
for (i = 0; sched_domain_topology[i].mask; i++);
- tl = kzalloc((i + level + 1) *
+ tl = kzalloc((i + nr_levels + 1) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
return;
@@ -1728,7 +1725,7 @@ void sched_init_numa(void)
/*
* .. and append 'j' levels of NUMA goodness.
*/
- for (j = 1; j < level; i++, j++) {
+ for (j = 1; j < nr_levels; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
.mask = sd_numa_mask,
.sd_flags = cpu_numa_flags,
@@ -1740,8 +1737,8 @@ void sched_init_numa(void)
sched_domain_topology = tl;
- sched_domains_numa_levels = level;
- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+ sched_domains_numa_levels = nr_levels;
+ sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
init_numa_topology_type();
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 952dc1c90229..1d60fc2c9987 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1164,7 +1164,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
* Make sure that any changes to mode from another thread have
* been seen after SYSCALL_WORK_SECCOMP was seen.
*/
- rmb();
+ smp_rmb();
if (!sd) {
populate_seccomp_data(&sd_local);
@@ -1284,6 +1284,8 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
const bool recheck_after_trace)
{
BUG();
+
+ return -1;
}
#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 5ad8566534e7..f2a1b898da29 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -91,7 +91,7 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
return true;
/* Only allow kernel generated signals to this kthread */
- if (unlikely((t->flags & PF_KTHREAD) &&
+ if (unlikely((t->flags & (PF_KTHREAD | PF_IO_WORKER)) &&
(handler == SIG_KTHREAD_KERNEL) && !force))
return true;
@@ -288,7 +288,8 @@ bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
- if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
+ if (unlikely(fatal_signal_pending(task) ||
+ (task->flags & (PF_EXITING | PF_IO_WORKER))))
return false;
if (mask & JOBCTL_STOP_SIGMASK)
@@ -833,6 +834,9 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
if (!valid_signal(sig))
return -EINVAL;
+ /* PF_IO_WORKER threads don't take any signals */
+ if (t->flags & PF_IO_WORKER)
+ return -ESRCH;
if (!si_fromuser(info))
return 0;
@@ -1096,7 +1100,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
/*
* Skip useless siginfo allocation for SIGKILL and kernel threads.
*/
- if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
+ if ((sig == SIGKILL) || (t->flags & (PF_KTHREAD | PF_IO_WORKER)))
goto out_set;
/*
diff --git a/kernel/smp.c b/kernel/smp.c
index 1b6070bf97bb..aeb0adfa0606 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -14,6 +14,7 @@
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/init.h>
+#include <linux/interrupt.h>
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/cpu.h>
@@ -449,6 +450,9 @@ void flush_smp_call_function_from_idle(void)
local_irq_save(flags);
flush_smp_call_function_queue(true);
+ if (local_softirq_pending())
+ do_softirq();
+
local_irq_restore(flags);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9d71046ea247..9908ec4a9bfe 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -26,6 +26,8 @@
#include <linux/tick.h>
#include <linux/irq.h>
+#include <asm/softirq_stack.h>
+
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 84565c2a41b8..2c5950b0b90e 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -12,6 +12,8 @@
extern struct static_call_site __start_static_call_sites[],
__stop_static_call_sites[];
+extern struct static_call_tramp_key __start_static_call_tramp_key[],
+ __stop_static_call_tramp_key[];
static bool static_call_initialized;
@@ -33,27 +35,30 @@ static inline void *static_call_addr(struct static_call_site *site)
return (void *)((long)site->addr + (long)&site->addr);
}
+static inline unsigned long __static_call_key(const struct static_call_site *site)
+{
+ return (long)site->key + (long)&site->key;
+}
static inline struct static_call_key *static_call_key(const struct static_call_site *site)
{
- return (struct static_call_key *)
- (((long)site->key + (long)&site->key) & ~STATIC_CALL_SITE_FLAGS);
+ return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS);
}
/* These assume the key is word-aligned. */
static inline bool static_call_is_init(struct static_call_site *site)
{
- return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_INIT;
+ return __static_call_key(site) & STATIC_CALL_SITE_INIT;
}
static inline bool static_call_is_tail(struct static_call_site *site)
{
- return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_TAIL;
+ return __static_call_key(site) & STATIC_CALL_SITE_TAIL;
}
static inline void static_call_set_init(struct static_call_site *site)
{
- site->key = ((long)static_call_key(site) | STATIC_CALL_SITE_INIT) -
+ site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) -
(long)&site->key;
}
@@ -144,6 +149,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
};
for (site_mod = &first; site_mod; site_mod = site_mod->next) {
+ bool init = system_state < SYSTEM_RUNNING;
struct module *mod = site_mod->mod;
if (!site_mod->sites) {
@@ -163,6 +169,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
if (mod) {
stop = mod->static_call_sites +
mod->num_static_call_sites;
+ init = mod->state == MODULE_STATE_COMING;
}
#endif
@@ -170,25 +177,26 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
site < stop && static_call_key(site) == key; site++) {
void *site_addr = static_call_addr(site);
- if (static_call_is_init(site)) {
- /*
- * Don't write to call sites which were in
- * initmem and have since been freed.
- */
- if (!mod && system_state >= SYSTEM_RUNNING)
- continue;
- if (mod && !within_module_init((unsigned long)site_addr, mod))
- continue;
- }
+ if (!init && static_call_is_init(site))
+ continue;
if (!kernel_text_address((unsigned long)site_addr)) {
- WARN_ONCE(1, "can't patch static call site at %pS",
+ /*
+ * This skips patching built-in __exit, which
+ * is part of init_section_contains() but is
+ * not part of kernel_text_address().
+ *
+ * Skipping built-in __exit is fine since it
+ * will never be executed.
+ */
+ WARN_ONCE(!static_call_is_init(site),
+ "can't patch static call site at %pS",
site_addr);
continue;
}
arch_static_call_transform(site_addr, NULL, func,
- static_call_is_tail(site));
+ static_call_is_tail(site));
}
}
@@ -323,10 +331,60 @@ static int __static_call_mod_text_reserved(void *start, void *end)
return ret;
}
+static unsigned long tramp_key_lookup(unsigned long addr)
+{
+ struct static_call_tramp_key *start = __start_static_call_tramp_key;
+ struct static_call_tramp_key *stop = __stop_static_call_tramp_key;
+ struct static_call_tramp_key *tramp_key;
+
+ for (tramp_key = start; tramp_key != stop; tramp_key++) {
+ unsigned long tramp;
+
+ tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp;
+ if (tramp == addr)
+ return (long)tramp_key->key + (long)&tramp_key->key;
+ }
+
+ return 0;
+}
+
static int static_call_add_module(struct module *mod)
{
- return __static_call_init(mod, mod->static_call_sites,
- mod->static_call_sites + mod->num_static_call_sites);
+ struct static_call_site *start = mod->static_call_sites;
+ struct static_call_site *stop = start + mod->num_static_call_sites;
+ struct static_call_site *site;
+
+ for (site = start; site != stop; site++) {
+ unsigned long s_key = __static_call_key(site);
+ unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS;
+ unsigned long key;
+
+ /*
+ * Is the key is exported, 'addr' points to the key, which
+ * means modules are allowed to call static_call_update() on
+ * it.
+ *
+ * Otherwise, the key isn't exported, and 'addr' points to the
+ * trampoline so we need to lookup the key.
+ *
+ * We go through this dance to prevent crazy modules from
+ * abusing sensitive static calls.
+ */
+ if (!kernel_text_address(addr))
+ continue;
+
+ key = tramp_key_lookup(addr);
+ if (!key) {
+ pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n",
+ static_call_addr(site));
+ return -EINVAL;
+ }
+
+ key |= s_key & STATIC_CALL_SITE_FLAGS;
+ site->key = key - (long)&site->key;
+ }
+
+ return __static_call_init(mod, start, stop);
}
static void static_call_del_module(struct module *mod)
@@ -438,6 +496,11 @@ int __init static_call_init(void)
}
early_initcall(static_call_init);
+long __static_call_return0(void)
+{
+ return 0;
+}
+
#ifdef CONFIG_STATIC_CALL_SELFTEST
static int func_a(int x)
diff --git a/kernel/sys.c b/kernel/sys.c
index 51f00fe20e4d..2e2e3f378d97 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -24,7 +24,6 @@
#include <linux/times.h>
#include <linux/posix-timers.h>
#include <linux/security.h>
-#include <linux/dcookies.h>
#include <linux/suspend.h>
#include <linux/tty.h>
#include <linux/signal.h>
@@ -1243,7 +1242,7 @@ static int override_release(char __user *release, size_t len)
break;
rest++;
}
- v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60;
+ v = LINUX_VERSION_PATCHLEVEL + 60;
copy = clamp_t(size_t, len, 1, sizeof(buf));
copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
ret = copy_to_user(release, buf, copy + 1);
@@ -1848,7 +1847,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
goto exit;
- err = inode_permission(inode, MAY_EXEC);
+ err = file_permission(exe.file, MAY_EXEC);
if (err)
goto exit;
@@ -2080,7 +2079,7 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
* up to the caller to provide sane values here, otherwise userspace
* tools which use this vector might be unhappy.
*/
- unsigned long user_auxv[AT_VECTOR_SIZE];
+ unsigned long user_auxv[AT_VECTOR_SIZE] = {};
if (len > sizeof(user_auxv))
return -EINVAL;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c9fbdd848138..62fbd09b5dc1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2962,7 +2962,7 @@ static struct ctl_table vm_table[] = {
.data = &block_dump,
.maxlen = sizeof(block_dump),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
@@ -2970,7 +2970,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_vfs_cache_pressure,
.maxlen = sizeof(sysctl_vfs_cache_pressure),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
@@ -2980,7 +2980,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_legacy_va_layout,
.maxlen = sizeof(sysctl_legacy_va_layout),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
#endif
@@ -2990,7 +2990,7 @@ static struct ctl_table vm_table[] = {
.data = &node_reclaim_mode,
.maxlen = sizeof(node_reclaim_mode),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index f4ace1bf8382..4d94e2b5499d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -527,8 +527,11 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
/**
* alarm_handle_timer - Callback for posix timers
* @alarm: alarm that fired
+ * @now: time at the timer expiration
*
* Posix timer callback for expired alarm timers.
+ *
+ * Return: whether the timer is to be restarted
*/
static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
ktime_t now)
@@ -715,8 +718,11 @@ static int alarm_timer_create(struct k_itimer *new_timer)
/**
* alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
* @alarm: ptr to alarm that fired
+ * @now: time at the timer expiration
*
* Wakes up the task that set the alarmtimer
+ *
+ * Return: ALARMTIMER_NORESTART
*/
static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
ktime_t now)
@@ -733,6 +739,7 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
* alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
* @alarm: ptr to alarmtimer
* @absexp: absolute expiration time
+ * @type: alarm type (BOOTTIME/REALTIME).
*
* Sets the alarm timer and sleeps until it is fired or interrupted.
*/
@@ -806,7 +813,6 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
* @which_clock: clockid
* @flags: determins abstime or relative
* @tsreq: requested sleep time (abs or rel)
- * @rmtp: remaining sleep time saved
*
* Handles clock_nanosleep calls against _ALARM clockids
*/
@@ -848,9 +854,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
if (flags == TIMER_ABSTIME)
return -ERESTARTNOHAND;
- restart->fn = alarm_timer_nsleep_restart;
restart->nanosleep.clockid = type;
restart->nanosleep.expires = exp;
+ set_restart_fn(restart, alarm_timer_nsleep_restart);
return ret;
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 743c852e10f2..5c9d968187ae 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -546,8 +546,11 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
}
/*
- * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
- * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
+ * Recomputes cpu_base::*next_timer and returns the earliest expires_next
+ * but does not set cpu_base::*expires_next, that is done by
+ * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
+ * cpu_base::*expires_next right away, reprogramming logic would no longer
+ * work.
*
* When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
* those timers will get run whenever the softirq gets handled, at the end of
@@ -588,6 +591,37 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
return expires_next;
}
+static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
+{
+ ktime_t expires_next, soft = KTIME_MAX;
+
+ /*
+ * If the soft interrupt has already been activated, ignore the
+ * soft bases. They will be handled in the already raised soft
+ * interrupt.
+ */
+ if (!cpu_base->softirq_activated) {
+ soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
+ /*
+ * Update the soft expiry time. clock_settime() might have
+ * affected it.
+ */
+ cpu_base->softirq_expires_next = soft;
+ }
+
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
+ /*
+ * If a softirq timer is expiring first, update cpu_base->next_timer
+ * and program the hardware with the soft expiry time.
+ */
+ if (expires_next > soft) {
+ cpu_base->next_timer = cpu_base->softirq_next_timer;
+ expires_next = soft;
+ }
+
+ return expires_next;
+}
+
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
@@ -628,23 +662,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
ktime_t expires_next;
- /*
- * Find the current next expiration time.
- */
- expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
-
- if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
- /*
- * When the softirq is activated, hrtimer has to be
- * programmed with the first hard hrtimer because soft
- * timer interrupt could occur too late.
- */
- if (cpu_base->softirq_activated)
- expires_next = __hrtimer_get_next_event(cpu_base,
- HRTIMER_ACTIVE_HARD);
- else
- cpu_base->softirq_expires_next = expires_next;
- }
+ expires_next = hrtimer_update_next_event(cpu_base);
if (skip_equal && expires_next == cpu_base->expires_next)
return;
@@ -1644,8 +1662,8 @@ retry:
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
- /* Reevaluate the clock bases for the next expiry */
- expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
+ /* Reevaluate the clock bases for the [soft] next expiry */
+ expires_next = hrtimer_update_next_event(cpu_base);
/*
* Store the new expiry value so the migration code can verify
* against it.
@@ -1939,9 +1957,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
}
restart = &current->restart_block;
- restart->fn = hrtimer_nanosleep_restart;
restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+ set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
destroy_hrtimer_on_stack(&t.timer);
return ret;
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 6ca625f5e554..12eab0d2ae28 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -465,9 +465,3 @@ struct time_namespace init_time_ns = {
.ns.ops = &timens_operations,
.frozen_offsets = true,
};
-
-static int __init time_ns_init(void)
-{
- return 0;
-}
-subsys_initcall(time_ns_init);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a71758e34e45..9abe15255bc4 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1480,8 +1480,8 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
if (flags & TIMER_ABSTIME)
return -ERESTARTNOHAND;
- restart_block->fn = posix_cpu_nsleep_restart;
restart_block->nanosleep.clockid = which_clock;
+ set_restart_fn(restart_block, posix_cpu_nsleep_restart);
}
return error;
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 8dbc008f8942..f475f1a027c8 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1237,6 +1237,20 @@ int try_to_del_timer_sync(struct timer_list *timer)
}
EXPORT_SYMBOL(try_to_del_timer_sync);
+bool timer_curr_running(struct timer_list *timer)
+{
+ int i;
+
+ for (i = 0; i < NR_BASES; i++) {
+ struct timer_base *base = this_cpu_ptr(&timer_bases[i]);
+
+ if (base->running_timer == timer)
+ return true;
+ }
+
+ return false;
+}
+
#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
diff --git a/kernel/torture.c b/kernel/torture.c
index 8562ac18d2eb..01e336f1e5b2 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -48,6 +48,12 @@ module_param(disable_onoff_at_boot, bool, 0444);
static bool ftrace_dump_at_shutdown;
module_param(ftrace_dump_at_shutdown, bool, 0444);
+static int verbose_sleep_frequency;
+module_param(verbose_sleep_frequency, int, 0444);
+
+static int verbose_sleep_duration = 1;
+module_param(verbose_sleep_duration, int, 0444);
+
static char *torture_type;
static int verbose;
@@ -58,6 +64,95 @@ static int verbose;
static int fullstop = FULLSTOP_RMMOD;
static DEFINE_MUTEX(fullstop_mutex);
+static atomic_t verbose_sleep_counter;
+
+/*
+ * Sleep if needed from VERBOSE_TOROUT*().
+ */
+void verbose_torout_sleep(void)
+{
+ if (verbose_sleep_frequency > 0 &&
+ verbose_sleep_duration > 0 &&
+ !(atomic_inc_return(&verbose_sleep_counter) % verbose_sleep_frequency))
+ schedule_timeout_uninterruptible(verbose_sleep_duration);
+}
+EXPORT_SYMBOL_GPL(verbose_torout_sleep);
+
+/*
+ * Schedule a high-resolution-timer sleep in nanoseconds, with a 32-bit
+ * nanosecond random fuzz. This function and its friends desynchronize
+ * testing from the timer wheel.
+ */
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp)
+{
+ ktime_t hto = baset_ns;
+
+ if (trsp)
+ hto += (torture_random(trsp) >> 3) % fuzzt_ns;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ return schedule_hrtimeout(&hto, HRTIMER_MODE_REL);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_ns);
+
+/*
+ * Schedule a high-resolution-timer sleep in microseconds, with a 32-bit
+ * nanosecond (not microsecond!) random fuzz.
+ */
+int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp)
+{
+ ktime_t baset_ns = baset_us * NSEC_PER_USEC;
+
+ return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_us);
+
+/*
+ * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit
+ * microsecond (not millisecond!) random fuzz.
+ */
+int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp)
+{
+ ktime_t baset_ns = baset_ms * NSEC_PER_MSEC;
+ u32 fuzzt_ns;
+
+ if ((u32)~0U / NSEC_PER_USEC < fuzzt_us)
+ fuzzt_ns = (u32)~0U;
+ else
+ fuzzt_ns = fuzzt_us * NSEC_PER_USEC;
+ return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_ms);
+
+/*
+ * Schedule a high-resolution-timer sleep in jiffies, with an
+ * implied one-jiffy random fuzz. This is intended to replace calls to
+ * schedule_timeout_interruptible() and friends.
+ */
+int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp)
+{
+ ktime_t baset_ns = jiffies_to_nsecs(baset_j);
+
+ return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies);
+
+/*
+ * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit
+ * millisecond (not second!) random fuzz.
+ */
+int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *trsp)
+{
+ ktime_t baset_ns = baset_s * NSEC_PER_SEC;
+ u32 fuzzt_ns;
+
+ if ((u32)~0U / NSEC_PER_MSEC < fuzzt_ms)
+ fuzzt_ns = (u32)~0U;
+ else
+ fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC;
+ return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_s);
+
#ifdef CONFIG_HOTPLUG_CPU
/*
@@ -80,6 +175,19 @@ static unsigned long sum_online;
static int min_online = -1;
static int max_online;
+static int torture_online_cpus = NR_CPUS;
+
+/*
+ * Some torture testing leverages confusion as to the number of online
+ * CPUs. This function returns the torture-testing view of this number,
+ * which allows torture tests to load-balance appropriately.
+ */
+int torture_num_online_cpus(void)
+{
+ return READ_ONCE(torture_online_cpus);
+}
+EXPORT_SYMBOL_GPL(torture_num_online_cpus);
+
/*
* Attempt to take a CPU offline. Return false if the CPU is already
* offline or if it is not subject to CPU-hotplug operations. The
@@ -134,6 +242,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
*min_offl = delta;
if (*max_offl < delta)
*max_offl = delta;
+ WRITE_ONCE(torture_online_cpus, torture_online_cpus - 1);
+ WARN_ON_ONCE(torture_online_cpus <= 0);
}
return true;
@@ -190,6 +300,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
*min_onl = delta;
if (*max_onl < delta)
*max_onl = delta;
+ WRITE_ONCE(torture_online_cpus, torture_online_cpus + 1);
}
return true;
@@ -197,6 +308,26 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
EXPORT_SYMBOL_GPL(torture_online);
/*
+ * Get everything online at the beginning and ends of tests.
+ */
+static void torture_online_all(char *phase)
+{
+ int cpu;
+ int ret;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_online(cpu))
+ continue;
+ ret = add_cpu(cpu);
+ if (ret && verbose) {
+ pr_alert("%s" TORTURE_FLAG
+ "%s: %s online %d: errno %d\n",
+ __func__, phase, torture_type, cpu, ret);
+ }
+ }
+}
+
+/*
* Execute random CPU-hotplug operations at the interval specified
* by the onoff_interval.
*/
@@ -206,25 +337,12 @@ torture_onoff(void *arg)
int cpu;
int maxcpu = -1;
DEFINE_TORTURE_RANDOM(rand);
- int ret;
VERBOSE_TOROUT_STRING("torture_onoff task started");
for_each_online_cpu(cpu)
maxcpu = cpu;
WARN_ON(maxcpu < 0);
- if (!IS_MODULE(CONFIG_TORTURE_TEST)) {
- for_each_possible_cpu(cpu) {
- if (cpu_online(cpu))
- continue;
- ret = add_cpu(cpu);
- if (ret && verbose) {
- pr_alert("%s" TORTURE_FLAG
- "%s: Initial online %d: errno %d\n",
- __func__, torture_type, cpu, ret);
- }
- }
- }
-
+ torture_online_all("Initial");
if (maxcpu == 0) {
VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled");
goto stop;
@@ -252,6 +370,7 @@ torture_onoff(void *arg)
stop:
torture_kthread_stopping("torture_onoff");
+ torture_online_all("Final");
return 0;
}
@@ -602,7 +721,6 @@ static int stutter_gap;
*/
bool stutter_wait(const char *title)
{
- ktime_t delay;
unsigned int i = 0;
bool ret = false;
int spt;
@@ -618,11 +736,8 @@ bool stutter_wait(const char *title)
schedule_timeout_interruptible(1);
} else if (spt == 2) {
while (READ_ONCE(stutter_pause_test)) {
- if (!(i++ & 0xffff)) {
- set_current_state(TASK_INTERRUPTIBLE);
- delay = 10 * NSEC_PER_USEC;
- schedule_hrtimeout(&delay, HRTIMER_MODE_REL);
- }
+ if (!(i++ & 0xffff))
+ torture_hrtimeout_us(10, 0, NULL);
cond_resched();
}
} else {
@@ -640,7 +755,6 @@ EXPORT_SYMBOL_GPL(stutter_wait);
*/
static int torture_stutter(void *arg)
{
- ktime_t delay;
DEFINE_TORTURE_RANDOM(rand);
int wtime;
@@ -651,20 +765,15 @@ static int torture_stutter(void *arg)
if (stutter > 2) {
WRITE_ONCE(stutter_pause_test, 1);
wtime = stutter - 3;
- delay = ktime_divns(NSEC_PER_SEC * wtime, HZ);
- delay += (torture_random(&rand) >> 3) % NSEC_PER_MSEC;
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_hrtimeout(&delay, HRTIMER_MODE_REL);
+ torture_hrtimeout_jiffies(wtime, &rand);
wtime = 2;
}
WRITE_ONCE(stutter_pause_test, 2);
- delay = ktime_divns(NSEC_PER_SEC * wtime, HZ);
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_hrtimeout(&delay, HRTIMER_MODE_REL);
+ torture_hrtimeout_jiffies(wtime, NULL);
}
WRITE_ONCE(stutter_pause_test, 0);
if (!torture_must_stop())
- schedule_timeout_interruptible(stutter_gap);
+ torture_hrtimeout_jiffies(stutter_gap, NULL);
torture_shutdown_absorb("torture_stutter");
} while (!torture_must_stop());
torture_kthread_stopping("torture_stutter");
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index c1a62ae7e812..7fa82778c3e6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -60,6 +60,11 @@ config HAVE_NOP_MCOUNT
help
Arch supports the gcc options -pg with -mrecord-mcount and -nop-mcount
+config HAVE_OBJTOOL_MCOUNT
+ bool
+ help
+ Arch supports objtool --mcount
+
config HAVE_C_RECORDMCOUNT
bool
help
@@ -545,7 +550,7 @@ config KPROBE_EVENTS_ON_NOTRACE
using kprobe events.
If kprobes can use ftrace instead of breakpoint, ftrace related
- functions are protected from kprobe-events to prevent an infinit
+ functions are protected from kprobe-events to prevent an infinite
recursion or any unexpected execution path which leads to a kernel
crash.
@@ -602,6 +607,30 @@ config FTRACE_MCOUNT_RECORD
depends on DYNAMIC_FTRACE
depends on HAVE_FTRACE_MCOUNT_RECORD
+config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ bool
+ depends on FTRACE_MCOUNT_RECORD
+
+config FTRACE_MCOUNT_USE_CC
+ def_bool y
+ depends on $(cc-option,-mrecord-mcount)
+ depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ depends on FTRACE_MCOUNT_RECORD
+
+config FTRACE_MCOUNT_USE_OBJTOOL
+ def_bool y
+ depends on HAVE_OBJTOOL_MCOUNT
+ depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ depends on !FTRACE_MCOUNT_USE_CC
+ depends on FTRACE_MCOUNT_RECORD
+
+config FTRACE_MCOUNT_USE_RECORDMCOUNT
+ def_bool y
+ depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ depends on !FTRACE_MCOUNT_USE_CC
+ depends on !FTRACE_MCOUNT_USE_OBJTOOL
+ depends on FTRACE_MCOUNT_RECORD
+
config TRACING_MAP
bool
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -665,7 +694,7 @@ config TRACEPOINT_BENCHMARK
help
This option creates the tracepoint "benchmark:benchmark_event".
When the tracepoint is enabled, it kicks off a kernel thread that
- goes into an infinite loop (calling cond_sched() to let other tasks
+ goes into an infinite loop (calling cond_resched() to let other tasks
run), and calls the tracepoint. Each iteration will record the time
it took to write to the tracepoint and the next iteration that
data will be passed to the tracepoint itself. That is, the tracepoint
@@ -886,6 +915,10 @@ config PREEMPTIRQ_DELAY_TEST
irq-disabled critical sections for 500us:
modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3
+ What's more, if you want to attach the test on the cpu which the latency
+ tracer is running on, specify cpu_affinity=cpu_num at the end of the
+ command.
+
If unsure, say N
config SYNTH_EVENT_GEN_TEST
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 7e44cea89fdc..b28d3e5013cd 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
+obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
ifeq ($(CONFIG_PM),y)
obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fb0fe4c66b84..c221e4c3f625 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -72,17 +72,17 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
struct trace_buffer *buffer = NULL;
- int pc = 0;
+ unsigned int trace_ctx = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
if (blk_tracer) {
buffer = blk_tr->array_buffer.buffer;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + len + cgid_len,
- 0, pc);
+ trace_ctx);
if (!event)
return;
t = ring_buffer_event_data(event);
@@ -107,7 +107,7 @@ record_it:
memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer)
- trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
}
}
@@ -222,8 +222,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
struct blk_io_trace *t;
unsigned long flags = 0;
unsigned long *sequence;
+ unsigned int trace_ctx = 0;
pid_t pid;
- int cpu, pc = 0;
+ int cpu;
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
@@ -252,10 +253,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
tracing_record_cmdline(current);
buffer = blk_tr->array_buffer.buffer;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + pdu_len + cgid_len,
- 0, pc);
+ trace_ctx);
if (!event)
return;
t = ring_buffer_event_data(event);
@@ -301,7 +302,7 @@ record_it:
memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
if (blk_tracer) {
- trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
return;
}
}
@@ -311,8 +312,6 @@ record_it:
static void blk_trace_free(struct blk_trace *bt)
{
- debugfs_remove(bt->msg_file);
- debugfs_remove(bt->dropped_file);
relay_close(bt->rchan);
debugfs_remove(bt->dir);
free_percpu(bt->sequence);
@@ -544,10 +543,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
INIT_LIST_HEAD(&bt->running_list);
ret = -EIO;
- bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
- &blk_dropped_fops);
-
- bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+ debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
+ debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
bt->rchan = relay_open("trace", dir, buts->buf_size,
buts->buf_nr, &blk_relay_callbacks, bt);
@@ -903,7 +900,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0);
}
static void blk_add_trace_bio_complete(void *ignore,
@@ -915,22 +912,24 @@ static void blk_add_trace_bio_complete(void *ignore,
static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE,
+ 0);
}
static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE,
+ 0);
}
static void blk_add_trace_bio_queue(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0);
}
static void blk_add_trace_getrq(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0);
}
static void blk_add_trace_plug(void *ignore, struct request_queue *q)
@@ -967,7 +966,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
{
- struct request_queue *q = bio->bi_disk->queue;
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct blk_trace *bt;
rcu_read_lock();
@@ -997,7 +996,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
sector_t from)
{
- struct request_queue *q = bio->bi_disk->queue;
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct blk_trace *bt;
struct blk_io_trace_remap r;
@@ -1865,7 +1864,17 @@ void blk_trace_remove_sysfs(struct device *dev)
#ifdef CONFIG_EVENT_TRACING
-void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
+/**
+ * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string.
+ * @rwbs: buffer to be filled
+ * @op: REQ_OP_XXX for the tracepoint
+ *
+ * Description:
+ * Maps the REQ_OP_XXX to character and fills the buffer provided by the
+ * caller with resulting string.
+ *
+ **/
+void blk_fill_rwbs(char *rwbs, unsigned int op)
{
int i = 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 764400260eb6..b0c45d923f0f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1188,6 +1188,10 @@ BTF_SET_END(btf_allowlist_d_path)
static bool bpf_d_path_allowed(const struct bpf_prog *prog)
{
+ if (prog->type == BPF_PROG_TYPE_TRACING &&
+ prog->expected_attach_type == BPF_TRACE_ITER)
+ return true;
+
if (prog->type == BPF_PROG_TYPE_LSM)
return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
@@ -1757,6 +1761,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_delete_tracing_proto;
case BPF_FUNC_sock_from_file:
return &bpf_sock_from_file_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_ptr_cookie_proto;
#endif
case BPF_FUNC_seq_printf:
return prog->expected_attach_type == BPF_TRACE_ITER ?
diff --git a/kernel/trace/error_report-traces.c b/kernel/trace/error_report-traces.c
new file mode 100644
index 000000000000..f89792c25b11
--- /dev/null
+++ b/kernel/trace/error_report-traces.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Error reporting trace points.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/error_report.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(error_report_end);
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index 312d1a0ca3b6..8c4ffd076162 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -21,13 +21,16 @@
static ulong delay = 100;
static char test_mode[12] = "irq";
static uint burst_size = 1;
+static int cpu_affinity = -1;
module_param_named(delay, delay, ulong, 0444);
module_param_string(test_mode, test_mode, 12, 0444);
module_param_named(burst_size, burst_size, uint, 0444);
+module_param_named(cpu_affinity, cpu_affinity, int, 0444);
MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)");
MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)");
MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)");
+MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on");
static struct completion done;
@@ -36,7 +39,9 @@ static struct completion done;
static void busy_wait(ulong time)
{
u64 start, end;
+
start = trace_clock_local();
+
do {
end = trace_clock_local();
if (kthread_should_stop())
@@ -47,6 +52,7 @@ static void busy_wait(ulong time)
static __always_inline void irqoff_test(void)
{
unsigned long flags;
+
local_irq_save(flags);
busy_wait(delay);
local_irq_restore(flags);
@@ -113,6 +119,14 @@ static int preemptirq_delay_run(void *data)
{
int i;
int s = MIN(burst_size, NR_TEST_FUNCS);
+ struct cpumask cpu_mask;
+
+ if (cpu_affinity > -1) {
+ cpumask_clear(&cpu_mask);
+ cpumask_set_cpu(cpu_affinity, &cpu_mask);
+ if (set_cpus_allowed_ptr(current, &cpu_mask))
+ pr_err("cpu_affinity:%d, failed\n", cpu_affinity);
+ }
for (i = 0; i < s; i++)
(testfuncs[i])(i);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ec08f948dd80..68744c51517e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1112,8 +1112,7 @@ static struct list_head *rb_list_head(struct list_head *list)
* its flags will be non zero.
*/
static inline int
-rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page *page, struct list_head *list)
+rb_is_head_page(struct buffer_page *page, struct list_head *list)
{
unsigned long val;
@@ -1142,8 +1141,7 @@ static bool rb_is_reader_page(struct buffer_page *page)
/*
* rb_set_list_to_head - set a list_head to be pointing to head.
*/
-static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
- struct list_head *list)
+static void rb_set_list_to_head(struct list_head *list)
{
unsigned long *ptr;
@@ -1166,7 +1164,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
/*
* Set the previous list pointer to have the HEAD flag.
*/
- rb_set_list_to_head(cpu_buffer, head->list.prev);
+ rb_set_list_to_head(head->list.prev);
}
static void rb_list_head_clear(struct list_head *list)
@@ -1241,8 +1239,7 @@ static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
old_flag, RB_PAGE_NORMAL);
}
-static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page **bpage)
+static inline void rb_inc_page(struct buffer_page **bpage)
{
struct list_head *p = rb_list_head((*bpage)->list.next);
@@ -1274,11 +1271,11 @@ rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
*/
for (i = 0; i < 3; i++) {
do {
- if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
+ if (rb_is_head_page(page, page->list.prev)) {
cpu_buffer->head_page = page;
return page;
}
- rb_inc_page(cpu_buffer, &page);
+ rb_inc_page(&page);
} while (page != head);
}
@@ -1824,7 +1821,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
cond_resched();
to_remove_page = tmp_iter_page;
- rb_inc_page(cpu_buffer, &tmp_iter_page);
+ rb_inc_page(&tmp_iter_page);
/* update the counters */
page_entries = rb_page_entries(to_remove_page);
@@ -2062,10 +2059,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
put_online_cpus();
} else {
- /* Make sure this CPU has been initialized */
- if (!cpumask_test_cpu(cpu_id, buffer->cpumask))
- goto out;
-
cpu_buffer = buffer->buffers[cpu_id];
if (nr_pages == cpu_buffer->nr_pages)
@@ -2271,7 +2264,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
if (iter->head_page == cpu_buffer->reader_page)
iter->head_page = rb_set_head_page(cpu_buffer);
else
- rb_inc_page(cpu_buffer, &iter->head_page);
+ rb_inc_page(&iter->head_page);
iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
iter->head = 0;
@@ -2374,7 +2367,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* want the outer most commit to reset it.
*/
new_head = next_page;
- rb_inc_page(cpu_buffer, &new_head);
+ rb_inc_page(&new_head);
ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
RB_PAGE_NORMAL);
@@ -2526,7 +2519,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
next_page = tail_page;
- rb_inc_page(cpu_buffer, &next_page);
+ rb_inc_page(&next_page);
/*
* If for some reason, we had an interrupt storm that made
@@ -2552,7 +2545,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* the buffer, unless the commit page is still on the
* reader page.
*/
- if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
+ if (rb_is_head_page(next_page, &tail_page->list)) {
/*
* If the commit is not on the reader page, then
@@ -2583,7 +2576,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* have filled up the buffer with events
* from interrupts and such, and wrapped.
*
- * Note, if the tail page is also the on the
+ * Note, if the tail page is also on the
* reader_page, we let it move out.
*/
if (unlikely((cpu_buffer->commit_page !=
@@ -2822,6 +2815,17 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
return 0;
/*
+ * It's possible that the event time delta is zero
+ * (has the same time stamp as the previous event)
+ * in which case write_stamp and before_stamp could
+ * be the same. In such a case, force before_stamp
+ * to be different than write_stamp. It doesn't
+ * matter what it is, as long as its different.
+ */
+ if (!delta)
+ rb_time_set(&cpu_buffer->before_stamp, 0);
+
+ /*
* If an event were to come in now, it would see that the
* write_stamp and the before_stamp are different, and assume
* that this event just added itself before updating
@@ -2879,7 +2883,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
return;
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
- rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+ rb_inc_page(&cpu_buffer->commit_page);
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -3314,9 +3318,13 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
goto out;
}
atomic_inc(&cpu_buffer->record_disabled);
- pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld after:%lld\n",
- cpu_buffer->cpu,
- ts + info->delta, info->ts, info->delta, info->after);
+ /* There's some cases in boot up that this can happen */
+ WARN_ON_ONCE(system_state != SYSTEM_BOOTING);
+ pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n",
+ cpu_buffer->cpu,
+ ts + info->delta, info->ts, info->delta,
+ info->before, info->after,
+ full ? " (full)" : "");
dump_buffer_page(bpage, info, tail);
atomic_dec(&ts_dump);
/* Do not re-enable checking */
@@ -3638,14 +3646,14 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
* Because the commit page may be on the reader page we
* start with the next page and check the end loop there.
*/
- rb_inc_page(cpu_buffer, &bpage);
+ rb_inc_page(&bpage);
start = bpage;
do {
if (bpage->page == (void *)addr) {
local_dec(&bpage->entries);
return;
}
- rb_inc_page(cpu_buffer, &bpage);
+ rb_inc_page(&bpage);
} while (bpage != start);
/* commit not part of this buffer?? */
@@ -4367,7 +4375,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->pages = reader->list.prev;
/* The reader page will be pointing to the new head */
- rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
+ rb_set_list_to_head(&cpu_buffer->reader_page->list);
/*
* We want to make sure we read the overruns after we set up our
@@ -4406,7 +4414,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
* Now make the new head point back to the reader page.
*/
rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
- rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_inc_page(&cpu_buffer->head_page);
local_inc(&cpu_buffer->pages_read);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b5815a022ecc..eccb4e1187cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -176,7 +176,7 @@ static union trace_eval_map_item *trace_eval_maps;
int tracing_set_tracer(struct trace_array *tr, const char *buf);
static void ftrace_trace_userstack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
#define MAX_TRACER_SIZE 100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -408,7 +408,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
- TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS)
+ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \
+ TRACE_ITER_HASH_PTR)
/* trace_options that are only supported by global_trace */
#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
@@ -454,6 +455,7 @@ static void __trace_array_put(struct trace_array *this_tr)
/**
* trace_array_put - Decrement the reference counter for this trace array.
+ * @this_tr : pointer to the trace array
*
* NOTE: Use this when we no longer need the trace array returned by
* trace_array_get_by_name(). This ensures the trace array can be later
@@ -530,6 +532,7 @@ trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
/**
* trace_ignore_this_task - should a task be ignored for tracing
* @filtered_pids: The list of pids to check
+ * @filtered_no_pids: The list of pids not to be traced
* @task: The task that should be ignored if not filtered
*
* Checks if @task should be traced or not from @filtered_pids.
@@ -780,7 +783,7 @@ u64 ftrace_now(int cpu)
}
/**
- * tracing_is_enabled - Show if global_trace has been disabled
+ * tracing_is_enabled - Show if global_trace has been enabled
*
* Shows if the global trace has been enabled or not. It uses the
* mirror flag "buffer_disabled" to be used in fast paths such as for
@@ -905,23 +908,23 @@ static inline void trace_access_lock_init(void)
#ifdef CONFIG_STACKTRACE
static void __ftrace_trace_stack(struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs);
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs);
static inline void ftrace_trace_stack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs);
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs);
#else
static inline void __ftrace_trace_stack(struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
}
static inline void ftrace_trace_stack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned long trace_ctx,
+ int skip, struct pt_regs *regs)
{
}
@@ -929,24 +932,24 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
static __always_inline void
trace_event_setup(struct ring_buffer_event *event,
- int type, unsigned long flags, int pc)
+ int type, unsigned int trace_ctx)
{
struct trace_entry *ent = ring_buffer_event_data(event);
- tracing_generic_entry_update(ent, type, flags, pc);
+ tracing_generic_entry_update(ent, type, trace_ctx);
}
static __always_inline struct ring_buffer_event *
__trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct ring_buffer_event *event;
event = ring_buffer_lock_reserve(buffer, len);
if (event != NULL)
- trace_event_setup(event, type, flags, pc);
+ trace_event_setup(event, type, trace_ctx);
return event;
}
@@ -1007,25 +1010,22 @@ int __trace_puts(unsigned long ip, const char *str, int size)
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct print_entry *entry;
- unsigned long irq_flags;
+ unsigned int trace_ctx;
int alloc;
- int pc;
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
- pc = preempt_count();
-
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
alloc = sizeof(*entry) + size + 2; /* possible \n added */
- local_save_flags(irq_flags);
+ trace_ctx = tracing_gen_ctx();
buffer = global_trace.array_buffer.buffer;
ring_buffer_nest_start(buffer);
- event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
- irq_flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+ trace_ctx);
if (!event) {
size = 0;
goto out;
@@ -1044,7 +1044,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
entry->buf[size] = '\0';
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
+ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
out:
ring_buffer_nest_end(buffer);
return size;
@@ -1061,25 +1061,22 @@ int __trace_bputs(unsigned long ip, const char *str)
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct bputs_entry *entry;
- unsigned long irq_flags;
+ unsigned int trace_ctx;
int size = sizeof(struct bputs_entry);
int ret = 0;
- int pc;
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
- pc = preempt_count();
-
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
- local_save_flags(irq_flags);
+ trace_ctx = tracing_gen_ctx();
buffer = global_trace.array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
- irq_flags, pc);
+ trace_ctx);
if (!event)
goto out;
@@ -1088,7 +1085,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
+ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
ret = 1;
out:
@@ -1932,6 +1929,12 @@ static int run_tracer_selftest(struct tracer *type)
if (!selftests_can_run)
return save_selftest(type);
+ if (!tracing_is_on()) {
+ pr_warn("Selftest for tracer %s skipped due to tracing disabled\n",
+ type->name);
+ return 0;
+ }
+
/*
* Run a selftest on this tracer.
* Here we reset the trace buffer, and set the current
@@ -2584,36 +2587,34 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
}
EXPORT_SYMBOL_GPL(trace_handle_return);
-void
-tracing_generic_entry_update(struct trace_entry *entry, unsigned short type,
- unsigned long flags, int pc)
+unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
{
- struct task_struct *tsk = current;
+ unsigned int trace_flags = irqs_status;
+ unsigned int pc;
- entry->preempt_count = pc & 0xff;
- entry->pid = (tsk) ? tsk->pid : 0;
- entry->type = type;
- entry->flags =
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
- (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
-#else
- TRACE_FLAG_IRQS_NOSUPPORT |
-#endif
- ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
- ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
- ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
- (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
- (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
+ pc = preempt_count();
+
+ if (pc & NMI_MASK)
+ trace_flags |= TRACE_FLAG_NMI;
+ if (pc & HARDIRQ_MASK)
+ trace_flags |= TRACE_FLAG_HARDIRQ;
+ if (in_serving_softirq())
+ trace_flags |= TRACE_FLAG_SOFTIRQ;
+
+ if (tif_need_resched())
+ trace_flags |= TRACE_FLAG_NEED_RESCHED;
+ if (test_preempt_need_resched())
+ trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
+ return (trace_flags << 16) | (pc & 0xff);
}
-EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
struct ring_buffer_event *
trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
- return __trace_buffer_lock_reserve(buffer, type, len, flags, pc);
+ return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx);
}
DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -2733,7 +2734,7 @@ struct ring_buffer_event *
trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
struct trace_event_file *trace_file,
int type, unsigned long len,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct ring_buffer_event *entry;
int val;
@@ -2746,15 +2747,15 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
/* Try to use the per cpu buffer first */
val = this_cpu_inc_return(trace_buffered_event_cnt);
if ((len < (PAGE_SIZE - sizeof(*entry))) && val == 1) {
- trace_event_setup(entry, type, flags, pc);
+ trace_event_setup(entry, type, trace_ctx);
entry->array[0] = len;
return entry;
}
this_cpu_dec(trace_buffered_event_cnt);
}
- entry = __trace_buffer_lock_reserve(*current_rb,
- type, len, flags, pc);
+ entry = __trace_buffer_lock_reserve(*current_rb, type, len,
+ trace_ctx);
/*
* If tracing is off, but we have triggers enabled
* we still need to look at the event data. Use the temp_buffer
@@ -2763,8 +2764,8 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
*/
if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
*current_rb = temp_buffer;
- entry = __trace_buffer_lock_reserve(*current_rb,
- type, len, flags, pc);
+ entry = __trace_buffer_lock_reserve(*current_rb, type, len,
+ trace_ctx);
}
return entry;
}
@@ -2850,7 +2851,7 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT);
event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer,
fbuffer->event, fbuffer->entry,
- fbuffer->flags, fbuffer->pc, fbuffer->regs);
+ fbuffer->trace_ctx, fbuffer->regs);
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
@@ -2866,7 +2867,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- unsigned long flags, int pc,
+ unsigned int trace_ctx,
struct pt_regs *regs)
{
__buffer_unlock_commit(buffer, event);
@@ -2877,8 +2878,8 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
* and mmiotrace, but that's ok if they lose a function or
* two. They are not that meaningful.
*/
- ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
- ftrace_trace_userstack(tr, buffer, flags, pc);
+ ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs);
+ ftrace_trace_userstack(tr, buffer, trace_ctx);
}
/*
@@ -2892,9 +2893,8 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
}
void
-trace_function(struct trace_array *tr,
- unsigned long ip, unsigned long parent_ip, unsigned long flags,
- int pc)
+trace_function(struct trace_array *tr, unsigned long ip, unsigned long
+ parent_ip, unsigned int trace_ctx)
{
struct trace_event_call *call = &event_function;
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -2902,7 +2902,7 @@ trace_function(struct trace_array *tr,
struct ftrace_entry *entry;
event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
- flags, pc);
+ trace_ctx);
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -2936,8 +2936,8 @@ static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
static void __ftrace_trace_stack(struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
struct trace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
@@ -2984,7 +2984,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
size = nr_entries * sizeof(unsigned long);
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
- sizeof(*entry) + size, flags, pc);
+ sizeof(*entry) + size, trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3005,22 +3005,22 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
static inline void ftrace_trace_stack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
return;
- __ftrace_trace_stack(buffer, flags, skip, pc, regs);
+ __ftrace_trace_stack(buffer, trace_ctx, skip, regs);
}
-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
- int pc)
+void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
+ int skip)
{
struct trace_buffer *buffer = tr->array_buffer.buffer;
if (rcu_is_watching()) {
- __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+ __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
return;
}
@@ -3034,7 +3034,7 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
return;
rcu_irq_enter_irqson();
- __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+ __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
rcu_irq_exit_irqson();
}
@@ -3044,19 +3044,15 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
*/
void trace_dump_stack(int skip)
{
- unsigned long flags;
-
if (tracing_disabled || tracing_selftest_running)
return;
- local_save_flags(flags);
-
#ifndef CONFIG_UNWINDER_ORC
/* Skip 1 to skip this function. */
skip++;
#endif
__ftrace_trace_stack(global_trace.array_buffer.buffer,
- flags, skip, preempt_count(), NULL);
+ tracing_gen_ctx(), skip, NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
@@ -3065,7 +3061,7 @@ static DEFINE_PER_CPU(int, user_stack_count);
static void
ftrace_trace_userstack(struct trace_array *tr,
- struct trace_buffer *buffer, unsigned long flags, int pc)
+ struct trace_buffer *buffer, unsigned int trace_ctx)
{
struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
@@ -3092,7 +3088,7 @@ ftrace_trace_userstack(struct trace_array *tr,
__this_cpu_inc(user_stack_count);
event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
goto out_drop_count;
entry = ring_buffer_event_data(event);
@@ -3112,7 +3108,7 @@ ftrace_trace_userstack(struct trace_array *tr,
#else /* CONFIG_USER_STACKTRACE_SUPPORT */
static void ftrace_trace_userstack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
}
#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */
@@ -3242,9 +3238,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
struct trace_buffer *buffer;
struct trace_array *tr = &global_trace;
struct bprint_entry *entry;
- unsigned long flags;
+ unsigned int trace_ctx;
char *tbuffer;
- int len = 0, size, pc;
+ int len = 0, size;
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -3252,7 +3248,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
/* Don't pollute graph traces with trace_vprintk internals */
pause_graph_tracing();
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
tbuffer = get_trace_buf();
@@ -3266,12 +3262,11 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
goto out_put;
- local_save_flags(flags);
size = sizeof(*entry) + sizeof(u32) * len;
buffer = tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
- flags, pc);
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3281,7 +3276,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
memcpy(entry->buf, tbuffer, sizeof(u32) * len);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
}
out:
@@ -3304,9 +3299,9 @@ __trace_array_vprintk(struct trace_buffer *buffer,
{
struct trace_event_call *call = &event_print;
struct ring_buffer_event *event;
- int len = 0, size, pc;
+ int len = 0, size;
struct print_entry *entry;
- unsigned long flags;
+ unsigned int trace_ctx;
char *tbuffer;
if (tracing_disabled || tracing_selftest_running)
@@ -3315,7 +3310,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
/* Don't pollute graph traces with trace_vprintk internals */
pause_graph_tracing();
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
@@ -3327,11 +3322,10 @@ __trace_array_vprintk(struct trace_buffer *buffer,
len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
- local_save_flags(flags);
size = sizeof(*entry) + len + 1;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- flags, pc);
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3340,7 +3334,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
+ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL);
}
out:
@@ -3543,6 +3537,65 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
return next;
}
+#define STATIC_FMT_BUF_SIZE 128
+static char static_fmt_buf[STATIC_FMT_BUF_SIZE];
+
+static char *trace_iter_expand_format(struct trace_iterator *iter)
+{
+ char *tmp;
+
+ if (iter->fmt == static_fmt_buf)
+ return NULL;
+
+ tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE,
+ GFP_KERNEL);
+ if (tmp) {
+ iter->fmt_size += STATIC_FMT_BUF_SIZE;
+ iter->fmt = tmp;
+ }
+
+ return tmp;
+}
+
+const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
+{
+ const char *p, *new_fmt;
+ char *q;
+
+ if (WARN_ON_ONCE(!fmt))
+ return fmt;
+
+ if (iter->tr->trace_flags & TRACE_ITER_HASH_PTR)
+ return fmt;
+
+ p = fmt;
+ new_fmt = q = iter->fmt;
+ while (*p) {
+ if (unlikely(q - new_fmt + 3 > iter->fmt_size)) {
+ if (!trace_iter_expand_format(iter))
+ return fmt;
+
+ q += iter->fmt - new_fmt;
+ new_fmt = iter->fmt;
+ }
+
+ *q++ = *p++;
+
+ /* Replace %p with %px */
+ if (p[-1] == '%') {
+ if (p[0] == '%') {
+ *q++ = *p++;
+ } else if (p[0] == 'p' && !isalnum(p[1])) {
+ *q++ = *p++;
+ *q++ = 'x';
+ }
+ }
+ }
+ *q = '\0';
+
+ return new_fmt;
+}
+
#define STATIC_TEMP_BUF_SIZE 128
static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4);
@@ -4336,6 +4389,16 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
iter->temp_size = 128;
/*
+ * trace_event_printf() may need to modify given format
+ * string to replace %p with %px so that it shows real address
+ * instead of hash value. However, that is only for the event
+ * tracing, other tracer may not need. Defer the allocation
+ * until it is needed.
+ */
+ iter->fmt = NULL;
+ iter->fmt_size = 0;
+
+ /*
* We make a copy of the current tracer to avoid concurrent
* changes on it while we are reading.
*/
@@ -4486,6 +4549,7 @@ static int tracing_release(struct inode *inode, struct file *file)
mutex_destroy(&iter->mutex);
free_cpumask_var(iter->started);
+ kfree(iter->fmt);
kfree(iter->temp);
kfree(iter->trace);
kfree(iter->buffer_iter);
@@ -6653,7 +6717,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
enum event_trigger_type tt = ETT_NONE;
struct trace_buffer *buffer;
struct print_entry *entry;
- unsigned long irq_flags;
ssize_t written;
int size;
int len;
@@ -6673,7 +6736,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
- local_save_flags(irq_flags);
size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
/* If less than "<faulted>", then make sure we can still add that */
@@ -6682,7 +6744,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- irq_flags, preempt_count());
+ tracing_gen_ctx());
if (unlikely(!event))
/* Ring buffer disabled, return as if not open for write */
return -EBADF;
@@ -6734,7 +6796,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct raw_data_entry *entry;
- unsigned long irq_flags;
ssize_t written;
int size;
int len;
@@ -6756,14 +6817,13 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
- local_save_flags(irq_flags);
size = sizeof(*entry) + cnt;
if (cnt < FAULT_SIZE_ID)
size += FAULT_SIZE_ID - cnt;
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
- irq_flags, preempt_count());
+ tracing_gen_ctx());
if (!event)
/* Ring buffer disabled, return as if not open for write */
return -EBADF;
@@ -9348,9 +9408,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
/* Simulate the iterator */
trace_init_global_iter(&iter);
- /* Can not use kmalloc for iter.temp */
+ /* Can not use kmalloc for iter.temp and iter.fmt */
iter.temp = static_temp_buf;
iter.temp_size = STATIC_TEMP_BUF_SIZE;
+ iter.fmt = static_fmt_buf;
+ iter.fmt_size = STATIC_FMT_BUF_SIZE;
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
@@ -9429,30 +9491,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
}
EXPORT_SYMBOL_GPL(ftrace_dump);
-int trace_run_command(const char *buf, int (*createfn)(int, char **))
-{
- char **argv;
- int argc, ret;
-
- argc = 0;
- ret = 0;
- argv = argv_split(GFP_KERNEL, buf, &argc);
- if (!argv)
- return -ENOMEM;
-
- if (argc)
- ret = createfn(argc, argv);
-
- argv_free(argv);
-
- return ret;
-}
-
#define WRITE_BUFSIZE 4096
ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos,
- int (*createfn)(int, char **))
+ int (*createfn)(const char *))
{
char *kbuf, *buf, *tmp;
int ret = 0;
@@ -9500,7 +9543,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
if (tmp)
*tmp = '\0';
- ret = trace_run_command(buf, createfn);
+ ret = createfn(buf);
if (ret)
goto out;
buf += size;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e448d2da0b99..a6446c03cfbc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -136,25 +136,6 @@ struct kretprobe_trace_entry_head {
unsigned long ret_ip;
};
-/*
- * trace_flag_type is an enumeration that holds different
- * states when a trace occurs. These are:
- * IRQS_OFF - interrupts were disabled
- * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
- * NEED_RESCHED - reschedule is requested
- * HARDIRQ - inside an interrupt handler
- * SOFTIRQ - inside a softirq handler
- */
-enum trace_flag_type {
- TRACE_FLAG_IRQS_OFF = 0x01,
- TRACE_FLAG_IRQS_NOSUPPORT = 0x02,
- TRACE_FLAG_NEED_RESCHED = 0x04,
- TRACE_FLAG_HARDIRQ = 0x08,
- TRACE_FLAG_SOFTIRQ = 0x10,
- TRACE_FLAG_PREEMPT_RESCHED = 0x20,
- TRACE_FLAG_NMI = 0x40,
-};
-
#define TRACE_BUF_SIZE 1024
struct trace_array;
@@ -589,8 +570,7 @@ struct ring_buffer_event *
trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
- unsigned long flags,
- int pc);
+ unsigned int trace_ctx);
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
@@ -601,6 +581,8 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
struct ring_buffer_event *event);
+const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
+
int trace_empty(struct trace_iterator *iter);
void *trace_find_next_entry_inc(struct trace_iterator *iter);
@@ -615,15 +597,14 @@ unsigned long trace_total_entries(struct trace_array *tr);
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
void trace_graph_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
void trace_latency_header(struct seq_file *m);
void trace_default_header(struct seq_file *m);
void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
-int trace_empty(struct trace_iterator *iter);
void trace_graph_return(struct ftrace_graph_ret *trace);
int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -687,11 +668,10 @@ static inline void latency_fsnotify(struct trace_array *tr) { }
#endif
#ifdef CONFIG_STACKTRACE
-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
- int pc);
+void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip);
#else
-static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
- int skip, int pc)
+static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
+ int skip)
{
}
#endif /* CONFIG_STACKTRACE */
@@ -831,10 +811,10 @@ extern void graph_trace_open(struct trace_iterator *iter);
extern void graph_trace_close(struct trace_iterator *iter);
extern int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
#ifdef CONFIG_DYNAMIC_FTRACE
extern struct ftrace_hash __rcu *ftrace_graph_hash;
@@ -1194,6 +1174,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(MARKERS, "markers"), \
C(EVENT_FORK, "event-fork"), \
C(PAUSE_ON_TRACE, "pause-on-trace"), \
+ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \
FUNCTION_FLAGS \
FGRAPH_FLAGS \
STACK_FLAGS \
@@ -1297,15 +1278,15 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- unsigned long flags, int pc,
+ unsigned int trcace_ctx,
struct pt_regs *regs);
static inline void trace_buffer_unlock_commit(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
- trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL);
+ trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL);
}
DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -1366,8 +1347,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
* @buffer: The ring buffer that the event is being written to
* @event: The event meta data in the ring buffer
* @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
+ * @trace_ctx: The tracing context flags.
*
* This is a helper function to handle triggers that require data
* from the event itself. It also tests the event against filters and
@@ -1377,12 +1357,12 @@ static inline void
event_trigger_unlock_commit(struct trace_event_file *file,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- void *entry, unsigned long irq_flags, int pc)
+ void *entry, unsigned int trace_ctx)
{
enum event_trigger_type tt = ETT_NONE;
if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
- trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
+ trace_buffer_unlock_commit(file->tr, buffer, event, trace_ctx);
if (tt)
event_triggers_post_call(file, tt);
@@ -1394,8 +1374,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
* @buffer: The ring buffer that the event is being written to
* @event: The event meta data in the ring buffer
* @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
+ * @trace_ctx: The tracing context flags.
*
* This is a helper function to handle triggers that require data
* from the event itself. It also tests the event against filters and
@@ -1408,14 +1387,14 @@ static inline void
event_trigger_unlock_commit_regs(struct trace_event_file *file,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- void *entry, unsigned long irq_flags, int pc,
+ void *entry, unsigned int trace_ctx,
struct pt_regs *regs)
{
enum event_trigger_type tt = ETT_NONE;
if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
trace_buffer_unlock_commit_regs(file->tr, buffer, event,
- irq_flags, pc, regs);
+ trace_ctx, regs);
if (tt)
event_triggers_post_call(file, tt);
@@ -1830,10 +1809,9 @@ extern int tracing_set_cpumask(struct trace_array *tr,
#define MAX_EVENT_NAME_LEN 64
-extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
extern ssize_t trace_parse_run_command(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos,
- int (*createfn)(int, char**));
+ int (*createfn)(const char *));
extern unsigned int err_pos(char *cmd, const char *str);
extern void tracing_log_err(struct trace_array *tr,
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index eff099123aa2..e47fdb4c92fb 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -37,7 +37,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
struct ring_buffer_event *event;
struct trace_branch *entry;
unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
const char *p;
if (current->trace_recursion & TRACE_BRANCH_BIT)
@@ -59,10 +59,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
if (atomic_read(&data->disabled))
goto out;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx_flags(flags);
buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
goto out;
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 4f967d5cd917..dc971a68dda4 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -31,23 +31,31 @@ int dyn_event_register(struct dyn_event_operations *ops)
return 0;
}
-int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
+int dyn_event_release(const char *raw_command, struct dyn_event_operations *type)
{
struct dyn_event *pos, *n;
char *system = NULL, *event, *p;
- int ret = -ENOENT;
+ int argc, ret = -ENOENT;
+ char **argv;
+
+ argv = argv_split(GFP_KERNEL, raw_command, &argc);
+ if (!argv)
+ return -ENOMEM;
if (argv[0][0] == '-') {
- if (argv[0][1] != ':')
- return -EINVAL;
+ if (argv[0][1] != ':') {
+ ret = -EINVAL;
+ goto out;
+ }
event = &argv[0][2];
} else {
event = strchr(argv[0], ':');
- if (!event)
- return -EINVAL;
+ if (!event) {
+ ret = -EINVAL;
+ goto out;
+ }
event++;
}
- argc--; argv++;
p = strchr(event, '/');
if (p) {
@@ -63,7 +71,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
if (type && type != pos->ops)
continue;
if (!pos->ops->match(system, event,
- argc, (const char **)argv, pos))
+ argc - 1, (const char **)argv + 1, pos))
continue;
ret = pos->ops->free(pos);
@@ -71,21 +79,22 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
break;
}
mutex_unlock(&event_mutex);
-
+out:
+ argv_free(argv);
return ret;
}
-static int create_dyn_event(int argc, char **argv)
+static int create_dyn_event(const char *raw_command)
{
struct dyn_event_operations *ops;
int ret = -ENODEV;
- if (argv[0][0] == '-' || argv[0][0] == '!')
- return dyn_event_release(argc, argv, NULL);
+ if (raw_command[0] == '-' || raw_command[0] == '!')
+ return dyn_event_release(raw_command, NULL);
mutex_lock(&dyn_event_ops_mutex);
list_for_each_entry(ops, &dyn_event_ops_list, list) {
- ret = ops->create(argc, (const char **)argv);
+ ret = ops->create(raw_command);
if (!ret || ret != -ECANCELED)
break;
}
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index d6f72dcb7269..7754936b57ee 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -39,7 +39,7 @@ struct dyn_event;
*/
struct dyn_event_operations {
struct list_head list;
- int (*create)(int argc, const char *argv[]);
+ int (*create)(const char *raw_command);
int (*show)(struct seq_file *m, struct dyn_event *ev);
bool (*is_busy)(struct dyn_event *ev);
int (*free)(struct dyn_event *ev);
@@ -97,7 +97,7 @@ void *dyn_event_seq_start(struct seq_file *m, loff_t *pos);
void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos);
void dyn_event_seq_stop(struct seq_file *m, void *v);
int dyn_events_release_all(struct dyn_event_operations *type);
-int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type);
+int dyn_event_release(const char *raw_command, struct dyn_event_operations *type);
/*
* for_each_dyn_event - iterate over the dyn_event list
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a71181655958..288ad2c274fb 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -421,11 +421,8 @@ NOKPROBE_SYMBOL(perf_trace_buf_alloc);
void perf_trace_buf_update(void *record, u16 type)
{
struct trace_entry *entry = record;
- int pc = preempt_count();
- unsigned long flags;
- local_save_flags(flags);
- tracing_generic_entry_update(entry, type, flags, pc);
+ tracing_generic_entry_update(entry, type, tracing_gen_ctx());
}
NOKPROBE_SYMBOL(perf_trace_buf_update);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d387b774ceeb..a3563afd412d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -258,22 +258,19 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
trace_event_ignore_this_pid(trace_file))
return NULL;
- local_save_flags(fbuffer->flags);
- fbuffer->pc = preempt_count();
/*
* If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
* preemption (adding one to the preempt_count). Since we are
* interested in the preempt_count at the time the tracepoint was
* hit, we need to subtract one to offset the increment.
*/
- if (IS_ENABLED(CONFIG_PREEMPTION))
- fbuffer->pc--;
+ fbuffer->trace_ctx = tracing_gen_ctx_dec();
fbuffer->trace_file = trace_file;
fbuffer->event =
trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file,
event_call->event.type, len,
- fbuffer->flags, fbuffer->pc);
+ fbuffer->trace_ctx);
if (!fbuffer->event)
return NULL;
@@ -2101,16 +2098,20 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
dir->subsystem = system;
file->system = dir;
- entry = tracefs_create_file("filter", 0644, dir->entry, dir,
- &ftrace_subsystem_filter_fops);
- if (!entry) {
- kfree(system->filter);
- system->filter = NULL;
- pr_warn("Could not create tracefs '%s/filter' entry\n", name);
- }
+ /* the ftrace system is special, do not create enable or filter files */
+ if (strcmp(name, "ftrace") != 0) {
- trace_create_file("enable", 0644, dir->entry, dir,
- &ftrace_system_enable_fops);
+ entry = tracefs_create_file("filter", 0644, dir->entry, dir,
+ &ftrace_subsystem_filter_fops);
+ if (!entry) {
+ kfree(system->filter);
+ system->filter = NULL;
+ pr_warn("Could not create tracefs '%s/filter' entry\n", name);
+ }
+
+ trace_create_file("enable", 0644, dir->entry, dir,
+ &ftrace_system_enable_fops);
+ }
list_add(&dir->list, &tr->systems);
@@ -3679,12 +3680,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
struct trace_buffer *buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
- unsigned long flags;
+ unsigned int trace_ctx;
long disabled;
int cpu;
- int pc;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
cpu = raw_smp_processor_id();
disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
@@ -3692,11 +3692,9 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
if (disabled != 1)
goto out;
- local_save_flags(flags);
-
event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file,
TRACE_FN, sizeof(*entry),
- flags, pc);
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3704,7 +3702,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
entry->parent_ip = parent_ip;
event_trigger_unlock_commit(&event_trace_file, buffer, event,
- entry, flags, pc);
+ entry, trace_ctx);
out:
atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
preempt_enable_notrace();
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index 22bcf7c51d1e..c188045c5f97 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -192,7 +192,6 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size)
static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
{
struct ftrace_event_field *field;
- unsigned long irq_flags;
void *entry = NULL;
int entry_size;
u64 val = 0;
@@ -203,9 +202,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
if (!entry)
return -ENOMEM;
- local_save_flags(irq_flags);
- tracing_generic_entry_update(entry, call->event.type, irq_flags,
- preempt_count());
+ tracing_generic_entry_update(entry, call->event.type,
+ tracing_gen_ctx());
while ((len = parse_field(str, call, &field, &val)) > 0) {
if (is_function_field(field))
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 5a8bc0b421f1..8d71e6c83f10 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -23,13 +23,14 @@
#undef ERRORS
#define ERRORS \
C(BAD_NAME, "Illegal name"), \
- C(CMD_INCOMPLETE, "Incomplete command"), \
+ C(INVALID_CMD, "Command must be of the form: <name> field[;field] ..."),\
+ C(INVALID_DYN_CMD, "Command must be of the form: s or -:[synthetic/]<name> field[;field] ..."),\
C(EVENT_EXISTS, "Event already exists"), \
C(TOO_MANY_FIELDS, "Too many fields"), \
C(INCOMPLETE_TYPE, "Incomplete type"), \
C(INVALID_TYPE, "Invalid type"), \
- C(INVALID_FIELD, "Invalid field"), \
- C(CMD_TOO_LONG, "Command too long"),
+ C(INVALID_FIELD, "Invalid field"), \
+ C(INVALID_ARRAY_SPEC, "Invalid array specification"),
#undef C
#define C(a, b) SYNTH_ERR_##a
@@ -48,7 +49,7 @@ static int errpos(const char *str)
return err_pos(last_cmd, str);
}
-static void last_cmd_set(char *str)
+static void last_cmd_set(const char *str)
{
if (!str)
return;
@@ -62,7 +63,7 @@ static void synth_err(u8 err_type, u8 err_pos)
err_type, err_pos);
}
-static int create_synth_event(int argc, const char **argv);
+static int create_synth_event(const char *raw_command);
static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
static int synth_event_release(struct dyn_event *ev);
static bool synth_event_is_busy(struct dyn_event *ev);
@@ -579,18 +580,32 @@ static void free_synth_field(struct synth_field *field)
kfree(field);
}
-static struct synth_field *parse_synth_field(int argc, const char **argv,
- int *consumed)
+static int check_field_version(const char *prefix, const char *field_type,
+ const char *field_name)
+{
+ /*
+ * For backward compatibility, the old synthetic event command
+ * format did not require semicolons, and in order to not
+ * break user space, that old format must still work. If a new
+ * feature is added, then the format that uses the new feature
+ * will be required to have semicolons, as nothing that uses
+ * the old format would be using the new, yet to be created,
+ * feature. When a new feature is added, this will detect it,
+ * and return a number greater than 1, and require the format
+ * to use semicolons.
+ */
+ return 1;
+}
+
+static struct synth_field *parse_synth_field(int argc, char **argv,
+ int *consumed, int *field_version)
{
- struct synth_field *field;
const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
+ struct synth_field *field;
int len, ret = -ENOMEM;
struct seq_buf s;
ssize_t size;
- if (field_type[0] == ';')
- field_type++;
-
if (!strcmp(field_type, "unsigned")) {
if (argc < 3) {
synth_err(SYNTH_ERR_INCOMPLETE_TYPE, errpos(field_type));
@@ -599,12 +614,19 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
prefix = "unsigned ";
field_type = argv[1];
field_name = argv[2];
- *consumed = 3;
+ *consumed += 3;
} else {
field_name = argv[1];
- *consumed = 2;
+ *consumed += 2;
}
+ if (!field_name) {
+ synth_err(SYNTH_ERR_INVALID_FIELD, errpos(field_type));
+ return ERR_PTR(-EINVAL);
+ }
+
+ *field_version = check_field_version(prefix, field_type, field_name);
+
field = kzalloc(sizeof(*field), GFP_KERNEL);
if (!field)
return ERR_PTR(-ENOMEM);
@@ -613,8 +635,6 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
array = strchr(field_name, '[');
if (array)
len -= strlen(array);
- else if (field_name[len - 1] == ';')
- len--;
field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
if (!field->name)
@@ -626,8 +646,6 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
goto free;
}
- if (field_type[0] == ';')
- field_type++;
len = strlen(field_type) + 1;
if (array)
@@ -644,11 +662,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
if (prefix)
seq_buf_puts(&s, prefix);
seq_buf_puts(&s, field_type);
- if (array) {
+ if (array)
seq_buf_puts(&s, array);
- if (s.buffer[s.len - 1] == ';')
- s.len--;
- }
if (WARN_ON_ONCE(!seq_buf_buffer_left(&s)))
goto free;
@@ -656,7 +671,10 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
size = synth_field_size(field->type);
if (size < 0) {
- synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type));
+ if (array)
+ synth_err(SYNTH_ERR_INVALID_ARRAY_SPEC, errpos(field_name));
+ else
+ synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type));
ret = -EINVAL;
goto free;
} else if (size == 0) {
@@ -1160,46 +1178,13 @@ int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name,
}
EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start);
-static int save_cmdstr(int argc, const char *name, const char **argv)
-{
- struct seq_buf s;
- char *buf;
- int i;
-
- buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- seq_buf_init(&s, buf, MAX_DYNEVENT_CMD_LEN);
-
- seq_buf_puts(&s, name);
-
- for (i = 0; i < argc; i++) {
- seq_buf_putc(&s, ' ');
- seq_buf_puts(&s, argv[i]);
- }
-
- if (!seq_buf_buffer_left(&s)) {
- synth_err(SYNTH_ERR_CMD_TOO_LONG, 0);
- kfree(buf);
- return -EINVAL;
- }
- buf[s.len] = 0;
- last_cmd_set(buf);
-
- kfree(buf);
- return 0;
-}
-
-static int __create_synth_event(int argc, const char *name, const char **argv)
+static int __create_synth_event(const char *name, const char *raw_fields)
{
+ char **argv, *field_str, *tmp_fields, *saved_fields = NULL;
struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
+ int consumed, cmd_version = 1, n_fields_this_loop;
+ int i, argc, n_fields = 0, ret = 0;
struct synth_event *event = NULL;
- int i, consumed = 0, n_fields = 0, ret = 0;
-
- ret = save_cmdstr(argc, name, argv);
- if (ret)
- return ret;
/*
* Argument syntax:
@@ -1208,46 +1193,101 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
* where 'field' = type field_name
*/
- if (name[0] == '\0' || argc < 1) {
- synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0);
+ if (name[0] == '\0') {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
return -EINVAL;
}
- mutex_lock(&event_mutex);
-
if (!is_good_name(name)) {
synth_err(SYNTH_ERR_BAD_NAME, errpos(name));
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
+ mutex_lock(&event_mutex);
+
event = find_synth_event(name);
if (event) {
synth_err(SYNTH_ERR_EVENT_EXISTS, errpos(name));
ret = -EEXIST;
- goto out;
+ goto err;
}
- for (i = 0; i < argc - 1; i++) {
- if (strcmp(argv[i], ";") == 0)
- continue;
- if (n_fields == SYNTH_FIELDS_MAX) {
- synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0);
- ret = -EINVAL;
+ tmp_fields = saved_fields = kstrdup(raw_fields, GFP_KERNEL);
+ if (!tmp_fields) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ while ((field_str = strsep(&tmp_fields, ";")) != NULL) {
+ argv = argv_split(GFP_KERNEL, field_str, &argc);
+ if (!argv) {
+ ret = -ENOMEM;
goto err;
}
- field = parse_synth_field(argc - i, &argv[i], &consumed);
- if (IS_ERR(field)) {
- ret = PTR_ERR(field);
+ if (!argc) {
+ argv_free(argv);
+ continue;
+ }
+
+ n_fields_this_loop = 0;
+ consumed = 0;
+ while (argc > consumed) {
+ int field_version;
+
+ field = parse_synth_field(argc - consumed,
+ argv + consumed, &consumed,
+ &field_version);
+ if (IS_ERR(field)) {
+ argv_free(argv);
+ ret = PTR_ERR(field);
+ goto err;
+ }
+
+ /*
+ * Track the highest version of any field we
+ * found in the command.
+ */
+ if (field_version > cmd_version)
+ cmd_version = field_version;
+
+ /*
+ * Now sort out what is and isn't valid for
+ * each supported version.
+ *
+ * If we see more than 1 field per loop, it
+ * means we have multiple fields between
+ * semicolons, and that's something we no
+ * longer support in a version 2 or greater
+ * command.
+ */
+ if (cmd_version > 1 && n_fields_this_loop >= 1) {
+ synth_err(SYNTH_ERR_INVALID_CMD, errpos(field_str));
+ ret = -EINVAL;
+ goto err;
+ }
+
+ fields[n_fields++] = field;
+ if (n_fields == SYNTH_FIELDS_MAX) {
+ synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ n_fields_this_loop++;
+ }
+
+ if (consumed < argc) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ ret = -EINVAL;
goto err;
}
- fields[n_fields++] = field;
- i += consumed - 1;
+
+ argv_free(argv);
}
- if (i < argc && strcmp(argv[i], ";") != 0) {
- synth_err(SYNTH_ERR_INVALID_FIELD, errpos(argv[i]));
+ if (n_fields == 0) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
ret = -EINVAL;
goto err;
}
@@ -1266,6 +1306,8 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
out:
mutex_unlock(&event_mutex);
+ kfree(saved_fields);
+
return ret;
err:
for (i = 0; i < n_fields; i++)
@@ -1383,19 +1425,79 @@ int synth_event_delete(const char *event_name)
}
EXPORT_SYMBOL_GPL(synth_event_delete);
-static int create_or_delete_synth_event(int argc, char **argv)
+static int check_command(const char *raw_command)
{
- const char *name = argv[0];
- int ret;
+ char **argv = NULL, *cmd, *saved_cmd, *name_and_field;
+ int argc, ret = 0;
+
+ cmd = saved_cmd = kstrdup(raw_command, GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ name_and_field = strsep(&cmd, ";");
+ if (!name_and_field) {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ if (name_and_field[0] == '!')
+ goto free;
+
+ argv = argv_split(GFP_KERNEL, name_and_field, &argc);
+ if (!argv) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ argv_free(argv);
+
+ if (argc < 3)
+ ret = -EINVAL;
+free:
+ kfree(saved_cmd);
+
+ return ret;
+}
+
+static int create_or_delete_synth_event(const char *raw_command)
+{
+ char *name = NULL, *fields, *p;
+ int ret = 0;
+
+ raw_command = skip_spaces(raw_command);
+ if (raw_command[0] == '\0')
+ return ret;
+
+ last_cmd_set(raw_command);
+
+ ret = check_command(raw_command);
+ if (ret) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ return ret;
+ }
+
+ p = strpbrk(raw_command, " \t");
+ if (!p && raw_command[0] != '!') {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ ret = -EINVAL;
+ goto free;
+ }
+
+ name = kmemdup_nul(raw_command, p ? p - raw_command : strlen(raw_command), GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
- /* trace_run_command() ensures argc != 0 */
if (name[0] == '!') {
ret = synth_event_delete(name + 1);
- return ret;
+ goto free;
}
- ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
- return ret == -ECANCELED ? -EINVAL : ret;
+ fields = skip_spaces(p);
+
+ ret = __create_synth_event(name, fields);
+free:
+ kfree(name);
+
+ return ret;
}
static int synth_event_run_command(struct dynevent_cmd *cmd)
@@ -1403,7 +1505,7 @@ static int synth_event_run_command(struct dynevent_cmd *cmd)
struct synth_event *se;
int ret;
- ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event);
+ ret = create_or_delete_synth_event(cmd->seq.buffer);
if (ret)
return ret;
@@ -1939,10 +2041,27 @@ int synth_event_trace_end(struct synth_event_trace_state *trace_state)
}
EXPORT_SYMBOL_GPL(synth_event_trace_end);
-static int create_synth_event(int argc, const char **argv)
+static int create_synth_event(const char *raw_command)
{
- const char *name = argv[0];
- int len;
+ char *fields, *p;
+ const char *name;
+ int len, ret = 0;
+
+ raw_command = skip_spaces(raw_command);
+ if (raw_command[0] == '\0')
+ return ret;
+
+ last_cmd_set(raw_command);
+
+ p = strpbrk(raw_command, " \t");
+ if (!p) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ return -EINVAL;
+ }
+
+ fields = skip_spaces(p);
+
+ name = raw_command;
if (name[0] != 's' || name[1] != ':')
return -ECANCELED;
@@ -1951,11 +2070,30 @@ static int create_synth_event(int argc, const char **argv)
/* This interface accepts group name prefix */
if (strchr(name, '/')) {
len = str_has_prefix(name, SYNTH_SYSTEM "/");
- if (len == 0)
+ if (len == 0) {
+ synth_err(SYNTH_ERR_INVALID_DYN_CMD, 0);
return -EINVAL;
+ }
name += len;
}
- return __create_synth_event(argc - 1, name, argv + 1);
+
+ len = name - raw_command;
+
+ ret = check_command(raw_command + len);
+ if (ret) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ return ret;
+ }
+
+ name = kmemdup_nul(raw_command + len, p - raw_command - len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ ret = __create_synth_event(name, fields);
+
+ kfree(name);
+
+ return ret;
}
static int synth_event_release(struct dyn_event *ev)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c5095dd28e20..f93723ca66bc 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -106,8 +106,7 @@ static int function_trace_init(struct trace_array *tr)
ftrace_init_array_ops(tr, func);
- tr->array_buffer.cpu = get_cpu();
- put_cpu();
+ tr->array_buffer.cpu = raw_smp_processor_id();
tracing_start_cmdline_record();
tracing_start_function_trace(tr);
@@ -132,10 +131,9 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
{
struct trace_array *tr = op->private;
struct trace_array_cpu *data;
- unsigned long flags;
+ unsigned int trace_ctx;
int bit;
int cpu;
- int pc;
if (unlikely(!tr->function_enabled))
return;
@@ -144,15 +142,14 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
if (bit < 0)
return;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
cpu = smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- if (!atomic_read(&data->disabled)) {
- local_save_flags(flags);
- trace_function(tr, ip, parent_ip, flags, pc);
- }
+ if (!atomic_read(&data->disabled))
+ trace_function(tr, ip, parent_ip, trace_ctx);
+
ftrace_test_recursion_unlock(bit);
preempt_enable_notrace();
}
@@ -184,7 +181,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
unsigned long flags;
long disabled;
int cpu;
- int pc;
+ unsigned int trace_ctx;
if (unlikely(!tr->function_enabled))
return;
@@ -199,9 +196,9 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
- pc = preempt_count();
- trace_function(tr, ip, parent_ip, flags, pc);
- __trace_stack(tr, flags, STACK_SKIP, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ trace_function(tr, ip, parent_ip, trace_ctx);
+ __trace_stack(tr, trace_ctx, STACK_SKIP);
}
atomic_dec(&data->disabled);
@@ -404,13 +401,11 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
static __always_inline void trace_stack(struct trace_array *tr)
{
- unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
- local_save_flags(flags);
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
- __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc);
+ __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP);
}
static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d874dec87131..0aa6e6faa943 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -96,8 +96,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
- unsigned long flags,
- int pc)
+ unsigned int trace_ctx)
{
struct trace_event_call *call = &event_funcgraph_entry;
struct ring_buffer_event *event;
@@ -105,7 +104,7 @@ int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
return 0;
entry = ring_buffer_event_data(event);
@@ -129,10 +128,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
struct trace_array *tr = graph_array;
struct trace_array_cpu *data;
unsigned long flags;
+ unsigned int trace_ctx;
long disabled;
int ret;
int cpu;
- int pc;
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
return 0;
@@ -174,8 +173,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
- pc = preempt_count();
- ret = __trace_graph_entry(tr, trace, flags, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ ret = __trace_graph_entry(tr, trace, trace_ctx);
} else {
ret = 0;
}
@@ -188,7 +187,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
static void
__trace_graph_function(struct trace_array *tr,
- unsigned long ip, unsigned long flags, int pc)
+ unsigned long ip, unsigned int trace_ctx)
{
u64 time = trace_clock_local();
struct ftrace_graph_ent ent = {
@@ -202,22 +201,21 @@ __trace_graph_function(struct trace_array *tr,
.rettime = time,
};
- __trace_graph_entry(tr, &ent, flags, pc);
- __trace_graph_return(tr, &ret, flags, pc);
+ __trace_graph_entry(tr, &ent, trace_ctx);
+ __trace_graph_return(tr, &ret, trace_ctx);
}
void
trace_graph_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
- __trace_graph_function(tr, ip, flags, pc);
+ __trace_graph_function(tr, ip, trace_ctx);
}
void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
- unsigned long flags,
- int pc)
+ unsigned int trace_ctx)
{
struct trace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
@@ -225,7 +223,7 @@ void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -239,9 +237,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
struct trace_array *tr = graph_array;
struct trace_array_cpu *data;
unsigned long flags;
+ unsigned int trace_ctx;
long disabled;
int cpu;
- int pc;
ftrace_graph_addr_finish(trace);
@@ -255,8 +253,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
- pc = preempt_count();
- __trace_graph_return(tr, trace, flags, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ __trace_graph_return(tr, trace, trace_ctx);
}
atomic_dec(&data->disabled);
local_irq_restore(flags);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index c0df9b97f147..34dc1a712dcb 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -108,14 +108,9 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct hwlat_entry *entry;
- unsigned long flags;
- int pc;
-
- pc = preempt_count();
- local_save_flags(flags);
event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry),
- flags, pc);
+ tracing_gen_ctx());
if (!event)
return;
entry = ring_buffer_event_data(event);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 6756379b661f..590b3d51afae 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -143,11 +143,14 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
+ unsigned int trace_ctx;
if (!func_prolog_dec(tr, &data, &flags))
return;
- trace_function(tr, ip, parent_ip, flags, preempt_count());
+ trace_ctx = tracing_gen_ctx_flags(flags);
+
+ trace_function(tr, ip, parent_ip, trace_ctx);
atomic_dec(&data->disabled);
}
@@ -177,8 +180,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
+ unsigned int trace_ctx;
int ret;
- int pc;
if (ftrace_graph_ignore_func(trace))
return 0;
@@ -195,8 +198,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
if (!func_prolog_dec(tr, &data, &flags))
return 0;
- pc = preempt_count();
- ret = __trace_graph_entry(tr, trace, flags, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ ret = __trace_graph_entry(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
return ret;
@@ -207,15 +210,15 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
ftrace_graph_addr_finish(trace);
if (!func_prolog_dec(tr, &data, &flags))
return;
- pc = preempt_count();
- __trace_graph_return(tr, trace, flags, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ __trace_graph_return(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
}
@@ -267,12 +270,12 @@ static void irqsoff_print_header(struct seq_file *s)
static void
__trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
if (is_graph(tr))
- trace_graph_function(tr, ip, parent_ip, flags, pc);
+ trace_graph_function(tr, ip, parent_ip, trace_ctx);
else
- trace_function(tr, ip, parent_ip, flags, pc);
+ trace_function(tr, ip, parent_ip, trace_ctx);
}
#else
@@ -322,15 +325,13 @@ check_critical_timing(struct trace_array *tr,
{
u64 T0, T1, delta;
unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
T0 = data->preempt_timestamp;
T1 = ftrace_now(cpu);
delta = T1-T0;
- local_save_flags(flags);
-
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
if (!report_latency(tr, delta))
goto out;
@@ -341,9 +342,9 @@ check_critical_timing(struct trace_array *tr,
if (!report_latency(tr, delta))
goto out_unlock;
- __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+ __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx);
/* Skip 5 functions to get to the irq/preempt enable function */
- __trace_stack(tr, flags, 5, pc);
+ __trace_stack(tr, trace_ctx, 5);
if (data->critical_sequence != max_sequence)
goto out_unlock;
@@ -363,16 +364,15 @@ out_unlock:
out:
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
- __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+ __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx);
}
static nokprobe_inline void
-start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
{
int cpu;
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
- unsigned long flags;
if (!tracer_enabled || !tracing_is_enabled())
return;
@@ -393,9 +393,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
data->preempt_timestamp = ftrace_now(cpu);
data->critical_start = parent_ip ? : ip;
- local_save_flags(flags);
-
- __trace_function(tr, ip, parent_ip, flags, pc);
+ __trace_function(tr, ip, parent_ip, tracing_gen_ctx());
per_cpu(tracing_cpu, cpu) = 1;
@@ -403,12 +401,12 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
}
static nokprobe_inline void
-stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
{
int cpu;
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
- unsigned long flags;
+ unsigned int trace_ctx;
cpu = raw_smp_processor_id();
/* Always clear the tracing cpu on stopping the trace */
@@ -428,8 +426,8 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
atomic_inc(&data->disabled);
- local_save_flags(flags);
- __trace_function(tr, ip, parent_ip, flags, pc);
+ trace_ctx = tracing_gen_ctx();
+ __trace_function(tr, ip, parent_ip, trace_ctx);
check_critical_timing(tr, data, parent_ip ? : ip, cpu);
data->critical_start = 0;
atomic_dec(&data->disabled);
@@ -438,20 +436,16 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
/* start and stop critical timings used to for stoppage (in idle) */
void start_critical_timings(void)
{
- int pc = preempt_count();
-
- if (preempt_trace(pc) || irq_trace())
- start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
+ if (preempt_trace(preempt_count()) || irq_trace())
+ start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
EXPORT_SYMBOL_GPL(start_critical_timings);
NOKPROBE_SYMBOL(start_critical_timings);
void stop_critical_timings(void)
{
- int pc = preempt_count();
-
- if (preempt_trace(pc) || irq_trace())
- stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
+ if (preempt_trace(preempt_count()) || irq_trace())
+ stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
EXPORT_SYMBOL_GPL(stop_critical_timings);
NOKPROBE_SYMBOL(stop_critical_timings);
@@ -613,19 +607,15 @@ static void irqsoff_tracer_stop(struct trace_array *tr)
*/
void tracer_hardirqs_on(unsigned long a0, unsigned long a1)
{
- unsigned int pc = preempt_count();
-
- if (!preempt_trace(pc) && irq_trace())
- stop_critical_timing(a0, a1, pc);
+ if (!preempt_trace(preempt_count()) && irq_trace())
+ stop_critical_timing(a0, a1);
}
NOKPROBE_SYMBOL(tracer_hardirqs_on);
void tracer_hardirqs_off(unsigned long a0, unsigned long a1)
{
- unsigned int pc = preempt_count();
-
- if (!preempt_trace(pc) && irq_trace())
- start_critical_timing(a0, a1, pc);
+ if (!preempt_trace(preempt_count()) && irq_trace())
+ start_critical_timing(a0, a1);
}
NOKPROBE_SYMBOL(tracer_hardirqs_off);
@@ -665,18 +655,14 @@ static struct tracer irqsoff_tracer __read_mostly =
#ifdef CONFIG_PREEMPT_TRACER
void tracer_preempt_on(unsigned long a0, unsigned long a1)
{
- int pc = preempt_count();
-
- if (preempt_trace(pc) && !irq_trace())
- stop_critical_timing(a0, a1, pc);
+ if (preempt_trace(preempt_count()) && !irq_trace())
+ stop_critical_timing(a0, a1);
}
void tracer_preempt_off(unsigned long a0, unsigned long a1)
{
- int pc = preempt_count();
-
- if (preempt_trace(pc) && !irq_trace())
- start_critical_timing(a0, a1, pc);
+ if (preempt_trace(preempt_count()) && !irq_trace())
+ start_critical_timing(a0, a1);
}
static int preemptoff_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 56c7fbff7bd7..6fe770d86dc3 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,7 +35,7 @@ static int __init set_kprobe_boot_events(char *str)
}
__setup("kprobe_event=", set_kprobe_boot_events);
-static int trace_kprobe_create(int argc, const char **argv);
+static int trace_kprobe_create(const char *raw_command);
static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev);
static int trace_kprobe_release(struct dyn_event *ev);
static bool trace_kprobe_is_busy(struct dyn_event *ev);
@@ -124,9 +124,9 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
if (!p)
return true;
*p = '\0';
- mutex_lock(&module_mutex);
+ rcu_read_lock_sched();
ret = !!find_module(tk->symbol);
- mutex_unlock(&module_mutex);
+ rcu_read_unlock_sched();
*p = ':';
return ret;
@@ -711,7 +711,7 @@ static inline void sanitize_event_name(char *name)
*name = '_';
}
-static int trace_kprobe_create(int argc, const char *argv[])
+static int __trace_kprobe_create(int argc, const char *argv[])
{
/*
* Argument syntax:
@@ -910,20 +910,25 @@ error:
goto out;
}
-static int create_or_delete_trace_kprobe(int argc, char **argv)
+static int trace_kprobe_create(const char *raw_command)
+{
+ return trace_probe_create(raw_command, __trace_kprobe_create);
+}
+
+static int create_or_delete_trace_kprobe(const char *raw_command)
{
int ret;
- if (argv[0][0] == '-')
- return dyn_event_release(argc, argv, &trace_kprobe_ops);
+ if (raw_command[0] == '-')
+ return dyn_event_release(raw_command, &trace_kprobe_ops);
- ret = trace_kprobe_create(argc, (const char **)argv);
+ ret = trace_kprobe_create(raw_command);
return ret == -ECANCELED ? -EINVAL : ret;
}
static int trace_kprobe_run_command(struct dynevent_cmd *cmd)
{
- return trace_run_command(cmd->seq.buffer, create_or_delete_trace_kprobe);
+ return create_or_delete_trace_kprobe(cmd->seq.buffer);
}
/**
@@ -1084,7 +1089,7 @@ int kprobe_event_delete(const char *name)
snprintf(buf, MAX_EVENT_NAME_LEN, "-:%s", name);
- return trace_run_command(buf, create_or_delete_trace_kprobe);
+ return create_or_delete_trace_kprobe(buf);
}
EXPORT_SYMBOL_GPL(kprobe_event_delete);
@@ -1386,8 +1391,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
if (trace_trigger_soft_disabled(trace_file))
return;
- local_save_flags(fbuffer.flags);
- fbuffer.pc = preempt_count();
+ fbuffer.trace_ctx = tracing_gen_ctx();
fbuffer.trace_file = trace_file;
dsize = __get_data_size(&tk->tp, regs);
@@ -1396,7 +1400,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
call->event.type,
sizeof(*entry) + tk->tp.size + dsize,
- fbuffer.flags, fbuffer.pc);
+ fbuffer.trace_ctx);
if (!fbuffer.event)
return;
@@ -1434,8 +1438,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
if (trace_trigger_soft_disabled(trace_file))
return;
- local_save_flags(fbuffer.flags);
- fbuffer.pc = preempt_count();
+ fbuffer.trace_ctx = tracing_gen_ctx();
fbuffer.trace_file = trace_file;
dsize = __get_data_size(&tk->tp, regs);
@@ -1443,7 +1446,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
call->event.type,
sizeof(*entry) + tk->tp.size + dsize,
- fbuffer.flags, fbuffer.pc);
+ fbuffer.trace_ctx);
if (!fbuffer.event)
return;
@@ -1888,7 +1891,7 @@ static __init void setup_boot_kprobe_events(void)
if (p)
*p++ = '\0';
- ret = trace_run_command(cmd, create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe(cmd);
if (ret)
pr_warn("Failed to add event(%d): %s\n", ret, cmd);
@@ -1982,8 +1985,7 @@ static __init int kprobe_trace_self_tests_init(void)
pr_info("Testing kprobe tracing: ");
- ret = trace_run_command("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)",
- create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)");
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function entry.\n");
warn++;
@@ -2004,8 +2006,7 @@ static __init int kprobe_trace_self_tests_init(void)
}
}
- ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target $retval",
- create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe("r:testprobe2 kprobe_trace_selftest_target $retval");
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function return.\n");
warn++;
@@ -2078,13 +2079,13 @@ static __init int kprobe_trace_self_tests_init(void)
trace_probe_event_call(&tk->tp), file);
}
- ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe("-:testprobe");
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
}
- ret = trace_run_command("-:testprobe2", create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe("-:testprobe2");
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 84582bf1ed5f..64e77b513697 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -5,8 +5,6 @@
* Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
*/
-#define DEBUG 1
-
#include <linux/kernel.h>
#include <linux/mmiotrace.h>
#include <linux/pci.h>
@@ -300,10 +298,11 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
- int pc = preempt_count();
+ unsigned int trace_ctx;
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
- sizeof(*entry), 0, pc);
+ sizeof(*entry), trace_ctx);
if (!event) {
atomic_inc(&dropped_count);
return;
@@ -312,7 +311,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
entry->rw = *rw;
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -330,10 +329,11 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
- int pc = preempt_count();
+ unsigned int trace_ctx;
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
- sizeof(*entry), 0, pc);
+ sizeof(*entry), trace_ctx);
if (!event) {
atomic_inc(&dropped_count);
return;
@@ -342,7 +342,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
entry->map = *map;
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 92b1575ae0ca..61255bad7e01 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -312,13 +312,23 @@ int trace_raw_output_prep(struct trace_iterator *iter,
}
EXPORT_SYMBOL(trace_raw_output_prep);
+void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ trace_seq_vprintf(&iter->seq, trace_event_format(iter, fmt), ap);
+ va_end(ap);
+}
+EXPORT_SYMBOL(trace_event_printf);
+
static int trace_output_raw(struct trace_iterator *iter, char *name,
char *fmt, va_list ap)
{
struct trace_seq *s = &iter->seq;
trace_seq_printf(s, "%s: ", name);
- trace_seq_vprintf(s, fmt, ap);
+ trace_seq_vprintf(s, trace_event_format(iter, fmt), ap);
return trace_handle_return(s);
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d2867ccc6aca..ec589a4612df 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1134,3 +1134,20 @@ bool trace_probe_match_command_args(struct trace_probe *tp,
}
return true;
}
+
+int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **))
+{
+ int argc = 0, ret = 0;
+ char **argv;
+
+ argv = argv_split(GFP_KERNEL, raw_command, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ if (argc)
+ ret = createfn(argc, (const char **)argv);
+
+ argv_free(argv);
+
+ return ret;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 2f703a20c724..7ce4027089ee 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -341,6 +341,7 @@ struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp,
int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b);
bool trace_probe_match_command_args(struct trace_probe *tp,
int argc, const char **argv);
+int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **));
#define trace_probe_for_each_link(pos, tp) \
list_for_each_entry(pos, &(tp)->event->files, list)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index c0181066dbe9..e5778d1d7a5b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -67,7 +67,7 @@ static bool function_enabled;
static int
func_prolog_preempt_disable(struct trace_array *tr,
struct trace_array_cpu **data,
- int *pc)
+ unsigned int *trace_ctx)
{
long disabled;
int cpu;
@@ -75,7 +75,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
if (likely(!wakeup_task))
return 0;
- *pc = preempt_count();
+ *trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
cpu = raw_smp_processor_id();
@@ -116,8 +116,8 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
- unsigned long flags;
- int pc, ret = 0;
+ unsigned int trace_ctx;
+ int ret = 0;
if (ftrace_graph_ignore_func(trace))
return 0;
@@ -131,11 +131,10 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
if (ftrace_graph_notrace_addr(trace->func))
return 1;
- if (!func_prolog_preempt_disable(tr, &data, &pc))
+ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return 0;
- local_save_flags(flags);
- ret = __trace_graph_entry(tr, trace, flags, pc);
+ ret = __trace_graph_entry(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
preempt_enable_notrace();
@@ -146,16 +145,14 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
- unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
ftrace_graph_addr_finish(trace);
- if (!func_prolog_preempt_disable(tr, &data, &pc))
+ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return;
- local_save_flags(flags);
- __trace_graph_return(tr, trace, flags, pc);
+ __trace_graph_return(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
preempt_enable_notrace();
@@ -217,13 +214,13 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
- if (!func_prolog_preempt_disable(tr, &data, &pc))
+ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return;
local_irq_save(flags);
- trace_function(tr, ip, parent_ip, flags, pc);
+ trace_function(tr, ip, parent_ip, trace_ctx);
local_irq_restore(flags);
atomic_dec(&data->disabled);
@@ -303,12 +300,12 @@ static void wakeup_print_header(struct seq_file *s)
static void
__trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
if (is_graph(tr))
- trace_graph_function(tr, ip, parent_ip, flags, pc);
+ trace_graph_function(tr, ip, parent_ip, trace_ctx);
else
- trace_function(tr, ip, parent_ip, flags, pc);
+ trace_function(tr, ip, parent_ip, trace_ctx);
}
static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -375,7 +372,7 @@ static void
tracing_sched_switch_trace(struct trace_array *tr,
struct task_struct *prev,
struct task_struct *next,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct trace_event_call *call = &event_context_switch;
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -383,7 +380,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
struct ctx_switch_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -396,14 +393,14 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry->next_cpu = task_cpu(next);
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
static void
tracing_sched_wakeup_trace(struct trace_array *tr,
struct task_struct *wakee,
struct task_struct *curr,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct trace_event_call *call = &event_wakeup;
struct ring_buffer_event *event;
@@ -411,7 +408,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct trace_buffer *buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -424,7 +421,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry->next_cpu = task_cpu(wakee);
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
static void notrace
@@ -436,7 +433,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
unsigned long flags;
long disabled;
int cpu;
- int pc;
+ unsigned int trace_ctx;
tracing_record_cmdline(prev);
@@ -455,8 +452,6 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
if (next != wakeup_task)
return;
- pc = preempt_count();
-
/* disable local data, not wakeup_cpu data */
cpu = raw_smp_processor_id();
disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
@@ -464,6 +459,8 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
goto out;
local_irq_save(flags);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+
arch_spin_lock(&wakeup_lock);
/* We could race with grabbing wakeup_lock */
@@ -473,9 +470,9 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
/* The task we are waiting for is waking up */
data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu);
- __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
- tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
- __trace_stack(wakeup_trace, flags, 0, pc);
+ __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, trace_ctx);
+ tracing_sched_switch_trace(wakeup_trace, prev, next, trace_ctx);
+ __trace_stack(wakeup_trace, trace_ctx, 0);
T0 = data->preempt_timestamp;
T1 = ftrace_now(cpu);
@@ -527,9 +524,8 @@ probe_wakeup(void *ignore, struct task_struct *p)
{
struct trace_array_cpu *data;
int cpu = smp_processor_id();
- unsigned long flags;
long disabled;
- int pc;
+ unsigned int trace_ctx;
if (likely(!tracer_enabled))
return;
@@ -550,11 +546,12 @@ probe_wakeup(void *ignore, struct task_struct *p)
(!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
return;
- pc = preempt_count();
disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
if (unlikely(disabled != 1))
goto out;
+ trace_ctx = tracing_gen_ctx();
+
/* interrupts should be off from try_to_wake_up */
arch_spin_lock(&wakeup_lock);
@@ -581,19 +578,17 @@ probe_wakeup(void *ignore, struct task_struct *p)
wakeup_task = get_task_struct(p);
- local_save_flags(flags);
-
data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu);
data->preempt_timestamp = ftrace_now(cpu);
- tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
- __trace_stack(wakeup_trace, flags, 0, pc);
+ tracing_sched_wakeup_trace(wakeup_trace, p, current, trace_ctx);
+ __trace_stack(wakeup_trace, trace_ctx, 0);
/*
* We must be careful in using CALLER_ADDR2. But since wake_up
* is not called by an assembly function (where as schedule is)
* it should be safe to use it here.
*/
- __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+ __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, trace_ctx);
out_locked:
arch_spin_unlock(&wakeup_lock);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d85a2f0f316b..8bfcd3b09422 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -298,9 +298,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
struct trace_buffer *buffer;
- unsigned long irq_flags;
+ unsigned int trace_ctx;
unsigned long args[6];
- int pc;
int syscall_nr;
int size;
@@ -322,12 +321,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
- local_save_flags(irq_flags);
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer,
- sys_data->enter_event->event.type, size, irq_flags, pc);
+ sys_data->enter_event->event.type, size, trace_ctx);
if (!event)
return;
@@ -337,7 +335,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
event_trigger_unlock_commit(trace_file, buffer, event, entry,
- irq_flags, pc);
+ trace_ctx);
}
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -348,8 +346,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
struct trace_buffer *buffer;
- unsigned long irq_flags;
- int pc;
+ unsigned int trace_ctx;
int syscall_nr;
syscall_nr = trace_get_syscall_nr(current, regs);
@@ -368,13 +365,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
if (!sys_data)
return;
- local_save_flags(irq_flags);
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer,
sys_data->exit_event->event.type, sizeof(*entry),
- irq_flags, pc);
+ trace_ctx);
if (!event)
return;
@@ -383,7 +379,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
entry->ret = syscall_get_return_value(current, regs);
event_trigger_unlock_commit(trace_file, buffer, event, entry,
- irq_flags, pc);
+ trace_ctx);
}
static int reg_event_syscall_enter(struct trace_event_file *file,
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 3cf7128e1ad3..9b50869a5ddb 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -34,7 +34,7 @@ struct uprobe_trace_entry_head {
#define DATAOF_TRACE_ENTRY(entry, is_return) \
((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
-static int trace_uprobe_create(int argc, const char **argv);
+static int trace_uprobe_create(const char *raw_command);
static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev);
static int trace_uprobe_release(struct dyn_event *ev);
static bool trace_uprobe_is_busy(struct dyn_event *ev);
@@ -530,7 +530,7 @@ end:
* Argument syntax:
* - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET[%return][(REF)] [FETCHARGS]
*/
-static int trace_uprobe_create(int argc, const char **argv)
+static int __trace_uprobe_create(int argc, const char **argv)
{
struct trace_uprobe *tu;
const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
@@ -716,14 +716,19 @@ fail_address_parse:
return ret;
}
-static int create_or_delete_trace_uprobe(int argc, char **argv)
+int trace_uprobe_create(const char *raw_command)
+{
+ return trace_probe_create(raw_command, __trace_uprobe_create);
+}
+
+static int create_or_delete_trace_uprobe(const char *raw_command)
{
int ret;
- if (argv[0][0] == '-')
- return dyn_event_release(argc, argv, &trace_uprobe_ops);
+ if (raw_command[0] == '-')
+ return dyn_event_release(raw_command, &trace_uprobe_ops);
- ret = trace_uprobe_create(argc, (const char **)argv);
+ ret = trace_uprobe_create(raw_command);
return ret == -ECANCELED ? -EINVAL : ret;
}
@@ -961,7 +966,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
size = esize + tu->tp.size + dsize;
event = trace_event_buffer_lock_reserve(&buffer, trace_file,
- call->event.type, size, 0, 0);
+ call->event.type, size, 0);
if (!event)
return;
@@ -977,7 +982,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
memcpy(data, ucb->buf, tu->tp.size + dsize);
- event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0);
+ event_trigger_unlock_commit(trace_file, buffer, event, entry, 0);
}
/* uprobe handler */
@@ -1635,7 +1640,7 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call)
}
#endif /* CONFIG_PERF_EVENTS */
-/* Make a trace interface for controling probe points */
+/* Make a trace interface for controlling probe points */
static __init int init_uprobe_trace(void)
{
int ret;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 7261fa0f5e3c..9f478d29b926 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -53,6 +53,12 @@ struct tp_probes {
struct tracepoint_func probes[];
};
+/* Called in removal of a func but failed to allocate a new tp_funcs */
+static void tp_stub_func(void)
+{
+ return;
+}
+
static inline void *allocate_probes(int count)
{
struct tp_probes *p = kmalloc(struct_size(p, probes, count),
@@ -130,8 +136,9 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
int prio)
{
struct tracepoint_func *old, *new;
- int nr_probes = 0;
- int pos = -1;
+ int iter_probes; /* Iterate over old probe array. */
+ int nr_probes = 0; /* Counter for probes */
+ int pos = -1; /* Insertion position into new array */
if (WARN_ON(!tp_func->func))
return ERR_PTR(-EINVAL);
@@ -140,13 +147,13 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
old = *funcs;
if (old) {
/* (N -> N+1), (N != 0, 1) probes */
- for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
- /* Insert before probes of lower priority */
- if (pos < 0 && old[nr_probes].prio < prio)
- pos = nr_probes;
- if (old[nr_probes].func == tp_func->func &&
- old[nr_probes].data == tp_func->data)
+ for (iter_probes = 0; old[iter_probes].func; iter_probes++) {
+ if (old[iter_probes].func == tp_stub_func)
+ continue; /* Skip stub functions. */
+ if (old[iter_probes].func == tp_func->func &&
+ old[iter_probes].data == tp_func->data)
return ERR_PTR(-EEXIST);
+ nr_probes++;
}
}
/* + 2 : one for new probe, one for NULL func */
@@ -154,20 +161,24 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
if (new == NULL)
return ERR_PTR(-ENOMEM);
if (old) {
- if (pos < 0) {
- pos = nr_probes;
- memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
- } else {
- /* Copy higher priority probes ahead of the new probe */
- memcpy(new, old, pos * sizeof(struct tracepoint_func));
- /* Copy the rest after it. */
- memcpy(new + pos + 1, old + pos,
- (nr_probes - pos) * sizeof(struct tracepoint_func));
+ nr_probes = 0;
+ for (iter_probes = 0; old[iter_probes].func; iter_probes++) {
+ if (old[iter_probes].func == tp_stub_func)
+ continue;
+ /* Insert before probes of lower priority */
+ if (pos < 0 && old[iter_probes].prio < prio)
+ pos = nr_probes++;
+ new[nr_probes++] = old[iter_probes];
}
- } else
+ if (pos < 0)
+ pos = nr_probes++;
+ /* nr_probes now points to the end of the new array */
+ } else {
pos = 0;
+ nr_probes = 1; /* must point at end of array */
+ }
new[pos] = *tp_func;
- new[nr_probes + 1].func = NULL;
+ new[nr_probes].func = NULL;
*funcs = new;
debug_print_probes(*funcs);
return old;
@@ -188,8 +199,9 @@ static void *func_remove(struct tracepoint_func **funcs,
/* (N -> M), (N > 1, M >= 0) probes */
if (tp_func->func) {
for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
- if (old[nr_probes].func == tp_func->func &&
- old[nr_probes].data == tp_func->data)
+ if ((old[nr_probes].func == tp_func->func &&
+ old[nr_probes].data == tp_func->data) ||
+ old[nr_probes].func == tp_stub_func)
nr_del++;
}
}
@@ -208,14 +220,27 @@ static void *func_remove(struct tracepoint_func **funcs,
/* N -> M, (N > 1, M > 0) */
/* + 1 for NULL */
new = allocate_probes(nr_probes - nr_del + 1);
- if (new == NULL)
- return ERR_PTR(-ENOMEM);
- for (i = 0; old[i].func; i++)
- if (old[i].func != tp_func->func
- || old[i].data != tp_func->data)
- new[j++] = old[i];
- new[nr_probes - nr_del].func = NULL;
- *funcs = new;
+ if (new) {
+ for (i = 0; old[i].func; i++) {
+ if ((old[i].func != tp_func->func ||
+ old[i].data != tp_func->data) &&
+ old[i].func != tp_stub_func)
+ new[j++] = old[i];
+ }
+ new[nr_probes - nr_del].func = NULL;
+ *funcs = new;
+ } else {
+ /*
+ * Failed to allocate, replace the old function
+ * with calls to tp_stub_func.
+ */
+ for (i = 0; old[i].func; i++) {
+ if (old[i].func == tp_func->func &&
+ old[i].data == tp_func->data)
+ WRITE_ONCE(old[i].func, tp_stub_func);
+ }
+ *funcs = old;
+ }
}
debug_print_probes(*funcs);
return old;
@@ -295,10 +320,12 @@ static int tracepoint_remove_func(struct tracepoint *tp,
tp_funcs = rcu_dereference_protected(tp->funcs,
lockdep_is_held(&tracepoints_mutex));
old = func_remove(&tp_funcs, func);
- if (IS_ERR(old)) {
- WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
+ if (WARN_ON_ONCE(IS_ERR(old)))
return PTR_ERR(old);
- }
+
+ if (tp_funcs == old)
+ /* Failed allocating new tp_funcs, replaced func with stub */
+ return 0;
if (!tp_funcs) {
/* Removed last function */
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 0ef8f65bd2d7..9c9eb20dd2c5 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -413,7 +413,7 @@ static void put_watch(struct watch *watch)
}
/**
- * init_watch_queue - Initialise a watch
+ * init_watch - Initialise a watch
* @watch: The watch to initialise.
* @wqueue: The queue to assign.
*
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 894bb885b40b..0d150da252e8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2964,8 +2964,8 @@ reflush:
if (++flush_cnt == 10 ||
(flush_cnt % 100 == 0 && flush_cnt <= 1000))
- pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",
- wq->name, flush_cnt);
+ pr_warn("workqueue %s: %s() isn't complete after %u tries\n",
+ wq->name, __func__, flush_cnt);
mutex_unlock(&wq->mutex);
goto reflush;