aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/audit.c4
-rw-r--r--kernel/audit.h12
-rw-r--r--kernel/audit_fsnotify.c2
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/auditsc.c24
-rw-r--r--kernel/bpf/bpf_inode_storage.c2
-rw-r--r--kernel/bpf/bpf_lsm.c3
-rw-r--r--kernel/bpf/bpf_struct_ops.c2
-rw-r--r--kernel/bpf/btf.c2
-rw-r--r--kernel/bpf/core.c10
-rw-r--r--kernel/bpf/disasm.c2
-rw-r--r--kernel/bpf/inode.c17
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c19
-rw-r--r--kernel/bpf/stackmap.c12
-rw-r--r--kernel/bpf/syscall.c5
-rw-r--r--kernel/bpf/trampoline.c248
-rw-r--r--kernel/bpf/verifier.c341
-rw-r--r--kernel/capability.c14
-rw-r--r--kernel/cfi.c329
-rw-r--r--kernel/cgroup/cgroup.c57
-rw-r--r--kernel/cgroup/cpuset.c6
-rw-r--r--kernel/debug/debug_core.c39
-rw-r--r--kernel/debug/gdbstub.c8
-rw-r--r--kernel/debug/kdb/kdb_bp.c75
-rw-r--r--kernel/debug/kdb/kdb_main.c598
-rw-r--r--kernel/debug/kdb/kdb_private.h17
-rw-r--r--kernel/debug/kdb/kdb_support.c71
-rw-r--r--kernel/dma/map_benchmark.c12
-rw-r--r--kernel/dma/mapping.c40
-rw-r--r--kernel/dma/swiotlb.c310
-rw-r--r--kernel/entry/common.c4
-rw-r--r--kernel/events/core.c50
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/fork.c38
-rw-r--r--kernel/futex.c3
-rw-r--r--kernel/gcov/clang.c88
-rw-r--r--kernel/groups.c7
-rw-r--r--kernel/irq/chip.c8
-rw-r--r--kernel/irq/dummychip.c2
-rw-r--r--kernel/irq/ipi.c2
-rw-r--r--kernel/irq/irq_sim.c35
-rw-r--r--kernel/irq/irqdesc.c2
-rw-r--r--kernel/irq/irqdomain.c62
-rw-r--r--kernel/irq/manage.c27
-rw-r--r--kernel/irq/matrix.c11
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/msi.c2
-rw-r--r--kernel/irq/proc.c4
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/irq/timings.c8
-rw-r--r--kernel/jump_label.c8
-rw-r--r--kernel/kallsyms.c63
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/kprobes.c44
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/livepatch/core.c7
-rw-r--r--kernel/locking/lockdep.c7
-rw-r--r--kernel/locking/lockdep_internals.h8
-rw-r--r--kernel/locking/mutex.c25
-rw-r--r--kernel/locking/qrwlock.c7
-rw-r--r--kernel/locking/rtmutex.c4
-rw-r--r--kernel/locking/rwsem.c2
-rw-r--r--kernel/locking/semaphore.c2
-rw-r--r--kernel/module.c524
-rw-r--r--kernel/module_signature.c2
-rw-r--r--kernel/module_signing.c2
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/energy_model.c2
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk/internal.h7
-rw-r--r--kernel/printk/printk.c478
-rw-r--r--kernel/printk/printk_safe.c30
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/resource.c98
-rw-r--r--kernel/sched/core.c127
-rw-r--r--kernel/sched/cpufreq_schedutil.c43
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/sched/membarrier.c6
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c10
-rw-r--r--kernel/softirq.c357
-rw-r--r--kernel/static_call.c47
-rw-r--r--kernel/sys.c22
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/alarmtimer.c8
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/time/hrtimer.c80
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/posix-cpu-timers.c8
-rw-r--r--kernel/time/posix-timers.c4
-rw-r--r--kernel/time/test_udelay.c7
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c2
-rw-r--r--kernel/time/tick-broadcast.c20
-rw-r--r--kernel/time/tick-common.c7
-rw-r--r--kernel/time/tick-oneshot.c2
-rw-r--r--kernel/time/tick-sched.c10
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--kernel/time/time.c2
-rw-r--r--kernel/time/timecounter.c2
-rw-r--r--kernel/time/timekeeping.c46
-rw-r--r--kernel/time/timer.c4
-rw-r--r--kernel/time/vsyscall.c2
-rw-r--r--kernel/trace/Kconfig37
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c37
-rw-r--r--kernel/trace/error_report-traces.c11
-rw-r--r--kernel/trace/ftrace.c52
-rw-r--r--kernel/trace/preemptirq_delay_test.c14
-rw-r--r--kernel/trace/ring_buffer.c62
-rw-r--r--kernel/trace/trace.c298
-rw-r--r--kernel/trace/trace.h64
-rw-r--r--kernel/trace/trace_branch.c6
-rw-r--r--kernel/trace/trace_dynevent.c41
-rw-r--r--kernel/trace/trace_dynevent.h4
-rw-r--r--kernel/trace/trace_event_perf.c5
-rw-r--r--kernel/trace/trace_events.c40
-rw-r--r--kernel/trace/trace_events_inject.c6
-rw-r--r--kernel/trace/trace_events_synth.c322
-rw-r--r--kernel/trace/trace_functions.c31
-rw-r--r--kernel/trace/trace_functions_graph.c32
-rw-r--r--kernel/trace/trace_hwlat.c7
-rw-r--r--kernel/trace/trace_irqsoff.c86
-rw-r--r--kernel/trace/trace_kprobe.c47
-rw-r--r--kernel/trace/trace_mmiotrace.c16
-rw-r--r--kernel/trace/trace_output.c12
-rw-r--r--kernel/trace/trace_probe.c17
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_sched_wakeup.c71
-rw-r--r--kernel/trace/trace_syscalls.c20
-rw-r--r--kernel/trace/trace_uprobe.c23
-rw-r--r--kernel/tracepoint.c91
-rw-r--r--kernel/user_namespace.c65
-rw-r--r--kernel/usermode_driver.c21
-rw-r--r--kernel/watch_queue.c2
-rw-r--r--kernel/watchdog.c5
-rw-r--r--kernel/workqueue.c25
142 files changed, 4054 insertions, 2348 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index aa7368c7eabf..e8a6715f38dc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -41,6 +41,9 @@ KCSAN_SANITIZE_kcov.o := n
UBSAN_SANITIZE_kcov.o := n
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
+# Don't instrument error handlers
+CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI)
+
obj-y += sched/
obj-y += locking/
obj-y += power/
@@ -51,7 +54,7 @@ obj-y += livepatch/
obj-y += dma/
obj-y += entry/
-obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
+obj-$(CONFIG_KCMP) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -111,6 +114,7 @@ obj-$(CONFIG_BPF) += bpf/
obj-$(CONFIG_KCSAN) += kcsan/
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o
+obj-$(CONFIG_CFI_CLANG) += cfi.o
obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/audit.c b/kernel/audit.c
index 551a394bc8f4..121d37e700a6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2132,7 +2132,7 @@ int audit_log_task_context(struct audit_buffer *ab)
int error;
u32 sid;
- security_task_getsecid(current, &sid);
+ security_task_getsecid_subj(current, &sid);
if (!sid)
return 0;
@@ -2353,7 +2353,7 @@ int audit_signal_info(int sig, struct task_struct *t)
audit_sig_uid = auid;
else
audit_sig_uid = uid;
- security_task_getsecid(current, &audit_sig_sid);
+ security_task_getsecid_subj(current, &audit_sig_sid);
}
return audit_signal_info_syscall(t);
diff --git a/kernel/audit.h b/kernel/audit.h
index 3b9c0945225a..1522e100fd17 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -292,8 +292,8 @@ extern void audit_filter_inodes(struct task_struct *tsk,
extern struct list_head *audit_killed_trees(void);
#else /* CONFIG_AUDITSYSCALL */
#define auditsc_get_stamp(c, t, s) 0
-#define audit_put_watch(w) {}
-#define audit_get_watch(w) {}
+#define audit_put_watch(w) do { } while (0)
+#define audit_get_watch(w) do { } while (0)
#define audit_to_watch(k, p, l, o) (-EINVAL)
#define audit_add_watch(k, l) (-EINVAL)
#define audit_remove_watch_rule(k) BUG()
@@ -302,8 +302,8 @@ extern struct list_head *audit_killed_trees(void);
#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL))
#define audit_mark_path(m) ""
-#define audit_remove_mark(m)
-#define audit_remove_mark_rule(k)
+#define audit_remove_mark(m) do { } while (0)
+#define audit_remove_mark_rule(k) do { } while (0)
#define audit_mark_compare(m, i, d) 0
#define audit_exe_compare(t, m) (-EINVAL)
#define audit_dupe_exe(n, o) (-EINVAL)
@@ -311,8 +311,8 @@ extern struct list_head *audit_killed_trees(void);
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
#define audit_make_tree(rule, str, op) -EINVAL
-#define audit_trim_trees() (void)0
-#define audit_put_tree(tree) (void)0
+#define audit_trim_trees() do { } while (0)
+#define audit_put_tree(tree) do { } while (0)
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
#define audit_kill_trees(context) BUG()
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 5b3f01da172b..60739d5e3373 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -84,7 +84,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
dentry = kern_path_locked(pathname, &path);
if (IS_ERR(dentry))
- return (void *)dentry; /* returning an error */
+ return ERR_CAST(dentry); /* returning an error */
inode = path.dentry->d_inode;
inode_unlock(inode);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 333b3bcfc545..db2c6b59dfc3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1359,7 +1359,8 @@ int audit_filter(int msgtype, unsigned int listtype)
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
if (f->lsm_rule) {
- security_task_getsecid(current, &sid);
+ security_task_getsecid_subj(current,
+ &sid);
result = security_audit_rule_match(sid,
f->type, f->op, f->lsm_rule);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 434337ab6b2b..175ef6f3ea4e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -667,7 +667,7 @@ static int audit_filter_rules(struct task_struct *tsk,
logged upon error */
if (f->lsm_rule) {
if (need_sid) {
- security_task_getsecid(tsk, &sid);
+ security_task_getsecid_subj(tsk, &sid);
need_sid = 0;
}
result = security_audit_rule_match(sid, f->type,
@@ -805,8 +805,7 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
* (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
*/
static void audit_filter_syscall(struct task_struct *tsk,
- struct audit_context *ctx,
- struct list_head *list)
+ struct audit_context *ctx)
{
struct audit_entry *e;
enum audit_state state;
@@ -815,7 +814,7 @@ static void audit_filter_syscall(struct task_struct *tsk,
return;
rcu_read_lock();
- list_for_each_entry_rcu(e, list, list) {
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) {
if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, NULL,
&state, false)) {
@@ -1627,8 +1626,7 @@ void __audit_free(struct task_struct *tsk)
context->return_valid = AUDITSC_INVALID;
context->return_code = 0;
- audit_filter_syscall(tsk, context,
- &audit_filter_list[AUDIT_FILTER_EXIT]);
+ audit_filter_syscall(tsk, context);
audit_filter_inodes(tsk, context);
if (context->current_state == AUDIT_RECORD_CONTEXT)
audit_log_exit();
@@ -1735,8 +1733,7 @@ void __audit_syscall_exit(int success, long return_code)
else
context->return_code = return_code;
- audit_filter_syscall(current, context,
- &audit_filter_list[AUDIT_FILTER_EXIT]);
+ audit_filter_syscall(current, context);
audit_filter_inodes(current, context);
if (context->current_state == AUDIT_RECORD_CONTEXT)
audit_log_exit();
@@ -1930,7 +1927,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
if (!dentry)
return 0;
- rc = get_vfs_caps_from_disk(dentry, &caps);
+ rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
if (rc)
return rc;
@@ -2400,7 +2397,7 @@ void __audit_ptrace(struct task_struct *t)
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid(t, &context->target_sid);
+ security_task_getsecid_obj(t, &context->target_sid);
memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
}
@@ -2427,7 +2424,7 @@ int audit_signal_info_syscall(struct task_struct *t)
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid(t, &ctx->target_sid);
+ security_task_getsecid_obj(t, &ctx->target_sid);
memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
return 0;
}
@@ -2448,7 +2445,7 @@ int audit_signal_info_syscall(struct task_struct *t)
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
- security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
+ security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]);
memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
axp->pid_count++;
@@ -2481,7 +2478,8 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+ get_vfs_caps_from_disk(&init_user_ns,
+ bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 6639640523c0..b58b2efb9b43 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -109,7 +109,7 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
fd = *(int *)key;
f = fget_raw(fd);
if (!f)
- return NULL;
+ return ERR_PTR(-EBADF);
sdata = inode_storage_lookup(f->f_inode, map, true);
fput(f);
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 1622a44d1617..0ff58259ccf8 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -209,7 +209,8 @@ BTF_ID(func, bpf_lsm_socket_socketpair)
BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
-BTF_ID(func, bpf_lsm_task_getsecid)
+BTF_ID(func, bpf_lsm_task_getsecid_subj)
+BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 1a666a975416..70f6fd4fa305 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -430,7 +430,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
- err = arch_prepare_bpf_trampoline(image,
+ err = arch_prepare_bpf_trampoline(NULL, image,
st_map->image + PAGE_SIZE,
&st_ops->func_models[i], 0,
tprogs, NULL);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2efeb5f4b343..b1a76fe046cb 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4321,8 +4321,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
* is not supported yet.
* BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
*/
- if (log->level & BPF_LOG_LEVEL)
- bpf_log(log, "arg#%d type is not a struct\n", arg);
return NULL;
}
tname = btf_name_by_offset(btf, t->name_off);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 0ae015ad1e05..75244ecb2389 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -827,7 +827,7 @@ static int __init bpf_jit_charge_init(void)
}
pure_initcall(bpf_jit_charge_init);
-static int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 pages)
{
if (atomic_long_add_return(pages, &bpf_jit_current) >
(bpf_jit_limit >> PAGE_SHIFT)) {
@@ -840,7 +840,7 @@ static int bpf_jit_charge_modmem(u32 pages)
return 0;
}
-static void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 pages)
{
atomic_long_sub(pages, &bpf_jit_current);
}
@@ -1118,6 +1118,8 @@ static void bpf_prog_clone_free(struct bpf_prog *fp)
* clone is guaranteed to not be locked.
*/
fp->aux = NULL;
+ fp->stats = NULL;
+ fp->active = NULL;
__bpf_prog_free(fp);
}
@@ -2342,6 +2344,10 @@ bool __weak bpf_helper_changes_pkt_data(void *func)
/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
* analysis code and wants explicit zero extension inserted by verifier.
* Otherwise, return FALSE.
+ *
+ * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
+ * you don't override this. JITs that don't want these extra insns can detect
+ * them using insn_is_zext.
*/
bool __weak bpf_jit_needs_zext(void)
{
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 3acc7e0b6916..faa54d58972c 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -84,7 +84,7 @@ static const char *const bpf_atomic_alu_string[16] = {
[BPF_ADD >> 4] = "add",
[BPF_AND >> 4] = "and",
[BPF_OR >> 4] = "or",
- [BPF_XOR >> 4] = "or",
+ [BPF_XOR >> 4] = "xor",
};
static const char *const bpf_ldst_string[] = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index dd4b7fd60ee7..d2de2abec35b 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
inode->i_mtime = inode->i_atime;
inode->i_ctime = inode->i_atime;
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(&init_user_ns, inode, dir, mode);
return inode;
}
@@ -152,7 +152,8 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
dir->i_ctime = dir->i_mtime;
}
-static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
@@ -381,8 +382,8 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
return simple_lookup(dir, dentry, flags);
}
-static int bpf_symlink(struct inode *dir, struct dentry *dentry,
- const char *target)
+static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *target)
{
char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
struct inode *inode;
@@ -507,7 +508,7 @@ static void *bpf_obj_do_get(const char __user *pathname,
return ERR_PTR(ret);
inode = d_backing_inode(path.dentry);
- ret = inode_permission(inode, ACC_MODE(flags));
+ ret = path_permission(&path, ACC_MODE(flags));
if (ret)
goto out;
@@ -542,11 +543,11 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
return PTR_ERR(raw);
if (type == BPF_TYPE_PROG)
- ret = bpf_prog_new_fd(raw);
+ ret = (f_flags != O_RDWR) ? -EINVAL : bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
ret = bpf_map_new_fd(raw, f_flags);
else if (type == BPF_TYPE_LINK)
- ret = bpf_link_new_fd(raw);
+ ret = (f_flags != O_RDWR) ? -EINVAL : bpf_link_new_fd(raw);
else
return -ENOENT;
@@ -558,7 +559,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
{
struct bpf_prog *prog;
- int ret = inode_permission(inode, MAY_READ);
+ int ret = inode_permission(&init_user_ns, inode, MAY_READ);
if (ret)
return ERR_PTR(ret);
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 79c5772465f1..53736e52c1df 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -60,9 +60,12 @@ static int finish(void)
&magic, sizeof(magic), &pos);
if (n != sizeof(magic))
return -EPIPE;
+
tgid = umd_ops.info.tgid;
- wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
- umd_ops.info.tgid = NULL;
+ if (tgid) {
+ wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
+ umd_cleanup_helper(&umd_ops.info);
+ }
return 0;
}
@@ -80,10 +83,18 @@ static int __init load_umd(void)
static void __exit fini_umd(void)
{
+ struct pid *tgid;
+
bpf_preload_ops = NULL;
+
/* kill UMD in case it's still there due to earlier error */
- kill_pid(umd_ops.info.tgid, SIGKILL, 1);
- umd_ops.info.tgid = NULL;
+ tgid = umd_ops.info.tgid;
+ if (tgid) {
+ kill_pid(tgid, SIGKILL, 1);
+
+ wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
+ umd_cleanup_helper(&umd_ops.info);
+ }
umd_unload_blob(&umd_ops.info);
}
late_initcall(load_umd);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index be35bfb7fb13..6fbc2abe9c91 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -517,9 +517,17 @@ const struct bpf_func_proto bpf_get_stack_proto = {
BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
u32, size, u64, flags)
{
- struct pt_regs *regs = task_pt_regs(task);
+ struct pt_regs *regs;
+ long res;
- return __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ if (!try_get_task_stack(task))
+ return -EFAULT;
+
+ regs = task_pt_regs(task);
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ put_task_stack(task);
+
+ return res;
}
BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c859bc46d06c..250503482cda 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -854,6 +854,11 @@ static int map_create(union bpf_attr *attr)
err = PTR_ERR(btf);
goto free_map;
}
+ if (btf_is_kernel(btf)) {
+ btf_put(btf);
+ err = -EACCES;
+ goto free_map;
+ }
map->btf = btf;
if (attr->btf_value_type_id) {
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 7bc3b3209224..4aa8b52adf25 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -9,6 +9,7 @@
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
+#include <linux/module.h>
/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -57,19 +58,10 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym)
PAGE_SIZE, true, ksym->name);
}
-static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr)
-{
- struct bpf_ksym *ksym = &tr->ksym;
-
- snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key);
- bpf_image_ksym_add(tr->image, ksym);
-}
-
static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
{
struct bpf_trampoline *tr;
struct hlist_head *head;
- void *image;
int i;
mutex_lock(&trampoline_mutex);
@@ -84,14 +76,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
if (!tr)
goto out;
- /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
- image = bpf_jit_alloc_exec_page();
- if (!image) {
- kfree(tr);
- tr = NULL;
- goto out;
- }
-
tr->key = key;
INIT_HLIST_NODE(&tr->hlist);
hlist_add_head(&tr->hlist, head);
@@ -99,14 +83,31 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
mutex_init(&tr->mutex);
for (i = 0; i < BPF_TRAMP_MAX; i++)
INIT_HLIST_HEAD(&tr->progs_hlist[i]);
- tr->image = image;
- INIT_LIST_HEAD_RCU(&tr->ksym.lnode);
- bpf_trampoline_ksym_add(tr);
out:
mutex_unlock(&trampoline_mutex);
return tr;
}
+static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
+{
+ struct module *mod;
+ int err = 0;
+
+ preempt_disable();
+ mod = __module_text_address((unsigned long) tr->func.addr);
+ if (mod && !try_module_get(mod))
+ err = -ENOENT;
+ preempt_enable();
+ tr->mod = mod;
+ return err;
+}
+
+static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
+{
+ module_put(tr->mod);
+ tr->mod = NULL;
+}
+
static int is_ftrace_location(void *ip)
{
long addr;
@@ -128,6 +129,9 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
ret = unregister_ftrace_direct((long)ip, (long)old_addr);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
+
+ if (!ret)
+ bpf_trampoline_module_put(tr);
return ret;
}
@@ -154,10 +158,16 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
return ret;
tr->func.ftrace_managed = ret;
+ if (bpf_trampoline_module_get(tr))
+ return -ENOENT;
+
if (tr->func.ftrace_managed)
ret = register_ftrace_direct((long)ip, (long)new_addr);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+
+ if (ret)
+ bpf_trampoline_module_put(tr);
return ret;
}
@@ -185,10 +195,142 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
return tprogs;
}
+static void __bpf_tramp_image_put_deferred(struct work_struct *work)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(work, struct bpf_tramp_image, work);
+ bpf_image_ksym_del(&im->ksym);
+ bpf_jit_free_exec(im->image);
+ bpf_jit_uncharge_modmem(1);
+ percpu_ref_exit(&im->pcref);
+ kfree_rcu(im, rcu);
+}
+
+/* callback, fexit step 3 or fentry step 2 */
+static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(rcu, struct bpf_tramp_image, rcu);
+ INIT_WORK(&im->work, __bpf_tramp_image_put_deferred);
+ schedule_work(&im->work);
+}
+
+/* callback, fexit step 2. Called after percpu_ref_kill confirms. */
+static void __bpf_tramp_image_release(struct percpu_ref *pcref)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(pcref, struct bpf_tramp_image, pcref);
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+/* callback, fexit or fentry step 1 */
+static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(rcu, struct bpf_tramp_image, rcu);
+ if (im->ip_after_call)
+ /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */
+ percpu_ref_kill(&im->pcref);
+ else
+ /* the case of fentry trampoline */
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+static void bpf_tramp_image_put(struct bpf_tramp_image *im)
+{
+ /* The trampoline image that calls original function is using:
+ * rcu_read_lock_trace to protect sleepable bpf progs
+ * rcu_read_lock to protect normal bpf progs
+ * percpu_ref to protect trampoline itself
+ * rcu tasks to protect trampoline asm not covered by percpu_ref
+ * (which are few asm insns before __bpf_tramp_enter and
+ * after __bpf_tramp_exit)
+ *
+ * The trampoline is unreachable before bpf_tramp_image_put().
+ *
+ * First, patch the trampoline to avoid calling into fexit progs.
+ * The progs will be freed even if the original function is still
+ * executing or sleeping.
+ * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on
+ * first few asm instructions to execute and call into
+ * __bpf_tramp_enter->percpu_ref_get.
+ * Then use percpu_ref_kill to wait for the trampoline and the original
+ * function to finish.
+ * Then use call_rcu_tasks() to make sure few asm insns in
+ * the trampoline epilogue are done as well.
+ *
+ * In !PREEMPT case the task that got interrupted in the first asm
+ * insns won't go through an RCU quiescent state which the
+ * percpu_ref_kill will be waiting for. Hence the first
+ * call_rcu_tasks() is not necessary.
+ */
+ if (im->ip_after_call) {
+ int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
+ NULL, im->ip_epilogue);
+ WARN_ON(err);
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
+ else
+ percpu_ref_kill(&im->pcref);
+ return;
+ }
+
+ /* The trampoline without fexit and fmod_ret progs doesn't call original
+ * function and doesn't use percpu_ref.
+ * Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * Then use call_rcu_tasks() to wait for the rest of trampoline asm
+ * and normal progs.
+ */
+ call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
+}
+
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
+{
+ struct bpf_tramp_image *im;
+ struct bpf_ksym *ksym;
+ void *image;
+ int err = -ENOMEM;
+
+ im = kzalloc(sizeof(*im), GFP_KERNEL);
+ if (!im)
+ goto out;
+
+ err = bpf_jit_charge_modmem(1);
+ if (err)
+ goto out_free_im;
+
+ err = -ENOMEM;
+ im->image = image = bpf_jit_alloc_exec_page();
+ if (!image)
+ goto out_uncharge;
+
+ err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
+ if (err)
+ goto out_free_image;
+
+ ksym = &im->ksym;
+ INIT_LIST_HEAD_RCU(&ksym->lnode);
+ snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx);
+ bpf_image_ksym_add(image, ksym);
+ return im;
+
+out_free_image:
+ bpf_jit_free_exec(im->image);
+out_uncharge:
+ bpf_jit_uncharge_modmem(1);
+out_free_im:
+ kfree(im);
+out:
+ return ERR_PTR(err);
+}
+
static int bpf_trampoline_update(struct bpf_trampoline *tr)
{
- void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
- void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
+ struct bpf_tramp_image *im;
struct bpf_tramp_progs *tprogs;
u32 flags = BPF_TRAMP_F_RESTORE_REGS;
int err, total;
@@ -198,41 +340,42 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
return PTR_ERR(tprogs);
if (total == 0) {
- err = unregister_fentry(tr, old_image);
+ err = unregister_fentry(tr, tr->cur_image->image);
+ bpf_tramp_image_put(tr->cur_image);
+ tr->cur_image = NULL;
tr->selector = 0;
goto out;
}
+ im = bpf_tramp_image_alloc(tr->key, tr->selector);
+ if (IS_ERR(im)) {
+ err = PTR_ERR(im);
+ goto out;
+ }
+
if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
- /* Though the second half of trampoline page is unused a task could be
- * preempted in the middle of the first half of trampoline and two
- * updates to trampoline would change the code from underneath the
- * preempted task. Hence wait for tasks to voluntarily schedule or go
- * to userspace.
- * The same trampoline can hold both sleepable and non-sleepable progs.
- * synchronize_rcu_tasks_trace() is needed to make sure all sleepable
- * programs finish executing.
- * Wait for these two grace periods together.
- */
- synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace);
-
- err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
+ err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
&tr->func.model, flags, tprogs,
tr->func.addr);
if (err < 0)
goto out;
- if (tr->selector)
+ WARN_ON(tr->cur_image && tr->selector == 0);
+ WARN_ON(!tr->cur_image && tr->selector);
+ if (tr->cur_image)
/* progs already running at this address */
- err = modify_fentry(tr, old_image, new_image);
+ err = modify_fentry(tr, tr->cur_image->image, im->image);
else
/* first time registering */
- err = register_fentry(tr, new_image);
+ err = register_fentry(tr, im->image);
if (err)
goto out;
+ if (tr->cur_image)
+ bpf_tramp_image_put(tr->cur_image);
+ tr->cur_image = im;
tr->selector++;
out:
kfree(tprogs);
@@ -364,17 +507,12 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
goto out;
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
goto out;
- bpf_image_ksym_del(&tr->ksym);
- /* This code will be executed when all bpf progs (both sleepable and
- * non-sleepable) went through
- * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred().
- * Hence no need for another synchronize_rcu_tasks_trace() here,
- * but synchronize_rcu_tasks() is still needed, since trampoline
- * may not have had any sleepable programs and we need to wait
- * for tasks to get out of trampoline code before freeing it.
+ /* This code will be executed even when the last bpf_tramp_image
+ * is alive. All progs are detached from the trampoline and the
+ * trampoline image is patched with jmp into epilogue to skip
+ * fexit progs. The fentry-only trampoline will be freed via
+ * multiple rcu callbacks.
*/
- synchronize_rcu_tasks();
- bpf_jit_free_exec(tr->image);
hlist_del(&tr->hlist);
kfree(tr);
out:
@@ -478,8 +616,18 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
rcu_read_unlock_trace();
}
+void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
+{
+ percpu_ref_get(&tr->pcref);
+}
+
+void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
+{
+ percpu_ref_put(&tr->pcref);
+}
+
int __weak
-arch_prepare_bpf_trampoline(void *image, void *image_end,
+arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
struct bpf_tramp_progs *tprogs,
void *orig_call)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1dda9d81f12c..0399ac092b36 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -504,6 +504,13 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
func_id == BPF_FUNC_skc_to_tcp_request_sock;
}
+static bool is_cmpxchg_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_CMPXCHG;
+}
+
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
@@ -1120,7 +1127,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
reg->type = PTR_TO_RDWR_BUF;
break;
default:
- WARN_ON("unknown nullable register type");
+ WARN_ONCE(1, "unknown nullable register type");
}
}
@@ -1703,7 +1710,11 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
if (class == BPF_STX) {
- if (reg->type != SCALAR_VALUE)
+ /* BPF_STX (including atomic variants) has multiple source
+ * operands, one of which is a ptr. Check whether the caller is
+ * asking about it.
+ */
+ if (t == SRC_OP && reg->type != SCALAR_VALUE)
return true;
return BPF_SIZE(code) == BPF_DW;
}
@@ -1735,22 +1746,38 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
return true;
}
-/* Return TRUE if INSN doesn't have explicit value define. */
-static bool insn_no_def(struct bpf_insn *insn)
+/* Return the regno defined by the insn, or -1. */
+static int insn_def_regno(const struct bpf_insn *insn)
{
- u8 class = BPF_CLASS(insn->code);
-
- return (class == BPF_JMP || class == BPF_JMP32 ||
- class == BPF_STX || class == BPF_ST);
+ switch (BPF_CLASS(insn->code)) {
+ case BPF_JMP:
+ case BPF_JMP32:
+ case BPF_ST:
+ return -1;
+ case BPF_STX:
+ if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm & BPF_FETCH)) {
+ if (insn->imm == BPF_CMPXCHG)
+ return BPF_REG_0;
+ else
+ return insn->src_reg;
+ } else {
+ return -1;
+ }
+ default:
+ return insn->dst_reg;
+ }
}
/* Return TRUE if INSN has defined any 32-bit value explicitly. */
static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- if (insn_no_def(insn))
+ int dst_reg = insn_def_regno(insn);
+
+ if (dst_reg == -1)
return false;
- return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP);
+ return !is_reg64(env, insn, dst_reg, NULL, DST_OP);
}
static void mark_insn_zext(struct bpf_verifier_env *env,
@@ -5829,35 +5856,51 @@ static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
return &env->insn_aux_data[env->insn_idx];
}
+enum {
+ REASON_BOUNDS = -1,
+ REASON_TYPE = -2,
+ REASON_PATHS = -3,
+ REASON_LIMIT = -4,
+ REASON_STACK = -5,
+};
+
static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
- u32 *ptr_limit, u8 opcode, bool off_is_neg)
+ const struct bpf_reg_state *off_reg,
+ u32 *alu_limit, u8 opcode)
{
+ bool off_is_neg = off_reg->smin_value < 0;
bool mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
(opcode == BPF_SUB && !off_is_neg);
- u32 off;
+ u32 max = 0, ptr_limit = 0;
+
+ if (!tnum_is_const(off_reg->var_off) &&
+ (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
+ return REASON_BOUNDS;
switch (ptr_reg->type) {
case PTR_TO_STACK:
- /* Indirect variable offset stack access is prohibited in
- * unprivileged mode so it's not handled here.
+ /* Offset 0 is out-of-bounds, but acceptable start for the
+ * left direction, see BPF_REG_FP. Also, unknown scalar
+ * offset where we would need to deal with min/max bounds is
+ * currently prohibited for unprivileged.
*/
- off = ptr_reg->off + ptr_reg->var_off.value;
- if (mask_to_left)
- *ptr_limit = MAX_BPF_STACK + off;
- else
- *ptr_limit = -off;
- return 0;
+ max = MAX_BPF_STACK + mask_to_left;
+ ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off);
+ break;
case PTR_TO_MAP_VALUE:
- if (mask_to_left) {
- *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
- } else {
- off = ptr_reg->smin_value + ptr_reg->off;
- *ptr_limit = ptr_reg->map_ptr->value_size - off;
- }
- return 0;
+ max = ptr_reg->map_ptr->value_size;
+ ptr_limit = (mask_to_left ?
+ ptr_reg->smin_value :
+ ptr_reg->umax_value) + ptr_reg->off;
+ break;
default:
- return -EINVAL;
+ return REASON_TYPE;
}
+
+ if (ptr_limit >= max)
+ return REASON_LIMIT;
+ *alu_limit = ptr_limit;
+ return 0;
}
static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
@@ -5875,7 +5918,7 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
if (aux->alu_state &&
(aux->alu_state != alu_state ||
aux->alu_limit != alu_limit))
- return -EACCES;
+ return REASON_PATHS;
/* Corresponding fixup done in fixup_bpf_calls(). */
aux->alu_state = alu_state;
@@ -5894,19 +5937,28 @@ static int sanitize_val_alu(struct bpf_verifier_env *env,
return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0);
}
+static bool sanitize_needed(u8 opcode)
+{
+ return opcode == BPF_ADD || opcode == BPF_SUB;
+}
+
static int sanitize_ptr_alu(struct bpf_verifier_env *env,
struct bpf_insn *insn,
const struct bpf_reg_state *ptr_reg,
+ const struct bpf_reg_state *off_reg,
struct bpf_reg_state *dst_reg,
- bool off_is_neg)
+ struct bpf_insn_aux_data *tmp_aux,
+ const bool commit_window)
{
+ struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux;
struct bpf_verifier_state *vstate = env->cur_state;
- struct bpf_insn_aux_data *aux = cur_aux(env);
+ bool off_is_neg = off_reg->smin_value < 0;
bool ptr_is_dst_reg = ptr_reg == dst_reg;
u8 opcode = BPF_OP(insn->code);
u32 alu_state, alu_limit;
struct bpf_reg_state tmp;
bool ret;
+ int err;
if (can_skip_alu_sanitation(env, insn))
return 0;
@@ -5918,15 +5970,33 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
if (vstate->speculative)
goto do_sim;
- alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
- alu_state |= ptr_is_dst_reg ?
- BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+ err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode);
+ if (err < 0)
+ return err;
- if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg))
- return 0;
- if (update_alu_sanitation_state(aux, alu_state, alu_limit))
- return -EACCES;
+ if (commit_window) {
+ /* In commit phase we narrow the masking window based on
+ * the observed pointer move after the simulated operation.
+ */
+ alu_state = tmp_aux->alu_state;
+ alu_limit = abs(tmp_aux->alu_limit - alu_limit);
+ } else {
+ alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+ alu_state |= ptr_is_dst_reg ?
+ BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+ }
+
+ err = update_alu_sanitation_state(aux, alu_state, alu_limit);
+ if (err < 0)
+ return err;
do_sim:
+ /* If we're in commit phase, we're done here given we already
+ * pushed the truncated dst_reg into the speculative verification
+ * stack.
+ */
+ if (commit_window)
+ return 0;
+
/* Simulate and find potential out-of-bounds access under
* speculative execution from truncation as a result of
* masking when off was not within expected range. If off
@@ -5943,7 +6013,46 @@ do_sim:
ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
if (!ptr_is_dst_reg && ret)
*dst_reg = tmp;
- return !ret ? -EFAULT : 0;
+ return !ret ? REASON_STACK : 0;
+}
+
+static int sanitize_err(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, int reason,
+ const struct bpf_reg_state *off_reg,
+ const struct bpf_reg_state *dst_reg)
+{
+ static const char *err = "pointer arithmetic with it prohibited for !root";
+ const char *op = BPF_OP(insn->code) == BPF_ADD ? "add" : "sub";
+ u32 dst = insn->dst_reg, src = insn->src_reg;
+
+ switch (reason) {
+ case REASON_BOUNDS:
+ verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n",
+ off_reg == dst_reg ? dst : src, err);
+ break;
+ case REASON_TYPE:
+ verbose(env, "R%d has pointer with unsupported alu operation, %s\n",
+ off_reg == dst_reg ? src : dst, err);
+ break;
+ case REASON_PATHS:
+ verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n",
+ dst, op, err);
+ break;
+ case REASON_LIMIT:
+ verbose(env, "R%d tried to %s beyond pointer bounds, %s\n",
+ dst, op, err);
+ break;
+ case REASON_STACK:
+ verbose(env, "R%d could not be pushed for speculative verification, %s\n",
+ dst, err);
+ break;
+ default:
+ verbose(env, "verifier internal error: unknown reason (%d)\n",
+ reason);
+ break;
+ }
+
+ return -EACCES;
}
/* check that stack access falls within stack limits and that 'reg' doesn't
@@ -5980,6 +6089,37 @@ static int check_stack_access_for_ptr_arithmetic(
return 0;
}
+static int sanitize_check_bounds(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn,
+ const struct bpf_reg_state *dst_reg)
+{
+ u32 dst = insn->dst_reg;
+
+ /* For unprivileged we require that resulting offset must be in bounds
+ * in order to be able to sanitize access later on.
+ */
+ if (env->bypass_spec_v1)
+ return 0;
+
+ switch (dst_reg->type) {
+ case PTR_TO_STACK:
+ if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg,
+ dst_reg->off + dst_reg->var_off.value))
+ return -EACCES;
+ break;
+ case PTR_TO_MAP_VALUE:
+ if (check_map_access(env, dst, dst_reg->off, 1, false)) {
+ verbose(env, "R%d pointer arithmetic of map value goes out of range, "
+ "prohibited for !root\n", dst);
+ return -EACCES;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
* Caller should also handle BPF_MOV case separately.
@@ -5999,8 +6139,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
- u32 dst = insn->dst_reg, src = insn->src_reg;
+ struct bpf_insn_aux_data tmp_aux = {};
u8 opcode = BPF_OP(insn->code);
+ u32 dst = insn->dst_reg;
int ret;
dst_reg = &regs[dst];
@@ -6048,13 +6189,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
- case PTR_TO_MAP_VALUE:
- if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
- verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
- off_reg == dst_reg ? dst : src);
- return -EACCES;
- }
- fallthrough;
default:
break;
}
@@ -6072,13 +6206,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* pointer types do not carry 32-bit bounds at the moment. */
__mark_reg32_unbounded(dst_reg);
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
+ &tmp_aux, false);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, off_reg, dst_reg);
+ }
+
switch (opcode) {
case BPF_ADD:
- ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
- if (ret < 0) {
- verbose(env, "R%d tried to add from different maps or paths\n", dst);
- return ret;
- }
/* We can take a fixed offset as long as it doesn't overflow
* the s32 'off' field
*/
@@ -6129,11 +6265,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
break;
case BPF_SUB:
- ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
- if (ret < 0) {
- verbose(env, "R%d tried to sub from different maps or paths\n", dst);
- return ret;
- }
if (dst_reg == off_reg) {
/* scalar -= pointer. Creates an unknown scalar */
verbose(env, "R%d tried to subtract pointer from scalar\n",
@@ -6214,21 +6345,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
- /* For unprivileged we require that resulting offset must be in bounds
- * in order to be able to sanitize access later on.
- */
- if (!env->bypass_spec_v1) {
- if (dst_reg->type == PTR_TO_MAP_VALUE &&
- check_map_access(env, dst, dst_reg->off, 1, false)) {
- verbose(env, "R%d pointer arithmetic of map value goes out of range, "
- "prohibited for !root\n", dst);
- return -EACCES;
- } else if (dst_reg->type == PTR_TO_STACK &&
- check_stack_access_for_ptr_arithmetic(
- env, dst, dst_reg, dst_reg->off +
- dst_reg->var_off.value)) {
- return -EACCES;
- }
+ if (sanitize_check_bounds(env, insn, dst_reg) < 0)
+ return -EACCES;
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
+ &tmp_aux, true);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, off_reg, dst_reg);
}
return 0;
@@ -6822,9 +6945,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
s32 s32_min_val, s32_max_val;
u32 u32_min_val, u32_max_val;
u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
- u32 dst = insn->dst_reg;
- int ret;
bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
+ int ret;
smin_val = src_reg.smin_value;
smax_val = src_reg.smax_value;
@@ -6866,6 +6988,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
return 0;
}
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_val_alu(env, insn);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, NULL, NULL);
+ }
+
/* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops.
* There are two classes of instructions: The first class we track both
* alu32 and alu64 sign/unsigned bounds independently this provides the
@@ -6882,21 +7010,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
*/
switch (opcode) {
case BPF_ADD:
- ret = sanitize_val_alu(env, insn);
- if (ret < 0) {
- verbose(env, "R%d tried to add from different pointers or scalars\n", dst);
- return ret;
- }
scalar32_min_max_add(dst_reg, &src_reg);
scalar_min_max_add(dst_reg, &src_reg);
dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
break;
case BPF_SUB:
- ret = sanitize_val_alu(env, insn);
- if (ret < 0) {
- verbose(env, "R%d tried to sub from different pointers or scalars\n", dst);
- return ret;
- }
scalar32_min_max_sub(dst_reg, &src_reg);
scalar_min_max_sub(dst_reg, &src_reg);
dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
@@ -9029,6 +9147,10 @@ static int check_btf_info(struct bpf_verifier_env *env,
btf = btf_get_by_fd(attr->prog_btf_fd);
if (IS_ERR(btf))
return PTR_ERR(btf);
+ if (btf_is_kernel(btf)) {
+ btf_put(btf);
+ return -EACCES;
+ }
env->prog->aux->btf = btf;
err = check_btf_func(env, attr, uattr);
@@ -11006,9 +11128,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
for (i = 0; i < len; i++) {
int adj_idx = i + delta;
struct bpf_insn insn;
- u8 load_reg;
+ int load_reg;
insn = insns[adj_idx];
+ load_reg = insn_def_regno(&insn);
if (!aux[adj_idx].zext_dst) {
u8 code, class;
u32 imm_rnd;
@@ -11018,14 +11141,14 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
code = insn.code;
class = BPF_CLASS(code);
- if (insn_no_def(&insn))
+ if (load_reg == -1)
continue;
/* NOTE: arg "reg" (the fourth one) is only used for
- * BPF_STX which has been ruled out in above
- * check, it is safe to pass NULL here.
+ * BPF_STX + SRC_OP, so it is safe to pass NULL
+ * here.
*/
- if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) {
+ if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) {
if (class == BPF_LD &&
BPF_MODE(code) == BPF_IMM)
i++;
@@ -11040,31 +11163,28 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
imm_rnd = get_random_int();
rnd_hi32_patch[0] = insn;
rnd_hi32_patch[1].imm = imm_rnd;
- rnd_hi32_patch[3].dst_reg = insn.dst_reg;
+ rnd_hi32_patch[3].dst_reg = load_reg;
patch = rnd_hi32_patch;
patch_len = 4;
goto apply_patch_buffer;
}
- if (!bpf_jit_needs_zext())
+ /* Add in an zero-extend instruction if a) the JIT has requested
+ * it or b) it's a CMPXCHG.
+ *
+ * The latter is because: BPF_CMPXCHG always loads a value into
+ * R0, therefore always zero-extends. However some archs'
+ * equivalent instruction only does this load when the
+ * comparison is successful. This detail of CMPXCHG is
+ * orthogonal to the general zero-extension behaviour of the
+ * CPU, so it's treated independently of bpf_jit_needs_zext.
+ */
+ if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
continue;
- /* zext_dst means that we want to zero-extend whatever register
- * the insn defines, which is dst_reg most of the time, with
- * the notable exception of BPF_STX + BPF_ATOMIC + BPF_FETCH.
- */
- if (BPF_CLASS(insn.code) == BPF_STX &&
- BPF_MODE(insn.code) == BPF_ATOMIC) {
- /* BPF_STX + BPF_ATOMIC insns without BPF_FETCH do not
- * define any registers, therefore zext_dst cannot be
- * set.
- */
- if (WARN_ON(!(insn.imm & BPF_FETCH)))
- return -EINVAL;
- load_reg = insn.imm == BPF_CMPXCHG ? BPF_REG_0
- : insn.src_reg;
- } else {
- load_reg = insn.dst_reg;
+ if (WARN_ON(load_reg == -1)) {
+ verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
+ return -EFAULT;
}
zext_patch[0] = insn;
@@ -11635,7 +11755,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
off_reg = issrc ? insn->src_reg : insn->dst_reg;
if (isneg)
*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
- *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
@@ -12120,6 +12240,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
u32 btf_id, member_idx;
const char *mname;
+ if (!prog->gpl_compatible) {
+ verbose(env, "struct ops programs must have a GPL compatible license\n");
+ return -EINVAL;
+ }
+
btf_id = prog->aux->attach_btf_id;
st_ops = bpf_struct_ops_find(btf_id);
if (!st_ops) {
diff --git a/kernel/capability.c b/kernel/capability.c
index de7eac903a2a..46a361dde042 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -484,10 +484,12 @@ EXPORT_SYMBOL(file_ns_capable);
*
* Return true if the inode uid and gid are within the namespace.
*/
-bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
+ struct user_namespace *mnt_userns,
+ const struct inode *inode)
{
- return kuid_has_mapping(ns, inode->i_uid) &&
- kgid_has_mapping(ns, inode->i_gid);
+ return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) &&
+ kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode));
}
/**
@@ -499,11 +501,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
-bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
+bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
+ const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
- return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
+ return ns_capable(ns, cap) &&
+ privileged_wrt_inode_uidgid(ns, mnt_userns, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cfi.c b/kernel/cfi.c
new file mode 100644
index 000000000000..e17a56639766
--- /dev/null
+++ b/kernel/cfi.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Clang Control Flow Integrity (CFI) error and slowpath handling.
+ *
+ * Copyright (C) 2021 Google LLC
+ */
+
+#include <linux/hardirq.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+#include <linux/rcupdate.h>
+#include <linux/vmalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/set_memory.h>
+
+/* Compiler-defined handler names */
+#ifdef CONFIG_CFI_PERMISSIVE
+#define cfi_failure_handler __ubsan_handle_cfi_check_fail
+#else
+#define cfi_failure_handler __ubsan_handle_cfi_check_fail_abort
+#endif
+
+static inline void handle_cfi_failure(void *ptr)
+{
+ if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
+ WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr);
+ else
+ panic("CFI failure (target: %pS)\n", ptr);
+}
+
+#ifdef CONFIG_MODULES
+#ifdef CONFIG_CFI_CLANG_SHADOW
+/*
+ * Index type. A 16-bit index can address at most (2^16)-2 pages (taking
+ * into account SHADOW_INVALID), i.e. ~256M with 4k pages.
+ */
+typedef u16 shadow_t;
+#define SHADOW_INVALID ((shadow_t)~0UL)
+
+struct cfi_shadow {
+ /* Page index for the beginning of the shadow */
+ unsigned long base;
+ /* An array of __cfi_check locations (as indices to the shadow) */
+ shadow_t shadow[1];
+} __packed;
+
+/*
+ * The shadow covers ~128M from the beginning of the module region. If
+ * the region is larger, we fall back to __module_address for the rest.
+ */
+#define __SHADOW_RANGE (_UL(SZ_128M) >> PAGE_SHIFT)
+
+/* The in-memory size of struct cfi_shadow, always at least one page */
+#define __SHADOW_PAGES ((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT)
+#define SHADOW_PAGES max(1UL, __SHADOW_PAGES)
+#define SHADOW_SIZE (SHADOW_PAGES << PAGE_SHIFT)
+
+/* The actual size of the shadow array, minus metadata */
+#define SHADOW_ARR_SIZE (SHADOW_SIZE - offsetof(struct cfi_shadow, shadow))
+#define SHADOW_ARR_SLOTS (SHADOW_ARR_SIZE / sizeof(shadow_t))
+
+static DEFINE_MUTEX(shadow_update_lock);
+static struct cfi_shadow __rcu *cfi_shadow __read_mostly;
+
+/* Returns the index in the shadow for the given address */
+static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr)
+{
+ unsigned long index;
+ unsigned long page = ptr >> PAGE_SHIFT;
+
+ if (unlikely(page < s->base))
+ return -1; /* Outside of module area */
+
+ index = page - s->base;
+
+ if (index >= SHADOW_ARR_SLOTS)
+ return -1; /* Cannot be addressed with shadow */
+
+ return (int)index;
+}
+
+/* Returns the page address for an index in the shadow */
+static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s,
+ int index)
+{
+ if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
+ return 0;
+
+ return (s->base + index) << PAGE_SHIFT;
+}
+
+/* Returns the __cfi_check function address for the given shadow location */
+static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s,
+ int index)
+{
+ if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
+ return 0;
+
+ if (unlikely(s->shadow[index] == SHADOW_INVALID))
+ return 0;
+
+ /* __cfi_check is always page aligned */
+ return (s->base + s->shadow[index]) << PAGE_SHIFT;
+}
+
+static void prepare_next_shadow(const struct cfi_shadow __rcu *prev,
+ struct cfi_shadow *next)
+{
+ int i, index, check;
+
+ /* Mark everything invalid */
+ memset(next->shadow, 0xFF, SHADOW_ARR_SIZE);
+
+ if (!prev)
+ return; /* No previous shadow */
+
+ /* If the base address didn't change, an update is not needed */
+ if (prev->base == next->base) {
+ memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE);
+ return;
+ }
+
+ /* Convert the previous shadow to the new address range */
+ for (i = 0; i < SHADOW_ARR_SLOTS; ++i) {
+ if (prev->shadow[i] == SHADOW_INVALID)
+ continue;
+
+ index = ptr_to_shadow(next, shadow_to_ptr(prev, i));
+ if (index < 0)
+ continue;
+
+ check = ptr_to_shadow(next,
+ shadow_to_check_fn(prev, prev->shadow[i]));
+ if (check < 0)
+ continue;
+
+ next->shadow[index] = (shadow_t)check;
+ }
+}
+
+static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod,
+ unsigned long min_addr, unsigned long max_addr)
+{
+ int check_index;
+ unsigned long check = (unsigned long)mod->cfi_check;
+ unsigned long ptr;
+
+ if (unlikely(!PAGE_ALIGNED(check))) {
+ pr_warn("cfi: not using shadow for module %s\n", mod->name);
+ return;
+ }
+
+ check_index = ptr_to_shadow(s, check);
+ if (check_index < 0)
+ return; /* Module not addressable with shadow */
+
+ /* For each page, store the check function index in the shadow */
+ for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
+ int index = ptr_to_shadow(s, ptr);
+
+ if (index >= 0) {
+ /* Each page must only contain one module */
+ WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID);
+ s->shadow[index] = (shadow_t)check_index;
+ }
+ }
+}
+
+static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod,
+ unsigned long min_addr, unsigned long max_addr)
+{
+ unsigned long ptr;
+
+ for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
+ int index = ptr_to_shadow(s, ptr);
+
+ if (index >= 0)
+ s->shadow[index] = SHADOW_INVALID;
+ }
+}
+
+typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *,
+ unsigned long min_addr, unsigned long max_addr);
+
+static void update_shadow(struct module *mod, unsigned long base_addr,
+ update_shadow_fn fn)
+{
+ struct cfi_shadow *prev;
+ struct cfi_shadow *next;
+ unsigned long min_addr, max_addr;
+
+ next = vmalloc(SHADOW_SIZE);
+
+ mutex_lock(&shadow_update_lock);
+ prev = rcu_dereference_protected(cfi_shadow,
+ mutex_is_locked(&shadow_update_lock));
+
+ if (next) {
+ next->base = base_addr >> PAGE_SHIFT;
+ prepare_next_shadow(prev, next);
+
+ min_addr = (unsigned long)mod->core_layout.base;
+ max_addr = min_addr + mod->core_layout.text_size;
+ fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK);
+
+ set_memory_ro((unsigned long)next, SHADOW_PAGES);
+ }
+
+ rcu_assign_pointer(cfi_shadow, next);
+ mutex_unlock(&shadow_update_lock);
+ synchronize_rcu();
+
+ if (prev) {
+ set_memory_rw((unsigned long)prev, SHADOW_PAGES);
+ vfree(prev);
+ }
+}
+
+void cfi_module_add(struct module *mod, unsigned long base_addr)
+{
+ update_shadow(mod, base_addr, add_module_to_shadow);
+}
+
+void cfi_module_remove(struct module *mod, unsigned long base_addr)
+{
+ update_shadow(mod, base_addr, remove_module_from_shadow);
+}
+
+static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s,
+ unsigned long ptr)
+{
+ int index;
+
+ if (unlikely(!s))
+ return NULL; /* No shadow available */
+
+ index = ptr_to_shadow(s, ptr);
+ if (index < 0)
+ return NULL; /* Cannot be addressed with shadow */
+
+ return (cfi_check_fn)shadow_to_check_fn(s, index);
+}
+
+static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
+{
+ cfi_check_fn fn;
+
+ rcu_read_lock_sched();
+ fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr);
+ rcu_read_unlock_sched();
+
+ return fn;
+}
+
+#else /* !CONFIG_CFI_CLANG_SHADOW */
+
+static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_CFI_CLANG_SHADOW */
+
+static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
+{
+ cfi_check_fn fn = NULL;
+ struct module *mod;
+
+ rcu_read_lock_sched();
+ mod = __module_address(ptr);
+ if (mod)
+ fn = mod->cfi_check;
+ rcu_read_unlock_sched();
+
+ return fn;
+}
+
+static inline cfi_check_fn find_check_fn(unsigned long ptr)
+{
+ cfi_check_fn fn = NULL;
+
+ if (is_kernel_text(ptr))
+ return __cfi_check;
+
+ /*
+ * Indirect call checks can happen when RCU is not watching. Both
+ * the shadow and __module_address use RCU, so we need to wake it
+ * up if necessary.
+ */
+ RCU_NONIDLE({
+ if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
+ fn = find_shadow_check_fn(ptr);
+
+ if (!fn)
+ fn = find_module_check_fn(ptr);
+ });
+
+ return fn;
+}
+
+void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+{
+ cfi_check_fn fn = find_check_fn((unsigned long)ptr);
+
+ if (likely(fn))
+ fn(id, ptr, diag);
+ else /* Don't allow unchecked modules */
+ handle_cfi_failure(ptr);
+}
+EXPORT_SYMBOL(__cfi_slowpath_diag);
+
+#else /* !CONFIG_MODULES */
+
+void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+{
+ handle_cfi_failure(ptr); /* No modules */
+}
+EXPORT_SYMBOL(__cfi_slowpath_diag);
+
+#endif /* CONFIG_MODULES */
+
+void cfi_failure_handler(void *data, void *ptr, void *vtable)
+{
+ handle_cfi_failure(ptr);
+}
+EXPORT_SYMBOL(cfi_failure_handler);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1ea995f801ec..9153b20e5cc6 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4672,7 +4672,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
if (!inode)
return -ENOMEM;
- ret = inode_permission(inode, MAY_WRITE);
+ ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
iput(inode);
return ret;
}
@@ -4728,8 +4728,8 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
return ret;
}
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ bool threadgroup)
{
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
@@ -4740,7 +4740,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, true, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -4750,19 +4750,26 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
+ /* process and thread migrations follow same delegation rule */
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb, true);
+ of->file->f_path.dentry->d_sb, threadgroup);
if (ret)
goto out_finish;
- ret = cgroup_attach_task(dst_cgrp, task, true);
+ ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
cgroup_procs_write_finish(task, locked);
out_unlock:
cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
+ return ret;
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, true) ?: nbytes;
}
static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
@@ -4773,41 +4780,7 @@ static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- struct cgroup *src_cgrp, *dst_cgrp;
- struct task_struct *task;
- ssize_t ret;
- bool locked;
-
- buf = strstrip(buf);
-
- dst_cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!dst_cgrp)
- return -ENODEV;
-
- task = cgroup_procs_write_start(buf, false, &locked);
- ret = PTR_ERR_OR_ZERO(task);
- if (ret)
- goto out_unlock;
-
- /* find the source cgroup */
- spin_lock_irq(&css_set_lock);
- src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- /* thread migrations follow the cgroup.procs delegation rule */
- ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb, false);
- if (ret)
- goto out_finish;
-
- ret = cgroup_attach_task(dst_cgrp, task, false);
-
-out_finish:
- cgroup_procs_write_finish(task, locked);
-out_unlock:
- cgroup_kn_unlock(of->kn);
-
- return ret ?: nbytes;
+ return __cgroup_procs_write(of, buf, false) ?: nbytes;
}
/* cgroup core interface files for the default hierarchy */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 53c70c470a38..5258b68153e0 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -98,7 +98,7 @@ struct cpuset {
* and if it ends up empty, it will inherit the parent's mask.
*
*
- * On legacy hierachy:
+ * On legacy hierarchy:
*
* The user-configured masks are always the same with effective masks.
*/
@@ -1309,10 +1309,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
*
- * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * When configured cpumask is changed, the effective cpumasks of this cpuset
* and all its descendants need to be updated.
*
- * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
+ * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
*
* Called with cpuset_mutex held
*/
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index af6e8b4fb359..4708aec492df 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -119,7 +119,6 @@ static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
*/
static atomic_t masters_in_kgdb;
static atomic_t slaves_in_kgdb;
-static atomic_t kgdb_break_tasklet_var;
atomic_t kgdb_setting_breakpoint;
struct task_struct *kgdb_usethread;
@@ -456,6 +455,17 @@ setundefined:
return 0;
}
+void kgdb_free_init_mem(void)
+{
+ int i;
+
+ /* Clear init memory breakpoints. */
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (init_section_contains((void *)kgdb_break[i].bpt_addr, 0))
+ kgdb_break[i].state = BP_UNDEFINED;
+ }
+}
+
#ifdef CONFIG_KGDB_KDB
void kdb_dump_stack_on_cpu(int cpu)
{
@@ -1084,31 +1094,6 @@ static void kgdb_unregister_callbacks(void)
}
}
-/*
- * There are times a tasklet needs to be used vs a compiled in
- * break point so as to cause an exception outside a kgdb I/O module,
- * such as is the case with kgdboe, where calling a breakpoint in the
- * I/O driver itself would be fatal.
- */
-static void kgdb_tasklet_bpt(unsigned long ing)
-{
- kgdb_breakpoint();
- atomic_set(&kgdb_break_tasklet_var, 0);
-}
-
-static DECLARE_TASKLET_OLD(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt);
-
-void kgdb_schedule_breakpoint(void)
-{
- if (atomic_read(&kgdb_break_tasklet_var) ||
- atomic_read(&kgdb_active) != -1 ||
- atomic_read(&kgdb_setting_breakpoint))
- return;
- atomic_inc(&kgdb_break_tasklet_var);
- tasklet_schedule(&kgdb_tasklet_breakpoint);
-}
-EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
-
/**
* kgdb_register_io_module - register KGDB IO module
* @new_dbg_io_ops: the io ops vector
@@ -1166,7 +1151,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
EXPORT_SYMBOL_GPL(kgdb_register_io_module);
/**
- * kkgdb_unregister_io_module - unregister KGDB IO module
+ * kgdb_unregister_io_module - unregister KGDB IO module
* @old_dbg_io_ops: the io ops vector
*
* Unregister it with the KGDB core.
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index a77df59d9ca5..8372897402f4 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -321,7 +321,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
/*
* Copy the binary array pointed to by buf into mem. Fix $, #, and
* 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
- * The input buf is overwitten with the result to write to mem.
+ * The input buf is overwritten with the result to write to mem.
*/
static int kgdb_ebin2mem(char *buf, char *mem, int count)
{
@@ -595,7 +595,7 @@ static char *gdb_hex_reg_helper(int regnum, char *out)
dbg_reg_def[i].size);
}
-/* Handle the 'p' individual regster get */
+/* Handle the 'p' individual register get */
static void gdb_cmd_reg_get(struct kgdb_state *ks)
{
unsigned long regnum;
@@ -610,7 +610,7 @@ static void gdb_cmd_reg_get(struct kgdb_state *ks)
gdb_hex_reg_helper(regnum, remcom_out_buffer);
}
-/* Handle the 'P' individual regster set */
+/* Handle the 'P' individual register set */
static void gdb_cmd_reg_set(struct kgdb_state *ks)
{
unsigned long regnum;
@@ -952,7 +952,7 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
}
/*
- * This function performs all gdbserial command procesing
+ * This function performs all gdbserial command processing
*/
int gdb_serial_stub(struct kgdb_state *ks)
{
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index ec4940146612..2168f8dacb99 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -522,6 +522,54 @@ static int kdb_ss(int argc, const char **argv)
return KDB_CMD_SS;
}
+static kdbtab_t bptab[] = {
+ { .cmd_name = "bp",
+ .cmd_func = kdb_bp,
+ .cmd_usage = "[<vaddr>]",
+ .cmd_help = "Set/Display breakpoints",
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "bl",
+ .cmd_func = kdb_bp,
+ .cmd_usage = "[<vaddr>]",
+ .cmd_help = "Display breakpoints",
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "bc",
+ .cmd_func = kdb_bc,
+ .cmd_usage = "<bpnum>",
+ .cmd_help = "Clear Breakpoint",
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ },
+ { .cmd_name = "be",
+ .cmd_func = kdb_bc,
+ .cmd_usage = "<bpnum>",
+ .cmd_help = "Enable Breakpoint",
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ },
+ { .cmd_name = "bd",
+ .cmd_func = kdb_bc,
+ .cmd_usage = "<bpnum>",
+ .cmd_help = "Disable Breakpoint",
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ },
+ { .cmd_name = "ss",
+ .cmd_func = kdb_ss,
+ .cmd_usage = "",
+ .cmd_help = "Single Step",
+ .cmd_minlen = 1,
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ },
+};
+
+static kdbtab_t bphcmd = {
+ .cmd_name = "bph",
+ .cmd_func = kdb_bp,
+ .cmd_usage = "[<vaddr>]",
+ .cmd_help = "[datar [length]|dataw [length]] Set hw brk",
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+};
+
/* Initialize the breakpoint table and register breakpoint commands. */
void __init kdb_initbptab(void)
@@ -537,30 +585,7 @@ void __init kdb_initbptab(void)
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
bp->bp_free = 1;
- kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
- "Set/Display breakpoints", 0,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
- "Display breakpoints", 0,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ kdb_register_table(bptab, ARRAY_SIZE(bptab));
if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
- kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
- "[datar [length]|dataw [length]] Set hw brk", 0,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("bc", kdb_bc, "<bpnum>",
- "Clear Breakpoint", 0,
- KDB_ENABLE_FLOW_CTRL);
- kdb_register_flags("be", kdb_bc, "<bpnum>",
- "Enable Breakpoint", 0,
- KDB_ENABLE_FLOW_CTRL);
- kdb_register_flags("bd", kdb_bc, "<bpnum>",
- "Disable Breakpoint", 0,
- KDB_ENABLE_FLOW_CTRL);
-
- kdb_register_flags("ss", kdb_ss, "",
- "Single Step", 1,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
- /*
- * Architecture dependent initialization.
- */
+ kdb_register_table(&bphcmd, 1);
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 930ac1b25ec7..1baa96a2ecb8 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -33,6 +33,7 @@
#include <linux/kallsyms.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
+#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
@@ -84,15 +85,8 @@ static unsigned int kdb_continue_catastrophic =
static unsigned int kdb_continue_catastrophic;
#endif
-/* kdb_commands describes the available commands. */
-static kdbtab_t *kdb_commands;
-#define KDB_BASE_CMD_MAX 50
-static int kdb_max_commands = KDB_BASE_CMD_MAX;
-static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
-#define for_each_kdbcmd(cmd, num) \
- for ((cmd) = kdb_base_commands, (num) = 0; \
- num < kdb_max_commands; \
- num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
+/* kdb_cmds_head describes the available commands. */
+static LIST_HEAD(kdb_cmds_head);
typedef struct _kdbmsg {
int km_diag; /* kdb diagnostic */
@@ -146,42 +140,18 @@ static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
* KDB_ENVBUFSIZE if required).
*/
-static char *__env[] = {
+static char *__env[31] = {
#if defined(CONFIG_SMP)
- "PROMPT=[%d]kdb> ",
+ "PROMPT=[%d]kdb> ",
#else
- "PROMPT=kdb> ",
+ "PROMPT=kdb> ",
#endif
- "MOREPROMPT=more> ",
- "RADIX=16",
- "MDCOUNT=8", /* lines of md output */
- KDB_PLATFORM_ENV,
- "DTABCOUNT=30",
- "NOSECT=1",
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
+ "MOREPROMPT=more> ",
+ "RADIX=16",
+ "MDCOUNT=8", /* lines of md output */
+ KDB_PLATFORM_ENV,
+ "DTABCOUNT=30",
+ "NOSECT=1",
};
static const int __nenv = ARRAY_SIZE(__env);
@@ -324,6 +294,63 @@ int kdbgetintenv(const char *match, int *value)
}
/*
+ * kdb_setenv() - Alter an existing environment variable or create a new one.
+ * @var: Name of the variable
+ * @val: Value of the variable
+ *
+ * Return: Zero on success, a kdb diagnostic on failure.
+ */
+static int kdb_setenv(const char *var, const char *val)
+{
+ int i;
+ char *ep;
+ size_t varlen, vallen;
+
+ varlen = strlen(var);
+ vallen = strlen(val);
+ ep = kdballocenv(varlen + vallen + 2);
+ if (ep == (char *)0)
+ return KDB_ENVBUFFULL;
+
+ sprintf(ep, "%s=%s", var, val);
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i]
+ && ((strncmp(__env[i], var, varlen) == 0)
+ && ((__env[i][varlen] == '\0')
+ || (__env[i][varlen] == '=')))) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ /*
+ * Wasn't existing variable. Fit into slot.
+ */
+ for (i = 0; i < __nenv-1; i++) {
+ if (__env[i] == (char *)0) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ return KDB_ENVFULL;
+}
+
+/*
+ * kdb_printenv() - Display the current environment variables.
+ */
+static void kdb_printenv(void)
+{
+ int i;
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i])
+ kdb_printf("%s\n", __env[i]);
+ }
+}
+
+/*
* kdbgetularg - This function will convert a numeric string into an
* unsigned long value.
* Parameters:
@@ -380,10 +407,6 @@ int kdbgetu64arg(const char *arg, u64 *value)
*/
int kdb_set(int argc, const char **argv)
{
- int i;
- char *ep;
- size_t varlen, vallen;
-
/*
* we can be invoked two ways:
* set var=value argv[1]="var", argv[2]="value"
@@ -428,37 +451,7 @@ int kdb_set(int argc, const char **argv)
* Tokenizer squashed the '=' sign. argv[1] is variable
* name, argv[2] = value.
*/
- varlen = strlen(argv[1]);
- vallen = strlen(argv[2]);
- ep = kdballocenv(varlen + vallen + 2);
- if (ep == (char *)0)
- return KDB_ENVBUFFULL;
-
- sprintf(ep, "%s=%s", argv[1], argv[2]);
-
- ep[varlen+vallen+1] = '\0';
-
- for (i = 0; i < __nenv; i++) {
- if (__env[i]
- && ((strncmp(__env[i], argv[1], varlen) == 0)
- && ((__env[i][varlen] == '\0')
- || (__env[i][varlen] == '=')))) {
- __env[i] = ep;
- return 0;
- }
- }
-
- /*
- * Wasn't existing variable. Fit into slot.
- */
- for (i = 0; i < __nenv-1; i++) {
- if (__env[i] == (char *)0) {
- __env[i] = ep;
- return 0;
- }
- }
-
- return KDB_ENVFULL;
+ return kdb_setenv(argv[1], argv[2]);
}
static int kdb_check_regs(void)
@@ -921,7 +914,7 @@ int kdb_parse(const char *cmdstr)
char *cp;
char *cpp, quoted;
kdbtab_t *tp;
- int i, escaped, ignore_errors = 0, check_grep = 0;
+ int escaped, ignore_errors = 0, check_grep = 0;
/*
* First tokenize the command string.
@@ -1011,25 +1004,17 @@ int kdb_parse(const char *cmdstr)
++argv[0];
}
- for_each_kdbcmd(tp, i) {
- if (tp->cmd_name) {
- /*
- * If this command is allowed to be abbreviated,
- * check to see if this is it.
- */
-
- if (tp->cmd_minlen
- && (strlen(argv[0]) <= tp->cmd_minlen)) {
- if (strncmp(argv[0],
- tp->cmd_name,
- tp->cmd_minlen) == 0) {
- break;
- }
- }
+ list_for_each_entry(tp, &kdb_cmds_head, list_node) {
+ /*
+ * If this command is allowed to be abbreviated,
+ * check to see if this is it.
+ */
+ if (tp->cmd_minlen && (strlen(argv[0]) <= tp->cmd_minlen) &&
+ (strncmp(argv[0], tp->cmd_name, tp->cmd_minlen) == 0))
+ break;
- if (strcmp(argv[0], tp->cmd_name) == 0)
- break;
- }
+ if (strcmp(argv[0], tp->cmd_name) == 0)
+ break;
}
/*
@@ -1037,19 +1022,15 @@ int kdb_parse(const char *cmdstr)
* few characters of this match any of the known commands.
* e.g., md1c20 should match md.
*/
- if (i == kdb_max_commands) {
- for_each_kdbcmd(tp, i) {
- if (tp->cmd_name) {
- if (strncmp(argv[0],
- tp->cmd_name,
- strlen(tp->cmd_name)) == 0) {
- break;
- }
- }
+ if (list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
+ list_for_each_entry(tp, &kdb_cmds_head, list_node) {
+ if (strncmp(argv[0], tp->cmd_name,
+ strlen(tp->cmd_name)) == 0)
+ break;
}
}
- if (i < kdb_max_commands) {
+ if (!list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
int result;
if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
@@ -2073,12 +2054,7 @@ static int kdb_lsmod(int argc, const char **argv)
static int kdb_env(int argc, const char **argv)
{
- int i;
-
- for (i = 0; i < __nenv; i++) {
- if (__env[i])
- kdb_printf("%s\n", __env[i]);
- }
+ kdb_printenv();
if (KDB_DEBUG(MASK))
kdb_printf("KDBDEBUG=0x%x\n",
@@ -2101,7 +2077,7 @@ static int kdb_dmesg(int argc, const char **argv)
int adjust = 0;
int n = 0;
int skip = 0;
- struct kmsg_dumper dumper = { .active = 1 };
+ struct kmsg_dump_iter iter;
size_t len;
char buf[201];
@@ -2126,8 +2102,8 @@ static int kdb_dmesg(int argc, const char **argv)
kdb_set(2, setargs);
}
- kmsg_dump_rewind_nolock(&dumper);
- while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
+ kmsg_dump_rewind(&iter);
+ while (kmsg_dump_get_line(&iter, 1, NULL, 0, NULL))
n++;
if (lines < 0) {
@@ -2159,8 +2135,8 @@ static int kdb_dmesg(int argc, const char **argv)
if (skip >= n || skip < 0)
return 0;
- kmsg_dump_rewind_nolock(&dumper);
- while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
+ kmsg_dump_rewind(&iter);
+ while (kmsg_dump_get_line(&iter, 1, buf, sizeof(buf), &len)) {
if (skip) {
skip--;
continue;
@@ -2428,17 +2404,14 @@ static int kdb_kgdb(int argc, const char **argv)
static int kdb_help(int argc, const char **argv)
{
kdbtab_t *kt;
- int i;
kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
kdb_printf("-----------------------------"
"-----------------------------\n");
- for_each_kdbcmd(kt, i) {
+ list_for_each_entry(kt, &kdb_cmds_head, list_node) {
char *space = "";
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
- if (!kt->cmd_name)
- continue;
if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
continue;
if (strlen(kt->cmd_usage) > 20)
@@ -2659,7 +2632,6 @@ static int kdb_grep_help(int argc, const char **argv)
* Returns:
* zero for success, one if a duplicate command.
*/
-#define kdb_command_extend 50 /* arbitrary */
int kdb_register_flags(char *cmd,
kdb_func_t func,
char *usage,
@@ -2667,49 +2639,20 @@ int kdb_register_flags(char *cmd,
short minlen,
kdb_cmdflags_t flags)
{
- int i;
kdbtab_t *kp;
- /*
- * Brute force method to determine duplicates
- */
- for_each_kdbcmd(kp, i) {
- if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (strcmp(kp->cmd_name, cmd) == 0) {
kdb_printf("Duplicate kdb command registered: "
"%s, func %px help %s\n", cmd, func, help);
return 1;
}
}
- /*
- * Insert command into first available location in table
- */
- for_each_kdbcmd(kp, i) {
- if (kp->cmd_name == NULL)
- break;
- }
-
- if (i >= kdb_max_commands) {
- kdbtab_t *new = kmalloc_array(kdb_max_commands -
- KDB_BASE_CMD_MAX +
- kdb_command_extend,
- sizeof(*new),
- GFP_KDB);
- if (!new) {
- kdb_printf("Could not allocate new kdb_command "
- "table\n");
- return 1;
- }
- if (kdb_commands) {
- memcpy(new, kdb_commands,
- (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
- kfree(kdb_commands);
- }
- memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
- kdb_command_extend * sizeof(*new));
- kdb_commands = new;
- kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
- kdb_max_commands += kdb_command_extend;
+ kp = kmalloc(sizeof(*kp), GFP_KDB);
+ if (!kp) {
+ kdb_printf("Could not allocate new kdb_command table\n");
+ return 1;
}
kp->cmd_name = cmd;
@@ -2718,11 +2661,27 @@ int kdb_register_flags(char *cmd,
kp->cmd_help = help;
kp->cmd_minlen = minlen;
kp->cmd_flags = flags;
+ kp->is_dynamic = true;
+
+ list_add_tail(&kp->list_node, &kdb_cmds_head);
return 0;
}
EXPORT_SYMBOL_GPL(kdb_register_flags);
+/*
+ * kdb_register_table() - This function is used to register a kdb command
+ * table.
+ * @kp: pointer to kdb command table
+ * @len: length of kdb command table
+ */
+void kdb_register_table(kdbtab_t *kp, size_t len)
+{
+ while (len--) {
+ list_add_tail(&kp->list_node, &kdb_cmds_head);
+ kp++;
+ }
+}
/*
* kdb_register - Compatibility register function for commands that do
@@ -2757,15 +2716,16 @@ EXPORT_SYMBOL_GPL(kdb_register);
*/
int kdb_unregister(char *cmd)
{
- int i;
kdbtab_t *kp;
/*
* find the command.
*/
- for_each_kdbcmd(kp, i) {
- if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
- kp->cmd_name = NULL;
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (strcmp(kp->cmd_name, cmd) == 0) {
+ list_del(&kp->list_node);
+ if (kp->is_dynamic)
+ kfree(kp);
return 0;
}
}
@@ -2775,118 +2735,222 @@ int kdb_unregister(char *cmd)
}
EXPORT_SYMBOL_GPL(kdb_unregister);
-/* Initialize the kdb command table. */
-static void __init kdb_inittab(void)
-{
- int i;
- kdbtab_t *kp;
-
- for_each_kdbcmd(kp, i)
- kp->cmd_name = NULL;
-
- kdb_register_flags("md", kdb_md, "<vaddr>",
- "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
- "Display Raw Memory", 0,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
- "Display Physical Memory", 0,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mds", kdb_md, "<vaddr>",
- "Display Memory Symbolically", 0,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
- "Modify Memory Contents", 0,
- KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("go", kdb_go, "[<vaddr>]",
- "Continue Execution", 1,
- KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
- kdb_register_flags("rd", kdb_rd, "",
- "Display Registers", 0,
- KDB_ENABLE_REG_READ);
- kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
- "Modify Registers", 0,
- KDB_ENABLE_REG_WRITE);
- kdb_register_flags("ef", kdb_ef, "<vaddr>",
- "Display exception frame", 0,
- KDB_ENABLE_MEM_READ);
- kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
- "Stack traceback", 1,
- KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
- kdb_register_flags("btp", kdb_bt, "<pid>",
- "Display stack for process <pid>", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
- "Backtrace all processes matching state flag", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("btc", kdb_bt, "",
- "Backtrace current process on each cpu", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("btt", kdb_bt, "<vaddr>",
- "Backtrace process given its struct task address", 0,
- KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
- kdb_register_flags("env", kdb_env, "",
- "Show environment variables", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("set", kdb_set, "",
- "Set environment variables", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("help", kdb_help, "",
- "Display Help Message", 1,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("?", kdb_help, "",
- "Display Help Message", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
- "Switch to new cpu", 0,
- KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
- kdb_register_flags("kgdb", kdb_kgdb, "",
- "Enter kgdb mode", 0, 0);
- kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
- "Display active task list", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("pid", kdb_pid, "<pidnum>",
- "Switch to another task", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("reboot", kdb_reboot, "",
- "Reboot the machine immediately", 0,
- KDB_ENABLE_REBOOT);
+static kdbtab_t maintab[] = {
+ { .cmd_name = "md",
+ .cmd_func = kdb_md,
+ .cmd_usage = "<vaddr>",
+ .cmd_help = "Display Memory Contents, also mdWcN, e.g. md8c1",
+ .cmd_minlen = 1,
+ .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "mdr",
+ .cmd_func = kdb_md,
+ .cmd_usage = "<vaddr> <bytes>",
+ .cmd_help = "Display Raw Memory",
+ .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "mdp",
+ .cmd_func = kdb_md,
+ .cmd_usage = "<paddr> <bytes>",
+ .cmd_help = "Display Physical Memory",
+ .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "mds",
+ .cmd_func = kdb_md,
+ .cmd_usage = "<vaddr>",
+ .cmd_help = "Display Memory Symbolically",
+ .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "mm",
+ .cmd_func = kdb_mm,
+ .cmd_usage = "<vaddr> <contents>",
+ .cmd_help = "Modify Memory Contents",
+ .cmd_flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "go",
+ .cmd_func = kdb_go,
+ .cmd_usage = "[<vaddr>]",
+ .cmd_help = "Continue Execution",
+ .cmd_minlen = 1,
+ .cmd_flags = KDB_ENABLE_REG_WRITE |
+ KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
+ },
+ { .cmd_name = "rd",
+ .cmd_func = kdb_rd,
+ .cmd_usage = "",
+ .cmd_help = "Display Registers",
+ .cmd_flags = KDB_ENABLE_REG_READ,
+ },
+ { .cmd_name = "rm",
+ .cmd_func = kdb_rm,
+ .cmd_usage = "<reg> <contents>",
+ .cmd_help = "Modify Registers",
+ .cmd_flags = KDB_ENABLE_REG_WRITE,
+ },
+ { .cmd_name = "ef",
+ .cmd_func = kdb_ef,
+ .cmd_usage = "<vaddr>",
+ .cmd_help = "Display exception frame",
+ .cmd_flags = KDB_ENABLE_MEM_READ,
+ },
+ { .cmd_name = "bt",
+ .cmd_func = kdb_bt,
+ .cmd_usage = "[<vaddr>]",
+ .cmd_help = "Stack traceback",
+ .cmd_minlen = 1,
+ .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ },
+ { .cmd_name = "btp",
+ .cmd_func = kdb_bt,
+ .cmd_usage = "<pid>",
+ .cmd_help = "Display stack for process <pid>",
+ .cmd_flags = KDB_ENABLE_INSPECT,
+ },
+ { .cmd_name = "bta",
+ .cmd_func = kdb_bt,
+ .cmd_usage = "[D|R|S|T|C|Z|E|U|I|M|A]",
+ .cmd_help = "Backtrace all processes matching state flag",
+ .cmd_flags = KDB_ENABLE_INSPECT,
+ },
+ { .cmd_name = "btc",
+ .cmd_func = kdb_bt,
+ .cmd_usage = "",
+ .cmd_help = "Backtrace current process on each cpu",
+ .cmd_flags = KDB_ENABLE_INSPECT,
+ },
+ { .cmd_name = "btt",
+ .cmd_func = kdb_bt,
+ .cmd_usage = "<vaddr>",
+ .cmd_help = "Backtrace process given its struct task address",
+ .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ },
+ { .cmd_name = "env",
+ .cmd_func = kdb_env,
+ .cmd_usage = "",
+ .cmd_help = "Show environment variables",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .cmd_name = "set",
+ .cmd_func = kdb_set,
+ .cmd_usage = "",
+ .cmd_help = "Set environment variables",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .cmd_name = "help",
+ .cmd_func = kdb_help,
+ .cmd_usage = "",
+ .cmd_help = "Display Help Message",
+ .cmd_minlen = 1,
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .cmd_name = "?",
+ .cmd_func = kdb_help,
+ .cmd_usage = "",
+ .cmd_help = "Display Help Message",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .cmd_name = "cpu",
+ .cmd_func = kdb_cpu,
+ .cmd_usage = "<cpunum>",
+ .cmd_help = "Switch to new cpu",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
+ },
+ { .cmd_name = "kgdb",
+ .cmd_func = kdb_kgdb,
+ .cmd_usage = "",
+ .cmd_help = "Enter kgdb mode",
+ .cmd_flags = 0,
+ },
+ { .cmd_name = "ps",
+ .cmd_func = kdb_ps,
+ .cmd_usage = "[<flags>|A]",
+ .cmd_help = "Display active task list",
+ .cmd_flags = KDB_ENABLE_INSPECT,
+ },
+ { .cmd_name = "pid",
+ .cmd_func = kdb_pid,
+ .cmd_usage = "<pidnum>",
+ .cmd_help = "Switch to another task",
+ .cmd_flags = KDB_ENABLE_INSPECT,
+ },
+ { .cmd_name = "reboot",
+ .cmd_func = kdb_reboot,
+ .cmd_usage = "",
+ .cmd_help = "Reboot the machine immediately",
+ .cmd_flags = KDB_ENABLE_REBOOT,
+ },
#if defined(CONFIG_MODULES)
- kdb_register_flags("lsmod", kdb_lsmod, "",
- "List loaded kernel modules", 0,
- KDB_ENABLE_INSPECT);
+ { .cmd_name = "lsmod",
+ .cmd_func = kdb_lsmod,
+ .cmd_usage = "",
+ .cmd_help = "List loaded kernel modules",
+ .cmd_flags = KDB_ENABLE_INSPECT,
+ },
#endif
#if defined(CONFIG_MAGIC_SYSRQ)
- kdb_register_flags("sr", kdb_sr, "<key>",
- "Magic SysRq key", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ { .cmd_name = "sr",
+ .cmd_func = kdb_sr,
+ .cmd_usage = "<key>",
+ .cmd_help = "Magic SysRq key",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
#endif
#if defined(CONFIG_PRINTK)
- kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
- "Display syslog buffer", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ { .cmd_name = "dmesg",
+ .cmd_func = kdb_dmesg,
+ .cmd_usage = "[lines]",
+ .cmd_help = "Display syslog buffer",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
#endif
- if (arch_kgdb_ops.enable_nmi) {
- kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
- "Disable NMI entry to KDB", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- }
- kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
- "Define a set of commands, down to endefcmd", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
- "Send a signal to a process", 0,
- KDB_ENABLE_SIGNAL);
- kdb_register_flags("summary", kdb_summary, "",
- "Summarize the system", 4,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
- "Display per_cpu variables", 3,
- KDB_ENABLE_MEM_READ);
- kdb_register_flags("grephelp", kdb_grep_help, "",
- "Display help on | grep", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ { .cmd_name = "defcmd",
+ .cmd_func = kdb_defcmd,
+ .cmd_usage = "name \"usage\" \"help\"",
+ .cmd_help = "Define a set of commands, down to endefcmd",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .cmd_name = "kill",
+ .cmd_func = kdb_kill,
+ .cmd_usage = "<-signal> <pid>",
+ .cmd_help = "Send a signal to a process",
+ .cmd_flags = KDB_ENABLE_SIGNAL,
+ },
+ { .cmd_name = "summary",
+ .cmd_func = kdb_summary,
+ .cmd_usage = "",
+ .cmd_help = "Summarize the system",
+ .cmd_minlen = 4,
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .cmd_name = "per_cpu",
+ .cmd_func = kdb_per_cpu,
+ .cmd_usage = "<sym> [<bytes>] [<cpu>]",
+ .cmd_help = "Display per_cpu variables",
+ .cmd_minlen = 3,
+ .cmd_flags = KDB_ENABLE_MEM_READ,
+ },
+ { .cmd_name = "grephelp",
+ .cmd_func = kdb_grep_help,
+ .cmd_usage = "",
+ .cmd_help = "Display help on | grep",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+};
+
+static kdbtab_t nmicmd = {
+ .cmd_name = "disable_nmi",
+ .cmd_func = kdb_disable_nmi,
+ .cmd_usage = "",
+ .cmd_help = "Disable NMI entry to KDB",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+};
+
+/* Initialize the kdb command table. */
+static void __init kdb_inittab(void)
+{
+ kdb_register_table(maintab, ARRAY_SIZE(maintab));
+ if (arch_kgdb_ops.enable_nmi)
+ kdb_register_table(&nmicmd, 1);
}
/* Execute any commands defined in kdb_cmds. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index a4281fb99299..ccbed9089808 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -174,8 +174,11 @@ typedef struct _kdbtab {
short cmd_minlen; /* Minimum legal # command
* chars required */
kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
+ struct list_head list_node; /* Command list */
+ bool is_dynamic; /* Command table allocation type */
} kdbtab_t;
+extern void kdb_register_table(kdbtab_t *kp, size_t len);
extern int kdb_bt(int, const char **); /* KDB display back trace */
/* KDB breakpoint management functions */
@@ -207,9 +210,7 @@ extern unsigned long kdb_task_state(const struct task_struct *p,
unsigned long mask);
extern void kdb_ps_suppressed(void);
extern void kdb_ps1(const struct task_struct *p);
-extern void kdb_print_nameval(const char *name, unsigned long val);
extern void kdb_send_sig(struct task_struct *p, int sig);
-extern void kdb_meminfo_proc_show(void);
extern char kdb_getchar(void);
extern char *kdb_getstr(char *, size_t, const char *);
extern void kdb_gdb_state_pass(char *buf);
@@ -230,7 +231,7 @@ extern struct task_struct *kdb_curr_task(int);
#define kdb_task_has_cpu(p) (task_curr(p))
-#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
+#define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL)
extern void *debug_kmalloc(size_t size, gfp_t flags);
extern void debug_kfree(void *);
@@ -254,4 +255,14 @@ extern char kdb_prompt_str[];
#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
#endif /* CONFIG_KGDB_KDB */
+
+#define kdb_func_printf(format, args...) \
+ kdb_printf("%s: " format, __func__, ## args)
+
+#define kdb_dbg_printf(mask, format, args...) \
+ do { \
+ if (KDB_DEBUG(mask)) \
+ kdb_func_printf(format, ## args); \
+ } while (0)
+
#endif /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 6226502ce049..91bb666d7c03 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -39,20 +39,15 @@
*/
int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
{
- if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: symname=%s, symtab=%px\n", symname,
- symtab);
+ kdb_dbg_printf(AR, "symname=%s, symtab=%px\n", symname, symtab);
memset(symtab, 0, sizeof(*symtab));
symtab->sym_start = kallsyms_lookup_name(symname);
if (symtab->sym_start) {
- if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: returns 1, "
- "symtab->sym_start=0x%lx\n",
- symtab->sym_start);
+ kdb_dbg_printf(AR, "returns 1, symtab->sym_start=0x%lx\n",
+ symtab->sym_start);
return 1;
}
- if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: returns 0\n");
+ kdb_dbg_printf(AR, "returns 0\n");
return 0;
}
EXPORT_SYMBOL(kdbgetsymval);
@@ -87,16 +82,14 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
#define knt1_size 128 /* must be >= kallsyms table size */
char *knt1 = NULL;
- if (KDB_DEBUG(AR))
- kdb_printf("kdbnearsym: addr=0x%lx, symtab=%px\n", addr, symtab);
+ kdb_dbg_printf(AR, "addr=0x%lx, symtab=%px\n", addr, symtab);
memset(symtab, 0, sizeof(*symtab));
if (addr < 4096)
goto out;
knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
if (!knt1) {
- kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
- addr);
+ kdb_func_printf("addr=0x%lx cannot kmalloc knt1\n", addr);
goto out;
}
symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
@@ -147,11 +140,8 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
if (symtab->mod_name == NULL)
symtab->mod_name = "kernel";
- if (KDB_DEBUG(AR))
- kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
- "symtab->mod_name=%px, symtab->sym_name=%px (%s)\n", ret,
- symtab->sym_start, symtab->mod_name, symtab->sym_name,
- symtab->sym_name);
+ kdb_dbg_printf(AR, "returns %d symtab->sym_start=0x%lx, symtab->mod_name=%px, symtab->sym_name=%px (%s)\n",
+ ret, symtab->sym_start, symtab->mod_name, symtab->sym_name, symtab->sym_name);
out:
debug_kfree(knt1);
@@ -328,7 +318,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
int ret = copy_from_kernel_nofault((char *)res, (char *)addr, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
- kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
+ kdb_func_printf("Bad address 0x%lx\n", addr);
KDB_STATE_SET(SUPPRESS);
}
ret = KDB_BADADDR;
@@ -353,7 +343,7 @@ int kdb_putarea_size(unsigned long addr, void *res, size_t size)
int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
- kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
+ kdb_func_printf("Bad address 0x%lx\n", addr);
KDB_STATE_SET(SUPPRESS);
}
ret = KDB_BADADDR;
@@ -435,7 +425,7 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
fallthrough;
default:
diag = KDB_BADWIDTH;
- kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
+ kdb_func_printf("bad width %zu\n", size);
}
return diag;
}
@@ -484,7 +474,7 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
fallthrough;
default:
diag = KDB_BADWIDTH;
- kdb_printf("kdb_getword: bad width %ld\n", (long) size);
+ kdb_func_printf("bad width %zu\n", size);
}
return diag;
}
@@ -528,7 +518,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
fallthrough;
default:
diag = KDB_BADWIDTH;
- kdb_printf("kdb_putword: bad width %ld\n", (long) size);
+ kdb_func_printf("bad width %zu\n", size);
}
return diag;
}
@@ -602,8 +592,7 @@ unsigned long kdb_task_state_string(const char *s)
res = ~0UL;
break;
default:
- kdb_printf("%s: unknown flag '%c' ignored\n",
- __func__, *s);
+ kdb_func_printf("unknown flag '%c' ignored\n", *s);
break;
}
++s;
@@ -665,24 +654,6 @@ unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
return (mask & kdb_task_state_string(state)) != 0;
}
-/*
- * kdb_print_nameval - Print a name and its value, converting the
- * value to a symbol lookup if possible.
- * Inputs:
- * name field name to print
- * val value of field
- */
-void kdb_print_nameval(const char *name, unsigned long val)
-{
- kdb_symtab_t symtab;
- kdb_printf(" %-11.11s ", name);
- if (kdbnearsym(val, &symtab))
- kdb_symbol_print(val, &symtab,
- KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
- else
- kdb_printf("0x%lx\n", val);
-}
-
/* Last ditch allocator for debugging, so we can still debug even when
* the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
* for space usage, not for speed. One smallish memory pool, the free
@@ -884,18 +855,16 @@ void debug_kusage(void)
if (!debug_kusage_one_time)
goto out;
debug_kusage_one_time = 0;
- kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
- __func__, dah_first);
+ kdb_func_printf("debug_kmalloc memory leak dah_first %d\n", dah_first);
if (dah_first) {
h_used = (struct debug_alloc_header *)debug_alloc_pool;
- kdb_printf("%s: h_used %px size %d\n", __func__, h_used,
- h_used->size);
+ kdb_func_printf("h_used %px size %d\n", h_used, h_used->size);
}
do {
h_used = (struct debug_alloc_header *)
((char *)h_free + dah_overhead + h_free->size);
- kdb_printf("%s: h_used %px size %d caller %px\n",
- __func__, h_used, h_used->size, h_used->caller);
+ kdb_func_printf("h_used %px size %d caller %px\n",
+ h_used, h_used->size, h_used->caller);
h_free = (struct debug_alloc_header *)
(debug_alloc_pool + h_free->next);
} while (h_free->next);
@@ -903,8 +872,8 @@ void debug_kusage(void)
((char *)h_free + dah_overhead + h_free->size);
if ((char *)h_used - debug_alloc_pool !=
sizeof(debug_alloc_pool_aligned))
- kdb_printf("%s: h_used %px size %d caller %px\n",
- __func__, h_used, h_used->size, h_used->caller);
+ kdb_func_printf("h_used %px size %d caller %px\n",
+ h_used, h_used->size, h_used->caller);
out:
spin_unlock(&dap_lock);
}
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index da95df381483..e0e64f8b0739 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -21,6 +21,7 @@
#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
#define DMA_MAP_MAX_THREADS 1024
#define DMA_MAP_MAX_SECONDS 300
+#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC)
#define DMA_MAP_BIDIRECTIONAL 0
#define DMA_MAP_TO_DEVICE 1
@@ -36,7 +37,8 @@ struct map_benchmark {
__s32 node; /* which numa node this benchmark will run on */
__u32 dma_bits; /* DMA addressing capability */
__u32 dma_dir; /* DMA data direction */
- __u8 expansion[84]; /* For future use */
+ __u32 dma_trans_ns; /* time for DMA transmission in ns */
+ __u8 expansion[80]; /* For future use */
};
struct map_benchmark_data {
@@ -87,6 +89,9 @@ static int map_benchmark_thread(void *data)
map_etime = ktime_get();
map_delta = ktime_sub(map_etime, map_stime);
+ /* Pretend DMA is transmitting */
+ ndelay(map->bparam.dma_trans_ns);
+
unmap_stime = ktime_get();
dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
unmap_etime = ktime_get();
@@ -218,6 +223,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
return -EINVAL;
}
+ if (map->bparam.dma_trans_ns > DMA_MAP_MAX_TRANS_DELAY) {
+ pr_err("invalid transmission delay\n");
+ return -EINVAL;
+ }
+
if (map->bparam.node != NUMA_NO_NODE &&
!node_possible(map->bparam.node)) {
pr_err("invalid numa node\n");
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 84de6b1c5fab..b6a633679933 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -517,46 +517,6 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page,
}
EXPORT_SYMBOL_GPL(dma_free_pages);
-void *dma_alloc_noncoherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
-{
- const struct dma_map_ops *ops = get_dma_ops(dev);
- void *vaddr;
-
- if (!ops || !ops->alloc_noncoherent) {
- struct page *page;
-
- page = dma_alloc_pages(dev, size, dma_handle, dir, gfp);
- if (!page)
- return NULL;
- return page_address(page);
- }
-
- size = PAGE_ALIGN(size);
- vaddr = ops->alloc_noncoherent(dev, size, dma_handle, dir, gfp);
- if (vaddr)
- debug_dma_map_page(dev, virt_to_page(vaddr), 0, size, dir,
- *dma_handle);
- return vaddr;
-}
-EXPORT_SYMBOL_GPL(dma_alloc_noncoherent);
-
-void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_handle, enum dma_data_direction dir)
-{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
- if (!ops || !ops->free_noncoherent) {
- dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir);
- return;
- }
-
- size = PAGE_ALIGN(size);
- debug_dma_unmap_page(dev, dma_handle, size, dir);
- ops->free_noncoherent(dev, size, vaddr, dma_handle, dir);
-}
-EXPORT_SYMBOL_GPL(dma_free_noncoherent);
-
int dma_supported(struct device *dev, u64 mask)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 7c42df6e6100..c10e855a03bc 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -50,9 +50,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/swiotlb.h>
-#define OFFSET(val,align) ((unsigned long) \
- ( (val) & ( (align) - 1)))
-
#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
/*
@@ -103,6 +100,11 @@ static unsigned int max_segment;
static phys_addr_t *io_tlb_orig_addr;
/*
+ * The mapped buffer's size should be validated during a sync operation.
+ */
+static size_t *io_tlb_orig_size;
+
+/*
* Protect the above data structures in the map and unmap calls
*/
static DEFINE_SPINLOCK(io_tlb_lock);
@@ -171,7 +173,7 @@ void __init swiotlb_adjust_size(unsigned long new_size)
* adjust/expand SWIOTLB size for their use.
*/
if (!io_tlb_nslabs) {
- size = ALIGN(new_size, 1 << IO_TLB_SHIFT);
+ size = ALIGN(new_size, IO_TLB_SIZE);
io_tlb_nslabs = size >> IO_TLB_SHIFT;
io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
@@ -192,6 +194,16 @@ void swiotlb_print_info(void)
bytes >> 20);
}
+static inline unsigned long io_tlb_offset(unsigned long val)
+{
+ return val & (IO_TLB_SEGSIZE - 1);
+}
+
+static inline unsigned long nr_slots(u64 val)
+{
+ return DIV_ROUND_UP(val, IO_TLB_SIZE);
+}
+
/*
* Early SWIOTLB allocation may be too early to allow an architecture to
* perform the desired operations. This function allows the architecture to
@@ -240,9 +252,16 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
__func__, alloc_size, PAGE_SIZE);
+ alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t));
+ io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!io_tlb_orig_size)
+ panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+
for (i = 0; i < io_tlb_nslabs; i++) {
- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+ io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ io_tlb_orig_size[i] = 0;
}
io_tlb_index = 0;
no_iotlb_memory = false;
@@ -363,7 +382,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
* between io_tlb_start and io_tlb_end.
*/
io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
- get_order(io_tlb_nslabs * sizeof(int)));
+ get_order(io_tlb_nslabs * sizeof(int)));
if (!io_tlb_list)
goto cleanup3;
@@ -374,9 +393,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
if (!io_tlb_orig_addr)
goto cleanup4;
+ io_tlb_orig_size = (size_t *)
+ __get_free_pages(GFP_KERNEL,
+ get_order(io_tlb_nslabs *
+ sizeof(size_t)));
+ if (!io_tlb_orig_size)
+ goto cleanup5;
+
+
for (i = 0; i < io_tlb_nslabs; i++) {
- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+ io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ io_tlb_orig_size[i] = 0;
}
io_tlb_index = 0;
no_iotlb_memory = false;
@@ -389,6 +417,10 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
return 0;
+cleanup5:
+ free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs *
+ sizeof(phys_addr_t)));
+
cleanup4:
free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
sizeof(int)));
@@ -404,6 +436,8 @@ void __init swiotlb_exit(void)
return;
if (late_alloc) {
+ free_pages((unsigned long)io_tlb_orig_size,
+ get_order(io_tlb_nslabs * sizeof(size_t)));
free_pages((unsigned long)io_tlb_orig_addr,
get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
@@ -413,6 +447,8 @@ void __init swiotlb_exit(void)
} else {
memblock_free_late(__pa(io_tlb_orig_addr),
PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
+ memblock_free_late(__pa(io_tlb_orig_size),
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)));
memblock_free_late(__pa(io_tlb_list),
PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
memblock_free_late(io_tlb_start,
@@ -461,79 +497,71 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
}
}
-phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
- size_t mapping_size, size_t alloc_size,
- enum dma_data_direction dir, unsigned long attrs)
-{
- dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start);
- unsigned long flags;
- phys_addr_t tlb_addr;
- unsigned int nslots, stride, index, wrap;
- int i;
- unsigned long mask;
- unsigned long offset_slots;
- unsigned long max_slots;
- unsigned long tmp_io_tlb_used;
-
- if (no_iotlb_memory)
- panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
-
- if (mem_encrypt_active())
- pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
+#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT))
- if (mapping_size > alloc_size) {
- dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
- mapping_size, alloc_size);
- return (phys_addr_t)DMA_MAPPING_ERROR;
- }
+/*
+ * Return the offset into a iotlb slot required to keep the device happy.
+ */
+static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
+{
+ return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1);
+}
- mask = dma_get_seg_boundary(hwdev);
+/*
+ * Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
+ */
+static inline unsigned long get_max_slots(unsigned long boundary_mask)
+{
+ if (boundary_mask == ~0UL)
+ return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+ return nr_slots(boundary_mask + 1);
+}
- tbl_dma_addr &= mask;
+static unsigned int wrap_index(unsigned int index)
+{
+ if (index >= io_tlb_nslabs)
+ return 0;
+ return index;
+}
- offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+/*
+ * Find a suitable number of IO TLB entries size that will fit this request and
+ * allocate a buffer from that IO TLB pool.
+ */
+static int find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size)
+{
+ unsigned long boundary_mask = dma_get_seg_boundary(dev);
+ dma_addr_t tbl_dma_addr =
+ phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask;
+ unsigned long max_slots = get_max_slots(boundary_mask);
+ unsigned int iotlb_align_mask =
+ dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
+ unsigned int nslots = nr_slots(alloc_size), stride;
+ unsigned int index, wrap, count = 0, i;
+ unsigned long flags;
- /*
- * Carefully handle integer overflow which can occur when mask == ~0UL.
- */
- max_slots = mask + 1
- ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
- : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+ BUG_ON(!nslots);
/*
- * For mappings greater than or equal to a page, we limit the stride
- * (and hence alignment) to a page size.
+ * For mappings with an alignment requirement don't bother looping to
+ * unaligned slots once we found an aligned one. For allocations of
+ * PAGE_SIZE or larger only look for page aligned allocations.
*/
- nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
if (alloc_size >= PAGE_SIZE)
- stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
- else
- stride = 1;
-
- BUG_ON(!nslots);
+ stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
- /*
- * Find suitable number of IO TLB entries size that will fit this
- * request and allocate a buffer from that IO TLB pool.
- */
spin_lock_irqsave(&io_tlb_lock, flags);
-
if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
goto not_found;
- index = ALIGN(io_tlb_index, stride);
- if (index >= io_tlb_nslabs)
- index = 0;
- wrap = index;
-
+ index = wrap = wrap_index(ALIGN(io_tlb_index, stride));
do {
- while (iommu_is_span_boundary(index, nslots, offset_slots,
- max_slots)) {
- index += stride;
- if (index >= io_tlb_nslabs)
- index = 0;
- if (index == wrap)
- goto not_found;
+ if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
+ (orig_addr & iotlb_align_mask)) {
+ index = wrap_index(index + 1);
+ continue;
}
/*
@@ -541,55 +569,96 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
* contiguous buffers, we allocate the buffers from that slot
* and mark the entries as '0' indicating unavailable.
*/
- if (io_tlb_list[index] >= nslots) {
- int count = 0;
-
- for (i = index; i < (int) (index + nslots); i++)
- io_tlb_list[i] = 0;
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
- tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT);
-
- /*
- * Update the indices to avoid searching in the next
- * round.
- */
- io_tlb_index = ((index + nslots) < io_tlb_nslabs
- ? (index + nslots) : 0);
-
- goto found;
+ if (!iommu_is_span_boundary(index, nslots,
+ nr_slots(tbl_dma_addr),
+ max_slots)) {
+ if (io_tlb_list[index] >= nslots)
+ goto found;
}
- index += stride;
- if (index >= io_tlb_nslabs)
- index = 0;
+ index = wrap_index(index + stride);
} while (index != wrap);
not_found:
- tmp_io_tlb_used = io_tlb_used;
-
spin_unlock_irqrestore(&io_tlb_lock, flags);
- if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
- dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
- alloc_size, io_tlb_nslabs, tmp_io_tlb_used);
- return (phys_addr_t)DMA_MAPPING_ERROR;
+ return -1;
+
found:
+ for (i = index; i < index + nslots; i++)
+ io_tlb_list[i] = 0;
+ for (i = index - 1;
+ io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
+ io_tlb_list[i]; i--)
+ io_tlb_list[i] = ++count;
+
+ /*
+ * Update the indices to avoid searching in the next round.
+ */
+ if (index + nslots < io_tlb_nslabs)
+ io_tlb_index = index + nslots;
+ else
+ io_tlb_index = 0;
io_tlb_used += nslots;
+
spin_unlock_irqrestore(&io_tlb_lock, flags);
+ return index;
+}
+
+phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
+ size_t mapping_size, size_t alloc_size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ unsigned int index, i;
+ phys_addr_t tlb_addr;
+
+ if (no_iotlb_memory)
+ panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+
+ if (mem_encrypt_active())
+ pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
+
+ if (mapping_size > alloc_size) {
+ dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
+ mapping_size, alloc_size);
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
+
+ index = find_slots(dev, orig_addr, alloc_size + offset);
+ if (index == -1) {
+ if (!(attrs & DMA_ATTR_NO_WARN))
+ dev_warn_ratelimited(dev,
+ "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
+ alloc_size, io_tlb_nslabs, io_tlb_used);
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
/*
* Save away the mapping from the original address to the DMA address.
* This is needed when we sync the memory. Then we sync the buffer if
* needed.
*/
- for (i = 0; i < nslots; i++)
- io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
+ for (i = 0; i < nr_slots(alloc_size + offset); i++) {
+ io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i);
+ io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
+ }
+ tlb_addr = slot_addr(io_tlb_start, index) + offset;
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
-
return tlb_addr;
}
+static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size)
+{
+ if (*size > orig_size) {
+ /* Warn and truncate mapping_size */
+ dev_WARN_ONCE(hwdev, 1,
+ "Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n",
+ orig_size, *size);
+ *size = orig_size;
+ }
+}
+
/*
* tlb_addr is the physical address of the bounce buffer to unmap.
*/
@@ -598,10 +667,13 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
enum dma_data_direction dir, unsigned long attrs)
{
unsigned long flags;
- int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
- int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+ unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
+ int i, count, nslots = nr_slots(alloc_size + offset);
+ int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = io_tlb_orig_addr[index];
+ validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size);
+
/*
* First, sync the memory before unmapping the entry
*/
@@ -617,26 +689,30 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
* with slots below and above the pool being returned.
*/
spin_lock_irqsave(&io_tlb_lock, flags);
- {
- count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
- io_tlb_list[index + nslots] : 0);
- /*
- * Step 1: return the slots to the free list, merging the
- * slots with superceeding slots
- */
- for (i = index + nslots - 1; i >= index; i--) {
- io_tlb_list[i] = ++count;
- io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
- }
- /*
- * Step 2: merge the returned slots with the preceding slots,
- * if available (non zero)
- */
- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
+ if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
+ count = io_tlb_list[index + nslots];
+ else
+ count = 0;
- io_tlb_used -= nslots;
+ /*
+ * Step 1: return the slots to the free list, merging the slots with
+ * superceeding slots
+ */
+ for (i = index + nslots - 1; i >= index; i--) {
+ io_tlb_list[i] = ++count;
+ io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ io_tlb_orig_size[i] = 0;
}
+
+ /*
+ * Step 2: merge the returned slots with the preceding slots, if
+ * available (non zero)
+ */
+ for (i = index - 1;
+ io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i];
+ i--)
+ io_tlb_list[i] = ++count;
+ io_tlb_used -= nslots;
spin_unlock_irqrestore(&io_tlb_lock, flags);
}
@@ -645,11 +721,13 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
enum dma_sync_target target)
{
int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+ size_t orig_size = io_tlb_orig_size[index];
phys_addr_t orig_addr = io_tlb_orig_addr[index];
if (orig_addr == INVALID_PHYS_ADDR)
return;
- orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1);
+
+ validate_sync_size_and_truncate(hwdev, orig_size, &size);
switch (target) {
case SYNC_FOR_CPU:
@@ -707,7 +785,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
size_t swiotlb_max_mapping_size(struct device *dev)
{
- return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
+ return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
}
bool is_swiotlb_active(void)
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 8442e5c9cfa2..a0b3b04fb596 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -341,7 +341,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* Checking for rcu_is_watching() here would prevent the nesting
* interrupt to invoke rcu_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
- * assume that it is the first interupt and eventually claim
+ * assume that it is the first interrupt and eventually claim
* quiescent state and end grace periods prematurely.
*
* Unconditionally invoke rcu_irq_enter() so RCU state stays
@@ -422,7 +422,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
instrumentation_begin();
if (IS_ENABLED(CONFIG_PREEMPTION)) {
-#ifdef CONFIG_PREEMT_DYNAMIC
+#ifdef CONFIG_PREEMPT_DYNAMIC
static_call(irqentry_exit_cond_resched)();
#else
irqentry_exit_cond_resched();
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 129dee540a8b..03db40f6cba9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -269,7 +269,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
if (!event->parent) {
/*
* If this is a !child event, we must hold ctx::mutex to
- * stabilize the the event->ctx relation. See
+ * stabilize the event->ctx relation. See
* perf_event_ctx_lock().
*/
lockdep_assert_held(&ctx->mutex);
@@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -1303,7 +1304,7 @@ static void put_ctx(struct perf_event_context *ctx)
* life-time rules separate them. That is an exiting task cannot fork, and a
* spawning task cannot (yet) exit.
*
- * But remember that that these are parent<->child context relations, and
+ * But remember that these are parent<->child context relations, and
* migration does not affect children, therefore these two orderings should not
* interact.
*
@@ -1442,7 +1443,7 @@ static u64 primary_event_id(struct perf_event *event)
/*
* Get the perf_event_context for a task and lock it.
*
- * This has to cope with with the fact that until it is locked,
+ * This has to cope with the fact that until it is locked,
* the context could get moved to another task.
*/
static struct perf_event_context *
@@ -2486,7 +2487,7 @@ static void perf_set_shadow_time(struct perf_event *event,
* But this is a bit hairy.
*
* So instead, we have an explicit cgroup call to remain
- * within the time time source all along. We believe it
+ * within the time source all along. We believe it
* is cleaner and simpler to understand.
*/
if (is_cgroup_event(event))
@@ -3461,11 +3462,16 @@ unlock:
}
}
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
void perf_sched_cb_dec(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- --cpuctx->sched_cb_usage;
+ this_cpu_dec(perf_sched_cb_usages);
+
+ if (!--cpuctx->sched_cb_usage)
+ list_del(&cpuctx->sched_cb_entry);
}
@@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- cpuctx->sched_cb_usage++;
+ if (!cpuctx->sched_cb_usage++)
+ list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+ this_cpu_inc(perf_sched_cb_usages);
}
/*
@@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
+static void perf_pmu_sched_task(struct task_struct *prev,
+ struct task_struct *next,
+ bool sched_in)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (prev == next)
+ return;
+
+ list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+ /* will be handled in perf_event_context_sched_in/out */
+ if (cpuctx->task_ctx)
+ continue;
+
+ __perf_pmu_sched_task(cpuctx, sched_in);
+ }
+}
+
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
@@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(task, next, false);
+
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
@@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
+
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(prev, task, true);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -4656,7 +4689,7 @@ static void unaccount_event(struct perf_event *event)
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
dec = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
@@ -11175,7 +11208,7 @@ static void account_event(struct perf_event *event)
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
inc = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
@@ -12972,6 +13005,7 @@ static void __init perf_event_init_all_cpus(void)
#ifdef CONFIG_CGROUP_PERF
INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
#endif
+ INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 3ea7f8f92f1d..6addc9780319 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1733,7 +1733,7 @@ void uprobe_free_utask(struct task_struct *t)
}
/*
- * Allocate a uprobe_task object for the task if if necessary.
+ * Allocate a uprobe_task object for the task if necessary.
* Called when the thread hits a breakpoint.
*
* Returns:
diff --git a/kernel/fork.c b/kernel/fork.c
index d66cd1014211..426cd0c51f9e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -994,6 +994,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
+static void mm_init_pasid(struct mm_struct *mm)
+{
+#ifdef CONFIG_IOMMU_SUPPORT
+ mm->pasid = INIT_PASID;
+#endif
+}
+
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
@@ -1024,6 +1031,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ mm_init_pasid(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
@@ -1940,6 +1948,14 @@ static __latent_entropy struct task_struct *copy_process(
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
+ if (args->io_thread) {
+ /*
+ * Mark us an IO worker, and block any signal that isn't
+ * fatal or STOP
+ */
+ p->flags |= PF_IO_WORKER;
+ siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ }
/*
* This _must_ happen before we call free_task(), i.e. before we jump
@@ -2411,6 +2427,28 @@ struct mm_struct *copy_init_mm(void)
}
/*
+ * This is like kernel_clone(), but shaved down and tailored to just
+ * creating io_uring workers. It returns a created task, or an error pointer.
+ * The returned task is inactive, and the caller must fire it up through
+ * wake_up_new_task(p). All signals are blocked in the created task.
+ */
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
+{
+ unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+ CLONE_IO;
+ struct kernel_clone_args args = {
+ .flags = ((lower_32_bits(flags) | CLONE_VM |
+ CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .stack = (unsigned long)fn,
+ .stack_size = (unsigned long)arg,
+ .io_thread = 1,
+ };
+
+ return copy_process(NULL, 0, node, &args);
+}
+
+/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
diff --git a/kernel/futex.c b/kernel/futex.c
index e68db7745039..00febd6dea9c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2728,14 +2728,13 @@ retry:
goto out;
restart = &current->restart_block;
- restart->fn = futex_wait_restart;
restart->futex.uaddr = uaddr;
restart->futex.val = val;
restart->futex.time = *abs_time;
restart->futex.bitset = bitset;
restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
- ret = -ERESTART_RESTARTBLOCK;
+ ret = set_restart_fn(restart, futex_wait_restart);
out:
if (to) {
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
index c94b820a1b62..c466c7fbdece 100644
--- a/kernel/gcov/clang.c
+++ b/kernel/gcov/clang.c
@@ -70,12 +70,16 @@ struct gcov_fn_info {
u32 ident;
u32 checksum;
+#if CONFIG_CLANG_VERSION < 110000
u8 use_extra_checksum;
+#endif
u32 cfg_checksum;
u32 num_counters;
u64 *counters;
+#if CONFIG_CLANG_VERSION < 110000
const char *function_name;
+#endif
};
static struct gcov_info *current_info;
@@ -105,6 +109,7 @@ void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
}
EXPORT_SYMBOL(llvm_gcov_init);
+#if CONFIG_CLANG_VERSION < 110000
void llvm_gcda_start_file(const char *orig_filename, const char version[4],
u32 checksum)
{
@@ -113,7 +118,17 @@ void llvm_gcda_start_file(const char *orig_filename, const char version[4],
current_info->checksum = checksum;
}
EXPORT_SYMBOL(llvm_gcda_start_file);
+#else
+void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum)
+{
+ current_info->filename = orig_filename;
+ current_info->version = version;
+ current_info->checksum = checksum;
+}
+EXPORT_SYMBOL(llvm_gcda_start_file);
+#endif
+#if CONFIG_CLANG_VERSION < 110000
void llvm_gcda_emit_function(u32 ident, const char *function_name,
u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
{
@@ -132,6 +147,21 @@ void llvm_gcda_emit_function(u32 ident, const char *function_name,
list_add_tail(&info->head, &current_info->functions);
}
+#else
+void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum)
+{
+ struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return;
+
+ INIT_LIST_HEAD(&info->head);
+ info->ident = ident;
+ info->checksum = func_checksum;
+ info->cfg_checksum = cfg_checksum;
+ list_add_tail(&info->head, &current_info->functions);
+}
+#endif
EXPORT_SYMBOL(llvm_gcda_emit_function);
void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
@@ -262,11 +292,16 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
!list_is_last(&fn_ptr2->head, &info2->functions)) {
if (fn_ptr1->checksum != fn_ptr2->checksum)
return false;
+#if CONFIG_CLANG_VERSION < 110000
if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
return false;
if (fn_ptr1->use_extra_checksum &&
fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
return false;
+#else
+ if (fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
+ return false;
+#endif
fn_ptr1 = list_next_entry(fn_ptr1, head);
fn_ptr2 = list_next_entry(fn_ptr2, head);
}
@@ -295,6 +330,7 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
}
}
+#if CONFIG_CLANG_VERSION < 110000
static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
{
size_t cv_size; /* counter values size */
@@ -322,6 +358,28 @@ err_name:
kfree(fn_dup);
return NULL;
}
+#else
+static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
+{
+ size_t cv_size; /* counter values size */
+ struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
+ GFP_KERNEL);
+ if (!fn_dup)
+ return NULL;
+ INIT_LIST_HEAD(&fn_dup->head);
+
+ cv_size = fn->num_counters * sizeof(fn->counters[0]);
+ fn_dup->counters = vmalloc(cv_size);
+ if (!fn_dup->counters) {
+ kfree(fn_dup);
+ return NULL;
+ }
+
+ memcpy(fn_dup->counters, fn->counters, cv_size);
+
+ return fn_dup;
+}
+#endif
/**
* gcov_info_dup - duplicate profiling data set
@@ -362,6 +420,7 @@ err:
* gcov_info_free - release memory for profiling data set duplicate
* @info: profiling data set duplicate to free
*/
+#if CONFIG_CLANG_VERSION < 110000
void gcov_info_free(struct gcov_info *info)
{
struct gcov_fn_info *fn, *tmp;
@@ -375,6 +434,20 @@ void gcov_info_free(struct gcov_info *info)
kfree(info->filename);
kfree(info);
}
+#else
+void gcov_info_free(struct gcov_info *info)
+{
+ struct gcov_fn_info *fn, *tmp;
+
+ list_for_each_entry_safe(fn, tmp, &info->functions, head) {
+ vfree(fn->counters);
+ list_del(&fn->head);
+ kfree(fn);
+ }
+ kfree(info->filename);
+ kfree(info);
+}
+#endif
#define ITER_STRIDE PAGE_SIZE
@@ -460,17 +533,22 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
list_for_each_entry(fi_ptr, &info->functions, head) {
u32 i;
- u32 len = 2;
-
- if (fi_ptr->use_extra_checksum)
- len++;
pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
- pos += store_gcov_u32(buffer, pos, len);
+#if CONFIG_CLANG_VERSION < 110000
+ pos += store_gcov_u32(buffer, pos,
+ fi_ptr->use_extra_checksum ? 3 : 2);
+#else
+ pos += store_gcov_u32(buffer, pos, 3);
+#endif
pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
+#if CONFIG_CLANG_VERSION < 110000
if (fi_ptr->use_extra_checksum)
pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+#else
+ pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+#endif
pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
diff --git a/kernel/groups.c b/kernel/groups.c
index fe7e6385530e..787b381c7c00 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -15,12 +15,7 @@
struct group_info *groups_alloc(int gidsetsize)
{
struct group_info *gi;
- unsigned int len;
-
- len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize;
- gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
- if (!gi)
- gi = __vmalloc(len, GFP_KERNEL_ACCOUNT);
+ gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT);
if (!gi)
return NULL;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6d89e33fe3aa..8cc8e5713287 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -761,7 +761,7 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_nmi);
* handle_edge_irq - edge type IRQ handler
* @desc: the interrupt description structure for this irq
*
- * Interrupt occures on the falling and/or rising edge of a hardware
+ * Interrupt occurs on the falling and/or rising edge of a hardware
* signal. The occurrence is latched into the irq controller hardware
* and must be acked in order to be reenabled. After the ack another
* interrupt can happen on the same source even before the first one
@@ -808,7 +808,7 @@ void handle_edge_irq(struct irq_desc *desc)
/*
* When another irq arrived while we were handling
* one, we could have masked the irq.
- * Renable it, if it was not disabled in meantime.
+ * Reenable it, if it was not disabled in meantime.
*/
if (unlikely(desc->istate & IRQS_PENDING)) {
if (!irqd_irq_disabled(&desc->irq_data) &&
@@ -1419,7 +1419,7 @@ EXPORT_SYMBOL_GPL(irq_chip_eoi_parent);
* @dest: The affinity mask to set
* @force: Flag to enforce setting (disable online checks)
*
- * Conditinal, as the underlying parent chip might not implement it.
+ * Conditional, as the underlying parent chip might not implement it.
*/
int irq_chip_set_affinity_parent(struct irq_data *data,
const struct cpumask *dest, bool force)
@@ -1531,7 +1531,7 @@ EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
#endif
/**
- * irq_chip_compose_msi_msg - Componse msi message for a irq chip
+ * irq_chip_compose_msi_msg - Compose msi message for a irq chip
* @data: Pointer to interrupt specific data
* @msg: Pointer to the MSI message
*
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 0b0cdf206dc4..7fe6cffe7d0d 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -13,7 +13,7 @@
/*
* What should we do if we get a hw irq event on an illegal vector?
- * Each architecture has to answer this themself.
+ * Each architecture has to answer this themselves.
*/
static void ack_bad(struct irq_data *data)
{
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 43e3d1be622c..52f11c791bf8 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -107,7 +107,7 @@ free_descs:
* @irq: linux irq number to be destroyed
* @dest: cpumask of cpus which should have the IPI removed
*
- * The IPIs allocated with irq_reserve_ipi() are retuerned to the system
+ * The IPIs allocated with irq_reserve_ipi() are returned to the system
* destroying all virqs associated with them.
*
* Return 0 on success or error code on failure.
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 48006608baf0..0cd02efa3a74 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -24,10 +24,6 @@ struct irq_sim_irq_ctx {
struct irq_sim_work_ctx *work_ctx;
};
-struct irq_sim_devres {
- struct irq_domain *domain;
-};
-
static void irq_sim_irqmask(struct irq_data *data)
{
struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
@@ -159,7 +155,7 @@ static const struct irq_domain_ops irq_sim_domain_ops = {
* irq_domain_create_sim - Create a new interrupt simulator irq_domain and
* allocate a range of dummy interrupts.
*
- * @fnode: struct fwnode_handle to be associated with this domain.
+ * @fwnode: struct fwnode_handle to be associated with this domain.
* @num_irqs: Number of interrupts to allocate.
*
* On success: return a new irq_domain object.
@@ -216,11 +212,11 @@ void irq_domain_remove_sim(struct irq_domain *domain)
}
EXPORT_SYMBOL_GPL(irq_domain_remove_sim);
-static void devm_irq_domain_release_sim(struct device *dev, void *res)
+static void devm_irq_domain_remove_sim(void *data)
{
- struct irq_sim_devres *this = res;
+ struct irq_domain *domain = data;
- irq_domain_remove_sim(this->domain);
+ irq_domain_remove_sim(domain);
}
/**
@@ -228,7 +224,7 @@ static void devm_irq_domain_release_sim(struct device *dev, void *res)
* a managed device.
*
* @dev: Device to initialize the simulator object for.
- * @fnode: struct fwnode_handle to be associated with this domain.
+ * @fwnode: struct fwnode_handle to be associated with this domain.
* @num_irqs: Number of interrupts to allocate
*
* On success: return a new irq_domain object.
@@ -238,20 +234,17 @@ struct irq_domain *devm_irq_domain_create_sim(struct device *dev,
struct fwnode_handle *fwnode,
unsigned int num_irqs)
{
- struct irq_sim_devres *dr;
+ struct irq_domain *domain;
+ int ret;
- dr = devres_alloc(devm_irq_domain_release_sim,
- sizeof(*dr), GFP_KERNEL);
- if (!dr)
- return ERR_PTR(-ENOMEM);
+ domain = irq_domain_create_sim(fwnode, num_irqs);
+ if (IS_ERR(domain))
+ return domain;
- dr->domain = irq_domain_create_sim(fwnode, num_irqs);
- if (IS_ERR(dr->domain)) {
- devres_free(dr);
- return dr->domain;
- }
+ ret = devm_add_action_or_reset(dev, devm_irq_domain_remove_sim, domain);
+ if (ret)
+ return ERR_PTR(ret);
- devres_add(dev, dr);
- return dr->domain;
+ return domain;
}
EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index cc1a09406c6e..4a617d7312a4 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -31,7 +31,7 @@ static int __init irq_affinity_setup(char *str)
cpulist_parse(str, irq_default_affinity);
/*
* Set at least the boot cpu. We don't want to end up with
- * bugreports caused by random comandline masks
+ * bugreports caused by random commandline masks
*/
cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
return 1;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 6aacd342cd14..f42ef868efd3 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -62,7 +62,7 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
* @name: Optional user provided domain name
* @pa: Optional user-provided physical address
*
- * Allocate a struct irqchip_fwid, and return a poiner to the embedded
+ * Allocate a struct irqchip_fwid, and return a pointer to the embedded
* fwnode_handle (or NULL on failure).
*
* Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are
@@ -205,6 +205,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
}
fwnode_handle_get(fwnode);
+ fwnode_dev_initialized(fwnode, true);
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
@@ -253,6 +254,7 @@ void irq_domain_remove(struct irq_domain *domain)
pr_debug("Removed domain %s\n", domain->name);
+ fwnode_dev_initialized(domain->fwnode, false);
fwnode_handle_put(domain->fwnode);
if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
kfree(domain->name);
@@ -663,7 +665,7 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
- /* Look for default domain if nececssary */
+ /* Look for default domain if necessary */
if (domain == NULL)
domain = irq_default_domain;
if (domain == NULL) {
@@ -701,41 +703,6 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_create_mapping_affinity);
-/**
- * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
- * @domain: domain owning the interrupt range
- * @irq_base: beginning of linux IRQ range
- * @hwirq_base: beginning of hardware IRQ range
- * @count: Number of interrupts to map
- *
- * This routine is used for allocating and mapping a range of hardware
- * irqs to linux irqs where the linux irq numbers are at pre-defined
- * locations. For use by controllers that already have static mappings
- * to insert in to the domain.
- *
- * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
- * domain insertion.
- *
- * 0 is returned upon success, while any failure to establish a static
- * mapping is treated as an error.
- */
-int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
- irq_hw_number_t hwirq_base, int count)
-{
- struct device_node *of_node;
- int ret;
-
- of_node = irq_domain_get_of_node(domain);
- ret = irq_alloc_descs(irq_base, irq_base, count,
- of_node_to_nid(of_node));
- if (unlikely(ret < 0))
- return ret;
-
- irq_domain_associate_many(domain, irq_base, hwirq_base, count);
- return 0;
-}
-EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
-
static int irq_domain_translate(struct irq_domain *d,
struct irq_fwspec *fwspec,
irq_hw_number_t *hwirq, unsigned int *type)
@@ -904,7 +871,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
{
struct irq_data *data;
- /* Look for default domain if nececssary */
+ /* Look for default domain if necessary */
if (domain == NULL)
domain = irq_default_domain;
if (domain == NULL)
@@ -1434,7 +1401,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
* The whole process to setup an IRQ has been split into two steps.
* The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
* descriptor and required hardware resources. The second step,
- * irq_domain_activate_irq(), is to program hardwares with preallocated
+ * irq_domain_activate_irq(), is to program the hardware with preallocated
* resources. In this way, it's easier to rollback when failing to
* allocate resources.
*/
@@ -1692,12 +1659,10 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
/**
* irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
+ * @domain: Domain below which interrupts must be allocated
* @irq_base: Base IRQ number
* @nr_irqs: Number of IRQs to allocate
* @arg: Allocation data (arch/domain specific)
- *
- * Check whether the domain has been setup recursive. If not allocate
- * through the parent domain.
*/
int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
unsigned int irq_base, unsigned int nr_irqs,
@@ -1713,11 +1678,9 @@ EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
/**
* irq_domain_free_irqs_parent - Free interrupts from parent domain
+ * @domain: Domain below which interrupts must be freed
* @irq_base: Base IRQ number
* @nr_irqs: Number of IRQs to free
- *
- * Check whether the domain has been setup recursive. If not free
- * through the parent domain.
*/
void irq_domain_free_irqs_parent(struct irq_domain *domain,
unsigned int irq_base, unsigned int nr_irqs)
@@ -1896,16 +1859,15 @@ DEFINE_SHOW_ATTRIBUTE(irq_domain_debug);
static void debugfs_add_domain_dir(struct irq_domain *d)
{
- if (!d->name || !domain_dir || d->debugfs_file)
+ if (!d->name || !domain_dir)
return;
- d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d,
- &irq_domain_debug_fops);
+ debugfs_create_file(d->name, 0444, domain_dir, d,
+ &irq_domain_debug_fops);
}
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- debugfs_remove(d->debugfs_file);
- d->debugfs_file = NULL;
+ debugfs_remove(debugfs_lookup(d->name, domain_dir));
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index dec3f73e8db9..4c14356543d9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -179,7 +179,7 @@ bool irq_can_set_affinity_usr(unsigned int irq)
/**
* irq_set_thread_affinity - Notify irq threads to adjust affinity
- * @desc: irq descriptor which has affitnity changed
+ * @desc: irq descriptor which has affinity changed
*
* We just set IRQTF_AFFINITY and delegate the affinity setting
* to the interrupt thread itself. We can not call
@@ -326,7 +326,7 @@ static bool irq_set_affinity_deactivated(struct irq_data *data,
* If the interrupt is not yet activated, just store the affinity
* mask and do not call the chip driver at all. On activation the
* driver has to make sure anyway that the interrupt is in a
- * useable state so startup works.
+ * usable state so startup works.
*/
if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) ||
irqd_is_activated(data) || !irqd_affinity_on_activate(data))
@@ -1054,7 +1054,7 @@ again:
* to IRQS_INPROGRESS and the irq line is masked forever.
*
* This also serializes the state of shared oneshot handlers
- * versus "desc->threads_onehsot |= action->thread_mask;" in
+ * versus "desc->threads_oneshot |= action->thread_mask;" in
* irq_wake_thread(). See the comment there which explains the
* serialization.
*/
@@ -1142,18 +1142,22 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
irqreturn_t ret;
local_bh_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
ret = action->thread_fn(action->irq, action->dev_id);
if (ret == IRQ_HANDLED)
atomic_inc(&desc->threads_handled);
irq_finalize_oneshot(desc, action);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
local_bh_enable();
return ret;
}
/*
* Interrupts explicitly requested as threaded interrupts want to be
- * preemtible - many of them need to sleep and wait for slow busses to
+ * preemptible - many of them need to sleep and wait for slow busses to
* complete.
*/
static irqreturn_t irq_thread_fn(struct irq_desc *desc,
@@ -1693,7 +1697,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
}
- if (irq_settings_can_autoenable(desc)) {
+ if (!(new->flags & IRQF_NO_AUTOEN) &&
+ irq_settings_can_autoenable(desc)) {
irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
} else {
/*
@@ -1908,7 +1913,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
/* Last action releases resources */
if (!desc->action) {
/*
- * Reaquire bus lock as irq_release_resources() might
+ * Reacquire bus lock as irq_release_resources() might
* require it to deallocate resources over the slow bus.
*/
chip_bus_lock(desc);
@@ -2086,10 +2091,15 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
* which interrupt is which (messes up the interrupt freeing
* logic etc).
*
+ * Also shared interrupts do not go well with disabling auto enable.
+ * The sharing interrupt might request it while it's still disabled
+ * and then wait for interrupts forever.
+ *
* Also IRQF_COND_SUSPEND only makes sense for shared interrupts and
* it cannot be set along with IRQF_NO_SUSPEND.
*/
if (((irqflags & IRQF_SHARED) && !dev_id) ||
+ ((irqflags & IRQF_SHARED) && (irqflags & IRQF_NO_AUTOEN)) ||
(!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) ||
((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND)))
return -EINVAL;
@@ -2245,7 +2255,8 @@ int request_nmi(unsigned int irq, irq_handler_t handler,
desc = irq_to_desc(irq);
- if (!desc || irq_settings_can_autoenable(desc) ||
+ if (!desc || (irq_settings_can_autoenable(desc) &&
+ !(irqflags & IRQF_NO_AUTOEN)) ||
!irq_settings_can_request(desc) ||
WARN_ON(irq_settings_is_per_cpu_devid(desc)) ||
!irq_supports_nmi(desc))
@@ -2742,7 +2753,7 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
* @irq: Interrupt line that is forwarded to a VM
* @which: One of IRQCHIP_STATE_* the caller wants to know about
- * @state: a pointer to a boolean where the state is to be storeed
+ * @state: a pointer to a boolean where the state is to be stored
*
* This call snapshots the internal irqchip state of an
* interrupt, returning into @state the bit corresponding to
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 651a4ad6d711..578596e41cb6 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -337,15 +337,14 @@ void irq_matrix_assign(struct irq_matrix *m, unsigned int bit)
* irq_matrix_reserve - Reserve interrupts
* @m: Matrix pointer
*
- * This is merily a book keeping call. It increments the number of globally
+ * This is merely a book keeping call. It increments the number of globally
* reserved interrupt bits w/o actually allocating them. This allows to
* setup interrupt descriptors w/o assigning low level resources to it.
* The actual allocation happens when the interrupt gets activated.
*/
void irq_matrix_reserve(struct irq_matrix *m)
{
- if (m->global_reserved <= m->global_available &&
- m->global_reserved + 1 > m->global_available)
+ if (m->global_reserved == m->global_available)
pr_warn("Interrupt reservation exceeds available resources\n");
m->global_reserved++;
@@ -356,7 +355,7 @@ void irq_matrix_reserve(struct irq_matrix *m)
* irq_matrix_remove_reserved - Remove interrupt reservation
* @m: Matrix pointer
*
- * This is merily a book keeping call. It decrements the number of globally
+ * This is merely a book keeping call. It decrements the number of globally
* reserved interrupt bits. This is used to undo irq_matrix_reserve() when the
* interrupt was never in use and a real vector allocated, which undid the
* reservation.
@@ -423,7 +422,9 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
return;
- clear_bit(bit, cm->alloc_map);
+ if (WARN_ON_ONCE(!test_and_clear_bit(bit, cm->alloc_map)))
+ return;
+
cm->allocated--;
if(managed)
cm->managed_allocated--;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index def48589ea48..61ca924ef4b4 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,7 +7,7 @@
/**
* irq_fixup_move_pending - Cleanup irq move pending from a dying CPU
- * @desc: Interrupt descpriptor to clean up
+ * @desc: Interrupt descriptor to clean up
* @force_clear: If set clear the move pending bit unconditionally.
* If not set, clear it only when the dying CPU is the
* last one in the pending mask.
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index b338d622f26e..c41965e348b5 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -5,7 +5,7 @@
*
* This file is licensed under GPLv2.
*
- * This file contains common code to support Message Signalled Interrupt for
+ * This file contains common code to support Message Signaled Interrupts for
* PCI compatible and non PCI compatible devices.
*/
#include <linux/types.h>
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 98138788cb04..7c5cd42df3b9 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -144,7 +144,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,
if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
return -EIO;
- if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
if (type)
@@ -238,7 +238,7 @@ static ssize_t default_affinity_write(struct file *file,
cpumask_var_t new_value;
int err;
- if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(buffer, count, new_value);
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index bd1d85c610aa..0c46e9fe3a89 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -128,7 +128,7 @@ int check_irq_resend(struct irq_desc *desc, bool inject)
if (!try_retrigger(desc))
err = irq_sw_resend(desc);
- /* If the retrigger was successfull, mark it with the REPLAY bit */
+ /* If the retrigger was successful, mark it with the REPLAY bit */
if (!err)
desc->istate |= IRQS_REPLAY;
return err;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index f865e5f4d382..c481d8458325 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -403,6 +403,10 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
desc->irqs_unhandled -= ok;
}
+ if (likely(!desc->irqs_unhandled))
+ return;
+
+ /* Now getting into unhandled irq detection */
desc->irq_count++;
if (likely(desc->irq_count < 100000))
return;
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 773b6105c4ae..d309d6fbf5bd 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -84,7 +84,7 @@ void irq_timings_disable(void)
* 2. Log interval
*
* We saw the irq timings allow to compute the interval of the
- * occurrences for a specific interrupt. We can reasonibly assume the
+ * occurrences for a specific interrupt. We can reasonably assume the
* longer is the interval, the higher is the error for the next event
* and we can consider storing those interval values into an array
* where each slot in the array correspond to an interval at the power
@@ -416,7 +416,7 @@ static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
* Copy the content of the circular buffer into another buffer
* in order to linearize the buffer instead of dealing with
* wrapping indexes and shifted array which will be prone to
- * error and extremelly difficult to debug.
+ * error and extremely difficult to debug.
*/
for (i = 0; i < count; i++) {
int index = (start + i) & IRQ_TIMINGS_MASK;
@@ -485,7 +485,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
/*
* The interrupt triggered more than one second apart, that
- * ends the sequence as predictible for our purpose. In this
+ * ends the sequence as predictable for our purpose. In this
* case, assume we have the beginning of a sequence and the
* timestamp is the first value. As it is impossible to
* predict anything at this point, return.
@@ -514,7 +514,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
* If more than the array size interrupts happened during the
* last busy/idle cycle, the index wrapped up and we have to
* begin with the next element in the array which is the last one
- * in the sequence, otherwise it is a the index 0.
+ * in the sequence, otherwise it is at the index 0.
*
* - have an indication of the interrupts activity on this CPU
* (eg. irq/sec)
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index c6a39d662935..ba39fbb1f8e7 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -407,6 +407,14 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init)
return false;
if (!kernel_text_address(jump_entry_code(entry))) {
+ /*
+ * This skips patching built-in __exit, which
+ * is part of init_section_contains() but is
+ * not part of kernel_text_address().
+ *
+ * Skipping built-in __exit is fine since it
+ * will never be executed.
+ */
WARN_ONCE(!jump_entry_is_init(entry),
"can't patch jump_label at %pS",
(void *)jump_entry_code(entry));
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fe9de067771c..c851ca0ed357 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -161,6 +161,27 @@ static unsigned long kallsyms_sym_address(int idx)
return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
}
+#if defined(CONFIG_CFI_CLANG) && defined(CONFIG_LTO_CLANG_THIN)
+/*
+ * LLVM appends a hash to static function names when ThinLTO and CFI are
+ * both enabled, i.e. foo() becomes foo$707af9a22804d33c81801f27dcfe489b.
+ * This causes confusion and potentially breaks user space tools, so we
+ * strip the suffix from expanded symbol names.
+ */
+static inline bool cleanup_symbol_name(char *s)
+{
+ char *res;
+
+ res = strrchr(s, '$');
+ if (res)
+ *res = '\0';
+
+ return res != NULL;
+}
+#else
+static inline bool cleanup_symbol_name(char *s) { return false; }
+#endif
+
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
{
@@ -173,10 +194,18 @@ unsigned long kallsyms_lookup_name(const char *name)
if (strcmp(namebuf, name) == 0)
return kallsyms_sym_address(i);
+
+ if (cleanup_symbol_name(namebuf) && strcmp(namebuf, name) == 0)
+ return kallsyms_sym_address(i);
}
return module_kallsyms_lookup_name(name);
}
+#ifdef CONFIG_LIVEPATCH
+/*
+ * Iterate over all symbols in vmlinux. For symbols from modules use
+ * module_kallsyms_on_each_symbol instead.
+ */
int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
unsigned long),
void *data)
@@ -192,8 +221,9 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
if (ret != 0)
return ret;
}
- return module_kallsyms_on_each_symbol(fn, data);
+ return 0;
}
+#endif /* CONFIG_LIVEPATCH */
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
@@ -297,7 +327,9 @@ const char *kallsyms_lookup(unsigned long addr,
namebuf, KSYM_NAME_LEN);
if (modname)
*modname = NULL;
- return namebuf;
+
+ ret = namebuf;
+ goto found;
}
/* See if it's in a module or a BPF JITed image. */
@@ -310,11 +342,16 @@ const char *kallsyms_lookup(unsigned long addr,
if (!ret)
ret = ftrace_mod_address_lookup(addr, symbolsize,
offset, modname, namebuf);
+
+found:
+ cleanup_symbol_name(namebuf);
return ret;
}
int lookup_symbol_name(unsigned long addr, char *symname)
{
+ int res;
+
symname[0] = '\0';
symname[KSYM_NAME_LEN - 1] = '\0';
@@ -325,15 +362,23 @@ int lookup_symbol_name(unsigned long addr, char *symname)
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos),
symname, KSYM_NAME_LEN);
- return 0;
+ goto found;
}
/* See if it's in a module. */
- return lookup_module_symbol_name(addr, symname);
+ res = lookup_module_symbol_name(addr, symname);
+ if (res)
+ return res;
+
+found:
+ cleanup_symbol_name(symname);
+ return 0;
}
int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
unsigned long *offset, char *modname, char *name)
{
+ int res;
+
name[0] = '\0';
name[KSYM_NAME_LEN - 1] = '\0';
@@ -345,10 +390,16 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
kallsyms_expand_symbol(get_symbol_offset(pos),
name, KSYM_NAME_LEN);
modname[0] = '\0';
- return 0;
+ goto found;
}
/* See if it's in a module. */
- return lookup_module_symbol_attrs(addr, size, offset, modname, name);
+ res = lookup_module_symbol_attrs(addr, size, offset, modname, name);
+ if (res)
+ return res;
+
+found:
+ cleanup_symbol_name(name);
+ return 0;
}
/* Look up a kernel symbol and return it in a text buffer. */
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 39d30ccf8d87..48aaf2ac0d0d 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -13,8 +13,6 @@ void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
-int machine_kexec_post_load(struct kimage *image);
-
extern struct mutex kexec_mutex;
#ifdef CONFIG_KEXEC_FILE
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d5a3eb74a657..745f08fdd7a6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -861,7 +861,6 @@ out:
cpus_read_unlock();
}
-#ifdef CONFIG_SYSCTL
static void optimize_all_kprobes(void)
{
struct hlist_head *head;
@@ -887,6 +886,7 @@ out:
mutex_unlock(&kprobe_mutex);
}
+#ifdef CONFIG_SYSCTL
static void unoptimize_all_kprobes(void)
{
struct hlist_head *head;
@@ -1520,13 +1520,16 @@ valid:
return ap;
}
-/* Return error if the kprobe is being re-registered */
-static inline int check_kprobe_rereg(struct kprobe *p)
+/*
+ * Warn and return error if the kprobe is being re-registered since
+ * there must be a software bug.
+ */
+static inline int warn_kprobe_rereg(struct kprobe *p)
{
int ret = 0;
mutex_lock(&kprobe_mutex);
- if (__get_valid_kprobe(p))
+ if (WARN_ON_ONCE(__get_valid_kprobe(p)))
ret = -EINVAL;
mutex_unlock(&kprobe_mutex);
@@ -1614,7 +1617,7 @@ int register_kprobe(struct kprobe *p)
return PTR_ERR(addr);
p->addr = addr;
- ret = check_kprobe_rereg(p);
+ ret = warn_kprobe_rereg(p);
if (ret)
return ret;
@@ -1995,7 +1998,7 @@ int register_kretprobe(struct kretprobe *rp)
return ret;
/* If only rp->kp.addr is specified, check reregistering kprobes */
- if (rp->kp.addr && check_kprobe_rereg(&rp->kp))
+ if (rp->kp.addr && warn_kprobe_rereg(&rp->kp))
return -EINVAL;
if (kretprobe_blacklist_size) {
@@ -2497,18 +2500,14 @@ static int __init init_kprobes(void)
}
}
-#if defined(CONFIG_OPTPROBES)
-#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
- /* Init kprobe_optinsn_slots */
- kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
-#endif
- /* By default, kprobes can be optimized */
- kprobes_allow_optimization = true;
-#endif
-
/* By default, kprobes are armed */
kprobes_all_disarmed = false;
+#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
+ /* Init kprobe_optinsn_slots for allocation */
+ kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+#endif
+
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
@@ -2523,6 +2522,21 @@ static int __init init_kprobes(void)
}
early_initcall(init_kprobes);
+#if defined(CONFIG_OPTPROBES)
+static int __init init_optprobes(void)
+{
+ /*
+ * Enable kprobe optimization - this kicks the optimizer which
+ * depends on synchronize_rcu_tasks() and ksoftirqd, that is
+ * not spawned in early initcall. So delay the optimization.
+ */
+ optimize_all_kprobes();
+
+ return 0;
+}
+subsys_initcall(init_optprobes);
+#endif
+
#ifdef CONFIG_DEBUG_FS
static void report_probe(struct seq_file *pi, struct kprobe *p,
const char *sym, int offset, char *modname, struct kprobe *pp)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1578973c5740..a1972eba2917 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -963,7 +963,8 @@ static void __kthread_queue_delayed_work(struct kthread_worker *worker,
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
- WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
+ WARN_ON_FUNCTION_MISMATCH(timer->function,
+ kthread_delayed_work_timer_fn);
/*
* If @delay is 0, queue @dwork->work immediately. This is for
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index f76fdb925532..335d988bd811 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -19,6 +19,7 @@
#include <linux/moduleloader.h>
#include <linux/completion.h>
#include <linux/memory.h>
+#include <linux/rcupdate.h>
#include <asm/cacheflush.h>
#include "core.h"
#include "patch.h"
@@ -57,7 +58,7 @@ static void klp_find_object_module(struct klp_object *obj)
if (!klp_is_module(obj))
return;
- mutex_lock(&module_mutex);
+ rcu_read_lock_sched();
/*
* We do not want to block removal of patched modules and therefore
* we do not take a reference here. The patches are removed by
@@ -74,7 +75,7 @@ static void klp_find_object_module(struct klp_object *obj)
if (mod && mod->klp_alive)
obj->mod = mod;
- mutex_unlock(&module_mutex);
+ rcu_read_unlock_sched();
}
static bool klp_initialized(void)
@@ -163,12 +164,10 @@ static int klp_find_object_symbol(const char *objname, const char *name,
.pos = sympos,
};
- mutex_lock(&module_mutex);
if (objname)
module_kallsyms_on_each_symbol(klp_find_callback, &args);
else
kallsyms_on_each_symbol(klp_find_callback, &args);
- mutex_unlock(&module_mutex);
/*
* Ensure an address was found. If sympos is 0, ensure symbol is unique;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c6d0c1dc6253..ef28a0b9cf1e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -705,7 +705,7 @@ static void print_lock_name(struct lock_class *class)
printk(KERN_CONT " (");
__print_lock_name(class);
- printk(KERN_CONT "){%s}-{%hd:%hd}", usage,
+ printk(KERN_CONT "){%s}-{%d:%d}", usage,
class->wait_type_outer ?: class->wait_type_inner,
class->wait_type_inner);
}
@@ -930,7 +930,8 @@ static bool assign_lock_key(struct lockdep_map *lock)
/* Debug-check: all keys must be persistent! */
debug_locks_off();
pr_err("INFO: trying to register non-static key.\n");
- pr_err("the code is fine but needs lockdep annotation.\n");
+ pr_err("The code is fine but needs lockdep annotation, or maybe\n");
+ pr_err("you didn't initialize this object before use?\n");
pr_err("turning off the locking correctness validator.\n");
dump_stack();
return false;
@@ -1392,7 +1393,7 @@ static int add_lock_to_list(struct lock_class *this,
/*
* For good efficiency of modular, we use power of 2
*/
-#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
+#define MAX_CIRCULAR_QUEUE_SIZE (1UL << CONFIG_LOCKDEP_CIRCULAR_QUEUE_BITS)
#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
/*
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index de49f9e1c11b..ecb8662e7a4e 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -99,16 +99,16 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_STACK_TRACE_ENTRIES 262144UL
#define STACK_TRACE_HASH_SIZE 8192
#else
-#define MAX_LOCKDEP_ENTRIES 32768UL
+#define MAX_LOCKDEP_ENTRIES (1UL << CONFIG_LOCKDEP_BITS)
-#define MAX_LOCKDEP_CHAINS_BITS 16
+#define MAX_LOCKDEP_CHAINS_BITS CONFIG_LOCKDEP_CHAINS_BITS
/*
* Stack-trace: tightly packed array of stack backtrace
* addresses. Protected by the hash_lock.
*/
-#define MAX_STACK_TRACE_ENTRIES 524288UL
-#define STACK_TRACE_HASH_SIZE 16384
+#define MAX_STACK_TRACE_ENTRIES (1UL << CONFIG_LOCKDEP_STACK_TRACE_BITS)
+#define STACK_TRACE_HASH_SIZE (1 << CONFIG_LOCKDEP_STACK_TRACE_HASH_BITS)
#endif
/*
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index adb935090768..622ebdfcd083 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -626,7 +626,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
*/
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
if (!waiter) {
/*
@@ -702,7 +702,7 @@ fail:
#else
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
return false;
}
@@ -922,6 +922,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
struct ww_mutex *ww;
int ret;
+ if (!use_ww_ctx)
+ ww_ctx = NULL;
+
might_sleep();
#ifdef CONFIG_DEBUG_MUTEXES
@@ -929,7 +932,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
#endif
ww = container_of(lock, struct ww_mutex, base);
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
return -EALREADY;
@@ -946,10 +949,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
if (__mutex_trylock(lock) ||
- mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) {
+ mutex_optimistic_spin(lock, ww_ctx, NULL)) {
/* got the lock, yay! */
lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
ww_mutex_set_context_fastpath(ww, ww_ctx);
preempt_enable();
return 0;
@@ -960,7 +963,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* After waiting to acquire the wait_lock, try again.
*/
if (__mutex_trylock(lock)) {
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
__ww_mutex_check_waiters(lock, ww_ctx);
goto skip_wait;
@@ -1013,7 +1016,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto err;
}
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx);
if (ret)
goto err;
@@ -1026,7 +1029,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* ww_mutex needs to always recheck its position since its waiter
* list is not FIFO ordered.
*/
- if ((use_ww_ctx && ww_ctx) || !first) {
+ if (ww_ctx || !first) {
first = __mutex_waiter_is_first(lock, &waiter);
if (first)
__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
@@ -1039,7 +1042,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* or we must see its unlock and acquire.
*/
if (__mutex_trylock(lock) ||
- (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter)))
+ (first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
break;
spin_lock(&lock->wait_lock);
@@ -1048,7 +1051,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
acquired:
__set_current_state(TASK_RUNNING);
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
/*
* Wound-Wait; we stole the lock (!first_waiter), check the
* waiters as anyone might want to wound us.
@@ -1068,7 +1071,7 @@ skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
spin_unlock(&lock->wait_lock);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 4786dd271b45..b94f3831e963 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -60,6 +60,8 @@ EXPORT_SYMBOL(queued_read_lock_slowpath);
*/
void queued_write_lock_slowpath(struct qrwlock *lock)
{
+ int cnts;
+
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);
@@ -73,9 +75,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
/* When no more readers or writers, set the locked flag */
do {
- atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING);
- } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) != _QW_WAITING);
+ cnts = atomic_cond_read_relaxed(&lock->cnts, VAL == _QW_WAITING);
+ } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
unlock:
arch_spin_unlock(&lock->wait_lock);
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 03b21135313c..48fff6437901 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1420,7 +1420,7 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
}
/*
- * Performs the wakeup of the the top-waiter and re-enables preemption.
+ * Performs the wakeup of the top-waiter and re-enables preemption.
*/
void rt_mutex_postunlock(struct wake_q_head *wake_q)
{
@@ -1819,7 +1819,7 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
* been started.
* @waiter: the pre-initialized rt_mutex_waiter
*
- * Wait for the the lock acquisition started on our behalf by
+ * Wait for the lock acquisition started on our behalf by
* rt_mutex_start_proxy_lock(). Upon failure, the caller must call
* rt_mutex_cleanup_proxy_lock().
*
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index ba67600c7b2c..abba5df50006 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1048,7 +1048,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
/*
* If there were already threads queued before us and:
- * 1) there are no no active locks, wake the front
+ * 1) there are no active locks, wake the front
* queued process(es) as the handoff bit might be set.
* 2) there are no active writers and some readers, the lock
* must be read owned; so we try to wake any read lock
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index d9dd94defc0a..9aa855a96c4a 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -119,7 +119,7 @@ EXPORT_SYMBOL(down_killable);
* @sem: the semaphore to be acquired
*
* Try to acquire the semaphore atomically. Returns 0 if the semaphore has
- * been acquired successfully or 1 if it it cannot be acquired.
+ * been acquired successfully or 1 if it cannot be acquired.
*
* NOTE: This return value is inverted from both spin_trylock and
* mutex_trylock! Be careful about this when converting code.
diff --git a/kernel/module.c b/kernel/module.c
index 4bf30e4b3eaa..20fb004e7d8d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -87,8 +87,7 @@
* 3) module_addr_min/module_addr_max.
* (delete and add uses RCU list operations).
*/
-DEFINE_MUTEX(module_mutex);
-EXPORT_SYMBOL_GPL(module_mutex);
+static DEFINE_MUTEX(module_mutex);
static LIST_HEAD(modules);
/* Work queue for freeing init sections in success case */
@@ -256,11 +255,6 @@ static void mod_update_bounds(struct module *mod)
struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
#endif /* CONFIG_KGDB_KDB */
-static void module_assert_mutex(void)
-{
- lockdep_assert_held(&module_mutex);
-}
-
static void module_assert_mutex_or_preempt(void)
{
#ifdef CONFIG_LOCKDEP
@@ -414,19 +408,8 @@ extern const struct kernel_symbol __start___ksymtab[];
extern const struct kernel_symbol __stop___ksymtab[];
extern const struct kernel_symbol __start___ksymtab_gpl[];
extern const struct kernel_symbol __stop___ksymtab_gpl[];
-extern const struct kernel_symbol __start___ksymtab_gpl_future[];
-extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
extern const s32 __start___kcrctab[];
extern const s32 __start___kcrctab_gpl[];
-extern const s32 __start___kcrctab_gpl_future[];
-#ifdef CONFIG_UNUSED_SYMBOLS
-extern const struct kernel_symbol __start___ksymtab_unused[];
-extern const struct kernel_symbol __stop___ksymtab_unused[];
-extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
-extern const s32 __start___kcrctab_unused[];
-extern const s32 __start___kcrctab_unused_gpl[];
-#endif
#ifndef CONFIG_MODVERSIONS
#define symversion(base, idx) NULL
@@ -434,87 +417,14 @@ extern const s32 __start___kcrctab_unused_gpl[];
#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
#endif
-static bool each_symbol_in_section(const struct symsearch *arr,
- unsigned int arrsize,
- struct module *owner,
- bool (*fn)(const struct symsearch *syms,
- struct module *owner,
- void *data),
- void *data)
-{
- unsigned int j;
-
- for (j = 0; j < arrsize; j++) {
- if (fn(&arr[j], owner, data))
- return true;
- }
-
- return false;
-}
-
-/* Returns true as soon as fn returns true, otherwise false. */
-static bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
- struct module *owner,
- void *data),
- void *data)
-{
- struct module *mod;
- static const struct symsearch arr[] = {
- { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
- NOT_GPL_ONLY, false },
- { __start___ksymtab_gpl, __stop___ksymtab_gpl,
- __start___kcrctab_gpl,
- GPL_ONLY, false },
- { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
- __start___kcrctab_gpl_future,
- WILL_BE_GPL_ONLY, false },
-#ifdef CONFIG_UNUSED_SYMBOLS
- { __start___ksymtab_unused, __stop___ksymtab_unused,
- __start___kcrctab_unused,
- NOT_GPL_ONLY, true },
- { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
- __start___kcrctab_unused_gpl,
- GPL_ONLY, true },
-#endif
- };
-
- module_assert_mutex_or_preempt();
-
- if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
- return true;
-
- list_for_each_entry_rcu(mod, &modules, list,
- lockdep_is_held(&module_mutex)) {
- struct symsearch arr[] = {
- { mod->syms, mod->syms + mod->num_syms, mod->crcs,
- NOT_GPL_ONLY, false },
- { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
- mod->gpl_crcs,
- GPL_ONLY, false },
- { mod->gpl_future_syms,
- mod->gpl_future_syms + mod->num_gpl_future_syms,
- mod->gpl_future_crcs,
- WILL_BE_GPL_ONLY, false },
-#ifdef CONFIG_UNUSED_SYMBOLS
- { mod->unused_syms,
- mod->unused_syms + mod->num_unused_syms,
- mod->unused_crcs,
- NOT_GPL_ONLY, true },
- { mod->unused_gpl_syms,
- mod->unused_gpl_syms + mod->num_unused_gpl_syms,
- mod->unused_gpl_crcs,
- GPL_ONLY, true },
-#endif
- };
-
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
-
- if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
- return true;
- }
- return false;
-}
+struct symsearch {
+ const struct kernel_symbol *start, *stop;
+ const s32 *crcs;
+ enum mod_license {
+ NOT_GPL_ONLY,
+ GPL_ONLY,
+ } license;
+};
struct find_symbol_arg {
/* Input */
@@ -535,28 +445,8 @@ static bool check_exported_symbol(const struct symsearch *syms,
{
struct find_symbol_arg *fsa = data;
- if (!fsa->gplok) {
- if (syms->license == GPL_ONLY)
- return false;
- if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) {
- pr_warn("Symbol %s is being used by a non-GPL module, "
- "which will not be allowed in the future\n",
- fsa->name);
- }
- }
-
-#ifdef CONFIG_UNUSED_SYMBOLS
- if (syms->unused && fsa->warn) {
- pr_warn("Symbol %s is marked as UNUSED, however this module is "
- "using it.\n", fsa->name);
- pr_warn("This symbol will go away in the future.\n");
- pr_warn("Please evaluate if this is the right api to use and "
- "if it really is, submit a report to the linux kernel "
- "mailing list together with submitting your code for "
- "inclusion.\n");
- }
-#endif
-
+ if (!fsa->gplok && syms->license == GPL_ONLY)
+ return false;
fsa->owner = owner;
fsa->crc = symversion(syms->crcs, symnum);
fsa->sym = &syms->start[symnum];
@@ -619,31 +509,44 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
* Find an exported symbol and return it, along with, (optional) crc and
* (optional) module which owns it. Needs preempt disabled or module_mutex.
*/
-static const struct kernel_symbol *find_symbol(const char *name,
- struct module **owner,
- const s32 **crc,
- enum mod_license *license,
- bool gplok,
- bool warn)
-{
- struct find_symbol_arg fsa;
-
- fsa.name = name;
- fsa.gplok = gplok;
- fsa.warn = warn;
-
- if (each_symbol_section(find_exported_symbol_in_section, &fsa)) {
- if (owner)
- *owner = fsa.owner;
- if (crc)
- *crc = fsa.crc;
- if (license)
- *license = fsa.license;
- return fsa.sym;
+static bool find_symbol(struct find_symbol_arg *fsa)
+{
+ static const struct symsearch arr[] = {
+ { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
+ NOT_GPL_ONLY },
+ { __start___ksymtab_gpl, __stop___ksymtab_gpl,
+ __start___kcrctab_gpl,
+ GPL_ONLY },
+ };
+ struct module *mod;
+ unsigned int i;
+
+ module_assert_mutex_or_preempt();
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++)
+ if (find_exported_symbol_in_section(&arr[i], NULL, fsa))
+ return true;
+
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
+ struct symsearch arr[] = {
+ { mod->syms, mod->syms + mod->num_syms, mod->crcs,
+ NOT_GPL_ONLY },
+ { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
+ mod->gpl_crcs,
+ GPL_ONLY },
+ };
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++)
+ if (find_exported_symbol_in_section(&arr[i], mod, fsa))
+ return true;
}
- pr_debug("Failed to find symbol %s\n", name);
- return NULL;
+ pr_debug("Failed to find symbol %s\n", fsa->name);
+ return false;
}
/*
@@ -669,10 +572,8 @@ static struct module *find_module_all(const char *name, size_t len,
struct module *find_module(const char *name)
{
- module_assert_mutex();
return find_module_all(name, strlen(name), false);
}
-EXPORT_SYMBOL_GPL(find_module);
#ifdef CONFIG_SMP
@@ -1107,12 +1008,15 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
void __symbol_put(const char *symbol)
{
- struct module *owner;
+ struct find_symbol_arg fsa = {
+ .name = symbol,
+ .gplok = true,
+ };
preempt_disable();
- if (!find_symbol(symbol, &owner, NULL, NULL, true, false))
+ if (!find_symbol(&fsa))
BUG();
- module_put(owner);
+ module_put(fsa.owner);
preempt_enable();
}
EXPORT_SYMBOL(__symbol_put);
@@ -1381,19 +1285,22 @@ bad_version:
static inline int check_modstruct_version(const struct load_info *info,
struct module *mod)
{
- const s32 *crc;
+ struct find_symbol_arg fsa = {
+ .name = "module_layout",
+ .gplok = true,
+ };
/*
* Since this should be found in kernel (which can't be removed), no
* locking is necessary -- use preempt_disable() to placate lockdep.
*/
preempt_disable();
- if (!find_symbol("module_layout", NULL, &crc, NULL, true, false)) {
+ if (!find_symbol(&fsa)) {
preempt_enable();
BUG();
}
preempt_enable();
- return check_version(info, "module_layout", mod, crc);
+ return check_version(info, "module_layout", mod, fsa.crc);
}
/* First part is kernel version, which we ignore if module has crcs. */
@@ -1487,10 +1394,11 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
const char *name,
char ownername[])
{
- struct module *owner;
- const struct kernel_symbol *sym;
- const s32 *crc;
- enum mod_license license;
+ struct find_symbol_arg fsa = {
+ .name = name,
+ .gplok = !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)),
+ .warn = true,
+ };
int err;
/*
@@ -1500,42 +1408,40 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
*/
sched_annotate_sleep();
mutex_lock(&module_mutex);
- sym = find_symbol(name, &owner, &crc, &license,
- !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
- if (!sym)
+ if (!find_symbol(&fsa))
goto unlock;
- if (license == GPL_ONLY)
+ if (fsa.license == GPL_ONLY)
mod->using_gplonly_symbols = true;
- if (!inherit_taint(mod, owner)) {
- sym = NULL;
+ if (!inherit_taint(mod, fsa.owner)) {
+ fsa.sym = NULL;
goto getname;
}
- if (!check_version(info, name, mod, crc)) {
- sym = ERR_PTR(-EINVAL);
+ if (!check_version(info, name, mod, fsa.crc)) {
+ fsa.sym = ERR_PTR(-EINVAL);
goto getname;
}
- err = verify_namespace_is_imported(info, sym, mod);
+ err = verify_namespace_is_imported(info, fsa.sym, mod);
if (err) {
- sym = ERR_PTR(err);
+ fsa.sym = ERR_PTR(err);
goto getname;
}
- err = ref_module(mod, owner);
+ err = ref_module(mod, fsa.owner);
if (err) {
- sym = ERR_PTR(err);
+ fsa.sym = ERR_PTR(err);
goto getname;
}
getname:
/* We must make copy under the lock if we failed to get ref. */
- strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
+ strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN);
unlock:
mutex_unlock(&module_mutex);
- return sym;
+ return fsa.sym;
}
static const struct kernel_symbol *
@@ -2240,6 +2146,8 @@ void __weak module_arch_freeing_init(struct module *mod)
{
}
+static void cfi_cleanup(struct module *mod);
+
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
@@ -2281,6 +2189,9 @@ static void free_module(struct module *mod)
synchronize_rcu();
mutex_unlock(&module_mutex);
+ /* Clean up CFI for the module. */
+ cfi_cleanup(mod);
+
/* This may be empty, but that's OK */
module_arch_freeing_init(mod);
module_memfree(mod->init_layout.base);
@@ -2296,16 +2207,19 @@ static void free_module(struct module *mod)
void *__symbol_get(const char *symbol)
{
- struct module *owner;
- const struct kernel_symbol *sym;
+ struct find_symbol_arg fsa = {
+ .name = symbol,
+ .gplok = true,
+ .warn = true,
+ };
preempt_disable();
- sym = find_symbol(symbol, &owner, NULL, NULL, true, true);
- if (sym && strong_try_module_get(owner))
- sym = NULL;
+ if (!find_symbol(&fsa) || strong_try_module_get(fsa.owner)) {
+ preempt_enable();
+ return NULL;
+ }
preempt_enable();
-
- return sym ? (void *)kernel_symbol_value(sym) : NULL;
+ return (void *)kernel_symbol_value(fsa.sym);
}
EXPORT_SYMBOL_GPL(__symbol_get);
@@ -2318,7 +2232,6 @@ EXPORT_SYMBOL_GPL(__symbol_get);
static int verify_exported_symbols(struct module *mod)
{
unsigned int i;
- struct module *owner;
const struct kernel_symbol *s;
struct {
const struct kernel_symbol *sym;
@@ -2326,21 +2239,19 @@ static int verify_exported_symbols(struct module *mod)
} arr[] = {
{ mod->syms, mod->num_syms },
{ mod->gpl_syms, mod->num_gpl_syms },
- { mod->gpl_future_syms, mod->num_gpl_future_syms },
-#ifdef CONFIG_UNUSED_SYMBOLS
- { mod->unused_syms, mod->num_unused_syms },
- { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
-#endif
};
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
- if (find_symbol(kernel_symbol_name(s), &owner, NULL,
- NULL, true, false)) {
+ struct find_symbol_arg fsa = {
+ .name = kernel_symbol_name(s),
+ .gplok = true,
+ };
+ if (find_symbol(&fsa)) {
pr_err("%s: exports duplicate symbol %s"
" (owned by %s)\n",
mod->name, kernel_symbol_name(s),
- module_name(owner));
+ module_name(fsa.owner));
return -ENOEXEC;
}
}
@@ -2348,6 +2259,21 @@ static int verify_exported_symbols(struct module *mod)
return 0;
}
+static bool ignore_undef_symbol(Elf_Half emachine, const char *name)
+{
+ /*
+ * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as
+ * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64.
+ * i386 has a similar problem but may not deserve a fix.
+ *
+ * If we ever have to ignore many symbols, consider refactoring the code to
+ * only warn if referenced by a relocation.
+ */
+ if (emachine == EM_386 || emachine == EM_X86_64)
+ return !strcmp(name, "_GLOBAL_OFFSET_TABLE_");
+ return false;
+}
+
/* Change all symbols so that st_value encodes the pointer directly. */
static int simplify_symbols(struct module *mod, const struct load_info *info)
{
@@ -2395,8 +2321,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
break;
}
- /* Ok if weak. */
- if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
+ /* Ok if weak or ignored. */
+ if (!ksym &&
+ (ELF_ST_BIND(sym[i].st_info) == STB_WEAK ||
+ ignore_undef_symbol(info->hdr->e_machine, name)))
break;
ret = PTR_ERR(ksym) ?: -ENOENT;
@@ -2964,7 +2892,7 @@ static int module_sig_check(struct load_info *info, int flags)
}
if (is_module_sig_enforced()) {
- pr_notice("%s: loading of %s is rejected\n", info->name, reason);
+ pr_notice("Loading of %s is rejected\n", reason);
return -EKEYREJECTED;
}
@@ -2977,9 +2905,33 @@ static int module_sig_check(struct load_info *info, int flags)
}
#endif /* !CONFIG_MODULE_SIG */
-/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
-static int elf_header_check(struct load_info *info)
+static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
{
+ unsigned long secend;
+
+ /*
+ * Check for both overflow and offset/size being
+ * too large.
+ */
+ secend = shdr->sh_offset + shdr->sh_size;
+ if (secend < shdr->sh_offset || secend > info->len)
+ return -ENOEXEC;
+
+ return 0;
+}
+
+/*
+ * Sanity checks against invalid binaries, wrong arch, weird elf version.
+ *
+ * Also do basic validity checks against section offsets and sizes, the
+ * section name string table, and the indices used for it (sh_name).
+ */
+static int elf_validity_check(struct load_info *info)
+{
+ unsigned int i;
+ Elf_Shdr *shdr, *strhdr;
+ int err;
+
if (info->len < sizeof(*(info->hdr)))
return -ENOEXEC;
@@ -2989,11 +2941,78 @@ static int elf_header_check(struct load_info *info)
|| info->hdr->e_shentsize != sizeof(Elf_Shdr))
return -ENOEXEC;
+ /*
+ * e_shnum is 16 bits, and sizeof(Elf_Shdr) is
+ * known and small. So e_shnum * sizeof(Elf_Shdr)
+ * will not overflow unsigned long on any platform.
+ */
if (info->hdr->e_shoff >= info->len
|| (info->hdr->e_shnum * sizeof(Elf_Shdr) >
info->len - info->hdr->e_shoff))
return -ENOEXEC;
+ info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
+
+ /*
+ * Verify if the section name table index is valid.
+ */
+ if (info->hdr->e_shstrndx == SHN_UNDEF
+ || info->hdr->e_shstrndx >= info->hdr->e_shnum)
+ return -ENOEXEC;
+
+ strhdr = &info->sechdrs[info->hdr->e_shstrndx];
+ err = validate_section_offset(info, strhdr);
+ if (err < 0)
+ return err;
+
+ /*
+ * The section name table must be NUL-terminated, as required
+ * by the spec. This makes strcmp and pr_* calls that access
+ * strings in the section safe.
+ */
+ info->secstrings = (void *)info->hdr + strhdr->sh_offset;
+ if (info->secstrings[strhdr->sh_size - 1] != '\0')
+ return -ENOEXEC;
+
+ /*
+ * The code assumes that section 0 has a length of zero and
+ * an addr of zero, so check for it.
+ */
+ if (info->sechdrs[0].sh_type != SHT_NULL
+ || info->sechdrs[0].sh_size != 0
+ || info->sechdrs[0].sh_addr != 0)
+ return -ENOEXEC;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ shdr = &info->sechdrs[i];
+ switch (shdr->sh_type) {
+ case SHT_NULL:
+ case SHT_NOBITS:
+ continue;
+ case SHT_SYMTAB:
+ if (shdr->sh_link == SHN_UNDEF
+ || shdr->sh_link >= info->hdr->e_shnum)
+ return -ENOEXEC;
+ fallthrough;
+ default:
+ err = validate_section_offset(info, shdr);
+ if (err < 0) {
+ pr_err("Invalid ELF section in module (section %u type %u)\n",
+ i, shdr->sh_type);
+ return err;
+ }
+
+ if (shdr->sh_flags & SHF_ALLOC) {
+ if (shdr->sh_name >= strhdr->sh_size) {
+ pr_err("Invalid ELF section name in module (section %u type %u)\n",
+ i, shdr->sh_type);
+ return -ENOEXEC;
+ }
+ }
+ break;
+ }
+ }
+
return 0;
}
@@ -3095,11 +3114,6 @@ static int rewrite_section_headers(struct load_info *info, int flags)
for (i = 1; i < info->hdr->e_shnum; i++) {
Elf_Shdr *shdr = &info->sechdrs[i];
- if (shdr->sh_type != SHT_NOBITS
- && info->len < shdr->sh_offset + shdr->sh_size) {
- pr_err("Module len %lu truncated\n", info->len);
- return -ENOEXEC;
- }
/*
* Mark all sections sh_addr with their address in the
@@ -3133,11 +3147,6 @@ static int setup_load_info(struct load_info *info, int flags)
{
unsigned int i;
- /* Set up the convenience variables */
- info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
- info->secstrings = (void *)info->hdr
- + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
-
/* Try to find a name early so we can log errors with a module name */
info->index.info = find_sec(info, ".modinfo");
if (info->index.info)
@@ -3241,22 +3250,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->gpl_syms),
&mod->num_gpl_syms);
mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
- mod->gpl_future_syms = section_objs(info,
- "__ksymtab_gpl_future",
- sizeof(*mod->gpl_future_syms),
- &mod->num_gpl_future_syms);
- mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
-
-#ifdef CONFIG_UNUSED_SYMBOLS
- mod->unused_syms = section_objs(info, "__ksymtab_unused",
- sizeof(*mod->unused_syms),
- &mod->num_unused_syms);
- mod->unused_crcs = section_addr(info, "__kcrctab_unused");
- mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
- sizeof(*mod->unused_gpl_syms),
- &mod->num_unused_gpl_syms);
- mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
-#endif
+
#ifdef CONFIG_CONSTRUCTORS
mod->ctors = section_objs(info, ".ctors",
sizeof(*mod->ctors), &mod->num_ctors);
@@ -3437,14 +3431,8 @@ static int check_module_license_and_versions(struct module *mod)
pr_warn("%s: module license taints kernel.\n", mod->name);
#ifdef CONFIG_MODVERSIONS
- if ((mod->num_syms && !mod->crcs)
- || (mod->num_gpl_syms && !mod->gpl_crcs)
- || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
-#ifdef CONFIG_UNUSED_SYMBOLS
- || (mod->num_unused_syms && !mod->unused_crcs)
- || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
-#endif
- ) {
+ if ((mod->num_syms && !mod->crcs) ||
+ (mod->num_gpl_syms && !mod->gpl_crcs)) {
return try_to_force_load(mod,
"no versions for exported symbols");
}
@@ -3883,6 +3871,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname,
return 0;
}
+static void cfi_init(struct module *mod);
+
/*
* Allocate and load the module: note that size of section 0 is always
* zero, and we rely on this for optional sections.
@@ -3894,26 +3884,50 @@ static int load_module(struct load_info *info, const char __user *uargs,
long err = 0;
char *after_dashes;
- err = elf_header_check(info);
+ /*
+ * Do the signature check (if any) first. All that
+ * the signature check needs is info->len, it does
+ * not need any of the section info. That can be
+ * set up later. This will minimize the chances
+ * of a corrupt module causing problems before
+ * we even get to the signature check.
+ *
+ * The check will also adjust info->len by stripping
+ * off the sig length at the end of the module, making
+ * checks against info->len more correct.
+ */
+ err = module_sig_check(info, flags);
+ if (err)
+ goto free_copy;
+
+ /*
+ * Do basic sanity checks against the ELF header and
+ * sections.
+ */
+ err = elf_validity_check(info);
if (err) {
- pr_err("Module has invalid ELF header\n");
+ pr_err("Module has invalid ELF structures\n");
goto free_copy;
}
+ /*
+ * Everything checks out, so set up the section info
+ * in the info structure.
+ */
err = setup_load_info(info, flags);
if (err)
goto free_copy;
+ /*
+ * Now that we know we have the correct module name, check
+ * if it's blacklisted.
+ */
if (blacklisted(info->name)) {
err = -EPERM;
pr_err("Module %s is blacklisted\n", info->name);
goto free_copy;
}
- err = module_sig_check(info, flags);
- if (err)
- goto free_copy;
-
err = rewrite_section_headers(info, flags);
if (err)
goto free_copy;
@@ -3990,6 +4004,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
flush_module_icache(mod);
+ /* Setup CFI for the module. */
+ cfi_init(mod);
+
/* Now copy in args */
mod->args = strndup_user(uargs, ~0UL >> 1);
if (IS_ERR(mod->args)) {
@@ -4063,6 +4080,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
synchronize_rcu();
kfree(mod->args);
free_arch_cleanup:
+ cfi_cleanup(mod);
module_arch_cleanup(mod);
free_modinfo:
free_modinfo(mod);
@@ -4374,16 +4392,16 @@ unsigned long module_kallsyms_lookup_name(const char *name)
return ret;
}
+#ifdef CONFIG_LIVEPATCH
int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
struct module *, unsigned long),
void *data)
{
struct module *mod;
unsigned int i;
- int ret;
-
- module_assert_mutex();
+ int ret = 0;
+ mutex_lock(&module_mutex);
list_for_each_entry(mod, &modules, list) {
/* We hold module_mutex: no need for rcu_dereference_sched */
struct mod_kallsyms *kallsyms = mod->kallsyms;
@@ -4399,13 +4417,47 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
ret = fn(data, kallsyms_symbol_name(kallsyms, i),
mod, kallsyms_symbol_value(sym));
if (ret != 0)
- return ret;
+ break;
}
}
- return 0;
+ mutex_unlock(&module_mutex);
+ return ret;
}
+#endif /* CONFIG_LIVEPATCH */
#endif /* CONFIG_KALLSYMS */
+static void cfi_init(struct module *mod)
+{
+#ifdef CONFIG_CFI_CLANG
+ initcall_t *init;
+ exitcall_t *exit;
+
+ rcu_read_lock_sched();
+ mod->cfi_check = (cfi_check_fn)
+ find_kallsyms_symbol_value(mod, "__cfi_check");
+ init = (initcall_t *)
+ find_kallsyms_symbol_value(mod, "__cfi_jt_init_module");
+ exit = (exitcall_t *)
+ find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module");
+ rcu_read_unlock_sched();
+
+ /* Fix init/exit functions to point to the CFI jump table */
+ if (init)
+ mod->init = *init;
+ if (exit)
+ mod->exit = *exit;
+
+ cfi_module_add(mod, module_addr_min);
+#endif
+}
+
+static void cfi_cleanup(struct module *mod)
+{
+#ifdef CONFIG_CFI_CLANG
+ cfi_module_remove(mod, module_addr_min);
+#endif
+}
+
/* Maximum number of characters written by module_flags() */
#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
diff --git a/kernel/module_signature.c b/kernel/module_signature.c
index 4224a1086b7d..00132d12487c 100644
--- a/kernel/module_signature.c
+++ b/kernel/module_signature.c
@@ -25,7 +25,7 @@ int mod_check_sig(const struct module_signature *ms, size_t file_len,
return -EBADMSG;
if (ms->id_type != PKEY_ID_PKCS7) {
- pr_err("%s: Module is not signed with expected PKCS#7 message\n",
+ pr_err("%s: not signed with expected PKCS#7 message\n",
name);
return -ENOPKG;
}
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 9d9fc678c91d..8723ae70ea1f 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -30,7 +30,7 @@ int mod_verify_sig(const void *mod, struct load_info *info)
memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
- ret = mod_check_sig(&ms, modlen, info->name);
+ ret = mod_check_sig(&ms, modlen, "module");
if (ret)
return ret;
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index 9af5a50d3489..b29c8aca7486 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -54,7 +54,7 @@ static void try_to_suspend(struct work_struct *work)
goto out;
/*
- * If the wakeup occured for an unknown reason, wait to prevent the
+ * If the wakeup occurred for an unknown reason, wait to prevent the
* system from trying to suspend and waking up in a tight loop.
*/
if (final_count == initial_count)
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 1358fa4abfa8..0f4530b3a8cd 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -98,7 +98,7 @@ static int __init em_debug_init(void)
return 0;
}
-core_initcall(em_debug_init);
+fs_initcall(em_debug_init);
#else /* CONFIG_DEBUG_FS */
static void em_debug_create_pd(struct device *dev) {}
static void em_debug_remove_pd(struct device *dev) {}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d63560e1cf87..1a221dcb3c01 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -329,7 +329,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
/**
* Data types related to memory bitmaps.
*
- * Memory bitmap is a structure consiting of many linked lists of
+ * Memory bitmap is a structure consisting of many linked lists of
* objects. The main list's elements are of type struct zone_bitmap
* and each of them corresonds to one zone. For each zone bitmap
* object there is a list of objects of type struct bm_block that
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 72e33054a2e1..bea3cb8afa11 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -884,7 +884,7 @@ out_clean:
* enough_swap - Make sure we have enough swap to save the image.
*
* Returns TRUE or FALSE after checking the total amount of swap
- * space avaiable from the resume partition.
+ * space available from the resume partition.
*/
static int enough_swap(unsigned int nr_pages)
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 3a8fd491758c..51615c909b2f 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -12,8 +12,6 @@
#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000
-extern raw_spinlock_t logbuf_lock;
-
__printf(4, 0)
int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
@@ -21,7 +19,6 @@ int vprintk_store(int facility, int level,
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
void __printk_safe_enter(void);
void __printk_safe_exit(void);
@@ -56,10 +53,8 @@ void defer_console_output(void);
#else
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
-
/*
- * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
+ * In !PRINTK builds we still export console_sem
* semaphore and some of console functions (console_unlock()/etc.), so
* printk-safe must preserve the existing local IRQ guarantees.
*/
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 575a34b88936..421c35571797 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -262,7 +262,7 @@ static void __up_console_sem(unsigned long ip)
* definitely not the perfect debug tool (we don't know if _WE_
* hold it and are racing, but it helps tracking those weird code
* paths in the console code where we end up in places I want
- * locked without the console sempahore held).
+ * locked without the console semaphore held).
*/
static int console_locked, console_suspended;
@@ -355,62 +355,50 @@ enum log_flags {
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
-/*
- * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
- * within the scheduler's rq lock. It must be released before calling
- * console_unlock() or anything else that might wake up a process.
- */
-DEFINE_RAW_SPINLOCK(logbuf_lock);
-
-/*
- * Helper macros to lock/unlock logbuf_lock and switch between
- * printk-safe/unsafe modes.
- */
-#define logbuf_lock_irq() \
- do { \
- printk_safe_enter_irq(); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irq() \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irq(); \
- } while (0)
-
-#define logbuf_lock_irqsave(flags) \
- do { \
- printk_safe_enter_irqsave(flags); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irqrestore(flags) \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irqrestore(flags); \
- } while (0)
+/* syslog_lock protects syslog_* variables and write access to clear_seq. */
+static DEFINE_RAW_SPINLOCK(syslog_lock);
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
+/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;
+/* All 3 protected by @console_sem. */
/* the next printk record to write to the console */
static u64 console_seq;
static u64 exclusive_console_stop_seq;
static unsigned long console_dropped;
-/* the next printk record to read after the last 'clear' command */
-static u64 clear_seq;
+struct latched_seq {
+ seqcount_latch_t latch;
+ u64 val[2];
+};
+
+/*
+ * The next printk record to read after the last 'clear' command. There are
+ * two copies (updated with seqcount_latch) so that reads can locklessly
+ * access a valid value. Writers are synchronized by @syslog_lock.
+ */
+static struct latched_seq clear_seq = {
+ .latch = SEQCNT_LATCH_ZERO(clear_seq.latch),
+ .val[0] = 0,
+ .val[1] = 0,
+};
#ifdef CONFIG_PRINTK_CALLER
#define PREFIX_MAX 48
#else
#define PREFIX_MAX 32
#endif
-#define LOG_LINE_MAX (1024 - PREFIX_MAX)
+
+/* the maximum size of a formatted record (i.e. with prefix added per line) */
+#define CONSOLE_LOG_MAX 1024
+
+/* the maximum size allowed to be reserved for a record */
+#define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX)
#define LOG_LEVEL(v) ((v) & 0x07)
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
@@ -452,6 +440,31 @@ bool printk_percpu_data_ready(void)
return __printk_percpu_data_ready;
}
+/* Must be called under syslog_lock. */
+static void latched_seq_write(struct latched_seq *ls, u64 val)
+{
+ raw_write_seqcount_latch(&ls->latch);
+ ls->val[0] = val;
+ raw_write_seqcount_latch(&ls->latch);
+ ls->val[1] = val;
+}
+
+/* Can be called from any context. */
+static u64 latched_seq_read_nolock(struct latched_seq *ls)
+{
+ unsigned int seq;
+ unsigned int idx;
+ u64 val;
+
+ do {
+ seq = raw_read_seqcount_latch(&ls->latch);
+ idx = seq & 0x1;
+ val = ls->val[idx];
+ } while (read_seqcount_latch_retry(&ls->latch, seq));
+
+ return val;
+}
+
/* Return log buffer address */
char *log_buf_addr_get(void)
{
@@ -619,7 +632,7 @@ out:
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
- u64 seq;
+ atomic64_t seq;
struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
@@ -719,27 +732,27 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
if (ret)
return ret;
- logbuf_lock_irq();
- if (!prb_read_valid(prb, user->seq, r)) {
+ printk_safe_enter_irq();
+ if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
goto out;
}
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
ret = wait_event_interruptible(log_wait,
- prb_read_valid(prb, user->seq, r));
+ prb_read_valid(prb, atomic64_read(&user->seq), r));
if (ret)
goto out;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
}
- if (r->info->seq != user->seq) {
+ if (r->info->seq != atomic64_read(&user->seq)) {
/* our last seen message is gone, return error and reset */
- user->seq = r->info->seq;
+ atomic64_set(&user->seq, r->info->seq);
ret = -EPIPE;
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
goto out;
}
@@ -748,8 +761,8 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
&r->text_buf[0], r->info->text_len,
&r->info->dev_info);
- user->seq = r->info->seq + 1;
- logbuf_unlock_irq();
+ atomic64_set(&user->seq, r->info->seq + 1);
+ printk_safe_exit_irq();
if (len > count) {
ret = -EINVAL;
@@ -784,11 +797,11 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
if (offset)
return -ESPIPE;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
switch (whence) {
case SEEK_SET:
/* the first record */
- user->seq = prb_first_valid_seq(prb);
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
break;
case SEEK_DATA:
/*
@@ -796,16 +809,16 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
- user->seq = clear_seq;
+ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq));
break;
case SEEK_END:
/* after the last record */
- user->seq = prb_next_seq(prb);
+ atomic64_set(&user->seq, prb_next_seq(prb));
break;
default:
ret = -EINVAL;
}
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
return ret;
}
@@ -820,15 +833,15 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
poll_wait(file, &log_wait, wait);
- logbuf_lock_irq();
- if (prb_read_valid_info(prb, user->seq, &info, NULL)) {
+ printk_safe_enter_irq();
+ if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
/* return error when data has vanished underneath us */
- if (info.seq != user->seq)
+ if (info.seq != atomic64_read(&user->seq))
ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
else
ret = EPOLLIN|EPOLLRDNORM;
}
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
return ret;
}
@@ -861,9 +874,9 @@ static int devkmsg_open(struct inode *inode, struct file *file)
prb_rec_init_rd(&user->record, &user->info,
&user->text_buf[0], sizeof(user->text_buf));
- logbuf_lock_irq();
- user->seq = prb_first_valid_seq(prb);
- logbuf_unlock_irq();
+ printk_safe_enter_irq();
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
+ printk_safe_exit_irq();
file->private_data = user;
return 0;
@@ -955,6 +968,9 @@ void log_buf_vmcoreinfo_setup(void)
VMCOREINFO_SIZE(atomic_long_t);
VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);
+
+ VMCOREINFO_STRUCT_SIZE(latched_seq);
+ VMCOREINFO_OFFSET(latched_seq, val);
}
#endif
@@ -1421,6 +1437,50 @@ static size_t get_record_print_text_size(struct printk_info *info,
return ((prefix_len * line_count) + info->text_len + 1);
}
+/*
+ * Beginning with @start_seq, find the first record where it and all following
+ * records up to (but not including) @max_seq fit into @size.
+ *
+ * @max_seq is simply an upper bound and does not need to exist. If the caller
+ * does not require an upper bound, -1 can be used for @max_seq.
+ */
+static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size,
+ bool syslog, bool time)
+{
+ struct printk_info info;
+ unsigned int line_count;
+ size_t len = 0;
+ u64 seq;
+
+ /* Determine the size of the records up to @max_seq. */
+ prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
+ if (info.seq >= max_seq)
+ break;
+ len += get_record_print_text_size(&info, line_count, syslog, time);
+ }
+
+ /*
+ * Adjust the upper bound for the next loop to avoid subtracting
+ * lengths that were never added.
+ */
+ if (seq < max_seq)
+ max_seq = seq;
+
+ /*
+ * Move first record forward until length fits into the buffer. Ignore
+ * newest messages that were not counted in the above cycle. Messages
+ * might appear and get lost in the meantime. This is a best effort
+ * that prevents an infinite loop that could occur with a retry.
+ */
+ prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
+ if (len <= size || info.seq >= max_seq)
+ break;
+ len -= get_record_print_text_size(&info, line_count, syslog, time);
+ }
+
+ return seq;
+}
+
static int syslog_print(char __user *buf, int size)
{
struct printk_info info;
@@ -1428,19 +1488,21 @@ static int syslog_print(char __user *buf, int size)
char *text;
int len = 0;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
while (size > 0) {
size_t n;
size_t skip;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
+ raw_spin_lock(&syslog_lock);
if (!prb_read_valid(prb, syslog_seq, &r)) {
- logbuf_unlock_irq();
+ raw_spin_unlock(&syslog_lock);
+ printk_safe_exit_irq();
break;
}
if (r.info->seq != syslog_seq) {
@@ -1469,7 +1531,8 @@ static int syslog_print(char __user *buf, int size)
syslog_partial += n;
} else
n = 0;
- logbuf_unlock_irq();
+ raw_spin_unlock(&syslog_lock);
+ printk_safe_exit_irq();
if (!n)
break;
@@ -1492,34 +1555,26 @@ static int syslog_print(char __user *buf, int size)
static int syslog_print_all(char __user *buf, int size, bool clear)
{
struct printk_info info;
- unsigned int line_count;
struct printk_record r;
char *text;
int len = 0;
u64 seq;
bool time;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
time = printk_time;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
*/
- prb_for_each_info(clear_seq, prb, seq, &info, &line_count)
- len += get_record_print_text_size(&info, line_count, true, time);
+ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
+ size, true, time);
- /* move first record forward until length fits into the buffer */
- prb_for_each_info(clear_seq, prb, seq, &info, &line_count) {
- if (len <= size)
- break;
- len -= get_record_print_text_size(&info, line_count, true, time);
- }
-
- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
len = 0;
prb_for_each_record(seq, prb, seq, &r) {
@@ -1532,20 +1587,23 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
break;
}
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
if (len < 0)
break;
}
- if (clear)
- clear_seq = seq;
- logbuf_unlock_irq();
+ if (clear) {
+ raw_spin_lock(&syslog_lock);
+ latched_seq_write(&clear_seq, seq);
+ raw_spin_unlock(&syslog_lock);
+ }
+ printk_safe_exit_irq();
kfree(text);
return len;
@@ -1553,9 +1611,23 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
static void syslog_clear(void)
{
- logbuf_lock_irq();
- clear_seq = prb_next_seq(prb);
- logbuf_unlock_irq();
+ printk_safe_enter_irq();
+ raw_spin_lock(&syslog_lock);
+ latched_seq_write(&clear_seq, prb_next_seq(prb));
+ raw_spin_unlock(&syslog_lock);
+ printk_safe_exit_irq();
+}
+
+/* Return a consistent copy of @syslog_seq. */
+static u64 read_syslog_seq_irq(void)
+{
+ u64 seq;
+
+ raw_spin_lock_irq(&syslog_lock);
+ seq = syslog_seq;
+ raw_spin_unlock_irq(&syslog_lock);
+
+ return seq;
}
int do_syslog(int type, char __user *buf, int len, int source)
@@ -1581,8 +1653,9 @@ int do_syslog(int type, char __user *buf, int len, int source)
return 0;
if (!access_ok(buf, len))
return -EFAULT;
+
error = wait_event_interruptible(log_wait,
- prb_read_valid(prb, syslog_seq, NULL));
+ prb_read_valid(prb, read_syslog_seq_irq(), NULL));
if (error)
return error;
error = syslog_print(buf, len);
@@ -1630,10 +1703,12 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- logbuf_lock_irq();
+ printk_safe_enter_irq();
+ raw_spin_lock(&syslog_lock);
if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
/* No unread messages. */
- logbuf_unlock_irq();
+ raw_spin_unlock(&syslog_lock);
+ printk_safe_exit_irq();
return 0;
}
if (info.seq != syslog_seq) {
@@ -1661,7 +1736,8 @@ int do_syslog(int type, char __user *buf, int len, int source)
}
error -= syslog_partial;
}
- logbuf_unlock_irq();
+ raw_spin_unlock(&syslog_lock);
+ printk_safe_exit_irq();
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -2104,12 +2180,6 @@ asmlinkage int vprintk_emit(int facility, int level,
}
EXPORT_SYMBOL(vprintk_emit);
-asmlinkage int vprintk(const char *fmt, va_list args)
-{
- return vprintk_func(fmt, args);
-}
-EXPORT_SYMBOL(vprintk);
-
int vprintk_default(const char *fmt, va_list args)
{
return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
@@ -2143,7 +2213,7 @@ asmlinkage __visible int printk(const char *fmt, ...)
int r;
va_start(args, fmt);
- r = vprintk_func(fmt, args);
+ r = vprintk(fmt, args);
va_end(args);
return r;
@@ -2152,8 +2222,7 @@ EXPORT_SYMBOL(printk);
#else /* CONFIG_PRINTK */
-#define LOG_LINE_MAX 0
-#define PREFIX_MAX 0
+#define CONSOLE_LOG_MAX 0
#define printk_time false
#define prb_read_valid(rb, seq, r) false
@@ -2262,7 +2331,7 @@ static int __init console_setup(char *str)
/*
* console="" or console=null have been suggested as a way to
* disable console output. Use ttynull that has been created
- * for exacly this purpose.
+ * for exactly this purpose.
*/
if (str[0] == 0 || strcmp(str, "null") == 0) {
__add_preferred_console("ttynull", 0, NULL, NULL, true);
@@ -2471,7 +2540,7 @@ static inline int can_use_console(void)
void console_unlock(void)
{
static char ext_text[CONSOLE_EXT_LOG_MAX];
- static char text[LOG_LINE_MAX + PREFIX_MAX];
+ static char text[CONSOLE_LOG_MAX];
unsigned long flags;
bool do_cond_resched, retry;
struct printk_info info;
@@ -2518,7 +2587,6 @@ again:
size_t len;
printk_safe_enter_irqsave(flags);
- raw_spin_lock(&logbuf_lock);
skip:
if (!prb_read_valid(prb, console_seq, &r))
break;
@@ -2562,7 +2630,6 @@ skip:
console_msg_format & MSG_FORMAT_SYSLOG,
printk_time);
console_seq++;
- raw_spin_unlock(&logbuf_lock);
/*
* While actively printing out messages, if another printk()
@@ -2589,8 +2656,6 @@ skip:
console_locked = 0;
- raw_spin_unlock(&logbuf_lock);
-
up_console_sem();
/*
@@ -2599,9 +2664,7 @@ skip:
* there's a new owner and the console_unlock() from them will do the
* flush, no worries.
*/
- raw_spin_lock(&logbuf_lock);
retry = prb_read_valid(prb, console_seq, NULL);
- raw_spin_unlock(&logbuf_lock);
printk_safe_exit_irqrestore(flags);
if (retry && console_trylock())
@@ -2668,9 +2731,9 @@ void console_flush_on_panic(enum con_flush_mode mode)
if (mode == CONSOLE_REPLAY_ALL) {
unsigned long flags;
- logbuf_lock_irqsave(flags);
+ printk_safe_enter_irqsave(flags);
console_seq = prb_first_valid_seq(prb);
- logbuf_unlock_irqrestore(flags);
+ printk_safe_exit_irqrestore(flags);
}
console_unlock();
}
@@ -2898,9 +2961,7 @@ void register_console(struct console *newcon)
/*
* console_unlock(); will print out the buffered messages
* for us.
- */
- logbuf_lock_irqsave(flags);
- /*
+ *
* We're about to replay the log buffer. Only do this to the
* just-registered console to avoid excessive message spam to
* the already-registered consoles.
@@ -2911,8 +2972,11 @@ void register_console(struct console *newcon)
*/
exclusive_console = newcon;
exclusive_console_stop_seq = console_seq;
+
+ /* Get a consistent copy of @syslog_seq. */
+ raw_spin_lock_irqsave(&syslog_lock, flags);
console_seq = syslog_seq;
- logbuf_unlock_irqrestore(flags);
+ raw_spin_unlock_irqrestore(&syslog_lock, flags);
}
console_unlock();
console_sysfs_notify();
@@ -3042,7 +3106,7 @@ void __init console_init(void)
*
* To mitigate this problem somewhat, only unregister consoles whose memory
* intersects with the init section. Note that all other boot consoles will
- * get unregistred when the real preferred console is registered.
+ * get unregistered when the real preferred console is registered.
*/
static int __init printk_late_init(void)
{
@@ -3276,7 +3340,6 @@ EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);
void kmsg_dump(enum kmsg_dump_reason reason)
{
struct kmsg_dumper *dumper;
- unsigned long flags;
rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list) {
@@ -3293,26 +3356,15 @@ void kmsg_dump(enum kmsg_dump_reason reason)
if (reason > max_reason)
continue;
- /* initialize iterator with data about the stored records */
- dumper->active = true;
-
- logbuf_lock_irqsave(flags);
- dumper->cur_seq = clear_seq;
- dumper->next_seq = prb_next_seq(prb);
- logbuf_unlock_irqrestore(flags);
-
/* invoke dumper which will iterate over records */
dumper->dump(dumper, reason);
-
- /* reset iterator */
- dumper->active = false;
}
rcu_read_unlock();
}
/**
- * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
- * @dumper: registered kmsg dumper
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @iter: kmsg dump iterator
* @syslog: include the "<4>" prefixes
* @line: buffer to copy the line to
* @size: maximum size of the buffer
@@ -3326,30 +3378,31 @@ void kmsg_dump(enum kmsg_dump_reason reason)
*
* A return value of FALSE indicates that there are no more records to
* read.
- *
- * The function is similar to kmsg_dump_get_line(), but grabs no locks.
*/
-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
- char *line, size_t size, size_t *len)
+bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
+ char *line, size_t size, size_t *len)
{
+ u64 min_seq = latched_seq_read_nolock(&clear_seq);
struct printk_info info;
unsigned int line_count;
struct printk_record r;
+ unsigned long flags;
size_t l = 0;
bool ret = false;
- prb_rec_init_rd(&r, &info, line, size);
+ if (iter->cur_seq < min_seq)
+ iter->cur_seq = min_seq;
- if (!dumper->active)
- goto out;
+ printk_safe_enter_irqsave(flags);
+ prb_rec_init_rd(&r, &info, line, size);
/* Read text or count text lines? */
if (line) {
- if (!prb_read_valid(prb, dumper->cur_seq, &r))
+ if (!prb_read_valid(prb, iter->cur_seq, &r))
goto out;
l = record_print_text(&r, syslog, printk_time);
} else {
- if (!prb_read_valid_info(prb, dumper->cur_seq,
+ if (!prb_read_valid_info(prb, iter->cur_seq,
&info, &line_count)) {
goto out;
}
@@ -3358,52 +3411,23 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
}
- dumper->cur_seq = r.info->seq + 1;
+ iter->cur_seq = r.info->seq + 1;
ret = true;
out:
+ printk_safe_exit_irqrestore(flags);
if (len)
*len = l;
return ret;
}
-
-/**
- * kmsg_dump_get_line - retrieve one kmsg log line
- * @dumper: registered kmsg dumper
- * @syslog: include the "<4>" prefixes
- * @line: buffer to copy the line to
- * @size: maximum size of the buffer
- * @len: length of line placed into buffer
- *
- * Start at the beginning of the kmsg buffer, with the oldest kmsg
- * record, and copy one record into the provided buffer.
- *
- * Consecutive calls will return the next available record moving
- * towards the end of the buffer with the youngest messages.
- *
- * A return value of FALSE indicates that there are no more records to
- * read.
- */
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
- char *line, size_t size, size_t *len)
-{
- unsigned long flags;
- bool ret;
-
- logbuf_lock_irqsave(flags);
- ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
- logbuf_unlock_irqrestore(flags);
-
- return ret;
-}
EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
/**
* kmsg_dump_get_buffer - copy kmsg log lines
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dump iterator
* @syslog: include the "<4>" prefixes
* @buf: buffer to copy the line to
* @size: maximum size of the buffer
- * @len: length of line placed into buffer
+ * @len_out: length of line placed into buffer
*
* Start at the end of the kmsg buffer and fill the provided buffer
* with as many of the *youngest* kmsg records that fit into it.
@@ -3416,115 +3440,93 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
* A return value of FALSE indicates that there are no more records to
* read.
*/
-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
- char *buf, size_t size, size_t *len)
+bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
+ char *buf, size_t size, size_t *len_out)
{
+ u64 min_seq = latched_seq_read_nolock(&clear_seq);
struct printk_info info;
- unsigned int line_count;
struct printk_record r;
unsigned long flags;
u64 seq;
u64 next_seq;
- size_t l = 0;
+ size_t len = 0;
bool ret = false;
bool time = printk_time;
- prb_rec_init_rd(&r, &info, buf, size);
-
- if (!dumper->active || !buf || !size)
+ if (!buf || !size)
goto out;
- logbuf_lock_irqsave(flags);
- if (prb_read_valid_info(prb, dumper->cur_seq, &info, NULL)) {
- if (info.seq != dumper->cur_seq) {
+ if (iter->cur_seq < min_seq)
+ iter->cur_seq = min_seq;
+
+ printk_safe_enter_irqsave(flags);
+ if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
+ if (info.seq != iter->cur_seq) {
/* messages are gone, move to first available one */
- dumper->cur_seq = info.seq;
+ iter->cur_seq = info.seq;
}
}
/* last entry */
- if (dumper->cur_seq >= dumper->next_seq) {
- logbuf_unlock_irqrestore(flags);
+ if (iter->cur_seq >= iter->next_seq) {
+ printk_safe_exit_irqrestore(flags);
goto out;
}
- /* calculate length of entire buffer */
- seq = dumper->cur_seq;
- while (prb_read_valid_info(prb, seq, &info, &line_count)) {
- if (r.info->seq >= dumper->next_seq)
- break;
- l += get_record_print_text_size(&info, line_count, syslog, time);
- seq = r.info->seq + 1;
- }
-
- /* move first record forward until length fits into the buffer */
- seq = dumper->cur_seq;
- while (l >= size && prb_read_valid_info(prb, seq,
- &info, &line_count)) {
- if (r.info->seq >= dumper->next_seq)
- break;
- l -= get_record_print_text_size(&info, line_count, syslog, time);
- seq = r.info->seq + 1;
- }
+ /*
+ * Find first record that fits, including all following records,
+ * into the user-provided buffer for this dump. Pass in size-1
+ * because this function (by way of record_print_text()) will
+ * not write more than size-1 bytes of text into @buf.
+ */
+ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq,
+ size - 1, syslog, time);
- /* last message in next interation */
+ /*
+ * Next kmsg_dump_get_buffer() invocation will dump block of
+ * older records stored right before this one.
+ */
next_seq = seq;
- /* actually read text into the buffer now */
- l = 0;
- while (prb_read_valid(prb, seq, &r)) {
- if (r.info->seq >= dumper->next_seq)
- break;
+ prb_rec_init_rd(&r, &info, buf, size);
- l += record_print_text(&r, syslog, time);
+ len = 0;
+ prb_for_each_record(seq, prb, seq, &r) {
+ if (r.info->seq >= iter->next_seq)
+ break;
- /* adjust record to store to remaining buffer space */
- prb_rec_init_rd(&r, &info, buf + l, size - l);
+ len += record_print_text(&r, syslog, time);
- seq = r.info->seq + 1;
+ /* Adjust record to store to remaining buffer space. */
+ prb_rec_init_rd(&r, &info, buf + len, size - len);
}
- dumper->next_seq = next_seq;
+ iter->next_seq = next_seq;
ret = true;
- logbuf_unlock_irqrestore(flags);
+ printk_safe_exit_irqrestore(flags);
out:
- if (len)
- *len = l;
+ if (len_out)
+ *len_out = len;
return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
/**
- * kmsg_dump_rewind_nolock - reset the iterator (unlocked version)
- * @dumper: registered kmsg dumper
- *
- * Reset the dumper's iterator so that kmsg_dump_get_line() and
- * kmsg_dump_get_buffer() can be called again and used multiple
- * times within the same dumper.dump() callback.
- *
- * The function is similar to kmsg_dump_rewind(), but grabs no locks.
- */
-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
-{
- dumper->cur_seq = clear_seq;
- dumper->next_seq = prb_next_seq(prb);
-}
-
-/**
* kmsg_dump_rewind - reset the iterator
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dump iterator
*
* Reset the dumper's iterator so that kmsg_dump_get_line() and
* kmsg_dump_get_buffer() can be called again and used multiple
* times within the same dumper.dump() callback.
*/
-void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
{
unsigned long flags;
- logbuf_lock_irqsave(flags);
- kmsg_dump_rewind_nolock(dumper);
- logbuf_unlock_irqrestore(flags);
+ printk_safe_enter_irqsave(flags);
+ iter->cur_seq = latched_seq_read_nolock(&clear_seq);
+ iter->next_seq = prb_next_seq(prb);
+ printk_safe_exit_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 2e9e3ed7d63e..7a1414622051 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -16,7 +16,7 @@
#include "internal.h"
/*
- * printk() could not take logbuf_lock in NMI context. Instead,
+ * In NMI and safe mode, printk() avoids taking locks. Instead,
* it uses an alternative implementation that temporary stores
* the strings into a per-CPU buffer. The content of the buffer
* is later flushed into the main ring buffer via IRQ work.
@@ -267,17 +267,9 @@ void printk_safe_flush(void)
void printk_safe_flush_on_panic(void)
{
/*
- * Make sure that we could access the main ring buffer.
+ * Make sure that we could access the safe buffers.
* Do not risk a double release when more CPUs are up.
*/
- if (raw_spin_is_locked(&logbuf_lock)) {
- if (num_online_cpus() > 1)
- return;
-
- debug_locks_off();
- raw_spin_lock_init(&logbuf_lock);
- }
-
if (raw_spin_is_locked(&safe_read_lock)) {
if (num_online_cpus() > 1)
return;
@@ -319,9 +311,7 @@ void noinstr printk_nmi_exit(void)
* reordering.
*
* It has effect only when called in NMI context. Then printk()
- * will try to store the messages into the main logbuf directly
- * and use the per-CPU buffers only as a fallback when the lock
- * is not available.
+ * will store the messages into the main logbuf directly.
*/
void printk_nmi_direct_enter(void)
{
@@ -367,7 +357,7 @@ void __printk_safe_exit(void)
this_cpu_dec(printk_context);
}
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+asmlinkage int vprintk(const char *fmt, va_list args)
{
#ifdef CONFIG_KGDB_KDB
/* Allow to pass printk() to kdb but avoid a recursion. */
@@ -376,20 +366,21 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
#endif
/*
- * Try to use the main logbuf even in NMI. But avoid calling console
+ * Use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
*/
- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) &&
- raw_spin_trylock(&logbuf_lock)) {
+ if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) {
+ unsigned long flags;
int len;
+ printk_safe_enter_irqsave(flags);
len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
- raw_spin_unlock(&logbuf_lock);
+ printk_safe_exit_irqrestore(flags);
defer_console_output();
return len;
}
- /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
+ /* Use extra buffer in NMI. */
if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
return vprintk_nmi(fmt, args);
@@ -420,3 +411,4 @@ void __init printk_safe_init(void)
/* Flush pending messages that did not have scheduled IRQ works. */
printk_safe_flush();
}
+EXPORT_SYMBOL(vprintk);
diff --git a/kernel/profile.c b/kernel/profile.c
index 6f69a4195d56..c2ebddb5e974 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -430,7 +430,7 @@ static ssize_t prof_cpu_mask_proc_write(struct file *file,
cpumask_var_t new_value;
int err;
- if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(buffer, count, new_value);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index eb1b15850761..a6ad5eb2fa73 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -244,8 +244,6 @@ void migrate_to_reboot_cpu(void)
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
- if (pm_power_off_prepare)
- pm_power_off_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
if (!cmd)
diff --git a/kernel/resource.c b/kernel/resource.c
index 833394f9c608..627e61b0c124 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -18,12 +18,15 @@
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
+#include <linux/pseudo_fs.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/pfn.h>
#include <linux/mm.h>
+#include <linux/mount.h>
#include <linux/resource_ext.h>
+#include <uapi/linux/magic.h>
#include <asm/io.h>
@@ -1119,6 +1122,55 @@ resource_size_t resource_alignment(struct resource *res)
static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
+static struct inode *iomem_inode;
+
+#ifdef CONFIG_IO_STRICT_DEVMEM
+static void revoke_iomem(struct resource *res)
+{
+ /* pairs with smp_store_release() in iomem_init_inode() */
+ struct inode *inode = smp_load_acquire(&iomem_inode);
+
+ /*
+ * Check that the initialization has completed. Losing the race
+ * is ok because it means drivers are claiming resources before
+ * the fs_initcall level of init and prevent iomem_get_mapping users
+ * from establishing mappings.
+ */
+ if (!inode)
+ return;
+
+ /*
+ * The expectation is that the driver has successfully marked
+ * the resource busy by this point, so devmem_is_allowed()
+ * should start returning false, however for performance this
+ * does not iterate the entire resource range.
+ */
+ if (devmem_is_allowed(PHYS_PFN(res->start)) &&
+ devmem_is_allowed(PHYS_PFN(res->end))) {
+ /*
+ * *cringe* iomem=relaxed says "go ahead, what's the
+ * worst that can happen?"
+ */
+ return;
+ }
+
+ unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1);
+}
+#else
+static void revoke_iomem(struct resource *res) {}
+#endif
+
+struct address_space *iomem_get_mapping(void)
+{
+ /*
+ * This function is only called from file open paths, hence guaranteed
+ * that fs_initcalls have completed and no need to check for NULL. But
+ * since revoke_iomem can be called before the initcall we still need
+ * the barrier to appease checkers.
+ */
+ return smp_load_acquire(&iomem_inode)->i_mapping;
+}
+
/**
* __request_region - create a new busy resource region
* @parent: parent resource descriptor
@@ -1186,7 +1238,7 @@ struct resource * __request_region(struct resource *parent,
write_unlock(&resource_lock);
if (res && orig_parent == &iomem_resource)
- revoke_devmem(res);
+ revoke_iomem(res);
return res;
}
@@ -1786,4 +1838,48 @@ static int __init strict_iomem(char *str)
return 1;
}
+static int iomem_fs_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type iomem_fs_type = {
+ .name = "iomem",
+ .owner = THIS_MODULE,
+ .init_fs_context = iomem_fs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int __init iomem_init_inode(void)
+{
+ static struct vfsmount *iomem_vfs_mount;
+ static int iomem_fs_cnt;
+ struct inode *inode;
+ int rc;
+
+ rc = simple_pin_fs(&iomem_fs_type, &iomem_vfs_mount, &iomem_fs_cnt);
+ if (rc < 0) {
+ pr_err("Cannot mount iomem pseudo filesystem: %d\n", rc);
+ return rc;
+ }
+
+ inode = alloc_anon_inode(iomem_vfs_mount->mnt_sb);
+ if (IS_ERR(inode)) {
+ rc = PTR_ERR(inode);
+ pr_err("Cannot allocate inode for iomem: %d\n", rc);
+ simple_release_fs(&iomem_vfs_mount, &iomem_fs_cnt);
+ return rc;
+ }
+
+ /*
+ * Publish iomem revocation inode initialized.
+ * Pairs with smp_load_acquire() in revoke_iomem().
+ */
+ smp_store_release(&iomem_inode, inode);
+
+ return 0;
+}
+
+fs_initcall(iomem_init_inode);
+
__setup("iomem=", strict_iomem);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ca2bb629595f..b2890f6e6d6f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1862,8 +1862,13 @@ struct migration_arg {
struct set_affinity_pending *pending;
};
+/*
+ * @refs: number of wait_for_completion()
+ * @stop_pending: is @stop_work in use
+ */
struct set_affinity_pending {
refcount_t refs;
+ unsigned int stop_pending;
struct completion done;
struct cpu_stop_work stop_work;
struct migration_arg arg;
@@ -1898,8 +1903,8 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
*/
static int migration_cpu_stop(void *data)
{
- struct set_affinity_pending *pending;
struct migration_arg *arg = data;
+ struct set_affinity_pending *pending = arg->pending;
struct task_struct *p = arg->task;
int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
@@ -1921,7 +1926,6 @@ static int migration_cpu_stop(void *data)
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
- pending = p->migration_pending;
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1932,21 +1936,14 @@ static int migration_cpu_stop(void *data)
goto out;
if (pending) {
- p->migration_pending = NULL;
+ if (p->migration_pending == pending)
+ p->migration_pending = NULL;
complete = true;
}
- /* migrate_enable() -- we must not race against SCA */
if (dest_cpu < 0) {
- /*
- * When this was migrate_enable() but we no longer
- * have a @pending, a concurrent SCA 'fixed' things
- * and we should be valid again. Nothing to do.
- */
- if (!pending) {
- WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
goto out;
- }
dest_cpu = cpumask_any_distribute(&p->cpus_mask);
}
@@ -1956,7 +1953,14 @@ static int migration_cpu_stop(void *data)
else
p->wake_cpu = dest_cpu;
- } else if (dest_cpu < 0 || pending) {
+ /*
+ * XXX __migrate_task() can fail, at which point we might end
+ * up running on a dodgy CPU, AFAICT this can only happen
+ * during CPU hotplug, at which point we'll get pushed out
+ * anyway, so it's probably not a big deal.
+ */
+
+ } else if (pending) {
/*
* This happens when we get migrated between migrate_enable()'s
* preempt_enable() and scheduling the stopper task. At that
@@ -1971,43 +1975,32 @@ static int migration_cpu_stop(void *data)
* ->pi_lock, so the allowed mask is stable - if it got
* somewhere allowed, we're done.
*/
- if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
- p->migration_pending = NULL;
+ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+ if (p->migration_pending == pending)
+ p->migration_pending = NULL;
complete = true;
goto out;
}
/*
- * When this was migrate_enable() but we no longer have an
- * @pending, a concurrent SCA 'fixed' things and we should be
- * valid again. Nothing to do.
- */
- if (!pending) {
- WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
- goto out;
- }
-
- /*
* When migrate_enable() hits a rq mis-match we can't reliably
* determine is_migration_disabled() and so have to chase after
* it.
*/
+ WARN_ON_ONCE(!pending->stop_pending);
task_rq_unlock(rq, p, &rf);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
return 0;
}
out:
+ if (pending)
+ pending->stop_pending = false;
task_rq_unlock(rq, p, &rf);
if (complete)
complete_all(&pending->done);
- /* For pending->{arg,stop_work} */
- pending = arg->pending;
- if (pending && refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
-
return 0;
}
@@ -2194,11 +2187,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
int dest_cpu, unsigned int flags)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
- struct migration_arg arg = {
- .task = p,
- .dest_cpu = dest_cpu,
- };
- bool complete = false;
+ bool stop_pending, complete = false;
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
@@ -2210,12 +2199,16 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
push_task = get_task_struct(p);
}
+ /*
+ * If there are pending waiters, but no pending stop_work,
+ * then complete now.
+ */
pending = p->migration_pending;
- if (pending) {
- refcount_inc(&pending->refs);
+ if (pending && !pending->stop_pending) {
p->migration_pending = NULL;
complete = true;
}
+
task_rq_unlock(rq, p, rf);
if (push_task) {
@@ -2224,7 +2217,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
}
if (complete)
- goto do_complete;
+ complete_all(&pending->done);
return 0;
}
@@ -2235,6 +2228,12 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
/* Install the request */
refcount_set(&my_pending.refs, 1);
init_completion(&my_pending.done);
+ my_pending.arg = (struct migration_arg) {
+ .task = p,
+ .dest_cpu = -1, /* any */
+ .pending = &my_pending,
+ };
+
p->migration_pending = &my_pending;
} else {
pending = p->migration_pending;
@@ -2259,45 +2258,41 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
return -EINVAL;
}
- if (flags & SCA_MIGRATE_ENABLE) {
-
- refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
- p->migration_flags &= ~MDF_PUSH;
- task_rq_unlock(rq, p, rf);
-
- pending->arg = (struct migration_arg) {
- .task = p,
- .dest_cpu = -1,
- .pending = pending,
- };
-
- stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
- &pending->arg, &pending->stop_work);
-
- return 0;
- }
-
if (task_running(rq, p) || p->state == TASK_WAKING) {
/*
- * Lessen races (and headaches) by delegating
- * is_migration_disabled(p) checks to the stopper, which will
- * run on the same CPU as said p.
+ * MIGRATE_ENABLE gets here because 'p == current', but for
+ * anything else we cannot do is_migration_disabled(), punt
+ * and have the stopper function handle it all race-free.
*/
+ stop_pending = pending->stop_pending;
+ if (!stop_pending)
+ pending->stop_pending = true;
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ p->migration_flags &= ~MDF_PUSH;
+
task_rq_unlock(rq, p, rf);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ if (!stop_pending) {
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ }
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ return 0;
} else {
if (!is_migration_disabled(p)) {
if (task_on_rq_queued(p))
rq = move_queued_task(rq, rf, p, dest_cpu);
- p->migration_pending = NULL;
- complete = true;
+ if (!pending->stop_pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
}
task_rq_unlock(rq, p, rf);
-do_complete:
if (complete)
complete_all(&pending->done);
}
@@ -2305,7 +2300,7 @@ do_complete:
wait_for_completion(&pending->done);
if (refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
+ wake_up_var(&pending->refs); /* No UaF, just an address */
/*
* Block the original owner of &pending until all subsequent callers
@@ -2313,6 +2308,9 @@ do_complete:
*/
wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+ /* ARGH */
+ WARN_ON_ONCE(my_pending.stop_pending);
+
return 0;
}
@@ -6386,6 +6384,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, false, true);
}
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 41e498b0008a..6ee9c9bbe505 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -26,7 +26,7 @@ struct sugov_policy {
struct sugov_tunables *tunables;
struct list_head tunables_hook;
- raw_spinlock_t update_lock; /* For shared policies */
+ raw_spinlock_t update_lock;
u64 last_freq_update_time;
s64 freq_update_delay_ns;
unsigned int next_freq;
@@ -114,19 +114,8 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
return true;
}
-static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
- unsigned int next_freq)
+static void sugov_deferred_update(struct sugov_policy *sg_policy)
{
- if (sugov_update_next_freq(sg_policy, time, next_freq))
- cpufreq_driver_fast_switch(sg_policy->policy, next_freq);
-}
-
-static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
- unsigned int next_freq)
-{
- if (!sugov_update_next_freq(sg_policy, time, next_freq))
- return;
-
if (!sg_policy->work_in_progress) {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
@@ -320,23 +309,21 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
* Make sugov_should_update_freq() ignore the rate limit when DL
* has increased the utilization.
*/
-static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
+static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
{
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
- sg_policy->limits_changed = true;
+ sg_cpu->sg_policy->limits_changed = true;
}
static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
u64 time, unsigned int flags)
{
- struct sugov_policy *sg_policy = sg_cpu->sg_policy;
-
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
- ignore_dl_rate_limit(sg_cpu, sg_policy);
+ ignore_dl_rate_limit(sg_cpu);
- if (!sugov_should_update_freq(sg_policy, time))
+ if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
return false;
sugov_get_util(sg_cpu);
@@ -368,16 +355,19 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
sg_policy->cached_raw_freq = cached_freq;
}
+ if (!sugov_update_next_freq(sg_policy, time, next_f))
+ return;
+
/*
* This code runs under rq->lock for the target CPU, so it won't run
* concurrently on two different CPUs for the same target and it is not
* necessary to acquire the lock in the fast switch case.
*/
if (sg_policy->policy->fast_switch_enabled) {
- sugov_fast_switch(sg_policy, time, next_f);
+ cpufreq_driver_fast_switch(sg_policy->policy, next_f);
} else {
raw_spin_lock(&sg_policy->update_lock);
- sugov_deferred_update(sg_policy, time, next_f);
+ sugov_deferred_update(sg_policy);
raw_spin_unlock(&sg_policy->update_lock);
}
}
@@ -451,17 +441,20 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
- ignore_dl_rate_limit(sg_cpu, sg_policy);
+ ignore_dl_rate_limit(sg_cpu);
if (sugov_should_update_freq(sg_policy, time)) {
next_f = sugov_next_freq_shared(sg_cpu, time);
+ if (!sugov_update_next_freq(sg_policy, time, next_f))
+ goto unlock;
+
if (sg_policy->policy->fast_switch_enabled)
- sugov_fast_switch(sg_policy, time, next_f);
+ cpufreq_driver_fast_switch(sg_policy->policy, next_f);
else
- sugov_deferred_update(sg_policy, time, next_f);
+ sugov_deferred_update(sg_policy);
}
-
+unlock:
raw_spin_unlock(&sg_policy->update_lock);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5f611658eeab..2c36a5fad589 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -60,7 +60,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
- pc = preempt_count() - offset;
+ pc = irq_count() - offset;
/*
* We do not account for softirq time from ksoftirqd here.
@@ -421,7 +421,7 @@ void vtime_task_switch(struct task_struct *prev)
void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
{
- unsigned int pc = preempt_count() - offset;
+ unsigned int pc = irq_count() - offset;
if (pc & HARDIRQ_OFFSET) {
vtime_account_hardirq(tsk);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8a8bd7b13634..794c2cb945f8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5126,7 +5126,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
/*
* When a group wakes up we want to make sure that its quota is not already
* expired/exceeded, otherwise it may be allowed to steal additional ticks of
- * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ * runtime as update_curr() throttling can not trigger until it's on-rq.
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 08ae45ad9261..b5add64d9698 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -454,7 +454,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)
/*
* For each cpu runqueue, if the task's mm match @mm, ensure that all
- * @mm's membarrier state set bits are also set in in the runqueue's
+ * @mm's membarrier state set bits are also set in the runqueue's
* membarrier state. This ensures that a runqueue scheduling
* between threads which are users of @mm has its membarrier state
* updated.
@@ -471,9 +471,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)
}
rcu_read_unlock();
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
- preempt_enable();
+ on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
free_cpumask_var(tmpmask);
cpus_read_unlock();
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 1d60fc2c9987..1e63db4dbd9a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -817,7 +817,7 @@ static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
}
/**
- * seccomp_cache_prepare - emulate the filter to find cachable syscalls
+ * seccomp_cache_prepare - emulate the filter to find cacheable syscalls
* @sfilter: The seccomp filter
*
* Returns 0 if successful or -errno if error occurred.
diff --git a/kernel/signal.c b/kernel/signal.c
index fcdfba988414..e528f96eebc8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2766,13 +2766,21 @@ relock:
}
/*
+ * PF_IO_WORKER threads will catch and exit on fatal signals
+ * themselves. They have cleanup that must be performed, so
+ * we cannot call do_exit() on their behalf.
+ */
+ if (current->flags & PF_IO_WORKER)
+ goto out;
+
+ /*
* Death signals, no core dump.
*/
do_group_exit(ksig->info.si_signo);
/* NOTREACHED */
}
spin_unlock_irq(&sighand->siglock);
-
+out:
ksig->sig = signr;
if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9d71046ea247..5a99696da86a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -13,6 +13,7 @@
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
#include <linux/init.h>
+#include <linux/local_lock.h>
#include <linux/mm.h>
#include <linux/notifier.h>
#include <linux/percpu.h>
@@ -25,6 +26,9 @@
#include <linux/smpboot.h>
#include <linux/tick.h>
#include <linux/irq.h>
+#include <linux/wait_bit.h>
+
+#include <asm/softirq_stack.h>
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
@@ -100,20 +104,204 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
#endif
/*
- * preempt_count and SOFTIRQ_OFFSET usage:
- * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
- * softirq processing.
- * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ * SOFTIRQ_OFFSET usage:
+ *
+ * On !RT kernels 'count' is the preempt counter, on RT kernels this applies
+ * to a per CPU counter and to task::softirqs_disabled_cnt.
+ *
+ * - count is changed by SOFTIRQ_OFFSET on entering or leaving softirq
+ * processing.
+ *
+ * - count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
* on local_bh_disable or local_bh_enable.
+ *
* This lets us distinguish between whether we are currently processing
* softirq and whether we just have bh disabled.
*/
+#ifdef CONFIG_PREEMPT_RT
-#ifdef CONFIG_TRACE_IRQFLAGS
/*
- * This is for softirq.c-internal use, where hardirqs are disabled
+ * RT accounts for BH disabled sections in task::softirqs_disabled_cnt and
+ * also in per CPU softirq_ctrl::cnt. This is necessary to allow tasks in a
+ * softirq disabled section to be preempted.
+ *
+ * The per task counter is used for softirq_count(), in_softirq() and
+ * in_serving_softirqs() because these counts are only valid when the task
+ * holding softirq_ctrl::lock is running.
+ *
+ * The per CPU counter prevents pointless wakeups of ksoftirqd in case that
+ * the task which is in a softirq disabled section is preempted or blocks.
+ */
+struct softirq_ctrl {
+ local_lock_t lock;
+ int cnt;
+};
+
+static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = {
+ .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock),
+};
+
+/**
+ * local_bh_blocked() - Check for idle whether BH processing is blocked
+ *
+ * Returns false if the per CPU softirq::cnt is 0 otherwise true.
+ *
+ * This is invoked from the idle task to guard against false positive
+ * softirq pending warnings, which would happen when the task which holds
+ * softirq_ctrl::lock was the only running task on the CPU and blocks on
+ * some other lock.
+ */
+bool local_bh_blocked(void)
+{
+ return __this_cpu_read(softirq_ctrl.cnt) != 0;
+}
+
+void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
+{
+ unsigned long flags;
+ int newcnt;
+
+ WARN_ON_ONCE(in_hardirq());
+
+ /* First entry of a task into a BH disabled section? */
+ if (!current->softirq_disable_cnt) {
+ if (preemptible()) {
+ local_lock(&softirq_ctrl.lock);
+ /* Required to meet the RCU bottomhalf requirements. */
+ rcu_read_lock();
+ } else {
+ DEBUG_LOCKS_WARN_ON(this_cpu_read(softirq_ctrl.cnt));
+ }
+ }
+
+ /*
+ * Track the per CPU softirq disabled state. On RT this is per CPU
+ * state to allow preemption of bottom half disabled sections.
+ */
+ newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt);
+ /*
+ * Reflect the result in the task state to prevent recursion on the
+ * local lock and to make softirq_count() & al work.
+ */
+ current->softirq_disable_cnt = newcnt;
+
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) {
+ raw_local_irq_save(flags);
+ lockdep_softirqs_off(ip);
+ raw_local_irq_restore(flags);
+ }
+}
+EXPORT_SYMBOL(__local_bh_disable_ip);
+
+static void __local_bh_enable(unsigned int cnt, bool unlock)
+{
+ unsigned long flags;
+ int newcnt;
+
+ DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt !=
+ this_cpu_read(softirq_ctrl.cnt));
+
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) {
+ raw_local_irq_save(flags);
+ lockdep_softirqs_on(_RET_IP_);
+ raw_local_irq_restore(flags);
+ }
+
+ newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt);
+ current->softirq_disable_cnt = newcnt;
+
+ if (!newcnt && unlock) {
+ rcu_read_unlock();
+ local_unlock(&softirq_ctrl.lock);
+ }
+}
+
+void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
+{
+ bool preempt_on = preemptible();
+ unsigned long flags;
+ u32 pending;
+ int curcnt;
+
+ WARN_ON_ONCE(in_irq());
+ lockdep_assert_irqs_enabled();
+
+ local_irq_save(flags);
+ curcnt = __this_cpu_read(softirq_ctrl.cnt);
+
+ /*
+ * If this is not reenabling soft interrupts, no point in trying to
+ * run pending ones.
+ */
+ if (curcnt != cnt)
+ goto out;
+
+ pending = local_softirq_pending();
+ if (!pending || ksoftirqd_running(pending))
+ goto out;
+
+ /*
+ * If this was called from non preemptible context, wake up the
+ * softirq daemon.
+ */
+ if (!preempt_on) {
+ wakeup_softirqd();
+ goto out;
+ }
+
+ /*
+ * Adjust softirq count to SOFTIRQ_OFFSET which makes
+ * in_serving_softirq() become true.
+ */
+ cnt = SOFTIRQ_OFFSET;
+ __local_bh_enable(cnt, false);
+ __do_softirq();
+
+out:
+ __local_bh_enable(cnt, preempt_on);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__local_bh_enable_ip);
+
+/*
+ * Invoked from ksoftirqd_run() outside of the interrupt disabled section
+ * to acquire the per CPU local lock for reentrancy protection.
+ */
+static inline void ksoftirqd_run_begin(void)
+{
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ local_irq_disable();
+}
+
+/* Counterpart to ksoftirqd_run_begin() */
+static inline void ksoftirqd_run_end(void)
+{
+ __local_bh_enable(SOFTIRQ_OFFSET, true);
+ WARN_ON_ONCE(in_interrupt());
+ local_irq_enable();
+}
+
+static inline void softirq_handle_begin(void) { }
+static inline void softirq_handle_end(void) { }
+
+static inline bool should_wake_ksoftirqd(void)
+{
+ return !this_cpu_read(softirq_ctrl.cnt);
+}
+
+static inline void invoke_softirq(void)
+{
+ if (should_wake_ksoftirqd())
+ wakeup_softirqd();
+}
+
+#else /* CONFIG_PREEMPT_RT */
+
+/*
+ * This one is for softirq.c-internal use, where hardirqs are disabled
* legitimately:
*/
+#ifdef CONFIG_TRACE_IRQFLAGS
void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
@@ -204,6 +392,32 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
}
EXPORT_SYMBOL(__local_bh_enable_ip);
+static inline void softirq_handle_begin(void)
+{
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+}
+
+static inline void softirq_handle_end(void)
+{
+ __local_bh_enable(SOFTIRQ_OFFSET);
+ WARN_ON_ONCE(in_interrupt());
+}
+
+static inline void ksoftirqd_run_begin(void)
+{
+ local_irq_disable();
+}
+
+static inline void ksoftirqd_run_end(void)
+{
+ local_irq_enable();
+}
+
+static inline bool should_wake_ksoftirqd(void)
+{
+ return true;
+}
+
static inline void invoke_softirq(void)
{
if (ksoftirqd_running(local_softirq_pending()))
@@ -248,6 +462,8 @@ asmlinkage __visible void do_softirq(void)
local_irq_restore(flags);
}
+#endif /* !CONFIG_PREEMPT_RT */
+
/*
* We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
* but break the loop if need_resched() is set or after 2 ms.
@@ -316,7 +532,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
pending = local_softirq_pending();
- __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ softirq_handle_begin();
in_hardirq = lockdep_softirq_start();
account_softirq_enter(current);
@@ -352,8 +568,10 @@ restart:
pending >>= softirq_bit;
}
- if (__this_cpu_read(ksoftirqd) == current)
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ __this_cpu_read(ksoftirqd) == current)
rcu_softirq_qs();
+
local_irq_disable();
pending = local_softirq_pending();
@@ -367,8 +585,7 @@ restart:
account_softirq_exit(current);
lockdep_softirq_end(in_hardirq);
- __local_bh_enable(SOFTIRQ_OFFSET);
- WARN_ON_ONCE(in_interrupt());
+ softirq_handle_end();
current_restore_flags(old_flags, PF_MEMALLOC);
}
@@ -463,7 +680,7 @@ inline void raise_softirq_irqoff(unsigned int nr)
* Otherwise we wake up ksoftirqd to make sure we
* schedule the softirq soon.
*/
- if (!in_interrupt())
+ if (!in_interrupt() && should_wake_ksoftirqd())
wakeup_softirqd();
}
@@ -529,6 +746,20 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
}
EXPORT_SYMBOL(__tasklet_hi_schedule);
+static bool tasklet_clear_sched(struct tasklet_struct *t)
+{
+ if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) {
+ wake_up_var(&t->state);
+ return true;
+ }
+
+ WARN_ONCE(1, "tasklet SCHED state not set: %s %pS\n",
+ t->use_callback ? "callback" : "func",
+ t->use_callback ? (void *)t->callback : (void *)t->func);
+
+ return false;
+}
+
static void tasklet_action_common(struct softirq_action *a,
struct tasklet_head *tl_head,
unsigned int softirq_nr)
@@ -548,13 +779,12 @@ static void tasklet_action_common(struct softirq_action *a,
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
- &t->state))
- BUG();
- if (t->use_callback)
- t->callback(t);
- else
- t->func(t->data);
+ if (tasklet_clear_sched(t)) {
+ if (t->use_callback)
+ t->callback(t);
+ else
+ t->func(t->data);
+ }
tasklet_unlock(t);
continue;
}
@@ -604,21 +834,62 @@ void tasklet_init(struct tasklet_struct *t,
}
EXPORT_SYMBOL(tasklet_init);
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * Do not use in new code. Waiting for tasklets from atomic contexts is
+ * error prone and should be avoided.
+ */
+void tasklet_unlock_spin_wait(struct tasklet_struct *t)
+{
+ while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /*
+ * Prevent a live lock when current preempted soft
+ * interrupt processing or prevents ksoftirqd from
+ * running. If the tasklet runs on a different CPU
+ * then this has no effect other than doing the BH
+ * disable/enable dance for nothing.
+ */
+ local_bh_disable();
+ local_bh_enable();
+ } else {
+ cpu_relax();
+ }
+ }
+}
+EXPORT_SYMBOL(tasklet_unlock_spin_wait);
+#endif
+
void tasklet_kill(struct tasklet_struct *t)
{
if (in_interrupt())
pr_notice("Attempt to kill tasklet from interrupt\n");
- while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
- do {
- yield();
- } while (test_bit(TASKLET_STATE_SCHED, &t->state));
- }
+ while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+ wait_var_event(&t->state, !test_bit(TASKLET_STATE_SCHED, &t->state));
+
tasklet_unlock_wait(t);
- clear_bit(TASKLET_STATE_SCHED, &t->state);
+ tasklet_clear_sched(t);
}
EXPORT_SYMBOL(tasklet_kill);
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+void tasklet_unlock(struct tasklet_struct *t)
+{
+ smp_mb__before_atomic();
+ clear_bit(TASKLET_STATE_RUN, &t->state);
+ smp_mb__after_atomic();
+ wake_up_var(&t->state);
+}
+EXPORT_SYMBOL_GPL(tasklet_unlock);
+
+void tasklet_unlock_wait(struct tasklet_struct *t)
+{
+ wait_var_event(&t->state, !test_bit(TASKLET_STATE_RUN, &t->state));
+}
+EXPORT_SYMBOL_GPL(tasklet_unlock_wait);
+#endif
+
void __init softirq_init(void)
{
int cpu;
@@ -641,53 +912,21 @@ static int ksoftirqd_should_run(unsigned int cpu)
static void run_ksoftirqd(unsigned int cpu)
{
- local_irq_disable();
+ ksoftirqd_run_begin();
if (local_softirq_pending()) {
/*
* We can safely run softirq on inline stack, as we are not deep
* in the task stack here.
*/
__do_softirq();
- local_irq_enable();
+ ksoftirqd_run_end();
cond_resched();
return;
}
- local_irq_enable();
+ ksoftirqd_run_end();
}
#ifdef CONFIG_HOTPLUG_CPU
-/*
- * tasklet_kill_immediate is called to remove a tasklet which can already be
- * scheduled for execution on @cpu.
- *
- * Unlike tasklet_kill, this function removes the tasklet
- * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
- *
- * When this function is called, @cpu must be in the CPU_DEAD state.
- */
-void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
-{
- struct tasklet_struct **i;
-
- BUG_ON(cpu_online(cpu));
- BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
-
- if (!test_bit(TASKLET_STATE_SCHED, &t->state))
- return;
-
- /* CPU is dead, so no lock needed. */
- for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
- if (*i == t) {
- *i = t->next;
- /* If this was the tail element, move the tail ptr */
- if (*i == NULL)
- per_cpu(tasklet_vec, cpu).tail = i;
- return;
- }
- }
- BUG();
-}
-
static int takeover_tasklets(unsigned int cpu)
{
/* CPU is dead, so no lock needed. */
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 6906c6ec4c97..2c5950b0b90e 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -35,27 +35,30 @@ static inline void *static_call_addr(struct static_call_site *site)
return (void *)((long)site->addr + (long)&site->addr);
}
+static inline unsigned long __static_call_key(const struct static_call_site *site)
+{
+ return (long)site->key + (long)&site->key;
+}
static inline struct static_call_key *static_call_key(const struct static_call_site *site)
{
- return (struct static_call_key *)
- (((long)site->key + (long)&site->key) & ~STATIC_CALL_SITE_FLAGS);
+ return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS);
}
/* These assume the key is word-aligned. */
static inline bool static_call_is_init(struct static_call_site *site)
{
- return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_INIT;
+ return __static_call_key(site) & STATIC_CALL_SITE_INIT;
}
static inline bool static_call_is_tail(struct static_call_site *site)
{
- return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_TAIL;
+ return __static_call_key(site) & STATIC_CALL_SITE_TAIL;
}
static inline void static_call_set_init(struct static_call_site *site)
{
- site->key = ((long)static_call_key(site) | STATIC_CALL_SITE_INIT) -
+ site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) -
(long)&site->key;
}
@@ -146,6 +149,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
};
for (site_mod = &first; site_mod; site_mod = site_mod->next) {
+ bool init = system_state < SYSTEM_RUNNING;
struct module *mod = site_mod->mod;
if (!site_mod->sites) {
@@ -165,6 +169,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
if (mod) {
stop = mod->static_call_sites +
mod->num_static_call_sites;
+ init = mod->state == MODULE_STATE_COMING;
}
#endif
@@ -172,25 +177,26 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
site < stop && static_call_key(site) == key; site++) {
void *site_addr = static_call_addr(site);
- if (static_call_is_init(site)) {
- /*
- * Don't write to call sites which were in
- * initmem and have since been freed.
- */
- if (!mod && system_state >= SYSTEM_RUNNING)
- continue;
- if (mod && !within_module_init((unsigned long)site_addr, mod))
- continue;
- }
+ if (!init && static_call_is_init(site))
+ continue;
if (!kernel_text_address((unsigned long)site_addr)) {
- WARN_ONCE(1, "can't patch static call site at %pS",
+ /*
+ * This skips patching built-in __exit, which
+ * is part of init_section_contains() but is
+ * not part of kernel_text_address().
+ *
+ * Skipping built-in __exit is fine since it
+ * will never be executed.
+ */
+ WARN_ONCE(!static_call_is_init(site),
+ "can't patch static call site at %pS",
site_addr);
continue;
}
arch_static_call_transform(site_addr, NULL, func,
- static_call_is_tail(site));
+ static_call_is_tail(site));
}
}
@@ -349,7 +355,8 @@ static int static_call_add_module(struct module *mod)
struct static_call_site *site;
for (site = start; site != stop; site++) {
- unsigned long addr = (unsigned long)static_call_key(site);
+ unsigned long s_key = __static_call_key(site);
+ unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS;
unsigned long key;
/*
@@ -373,8 +380,8 @@ static int static_call_add_module(struct module *mod)
return -EINVAL;
}
- site->key = (key - (long)&site->key) |
- (site->key & STATIC_CALL_SITE_FLAGS);
+ key |= s_key & STATIC_CALL_SITE_FLAGS;
+ site->key = key - (long)&site->key;
}
return __static_call_init(mod, start, stop);
diff --git a/kernel/sys.c b/kernel/sys.c
index 6928d23c46ea..3d62c9599dc0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -119,6 +119,12 @@
#ifndef PAC_RESET_KEYS
# define PAC_RESET_KEYS(a, b) (-EINVAL)
#endif
+#ifndef PAC_SET_ENABLED_KEYS
+# define PAC_SET_ENABLED_KEYS(a, b, c) (-EINVAL)
+#endif
+#ifndef PAC_GET_ENABLED_KEYS
+# define PAC_GET_ENABLED_KEYS(a) (-EINVAL)
+#endif
#ifndef SET_TAGGED_ADDR_CTRL
# define SET_TAGGED_ADDR_CTRL(a) (-EINVAL)
#endif
@@ -1242,7 +1248,7 @@ static int override_release(char __user *release, size_t len)
break;
rest++;
}
- v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60;
+ v = LINUX_VERSION_PATCHLEVEL + 60;
copy = clamp_t(size_t, len, 1, sizeof(buf));
copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
ret = copy_to_user(release, buf, copy + 1);
@@ -1847,7 +1853,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
goto exit;
- err = inode_permission(inode, MAY_EXEC);
+ err = file_permission(exe.file, MAY_EXEC);
if (err)
goto exit;
@@ -2079,7 +2085,7 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
* up to the caller to provide sane values here, otherwise userspace
* tools which use this vector might be unhappy.
*/
- unsigned long user_auxv[AT_VECTOR_SIZE];
+ unsigned long user_auxv[AT_VECTOR_SIZE] = {};
if (len > sizeof(user_auxv))
return -EINVAL;
@@ -2497,6 +2503,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = PAC_RESET_KEYS(me, arg2);
break;
+ case PR_PAC_SET_ENABLED_KEYS:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = PAC_SET_ENABLED_KEYS(me, arg2, arg3);
+ break;
+ case PR_PAC_GET_ENABLED_KEYS:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = PAC_GET_ENABLED_KEYS(me);
+ break;
case PR_SET_TAGGED_ADDR_CTRL:
if (arg3 || arg4 || arg5)
return -EINVAL;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c9fbdd848138..62fbd09b5dc1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2962,7 +2962,7 @@ static struct ctl_table vm_table[] = {
.data = &block_dump,
.maxlen = sizeof(block_dump),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
@@ -2970,7 +2970,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_vfs_cache_pressure,
.maxlen = sizeof(sysctl_vfs_cache_pressure),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
@@ -2980,7 +2980,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_legacy_va_layout,
.maxlen = sizeof(sysctl_legacy_va_layout),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
#endif
@@ -2990,7 +2990,7 @@ static struct ctl_table vm_table[] = {
.data = &node_reclaim_mode,
.maxlen = sizeof(node_reclaim_mode),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 98d7a15e8cf6..bea9d08b1698 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -2,13 +2,13 @@
/*
* Alarmtimer interface
*
- * This interface provides a timer which is similarto hrtimers,
+ * This interface provides a timer which is similar to hrtimers,
* but triggers a RTC alarm if the box is suspend.
*
* This interface is influenced by the Android RTC Alarm timer
* interface.
*
- * Copyright (C) 2010 IBM Corperation
+ * Copyright (C) 2010 IBM Corporation
*
* Author: John Stultz <john.stultz@linaro.org>
*/
@@ -811,7 +811,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
/**
* alarm_timer_nsleep - alarmtimer nanosleep
* @which_clock: clockid
- * @flags: determins abstime or relative
+ * @flags: determines abstime or relative
* @tsreq: requested sleep time (abs or rel)
*
* Handles clock_nanosleep calls against _ALARM clockids
@@ -854,9 +854,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
if (flags == TIMER_ABSTIME)
return -ERESTARTNOHAND;
- restart->fn = alarm_timer_nsleep_restart;
restart->nanosleep.clockid = type;
restart->nanosleep.expires = exp;
+ set_restart_fn(restart, alarm_timer_nsleep_restart);
return ret;
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index cce484a2cc7c..1d1a61371b5a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -38,7 +38,7 @@
* calculated mult and shift factors. This guarantees that no 64bit
* overflow happens when the input value of the conversion is
* multiplied with the calculated mult factor. Larger ranges may
- * reduce the conversion accuracy by chosing smaller mult and shift
+ * reduce the conversion accuracy by choosing smaller mult and shift
* factors.
*/
void
@@ -518,7 +518,7 @@ static void clocksource_suspend_select(bool fallback)
* the suspend time when resuming system.
*
* This function is called late in the suspend process from timekeeping_suspend(),
- * that means processes are freezed, non-boot cpus and interrupts are disabled
+ * that means processes are frozen, non-boot cpus and interrupts are disabled
* now. It is therefore possible to start the suspend timer without taking the
* clocksource mutex.
*/
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 743c852e10f2..4a66725b1d4a 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -546,8 +546,11 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
}
/*
- * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
- * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
+ * Recomputes cpu_base::*next_timer and returns the earliest expires_next
+ * but does not set cpu_base::*expires_next, that is done by
+ * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
+ * cpu_base::*expires_next right away, reprogramming logic would no longer
+ * work.
*
* When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
* those timers will get run whenever the softirq gets handled, at the end of
@@ -588,6 +591,37 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
return expires_next;
}
+static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
+{
+ ktime_t expires_next, soft = KTIME_MAX;
+
+ /*
+ * If the soft interrupt has already been activated, ignore the
+ * soft bases. They will be handled in the already raised soft
+ * interrupt.
+ */
+ if (!cpu_base->softirq_activated) {
+ soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
+ /*
+ * Update the soft expiry time. clock_settime() might have
+ * affected it.
+ */
+ cpu_base->softirq_expires_next = soft;
+ }
+
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
+ /*
+ * If a softirq timer is expiring first, update cpu_base->next_timer
+ * and program the hardware with the soft expiry time.
+ */
+ if (expires_next > soft) {
+ cpu_base->next_timer = cpu_base->softirq_next_timer;
+ expires_next = soft;
+ }
+
+ return expires_next;
+}
+
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
@@ -628,23 +662,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
ktime_t expires_next;
- /*
- * Find the current next expiration time.
- */
- expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
-
- if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
- /*
- * When the softirq is activated, hrtimer has to be
- * programmed with the first hard hrtimer because soft
- * timer interrupt could occur too late.
- */
- if (cpu_base->softirq_activated)
- expires_next = __hrtimer_get_next_event(cpu_base,
- HRTIMER_ACTIVE_HARD);
- else
- cpu_base->softirq_expires_next = expires_next;
- }
+ expires_next = hrtimer_update_next_event(cpu_base);
if (skip_equal && expires_next == cpu_base->expires_next)
return;
@@ -665,7 +683,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
* T1 is removed, so this code is called and would reprogram
* the hardware to 5s from now. Any hrtimer_start after that
* will not reprogram the hardware due to hang_detected being
- * set. So we'd effectivly block all timers until the T2 event
+ * set. So we'd effectively block all timers until the T2 event
* fires.
*/
if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
@@ -1001,7 +1019,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
* cpu_base->next_timer. This happens when we remove the first
* timer on a remote cpu. No harm as we never dereference
* cpu_base->next_timer. So the worst thing what can happen is
- * an superflous call to hrtimer_force_reprogram() on the
+ * an superfluous call to hrtimer_force_reprogram() on the
* remote cpu later on if the same timer gets enqueued again.
*/
if (reprogram && timer == cpu_base->next_timer)
@@ -1194,7 +1212,7 @@ static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
* The counterpart to hrtimer_cancel_wait_running().
*
* If there is a waiter for cpu_base->expiry_lock, then it was waiting for
- * the timer callback to finish. Drop expiry_lock and reaquire it. That
+ * the timer callback to finish. Drop expiry_lock and reacquire it. That
* allows the waiter to acquire the lock and make progress.
*/
static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
@@ -1380,7 +1398,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
int base;
/*
- * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+ * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
* marked for hard interrupt expiry mode are moved into soft
* interrupt context for latency reasons and because the callbacks
* can invoke functions which might sleep on RT, e.g. spin_lock().
@@ -1412,7 +1430,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
* hrtimer_init - initialize a timer to the given clock
* @timer: the timer to be initialized
* @clock_id: the clock to be used
- * @mode: The modes which are relevant for intitialization:
+ * @mode: The modes which are relevant for initialization:
* HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
* HRTIMER_MODE_REL_SOFT
*
@@ -1469,7 +1487,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
* insufficient for that.
*
* The sequence numbers are required because otherwise we could still observe
- * a false negative if the read side got smeared over multiple consequtive
+ * a false negative if the read side got smeared over multiple consecutive
* __run_hrtimer() invocations.
*/
@@ -1570,7 +1588,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
* minimizing wakeups, not running timers at the
* earliest interrupt after their soft expiration.
* This allows us to avoid using a Priority Search
- * Tree, which can answer a stabbing querry for
+ * Tree, which can answer a stabbing query for
* overlapping intervals and instead use the simple
* BST we already have.
* We don't add extra wakeups by delaying timers that
@@ -1644,8 +1662,8 @@ retry:
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
- /* Reevaluate the clock bases for the next expiry */
- expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
+ /* Reevaluate the clock bases for the [soft] next expiry */
+ expires_next = hrtimer_update_next_event(cpu_base);
/*
* Store the new expiry value so the migration code can verify
* against it.
@@ -1804,7 +1822,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
clockid_t clock_id, enum hrtimer_mode mode)
{
/*
- * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+ * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
* marked for hard interrupt expiry mode are moved into soft
* interrupt context either for latency reasons or because the
* hrtimer callback takes regular spinlocks or invokes other
@@ -1817,7 +1835,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
* the same CPU. That causes a latency spike due to the wakeup of
* a gazillion threads.
*
- * OTOH, priviledged real-time user space applications rely on the
+ * OTOH, privileged real-time user space applications rely on the
* low latency of hard interrupt wakeups. If the current task is in
* a real-time scheduling class, mark the mode for hard interrupt
* expiry.
@@ -1939,9 +1957,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
}
restart = &current->restart_block;
- restart->fn = hrtimer_nanosleep_restart;
restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+ set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
destroy_hrtimer_on_stack(&t.timer);
return ret;
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a5cffe2a1770..a492e4da69ba 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -44,7 +44,7 @@ static u64 jiffies_read(struct clocksource *cs)
* the timer interrupt frequency HZ and it suffers
* inaccuracies caused by missed or lost timer
* interrupts and the inability for the timer
- * interrupt hardware to accuratly tick at the
+ * interrupt hardware to accurately tick at the
* requested HZ value. It is also not recommended
* for "tick-less" systems.
*/
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5247afd7f345..406dccb79c2b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -544,7 +544,7 @@ static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec,
struct timespec64 *to_set,
const struct timespec64 *now)
{
- /* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */
+ /* Allowed error in tv_nsec, arbitrarily set to 5 jiffies in ns. */
const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
struct timespec64 delay = {.tv_sec = -1,
.tv_nsec = set_offset_nsec};
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a71758e34e45..3bb96a8b49c9 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -279,7 +279,7 @@ void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples)
* @tsk: Task for which cputime needs to be started
* @samples: Storage for time samples
*
- * The thread group cputime accouting is avoided when there are no posix
+ * The thread group cputime accounting is avoided when there are no posix
* CPU timers armed. Before starting a timer it's required to check whether
* the time accounting is active. If not, a full update of the atomic
* accounting store needs to be done and the accounting enabled.
@@ -390,7 +390,7 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
/*
* If posix timer expiry is handled in task work context then
* timer::it_lock can be taken without disabling interrupts as all
- * other locking happens in task context. This requires a seperate
+ * other locking happens in task context. This requires a separate
* lock class key otherwise regular posix timer expiry would record
* the lock class being taken in interrupt context and generate a
* false positive warning.
@@ -1216,7 +1216,7 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
check_process_timers(tsk, &firing);
/*
- * The above timer checks have updated the exipry cache and
+ * The above timer checks have updated the expiry cache and
* because nothing can have queued or modified timers after
* sighand lock was taken above it is guaranteed to be
* consistent. So the next timer interrupt fastpath check
@@ -1480,8 +1480,8 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
if (flags & TIMER_ABSTIME)
return -ERESTARTNOHAND;
- restart_block->fn = posix_cpu_nsleep_restart;
restart_block->nanosleep.clockid = which_clock;
+ set_restart_fn(restart_block, posix_cpu_nsleep_restart);
}
return error;
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index bf540f5a4115..dd5697d7347b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1191,8 +1191,8 @@ SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock,
err = do_clock_adjtime(which_clock, &ktx);
- if (err >= 0)
- err = put_old_timex32(utp, &ktx);
+ if (err >= 0 && put_old_timex32(utp, &ktx))
+ return -EFAULT;
return err;
}
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index 77c63005dc4e..13b11eb62685 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -21,7 +21,6 @@
#define DEBUGFS_FILENAME "udelay_test"
static DEFINE_MUTEX(udelay_test_lock);
-static struct dentry *udelay_test_debugfs_file;
static int udelay_test_usecs;
static int udelay_test_iterations = DEFAULT_ITERATIONS;
@@ -138,8 +137,8 @@ static const struct file_operations udelay_test_debugfs_ops = {
static int __init udelay_test_init(void)
{
mutex_lock(&udelay_test_lock);
- udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME,
- S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops);
+ debugfs_create_file(DEBUGFS_FILENAME, S_IRUSR, NULL, NULL,
+ &udelay_test_debugfs_ops);
mutex_unlock(&udelay_test_lock);
return 0;
@@ -150,7 +149,7 @@ module_init(udelay_test_init);
static void __exit udelay_test_exit(void)
{
mutex_lock(&udelay_test_lock);
- debugfs_remove(udelay_test_debugfs_file);
+ debugfs_remove(debugfs_lookup(DEBUGFS_FILENAME, NULL));
mutex_unlock(&udelay_test_lock);
}
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index b5a65e212df2..797eb93103ad 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -53,7 +53,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
* reasons.
*
* Each caller tries to arm the hrtimer on its own CPU, but if the
- * hrtimer callbback function is currently running, then
+ * hrtimer callback function is currently running, then
* hrtimer_start() cannot move it and the timer stays on the CPU on
* which it is assigned at the moment.
*
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 5a23829372c7..a44055228796 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -107,6 +107,19 @@ void tick_install_broadcast_device(struct clock_event_device *dev)
tick_broadcast_device.evtdev = dev;
if (!cpumask_empty(tick_broadcast_mask))
tick_broadcast_start_periodic(dev);
+
+ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return;
+
+ /*
+ * If the system already runs in oneshot mode, switch the newly
+ * registered broadcast device to oneshot mode explicitly.
+ */
+ if (tick_broadcast_oneshot_active()) {
+ tick_broadcast_switch_to_oneshot();
+ return;
+ }
+
/*
* Inform all cpus about this. We might be in a situation
* where we did not switch to oneshot mode because the per cpu
@@ -115,8 +128,7 @@ void tick_install_broadcast_device(struct clock_event_device *dev)
* notification the systems stays stuck in periodic mode
* forever.
*/
- if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
- tick_clock_notify();
+ tick_clock_notify();
}
/*
@@ -157,7 +169,7 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
}
/*
- * Check, if the device is disfunctional and a place holder, which
+ * Check, if the device is dysfunctional and a placeholder, which
* needs to be handled by the broadcast device.
*/
int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
@@ -391,7 +403,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
* - the broadcast device exists
* - the broadcast device is not a hrtimer based one
* - the broadcast device is in periodic mode to
- * avoid a hickup during switch to oneshot mode
+ * avoid a hiccup during switch to oneshot mode
*/
if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) &&
tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 9d3a22510bab..e15bc0ef1912 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -348,12 +348,7 @@ void tick_check_new_device(struct clock_event_device *newdev)
td = &per_cpu(tick_cpu_device, cpu);
curdev = td->evtdev;
- /* cpu local device ? */
- if (!tick_check_percpu(curdev, newdev, cpu))
- goto out_bc;
-
- /* Preference decision */
- if (!tick_check_preferred(curdev, newdev))
+ if (!tick_check_replacement(curdev, newdev))
goto out_bc;
if (!try_module_get(newdev->owner))
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index f9745d47425a..475ecceda768 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -45,7 +45,7 @@ int tick_program_event(ktime_t expires, int force)
}
/**
- * tick_resume_onshot - resume oneshot mode
+ * tick_resume_oneshot - resume oneshot mode
*/
void tick_resume_oneshot(void)
{
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e10a4af88737..828b091501ca 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -751,7 +751,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
* Aside of that check whether the local timer softirq is
* pending. If so its a bad idea to call get_next_timer_interrupt()
* because there is an already expired timer, so it will request
- * immeditate expiry, which rearms the hardware timer with a
+ * immediate expiry, which rearms the hardware timer with a
* minimal delta which brings us back to this place
* immediately. Lather, rinse and repeat...
*/
@@ -973,7 +973,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
if (unlikely(local_softirq_pending())) {
static int ratelimit;
- if (ratelimit < 10 &&
+ if (ratelimit < 10 && !local_bh_blocked() &&
(local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n",
(unsigned int) local_softirq_pending());
@@ -1124,7 +1124,11 @@ ktime_t tick_nohz_get_next_hrtimer(void)
* tick_nohz_get_sleep_length - return the expected length of the current sleep
* @delta_next: duration until the next event if the tick cannot be stopped
*
- * Called from power state control code with interrupts disabled
+ * Called from power state control code with interrupts disabled.
+ *
+ * The return value of this function and/or the value returned by it through the
+ * @delta_next pointer can be negative which must be taken into account by its
+ * callers.
*/
ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 4fb06527cf64..d952ae393423 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -29,7 +29,7 @@ enum tick_nohz_mode {
* @inidle: Indicator that the CPU is in the tick idle mode
* @tick_stopped: Indicator that the idle tick has been stopped
* @idle_active: Indicator that the CPU is actively in the tick idle mode;
- * it is resetted during irq handling phases.
+ * it is reset during irq handling phases.
* @do_timer_lst: CPU was the last one doing do_timer before going idle
* @got_idle_tick: Tick timer function has run with @inidle set
* @last_tick: Store the last tick expiry time when the tick
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 3985b2b32d08..29923b20e0e4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -571,7 +571,7 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
/*
* The TICK_NSEC - 1 rounds up the value to the next resolution. Note
* that a remainder subtract here would not do the right thing as the
- * resolution values don't fall on second boundries. I.e. the line:
+ * resolution values don't fall on second boundaries. I.e. the line:
* nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
* Note that due to the small error in the multiplier here, this
* rounding is incorrect for sufficiently large values of tv_nsec, but
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 85b98e727306..e6285288d765 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -76,7 +76,7 @@ static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
return ns;
}
-u64 timecounter_cyc2time(struct timecounter *tc,
+u64 timecounter_cyc2time(const struct timecounter *tc,
u64 cycle_tstamp)
{
u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6aee5768c86f..81fe2a33b80c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -596,14 +596,14 @@ EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
* careful cache layout of the timekeeper because the sequence count and
* struct tk_read_base would then need two cache lines instead of one.
*
- * Access to the time keeper clock source is disabled accross the innermost
+ * Access to the time keeper clock source is disabled across the innermost
* steps of suspend/resume. The accessors still work, but the timestamps
* are frozen until time keeping is resumed which happens very early.
*
* For regular suspend/resume there is no observable difference vs. sched
* clock, but it might affect some of the nasty low level debug printks.
*
- * OTOH, access to sched clock is not guaranteed accross suspend/resume on
+ * OTOH, access to sched clock is not guaranteed across suspend/resume on
* all systems either so it depends on the hardware in use.
*
* If that turns out to be a real problem then this could be mitigated by
@@ -899,7 +899,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
/**
- * ktime_mono_to_any() - convert mononotic time to any other time
+ * ktime_mono_to_any() - convert monotonic time to any other time
* @tmono: time to convert.
* @offs: which offset to use
*/
@@ -1427,35 +1427,45 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
static int change_clocksource(void *data)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct clocksource *new, *old;
+ struct clocksource *new, *old = NULL;
unsigned long flags;
+ bool change = false;
new = (struct clocksource *) data;
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
-
- timekeeping_forward_now(tk);
/*
* If the cs is in module, get a module reference. Succeeds
* for built-in code (owner == NULL) as well.
*/
if (try_module_get(new->owner)) {
- if (!new->enable || new->enable(new) == 0) {
- old = tk->tkr_mono.clock;
- tk_setup_internals(tk, new);
- if (old->disable)
- old->disable(old);
- module_put(old->owner);
- } else {
+ if (!new->enable || new->enable(new) == 0)
+ change = true;
+ else
module_put(new->owner);
- }
}
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+
+ timekeeping_forward_now(tk);
+
+ if (change) {
+ old = tk->tkr_mono.clock;
+ tk_setup_internals(tk, new);
+ }
+
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ if (old) {
+ if (old->disable)
+ old->disable(old);
+
+ module_put(old->owner);
+ }
+
return 0;
}
@@ -1948,7 +1958,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
* xtime_nsec_1 = offset + xtime_nsec_2
* Which gives us:
* xtime_nsec_2 = xtime_nsec_1 - offset
- * Which simplfies to:
+ * Which simplifies to:
* xtime_nsec -= offset
*/
if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
@@ -2336,7 +2346,7 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
/*
* Validate if a timespec/timeval used to inject a time
- * offset is valid. Offsets can be postive or negative, so
+ * offset is valid. Offsets can be positive or negative, so
* we don't check tv_sec. The value of the timeval/timespec
* is the sum of its fields,but *NOTE*:
* The field tv_usec/tv_nsec must always be non-negative and
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f475f1a027c8..d111adf4a0cb 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -894,7 +894,7 @@ static inline void forward_timer_base(struct timer_base *base)
/*
* No need to forward if we are close enough below jiffies.
* Also while executing timers, base->clk is 1 offset ahead
- * of jiffies to avoid endless requeuing to current jffies.
+ * of jiffies to avoid endless requeuing to current jiffies.
*/
if ((long)(jnow - base->clk) < 1)
return;
@@ -1271,7 +1271,7 @@ static inline void timer_base_unlock_expiry(struct timer_base *base)
* The counterpart to del_timer_wait_running().
*
* If there is a waiter for base->expiry_lock, then it was waiting for the
- * timer callback to finish. Drop expiry_lock and reaquire it. That allows
+ * timer callback to finish. Drop expiry_lock and reacquire it. That allows
* the waiter to acquire the lock and make progress.
*/
static void timer_sync_wait_running(struct timer_base *base)
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 88e6b8ed6ca5..f0d5062d9cbc 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -108,7 +108,7 @@ void update_vsyscall(struct timekeeper *tk)
/*
* If the current clocksource is not VDSO capable, then spare the
- * update of the high reolution parts.
+ * update of the high resolution parts.
*/
if (clock_mode != VDSO_CLOCKMODE_NONE)
update_vdso_data(vdata, tk);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index c1a62ae7e812..7fa82778c3e6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -60,6 +60,11 @@ config HAVE_NOP_MCOUNT
help
Arch supports the gcc options -pg with -mrecord-mcount and -nop-mcount
+config HAVE_OBJTOOL_MCOUNT
+ bool
+ help
+ Arch supports objtool --mcount
+
config HAVE_C_RECORDMCOUNT
bool
help
@@ -545,7 +550,7 @@ config KPROBE_EVENTS_ON_NOTRACE
using kprobe events.
If kprobes can use ftrace instead of breakpoint, ftrace related
- functions are protected from kprobe-events to prevent an infinit
+ functions are protected from kprobe-events to prevent an infinite
recursion or any unexpected execution path which leads to a kernel
crash.
@@ -602,6 +607,30 @@ config FTRACE_MCOUNT_RECORD
depends on DYNAMIC_FTRACE
depends on HAVE_FTRACE_MCOUNT_RECORD
+config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ bool
+ depends on FTRACE_MCOUNT_RECORD
+
+config FTRACE_MCOUNT_USE_CC
+ def_bool y
+ depends on $(cc-option,-mrecord-mcount)
+ depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ depends on FTRACE_MCOUNT_RECORD
+
+config FTRACE_MCOUNT_USE_OBJTOOL
+ def_bool y
+ depends on HAVE_OBJTOOL_MCOUNT
+ depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ depends on !FTRACE_MCOUNT_USE_CC
+ depends on FTRACE_MCOUNT_RECORD
+
+config FTRACE_MCOUNT_USE_RECORDMCOUNT
+ def_bool y
+ depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ depends on !FTRACE_MCOUNT_USE_CC
+ depends on !FTRACE_MCOUNT_USE_OBJTOOL
+ depends on FTRACE_MCOUNT_RECORD
+
config TRACING_MAP
bool
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -665,7 +694,7 @@ config TRACEPOINT_BENCHMARK
help
This option creates the tracepoint "benchmark:benchmark_event".
When the tracepoint is enabled, it kicks off a kernel thread that
- goes into an infinite loop (calling cond_sched() to let other tasks
+ goes into an infinite loop (calling cond_resched() to let other tasks
run), and calls the tracepoint. Each iteration will record the time
it took to write to the tracepoint and the next iteration that
data will be passed to the tracepoint itself. That is, the tracepoint
@@ -886,6 +915,10 @@ config PREEMPTIRQ_DELAY_TEST
irq-disabled critical sections for 500us:
modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3
+ What's more, if you want to attach the test on the cpu which the latency
+ tracer is running on, specify cpu_affinity=cpu_num at the end of the
+ command.
+
If unsure, say N
config SYNTH_EVENT_GEN_TEST
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 7e44cea89fdc..b28d3e5013cd 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
+obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
ifeq ($(CONFIG_PM),y)
obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9e9ee4945043..c221e4c3f625 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -72,17 +72,17 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
struct trace_buffer *buffer = NULL;
- int pc = 0;
+ unsigned int trace_ctx = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
if (blk_tracer) {
buffer = blk_tr->array_buffer.buffer;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + len + cgid_len,
- 0, pc);
+ trace_ctx);
if (!event)
return;
t = ring_buffer_event_data(event);
@@ -107,7 +107,7 @@ record_it:
memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer)
- trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
}
}
@@ -222,8 +222,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
struct blk_io_trace *t;
unsigned long flags = 0;
unsigned long *sequence;
+ unsigned int trace_ctx = 0;
pid_t pid;
- int cpu, pc = 0;
+ int cpu;
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
@@ -252,10 +253,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
tracing_record_cmdline(current);
buffer = blk_tr->array_buffer.buffer;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + pdu_len + cgid_len,
- 0, pc);
+ trace_ctx);
if (!event)
return;
t = ring_buffer_event_data(event);
@@ -301,7 +302,7 @@ record_it:
memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
if (blk_tracer) {
- trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
return;
}
}
@@ -311,8 +312,6 @@ record_it:
static void blk_trace_free(struct blk_trace *bt)
{
- debugfs_remove(bt->msg_file);
- debugfs_remove(bt->dropped_file);
relay_close(bt->rchan);
debugfs_remove(bt->dir);
free_percpu(bt->sequence);
@@ -544,10 +543,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
INIT_LIST_HEAD(&bt->running_list);
ret = -EIO;
- bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
- &blk_dropped_fops);
-
- bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+ debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
+ debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
bt->rchan = relay_open("trace", dir, buts->buf_size,
buts->buf_nr, &blk_relay_callbacks, bt);
@@ -1867,7 +1864,17 @@ void blk_trace_remove_sysfs(struct device *dev)
#ifdef CONFIG_EVENT_TRACING
-void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
+/**
+ * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string.
+ * @rwbs: buffer to be filled
+ * @op: REQ_OP_XXX for the tracepoint
+ *
+ * Description:
+ * Maps the REQ_OP_XXX to character and fills the buffer provided by the
+ * caller with resulting string.
+ *
+ **/
+void blk_fill_rwbs(char *rwbs, unsigned int op)
{
int i = 0;
diff --git a/kernel/trace/error_report-traces.c b/kernel/trace/error_report-traces.c
new file mode 100644
index 000000000000..f89792c25b11
--- /dev/null
+++ b/kernel/trace/error_report-traces.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Error reporting trace points.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/error_report.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(error_report_end);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4d8e35575549..3ba52d4e1314 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3231,7 +3231,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
pg = start_pg;
while (pg) {
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
+ if (order >= 0)
+ free_pages((unsigned long)pg->records, order);
start_pg = pg->next;
kfree(pg);
pg = start_pg;
@@ -5045,6 +5046,20 @@ struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
return NULL;
}
+static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr)
+{
+ struct ftrace_direct_func *direct;
+
+ direct = kmalloc(sizeof(*direct), GFP_KERNEL);
+ if (!direct)
+ return NULL;
+ direct->addr = addr;
+ direct->count = 0;
+ list_add_rcu(&direct->next, &ftrace_direct_funcs);
+ ftrace_direct_func_count++;
+ return direct;
+}
+
/**
* register_ftrace_direct - Call a custom trampoline directly
* @ip: The address of the nop at the beginning of a function
@@ -5120,15 +5135,11 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
direct = ftrace_find_direct_func(addr);
if (!direct) {
- direct = kmalloc(sizeof(*direct), GFP_KERNEL);
+ direct = ftrace_alloc_direct_func(addr);
if (!direct) {
kfree(entry);
goto out_unlock;
}
- direct->addr = addr;
- direct->count = 0;
- list_add_rcu(&direct->next, &ftrace_direct_funcs);
- ftrace_direct_func_count++;
}
entry->ip = ip;
@@ -5329,6 +5340,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
int modify_ftrace_direct(unsigned long ip,
unsigned long old_addr, unsigned long new_addr)
{
+ struct ftrace_direct_func *direct, *new_direct = NULL;
struct ftrace_func_entry *entry;
struct dyn_ftrace *rec;
int ret = -ENODEV;
@@ -5344,6 +5356,20 @@ int modify_ftrace_direct(unsigned long ip,
if (entry->direct != old_addr)
goto out_unlock;
+ direct = ftrace_find_direct_func(old_addr);
+ if (WARN_ON(!direct))
+ goto out_unlock;
+ if (direct->count > 1) {
+ ret = -ENOMEM;
+ new_direct = ftrace_alloc_direct_func(new_addr);
+ if (!new_direct)
+ goto out_unlock;
+ direct->count--;
+ new_direct->count++;
+ } else {
+ direct->addr = new_addr;
+ }
+
/*
* If there's no other ftrace callback on the rec->ip location,
* then it can be changed directly by the architecture.
@@ -5357,6 +5383,14 @@ int modify_ftrace_direct(unsigned long ip,
ret = 0;
}
+ if (unlikely(ret && new_direct)) {
+ direct->count++;
+ list_del_rcu(&new_direct->next);
+ synchronize_rcu_tasks();
+ kfree(new_direct);
+ ftrace_direct_func_count--;
+ }
+
out_unlock:
mutex_unlock(&ftrace_lock);
mutex_unlock(&direct_mutex);
@@ -6418,7 +6452,8 @@ void ftrace_release_mod(struct module *mod)
clear_mod_from_hashes(pg);
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
+ if (order >= 0)
+ free_pages((unsigned long)pg->records, order);
tmp_page = pg->next;
kfree(pg);
ftrace_number_of_pages -= 1 << order;
@@ -6778,7 +6813,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
if (!pg->index) {
*last_pg = pg->next;
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
+ if (order >= 0)
+ free_pages((unsigned long)pg->records, order);
ftrace_number_of_pages -= 1 << order;
ftrace_number_of_groups--;
kfree(pg);
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index 312d1a0ca3b6..8c4ffd076162 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -21,13 +21,16 @@
static ulong delay = 100;
static char test_mode[12] = "irq";
static uint burst_size = 1;
+static int cpu_affinity = -1;
module_param_named(delay, delay, ulong, 0444);
module_param_string(test_mode, test_mode, 12, 0444);
module_param_named(burst_size, burst_size, uint, 0444);
+module_param_named(cpu_affinity, cpu_affinity, int, 0444);
MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)");
MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)");
MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)");
+MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on");
static struct completion done;
@@ -36,7 +39,9 @@ static struct completion done;
static void busy_wait(ulong time)
{
u64 start, end;
+
start = trace_clock_local();
+
do {
end = trace_clock_local();
if (kthread_should_stop())
@@ -47,6 +52,7 @@ static void busy_wait(ulong time)
static __always_inline void irqoff_test(void)
{
unsigned long flags;
+
local_irq_save(flags);
busy_wait(delay);
local_irq_restore(flags);
@@ -113,6 +119,14 @@ static int preemptirq_delay_run(void *data)
{
int i;
int s = MIN(burst_size, NR_TEST_FUNCS);
+ struct cpumask cpu_mask;
+
+ if (cpu_affinity > -1) {
+ cpumask_clear(&cpu_mask);
+ cpumask_set_cpu(cpu_affinity, &cpu_mask);
+ if (set_cpus_allowed_ptr(current, &cpu_mask))
+ pr_err("cpu_affinity:%d, failed\n", cpu_affinity);
+ }
for (i = 0; i < s; i++)
(testfuncs[i])(i);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ec08f948dd80..68744c51517e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1112,8 +1112,7 @@ static struct list_head *rb_list_head(struct list_head *list)
* its flags will be non zero.
*/
static inline int
-rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page *page, struct list_head *list)
+rb_is_head_page(struct buffer_page *page, struct list_head *list)
{
unsigned long val;
@@ -1142,8 +1141,7 @@ static bool rb_is_reader_page(struct buffer_page *page)
/*
* rb_set_list_to_head - set a list_head to be pointing to head.
*/
-static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
- struct list_head *list)
+static void rb_set_list_to_head(struct list_head *list)
{
unsigned long *ptr;
@@ -1166,7 +1164,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
/*
* Set the previous list pointer to have the HEAD flag.
*/
- rb_set_list_to_head(cpu_buffer, head->list.prev);
+ rb_set_list_to_head(head->list.prev);
}
static void rb_list_head_clear(struct list_head *list)
@@ -1241,8 +1239,7 @@ static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
old_flag, RB_PAGE_NORMAL);
}
-static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page **bpage)
+static inline void rb_inc_page(struct buffer_page **bpage)
{
struct list_head *p = rb_list_head((*bpage)->list.next);
@@ -1274,11 +1271,11 @@ rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
*/
for (i = 0; i < 3; i++) {
do {
- if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
+ if (rb_is_head_page(page, page->list.prev)) {
cpu_buffer->head_page = page;
return page;
}
- rb_inc_page(cpu_buffer, &page);
+ rb_inc_page(&page);
} while (page != head);
}
@@ -1824,7 +1821,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
cond_resched();
to_remove_page = tmp_iter_page;
- rb_inc_page(cpu_buffer, &tmp_iter_page);
+ rb_inc_page(&tmp_iter_page);
/* update the counters */
page_entries = rb_page_entries(to_remove_page);
@@ -2062,10 +2059,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
put_online_cpus();
} else {
- /* Make sure this CPU has been initialized */
- if (!cpumask_test_cpu(cpu_id, buffer->cpumask))
- goto out;
-
cpu_buffer = buffer->buffers[cpu_id];
if (nr_pages == cpu_buffer->nr_pages)
@@ -2271,7 +2264,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
if (iter->head_page == cpu_buffer->reader_page)
iter->head_page = rb_set_head_page(cpu_buffer);
else
- rb_inc_page(cpu_buffer, &iter->head_page);
+ rb_inc_page(&iter->head_page);
iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
iter->head = 0;
@@ -2374,7 +2367,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* want the outer most commit to reset it.
*/
new_head = next_page;
- rb_inc_page(cpu_buffer, &new_head);
+ rb_inc_page(&new_head);
ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
RB_PAGE_NORMAL);
@@ -2526,7 +2519,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
next_page = tail_page;
- rb_inc_page(cpu_buffer, &next_page);
+ rb_inc_page(&next_page);
/*
* If for some reason, we had an interrupt storm that made
@@ -2552,7 +2545,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* the buffer, unless the commit page is still on the
* reader page.
*/
- if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
+ if (rb_is_head_page(next_page, &tail_page->list)) {
/*
* If the commit is not on the reader page, then
@@ -2583,7 +2576,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* have filled up the buffer with events
* from interrupts and such, and wrapped.
*
- * Note, if the tail page is also the on the
+ * Note, if the tail page is also on the
* reader_page, we let it move out.
*/
if (unlikely((cpu_buffer->commit_page !=
@@ -2822,6 +2815,17 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
return 0;
/*
+ * It's possible that the event time delta is zero
+ * (has the same time stamp as the previous event)
+ * in which case write_stamp and before_stamp could
+ * be the same. In such a case, force before_stamp
+ * to be different than write_stamp. It doesn't
+ * matter what it is, as long as its different.
+ */
+ if (!delta)
+ rb_time_set(&cpu_buffer->before_stamp, 0);
+
+ /*
* If an event were to come in now, it would see that the
* write_stamp and the before_stamp are different, and assume
* that this event just added itself before updating
@@ -2879,7 +2883,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
return;
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
- rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+ rb_inc_page(&cpu_buffer->commit_page);
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -3314,9 +3318,13 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
goto out;
}
atomic_inc(&cpu_buffer->record_disabled);
- pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld after:%lld\n",
- cpu_buffer->cpu,
- ts + info->delta, info->ts, info->delta, info->after);
+ /* There's some cases in boot up that this can happen */
+ WARN_ON_ONCE(system_state != SYSTEM_BOOTING);
+ pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n",
+ cpu_buffer->cpu,
+ ts + info->delta, info->ts, info->delta,
+ info->before, info->after,
+ full ? " (full)" : "");
dump_buffer_page(bpage, info, tail);
atomic_dec(&ts_dump);
/* Do not re-enable checking */
@@ -3638,14 +3646,14 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
* Because the commit page may be on the reader page we
* start with the next page and check the end loop there.
*/
- rb_inc_page(cpu_buffer, &bpage);
+ rb_inc_page(&bpage);
start = bpage;
do {
if (bpage->page == (void *)addr) {
local_dec(&bpage->entries);
return;
}
- rb_inc_page(cpu_buffer, &bpage);
+ rb_inc_page(&bpage);
} while (bpage != start);
/* commit not part of this buffer?? */
@@ -4367,7 +4375,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->pages = reader->list.prev;
/* The reader page will be pointing to the new head */
- rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
+ rb_set_list_to_head(&cpu_buffer->reader_page->list);
/*
* We want to make sure we read the overruns after we set up our
@@ -4406,7 +4414,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
* Now make the new head point back to the reader page.
*/
rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
- rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_inc_page(&cpu_buffer->head_page);
local_inc(&cpu_buffer->pages_read);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b5815a022ecc..915fe8790f04 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -176,7 +176,7 @@ static union trace_eval_map_item *trace_eval_maps;
int tracing_set_tracer(struct trace_array *tr, const char *buf);
static void ftrace_trace_userstack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
#define MAX_TRACER_SIZE 100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -408,7 +408,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
- TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS)
+ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \
+ TRACE_ITER_HASH_PTR)
/* trace_options that are only supported by global_trace */
#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
@@ -454,6 +455,7 @@ static void __trace_array_put(struct trace_array *this_tr)
/**
* trace_array_put - Decrement the reference counter for this trace array.
+ * @this_tr : pointer to the trace array
*
* NOTE: Use this when we no longer need the trace array returned by
* trace_array_get_by_name(). This ensures the trace array can be later
@@ -530,6 +532,7 @@ trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
/**
* trace_ignore_this_task - should a task be ignored for tracing
* @filtered_pids: The list of pids to check
+ * @filtered_no_pids: The list of pids not to be traced
* @task: The task that should be ignored if not filtered
*
* Checks if @task should be traced or not from @filtered_pids.
@@ -780,7 +783,7 @@ u64 ftrace_now(int cpu)
}
/**
- * tracing_is_enabled - Show if global_trace has been disabled
+ * tracing_is_enabled - Show if global_trace has been enabled
*
* Shows if the global trace has been enabled or not. It uses the
* mirror flag "buffer_disabled" to be used in fast paths such as for
@@ -905,23 +908,23 @@ static inline void trace_access_lock_init(void)
#ifdef CONFIG_STACKTRACE
static void __ftrace_trace_stack(struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs);
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs);
static inline void ftrace_trace_stack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs);
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs);
#else
static inline void __ftrace_trace_stack(struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
}
static inline void ftrace_trace_stack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned long trace_ctx,
+ int skip, struct pt_regs *regs)
{
}
@@ -929,24 +932,24 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
static __always_inline void
trace_event_setup(struct ring_buffer_event *event,
- int type, unsigned long flags, int pc)
+ int type, unsigned int trace_ctx)
{
struct trace_entry *ent = ring_buffer_event_data(event);
- tracing_generic_entry_update(ent, type, flags, pc);
+ tracing_generic_entry_update(ent, type, trace_ctx);
}
static __always_inline struct ring_buffer_event *
__trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct ring_buffer_event *event;
event = ring_buffer_lock_reserve(buffer, len);
if (event != NULL)
- trace_event_setup(event, type, flags, pc);
+ trace_event_setup(event, type, trace_ctx);
return event;
}
@@ -1007,25 +1010,22 @@ int __trace_puts(unsigned long ip, const char *str, int size)
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct print_entry *entry;
- unsigned long irq_flags;
+ unsigned int trace_ctx;
int alloc;
- int pc;
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
- pc = preempt_count();
-
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
alloc = sizeof(*entry) + size + 2; /* possible \n added */
- local_save_flags(irq_flags);
+ trace_ctx = tracing_gen_ctx();
buffer = global_trace.array_buffer.buffer;
ring_buffer_nest_start(buffer);
- event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
- irq_flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+ trace_ctx);
if (!event) {
size = 0;
goto out;
@@ -1044,7 +1044,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
entry->buf[size] = '\0';
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
+ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
out:
ring_buffer_nest_end(buffer);
return size;
@@ -1061,25 +1061,22 @@ int __trace_bputs(unsigned long ip, const char *str)
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct bputs_entry *entry;
- unsigned long irq_flags;
+ unsigned int trace_ctx;
int size = sizeof(struct bputs_entry);
int ret = 0;
- int pc;
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
- pc = preempt_count();
-
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
- local_save_flags(irq_flags);
+ trace_ctx = tracing_gen_ctx();
buffer = global_trace.array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
- irq_flags, pc);
+ trace_ctx);
if (!event)
goto out;
@@ -1088,7 +1085,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
+ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
ret = 1;
out:
@@ -1932,6 +1929,12 @@ static int run_tracer_selftest(struct tracer *type)
if (!selftests_can_run)
return save_selftest(type);
+ if (!tracing_is_on()) {
+ pr_warn("Selftest for tracer %s skipped due to tracing disabled\n",
+ type->name);
+ return 0;
+ }
+
/*
* Run a selftest on this tracer.
* Here we reset the trace buffer, and set the current
@@ -2584,36 +2587,34 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
}
EXPORT_SYMBOL_GPL(trace_handle_return);
-void
-tracing_generic_entry_update(struct trace_entry *entry, unsigned short type,
- unsigned long flags, int pc)
+unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
{
- struct task_struct *tsk = current;
+ unsigned int trace_flags = irqs_status;
+ unsigned int pc;
- entry->preempt_count = pc & 0xff;
- entry->pid = (tsk) ? tsk->pid : 0;
- entry->type = type;
- entry->flags =
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
- (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
-#else
- TRACE_FLAG_IRQS_NOSUPPORT |
-#endif
- ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
- ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
- ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
- (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
- (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
+ pc = preempt_count();
+
+ if (pc & NMI_MASK)
+ trace_flags |= TRACE_FLAG_NMI;
+ if (pc & HARDIRQ_MASK)
+ trace_flags |= TRACE_FLAG_HARDIRQ;
+ if (in_serving_softirq())
+ trace_flags |= TRACE_FLAG_SOFTIRQ;
+
+ if (tif_need_resched())
+ trace_flags |= TRACE_FLAG_NEED_RESCHED;
+ if (test_preempt_need_resched())
+ trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
+ return (trace_flags << 16) | (pc & 0xff);
}
-EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
struct ring_buffer_event *
trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
- return __trace_buffer_lock_reserve(buffer, type, len, flags, pc);
+ return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx);
}
DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -2733,7 +2734,7 @@ struct ring_buffer_event *
trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
struct trace_event_file *trace_file,
int type, unsigned long len,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct ring_buffer_event *entry;
int val;
@@ -2746,15 +2747,15 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
/* Try to use the per cpu buffer first */
val = this_cpu_inc_return(trace_buffered_event_cnt);
if ((len < (PAGE_SIZE - sizeof(*entry))) && val == 1) {
- trace_event_setup(entry, type, flags, pc);
+ trace_event_setup(entry, type, trace_ctx);
entry->array[0] = len;
return entry;
}
this_cpu_dec(trace_buffered_event_cnt);
}
- entry = __trace_buffer_lock_reserve(*current_rb,
- type, len, flags, pc);
+ entry = __trace_buffer_lock_reserve(*current_rb, type, len,
+ trace_ctx);
/*
* If tracing is off, but we have triggers enabled
* we still need to look at the event data. Use the temp_buffer
@@ -2763,8 +2764,8 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
*/
if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
*current_rb = temp_buffer;
- entry = __trace_buffer_lock_reserve(*current_rb,
- type, len, flags, pc);
+ entry = __trace_buffer_lock_reserve(*current_rb, type, len,
+ trace_ctx);
}
return entry;
}
@@ -2850,7 +2851,7 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT);
event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer,
fbuffer->event, fbuffer->entry,
- fbuffer->flags, fbuffer->pc, fbuffer->regs);
+ fbuffer->trace_ctx, fbuffer->regs);
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
@@ -2866,7 +2867,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- unsigned long flags, int pc,
+ unsigned int trace_ctx,
struct pt_regs *regs)
{
__buffer_unlock_commit(buffer, event);
@@ -2877,8 +2878,8 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
* and mmiotrace, but that's ok if they lose a function or
* two. They are not that meaningful.
*/
- ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
- ftrace_trace_userstack(tr, buffer, flags, pc);
+ ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs);
+ ftrace_trace_userstack(tr, buffer, trace_ctx);
}
/*
@@ -2892,9 +2893,8 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
}
void
-trace_function(struct trace_array *tr,
- unsigned long ip, unsigned long parent_ip, unsigned long flags,
- int pc)
+trace_function(struct trace_array *tr, unsigned long ip, unsigned long
+ parent_ip, unsigned int trace_ctx)
{
struct trace_event_call *call = &event_function;
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -2902,7 +2902,7 @@ trace_function(struct trace_array *tr,
struct ftrace_entry *entry;
event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
- flags, pc);
+ trace_ctx);
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -2936,8 +2936,8 @@ static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
static void __ftrace_trace_stack(struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
struct trace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
@@ -2984,7 +2984,8 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
size = nr_entries * sizeof(unsigned long);
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
- sizeof(*entry) + size, flags, pc);
+ (sizeof(*entry) - sizeof(entry->caller)) + size,
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3005,22 +3006,22 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
static inline void ftrace_trace_stack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
return;
- __ftrace_trace_stack(buffer, flags, skip, pc, regs);
+ __ftrace_trace_stack(buffer, trace_ctx, skip, regs);
}
-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
- int pc)
+void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
+ int skip)
{
struct trace_buffer *buffer = tr->array_buffer.buffer;
if (rcu_is_watching()) {
- __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+ __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
return;
}
@@ -3034,7 +3035,7 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
return;
rcu_irq_enter_irqson();
- __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+ __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
rcu_irq_exit_irqson();
}
@@ -3044,19 +3045,15 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
*/
void trace_dump_stack(int skip)
{
- unsigned long flags;
-
if (tracing_disabled || tracing_selftest_running)
return;
- local_save_flags(flags);
-
#ifndef CONFIG_UNWINDER_ORC
/* Skip 1 to skip this function. */
skip++;
#endif
__ftrace_trace_stack(global_trace.array_buffer.buffer,
- flags, skip, preempt_count(), NULL);
+ tracing_gen_ctx(), skip, NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
@@ -3065,7 +3062,7 @@ static DEFINE_PER_CPU(int, user_stack_count);
static void
ftrace_trace_userstack(struct trace_array *tr,
- struct trace_buffer *buffer, unsigned long flags, int pc)
+ struct trace_buffer *buffer, unsigned int trace_ctx)
{
struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
@@ -3092,7 +3089,7 @@ ftrace_trace_userstack(struct trace_array *tr,
__this_cpu_inc(user_stack_count);
event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
goto out_drop_count;
entry = ring_buffer_event_data(event);
@@ -3112,7 +3109,7 @@ ftrace_trace_userstack(struct trace_array *tr,
#else /* CONFIG_USER_STACKTRACE_SUPPORT */
static void ftrace_trace_userstack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
}
#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */
@@ -3242,9 +3239,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
struct trace_buffer *buffer;
struct trace_array *tr = &global_trace;
struct bprint_entry *entry;
- unsigned long flags;
+ unsigned int trace_ctx;
char *tbuffer;
- int len = 0, size, pc;
+ int len = 0, size;
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -3252,7 +3249,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
/* Don't pollute graph traces with trace_vprintk internals */
pause_graph_tracing();
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
tbuffer = get_trace_buf();
@@ -3266,12 +3263,11 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
goto out_put;
- local_save_flags(flags);
size = sizeof(*entry) + sizeof(u32) * len;
buffer = tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
- flags, pc);
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3281,7 +3277,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
memcpy(entry->buf, tbuffer, sizeof(u32) * len);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
}
out:
@@ -3304,9 +3300,9 @@ __trace_array_vprintk(struct trace_buffer *buffer,
{
struct trace_event_call *call = &event_print;
struct ring_buffer_event *event;
- int len = 0, size, pc;
+ int len = 0, size;
struct print_entry *entry;
- unsigned long flags;
+ unsigned int trace_ctx;
char *tbuffer;
if (tracing_disabled || tracing_selftest_running)
@@ -3315,7 +3311,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
/* Don't pollute graph traces with trace_vprintk internals */
pause_graph_tracing();
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
@@ -3327,11 +3323,10 @@ __trace_array_vprintk(struct trace_buffer *buffer,
len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
- local_save_flags(flags);
size = sizeof(*entry) + len + 1;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- flags, pc);
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3340,7 +3335,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
+ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL);
}
out:
@@ -3543,6 +3538,69 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
return next;
}
+#define STATIC_FMT_BUF_SIZE 128
+static char static_fmt_buf[STATIC_FMT_BUF_SIZE];
+
+static char *trace_iter_expand_format(struct trace_iterator *iter)
+{
+ char *tmp;
+
+ /*
+ * iter->tr is NULL when used with tp_printk, which makes
+ * this get called where it is not safe to call krealloc().
+ */
+ if (!iter->tr || iter->fmt == static_fmt_buf)
+ return NULL;
+
+ tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE,
+ GFP_KERNEL);
+ if (tmp) {
+ iter->fmt_size += STATIC_FMT_BUF_SIZE;
+ iter->fmt = tmp;
+ }
+
+ return tmp;
+}
+
+const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
+{
+ const char *p, *new_fmt;
+ char *q;
+
+ if (WARN_ON_ONCE(!fmt))
+ return fmt;
+
+ if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR)
+ return fmt;
+
+ p = fmt;
+ new_fmt = q = iter->fmt;
+ while (*p) {
+ if (unlikely(q - new_fmt + 3 > iter->fmt_size)) {
+ if (!trace_iter_expand_format(iter))
+ return fmt;
+
+ q += iter->fmt - new_fmt;
+ new_fmt = iter->fmt;
+ }
+
+ *q++ = *p++;
+
+ /* Replace %p with %px */
+ if (p[-1] == '%') {
+ if (p[0] == '%') {
+ *q++ = *p++;
+ } else if (p[0] == 'p' && !isalnum(p[1])) {
+ *q++ = *p++;
+ *q++ = 'x';
+ }
+ }
+ }
+ *q = '\0';
+
+ return new_fmt;
+}
+
#define STATIC_TEMP_BUF_SIZE 128
static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4);
@@ -4336,6 +4394,16 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
iter->temp_size = 128;
/*
+ * trace_event_printf() may need to modify given format
+ * string to replace %p with %px so that it shows real address
+ * instead of hash value. However, that is only for the event
+ * tracing, other tracer may not need. Defer the allocation
+ * until it is needed.
+ */
+ iter->fmt = NULL;
+ iter->fmt_size = 0;
+
+ /*
* We make a copy of the current tracer to avoid concurrent
* changes on it while we are reading.
*/
@@ -4486,6 +4554,7 @@ static int tracing_release(struct inode *inode, struct file *file)
mutex_destroy(&iter->mutex);
free_cpumask_var(iter->started);
+ kfree(iter->fmt);
kfree(iter->temp);
kfree(iter->trace);
kfree(iter->buffer_iter);
@@ -4763,7 +4832,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
cpumask_var_t tracing_cpumask_new;
int err;
- if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
@@ -6653,7 +6722,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
enum event_trigger_type tt = ETT_NONE;
struct trace_buffer *buffer;
struct print_entry *entry;
- unsigned long irq_flags;
ssize_t written;
int size;
int len;
@@ -6673,7 +6741,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
- local_save_flags(irq_flags);
size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
/* If less than "<faulted>", then make sure we can still add that */
@@ -6682,7 +6749,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- irq_flags, preempt_count());
+ tracing_gen_ctx());
if (unlikely(!event))
/* Ring buffer disabled, return as if not open for write */
return -EBADF;
@@ -6734,7 +6801,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct raw_data_entry *entry;
- unsigned long irq_flags;
ssize_t written;
int size;
int len;
@@ -6756,14 +6822,13 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
- local_save_flags(irq_flags);
size = sizeof(*entry) + cnt;
if (cnt < FAULT_SIZE_ID)
size += FAULT_SIZE_ID - cnt;
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
- irq_flags, preempt_count());
+ tracing_gen_ctx());
if (!event)
/* Ring buffer disabled, return as if not open for write */
return -EBADF;
@@ -9348,9 +9413,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
/* Simulate the iterator */
trace_init_global_iter(&iter);
- /* Can not use kmalloc for iter.temp */
+ /* Can not use kmalloc for iter.temp and iter.fmt */
iter.temp = static_temp_buf;
iter.temp_size = STATIC_TEMP_BUF_SIZE;
+ iter.fmt = static_fmt_buf;
+ iter.fmt_size = STATIC_FMT_BUF_SIZE;
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
@@ -9429,30 +9496,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
}
EXPORT_SYMBOL_GPL(ftrace_dump);
-int trace_run_command(const char *buf, int (*createfn)(int, char **))
-{
- char **argv;
- int argc, ret;
-
- argc = 0;
- ret = 0;
- argv = argv_split(GFP_KERNEL, buf, &argc);
- if (!argv)
- return -ENOMEM;
-
- if (argc)
- ret = createfn(argc, argv);
-
- argv_free(argv);
-
- return ret;
-}
-
#define WRITE_BUFSIZE 4096
ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos,
- int (*createfn)(int, char **))
+ int (*createfn)(const char *))
{
char *kbuf, *buf, *tmp;
int ret = 0;
@@ -9500,7 +9548,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
if (tmp)
*tmp = '\0';
- ret = trace_run_command(buf, createfn);
+ ret = createfn(buf);
if (ret)
goto out;
buf += size;
@@ -9648,7 +9696,7 @@ void __init early_trace_init(void)
{
if (tracepoint_printk) {
tracepoint_print_iter =
- kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
+ kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
if (MEM_FAIL(!tracepoint_print_iter,
"Failed to allocate trace iterator\n"))
tracepoint_printk = 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e448d2da0b99..a6446c03cfbc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -136,25 +136,6 @@ struct kretprobe_trace_entry_head {
unsigned long ret_ip;
};
-/*
- * trace_flag_type is an enumeration that holds different
- * states when a trace occurs. These are:
- * IRQS_OFF - interrupts were disabled
- * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
- * NEED_RESCHED - reschedule is requested
- * HARDIRQ - inside an interrupt handler
- * SOFTIRQ - inside a softirq handler
- */
-enum trace_flag_type {
- TRACE_FLAG_IRQS_OFF = 0x01,
- TRACE_FLAG_IRQS_NOSUPPORT = 0x02,
- TRACE_FLAG_NEED_RESCHED = 0x04,
- TRACE_FLAG_HARDIRQ = 0x08,
- TRACE_FLAG_SOFTIRQ = 0x10,
- TRACE_FLAG_PREEMPT_RESCHED = 0x20,
- TRACE_FLAG_NMI = 0x40,
-};
-
#define TRACE_BUF_SIZE 1024
struct trace_array;
@@ -589,8 +570,7 @@ struct ring_buffer_event *
trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
- unsigned long flags,
- int pc);
+ unsigned int trace_ctx);
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
@@ -601,6 +581,8 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
struct ring_buffer_event *event);
+const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
+
int trace_empty(struct trace_iterator *iter);
void *trace_find_next_entry_inc(struct trace_iterator *iter);
@@ -615,15 +597,14 @@ unsigned long trace_total_entries(struct trace_array *tr);
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
void trace_graph_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
void trace_latency_header(struct seq_file *m);
void trace_default_header(struct seq_file *m);
void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
-int trace_empty(struct trace_iterator *iter);
void trace_graph_return(struct ftrace_graph_ret *trace);
int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -687,11 +668,10 @@ static inline void latency_fsnotify(struct trace_array *tr) { }
#endif
#ifdef CONFIG_STACKTRACE
-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
- int pc);
+void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip);
#else
-static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
- int skip, int pc)
+static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
+ int skip)
{
}
#endif /* CONFIG_STACKTRACE */
@@ -831,10 +811,10 @@ extern void graph_trace_open(struct trace_iterator *iter);
extern void graph_trace_close(struct trace_iterator *iter);
extern int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
#ifdef CONFIG_DYNAMIC_FTRACE
extern struct ftrace_hash __rcu *ftrace_graph_hash;
@@ -1194,6 +1174,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(MARKERS, "markers"), \
C(EVENT_FORK, "event-fork"), \
C(PAUSE_ON_TRACE, "pause-on-trace"), \
+ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \
FUNCTION_FLAGS \
FGRAPH_FLAGS \
STACK_FLAGS \
@@ -1297,15 +1278,15 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- unsigned long flags, int pc,
+ unsigned int trcace_ctx,
struct pt_regs *regs);
static inline void trace_buffer_unlock_commit(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
- trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL);
+ trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL);
}
DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -1366,8 +1347,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
* @buffer: The ring buffer that the event is being written to
* @event: The event meta data in the ring buffer
* @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
+ * @trace_ctx: The tracing context flags.
*
* This is a helper function to handle triggers that require data
* from the event itself. It also tests the event against filters and
@@ -1377,12 +1357,12 @@ static inline void
event_trigger_unlock_commit(struct trace_event_file *file,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- void *entry, unsigned long irq_flags, int pc)
+ void *entry, unsigned int trace_ctx)
{
enum event_trigger_type tt = ETT_NONE;
if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
- trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
+ trace_buffer_unlock_commit(file->tr, buffer, event, trace_ctx);
if (tt)
event_triggers_post_call(file, tt);
@@ -1394,8 +1374,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
* @buffer: The ring buffer that the event is being written to
* @event: The event meta data in the ring buffer
* @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
+ * @trace_ctx: The tracing context flags.
*
* This is a helper function to handle triggers that require data
* from the event itself. It also tests the event against filters and
@@ -1408,14 +1387,14 @@ static inline void
event_trigger_unlock_commit_regs(struct trace_event_file *file,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- void *entry, unsigned long irq_flags, int pc,
+ void *entry, unsigned int trace_ctx,
struct pt_regs *regs)
{
enum event_trigger_type tt = ETT_NONE;
if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
trace_buffer_unlock_commit_regs(file->tr, buffer, event,
- irq_flags, pc, regs);
+ trace_ctx, regs);
if (tt)
event_triggers_post_call(file, tt);
@@ -1830,10 +1809,9 @@ extern int tracing_set_cpumask(struct trace_array *tr,
#define MAX_EVENT_NAME_LEN 64
-extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
extern ssize_t trace_parse_run_command(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos,
- int (*createfn)(int, char**));
+ int (*createfn)(const char *));
extern unsigned int err_pos(char *cmd, const char *str);
extern void tracing_log_err(struct trace_array *tr,
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index eff099123aa2..e47fdb4c92fb 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -37,7 +37,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
struct ring_buffer_event *event;
struct trace_branch *entry;
unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
const char *p;
if (current->trace_recursion & TRACE_BRANCH_BIT)
@@ -59,10 +59,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
if (atomic_read(&data->disabled))
goto out;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx_flags(flags);
buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
goto out;
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 4f967d5cd917..e57cc0870892 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -31,23 +31,31 @@ int dyn_event_register(struct dyn_event_operations *ops)
return 0;
}
-int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
+int dyn_event_release(const char *raw_command, struct dyn_event_operations *type)
{
struct dyn_event *pos, *n;
char *system = NULL, *event, *p;
- int ret = -ENOENT;
+ int argc, ret = -ENOENT;
+ char **argv;
+
+ argv = argv_split(GFP_KERNEL, raw_command, &argc);
+ if (!argv)
+ return -ENOMEM;
if (argv[0][0] == '-') {
- if (argv[0][1] != ':')
- return -EINVAL;
+ if (argv[0][1] != ':') {
+ ret = -EINVAL;
+ goto out;
+ }
event = &argv[0][2];
} else {
event = strchr(argv[0], ':');
- if (!event)
- return -EINVAL;
+ if (!event) {
+ ret = -EINVAL;
+ goto out;
+ }
event++;
}
- argc--; argv++;
p = strchr(event, '/');
if (p) {
@@ -55,15 +63,17 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
event = p + 1;
*p = '\0';
}
- if (event[0] == '\0')
- return -EINVAL;
+ if (event[0] == '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
mutex_lock(&event_mutex);
for_each_dyn_event_safe(pos, n) {
if (type && type != pos->ops)
continue;
if (!pos->ops->match(system, event,
- argc, (const char **)argv, pos))
+ argc - 1, (const char **)argv + 1, pos))
continue;
ret = pos->ops->free(pos);
@@ -71,21 +81,22 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
break;
}
mutex_unlock(&event_mutex);
-
+out:
+ argv_free(argv);
return ret;
}
-static int create_dyn_event(int argc, char **argv)
+static int create_dyn_event(const char *raw_command)
{
struct dyn_event_operations *ops;
int ret = -ENODEV;
- if (argv[0][0] == '-' || argv[0][0] == '!')
- return dyn_event_release(argc, argv, NULL);
+ if (raw_command[0] == '-' || raw_command[0] == '!')
+ return dyn_event_release(raw_command, NULL);
mutex_lock(&dyn_event_ops_mutex);
list_for_each_entry(ops, &dyn_event_ops_list, list) {
- ret = ops->create(argc, (const char **)argv);
+ ret = ops->create(raw_command);
if (!ret || ret != -ECANCELED)
break;
}
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index d6f72dcb7269..7754936b57ee 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -39,7 +39,7 @@ struct dyn_event;
*/
struct dyn_event_operations {
struct list_head list;
- int (*create)(int argc, const char *argv[]);
+ int (*create)(const char *raw_command);
int (*show)(struct seq_file *m, struct dyn_event *ev);
bool (*is_busy)(struct dyn_event *ev);
int (*free)(struct dyn_event *ev);
@@ -97,7 +97,7 @@ void *dyn_event_seq_start(struct seq_file *m, loff_t *pos);
void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos);
void dyn_event_seq_stop(struct seq_file *m, void *v);
int dyn_events_release_all(struct dyn_event_operations *type);
-int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type);
+int dyn_event_release(const char *raw_command, struct dyn_event_operations *type);
/*
* for_each_dyn_event - iterate over the dyn_event list
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a71181655958..288ad2c274fb 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -421,11 +421,8 @@ NOKPROBE_SYMBOL(perf_trace_buf_alloc);
void perf_trace_buf_update(void *record, u16 type)
{
struct trace_entry *entry = record;
- int pc = preempt_count();
- unsigned long flags;
- local_save_flags(flags);
- tracing_generic_entry_update(entry, type, flags, pc);
+ tracing_generic_entry_update(entry, type, tracing_gen_ctx());
}
NOKPROBE_SYMBOL(perf_trace_buf_update);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d387b774ceeb..a3563afd412d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -258,22 +258,19 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
trace_event_ignore_this_pid(trace_file))
return NULL;
- local_save_flags(fbuffer->flags);
- fbuffer->pc = preempt_count();
/*
* If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
* preemption (adding one to the preempt_count). Since we are
* interested in the preempt_count at the time the tracepoint was
* hit, we need to subtract one to offset the increment.
*/
- if (IS_ENABLED(CONFIG_PREEMPTION))
- fbuffer->pc--;
+ fbuffer->trace_ctx = tracing_gen_ctx_dec();
fbuffer->trace_file = trace_file;
fbuffer->event =
trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file,
event_call->event.type, len,
- fbuffer->flags, fbuffer->pc);
+ fbuffer->trace_ctx);
if (!fbuffer->event)
return NULL;
@@ -2101,16 +2098,20 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
dir->subsystem = system;
file->system = dir;
- entry = tracefs_create_file("filter", 0644, dir->entry, dir,
- &ftrace_subsystem_filter_fops);
- if (!entry) {
- kfree(system->filter);
- system->filter = NULL;
- pr_warn("Could not create tracefs '%s/filter' entry\n", name);
- }
+ /* the ftrace system is special, do not create enable or filter files */
+ if (strcmp(name, "ftrace") != 0) {
- trace_create_file("enable", 0644, dir->entry, dir,
- &ftrace_system_enable_fops);
+ entry = tracefs_create_file("filter", 0644, dir->entry, dir,
+ &ftrace_subsystem_filter_fops);
+ if (!entry) {
+ kfree(system->filter);
+ system->filter = NULL;
+ pr_warn("Could not create tracefs '%s/filter' entry\n", name);
+ }
+
+ trace_create_file("enable", 0644, dir->entry, dir,
+ &ftrace_system_enable_fops);
+ }
list_add(&dir->list, &tr->systems);
@@ -3679,12 +3680,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
struct trace_buffer *buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
- unsigned long flags;
+ unsigned int trace_ctx;
long disabled;
int cpu;
- int pc;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
cpu = raw_smp_processor_id();
disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
@@ -3692,11 +3692,9 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
if (disabled != 1)
goto out;
- local_save_flags(flags);
-
event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file,
TRACE_FN, sizeof(*entry),
- flags, pc);
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3704,7 +3702,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
entry->parent_ip = parent_ip;
event_trigger_unlock_commit(&event_trace_file, buffer, event,
- entry, flags, pc);
+ entry, trace_ctx);
out:
atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
preempt_enable_notrace();
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index 22bcf7c51d1e..c188045c5f97 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -192,7 +192,6 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size)
static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
{
struct ftrace_event_field *field;
- unsigned long irq_flags;
void *entry = NULL;
int entry_size;
u64 val = 0;
@@ -203,9 +202,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
if (!entry)
return -ENOMEM;
- local_save_flags(irq_flags);
- tracing_generic_entry_update(entry, call->event.type, irq_flags,
- preempt_count());
+ tracing_generic_entry_update(entry, call->event.type,
+ tracing_gen_ctx());
while ((len = parse_field(str, call, &field, &val)) > 0) {
if (is_function_field(field))
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 5a8bc0b421f1..8d71e6c83f10 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -23,13 +23,14 @@
#undef ERRORS
#define ERRORS \
C(BAD_NAME, "Illegal name"), \
- C(CMD_INCOMPLETE, "Incomplete command"), \
+ C(INVALID_CMD, "Command must be of the form: <name> field[;field] ..."),\
+ C(INVALID_DYN_CMD, "Command must be of the form: s or -:[synthetic/]<name> field[;field] ..."),\
C(EVENT_EXISTS, "Event already exists"), \
C(TOO_MANY_FIELDS, "Too many fields"), \
C(INCOMPLETE_TYPE, "Incomplete type"), \
C(INVALID_TYPE, "Invalid type"), \
- C(INVALID_FIELD, "Invalid field"), \
- C(CMD_TOO_LONG, "Command too long"),
+ C(INVALID_FIELD, "Invalid field"), \
+ C(INVALID_ARRAY_SPEC, "Invalid array specification"),
#undef C
#define C(a, b) SYNTH_ERR_##a
@@ -48,7 +49,7 @@ static int errpos(const char *str)
return err_pos(last_cmd, str);
}
-static void last_cmd_set(char *str)
+static void last_cmd_set(const char *str)
{
if (!str)
return;
@@ -62,7 +63,7 @@ static void synth_err(u8 err_type, u8 err_pos)
err_type, err_pos);
}
-static int create_synth_event(int argc, const char **argv);
+static int create_synth_event(const char *raw_command);
static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
static int synth_event_release(struct dyn_event *ev);
static bool synth_event_is_busy(struct dyn_event *ev);
@@ -579,18 +580,32 @@ static void free_synth_field(struct synth_field *field)
kfree(field);
}
-static struct synth_field *parse_synth_field(int argc, const char **argv,
- int *consumed)
+static int check_field_version(const char *prefix, const char *field_type,
+ const char *field_name)
+{
+ /*
+ * For backward compatibility, the old synthetic event command
+ * format did not require semicolons, and in order to not
+ * break user space, that old format must still work. If a new
+ * feature is added, then the format that uses the new feature
+ * will be required to have semicolons, as nothing that uses
+ * the old format would be using the new, yet to be created,
+ * feature. When a new feature is added, this will detect it,
+ * and return a number greater than 1, and require the format
+ * to use semicolons.
+ */
+ return 1;
+}
+
+static struct synth_field *parse_synth_field(int argc, char **argv,
+ int *consumed, int *field_version)
{
- struct synth_field *field;
const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
+ struct synth_field *field;
int len, ret = -ENOMEM;
struct seq_buf s;
ssize_t size;
- if (field_type[0] == ';')
- field_type++;
-
if (!strcmp(field_type, "unsigned")) {
if (argc < 3) {
synth_err(SYNTH_ERR_INCOMPLETE_TYPE, errpos(field_type));
@@ -599,12 +614,19 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
prefix = "unsigned ";
field_type = argv[1];
field_name = argv[2];
- *consumed = 3;
+ *consumed += 3;
} else {
field_name = argv[1];
- *consumed = 2;
+ *consumed += 2;
}
+ if (!field_name) {
+ synth_err(SYNTH_ERR_INVALID_FIELD, errpos(field_type));
+ return ERR_PTR(-EINVAL);
+ }
+
+ *field_version = check_field_version(prefix, field_type, field_name);
+
field = kzalloc(sizeof(*field), GFP_KERNEL);
if (!field)
return ERR_PTR(-ENOMEM);
@@ -613,8 +635,6 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
array = strchr(field_name, '[');
if (array)
len -= strlen(array);
- else if (field_name[len - 1] == ';')
- len--;
field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
if (!field->name)
@@ -626,8 +646,6 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
goto free;
}
- if (field_type[0] == ';')
- field_type++;
len = strlen(field_type) + 1;
if (array)
@@ -644,11 +662,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
if (prefix)
seq_buf_puts(&s, prefix);
seq_buf_puts(&s, field_type);
- if (array) {
+ if (array)
seq_buf_puts(&s, array);
- if (s.buffer[s.len - 1] == ';')
- s.len--;
- }
if (WARN_ON_ONCE(!seq_buf_buffer_left(&s)))
goto free;
@@ -656,7 +671,10 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
size = synth_field_size(field->type);
if (size < 0) {
- synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type));
+ if (array)
+ synth_err(SYNTH_ERR_INVALID_ARRAY_SPEC, errpos(field_name));
+ else
+ synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type));
ret = -EINVAL;
goto free;
} else if (size == 0) {
@@ -1160,46 +1178,13 @@ int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name,
}
EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start);
-static int save_cmdstr(int argc, const char *name, const char **argv)
-{
- struct seq_buf s;
- char *buf;
- int i;
-
- buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- seq_buf_init(&s, buf, MAX_DYNEVENT_CMD_LEN);
-
- seq_buf_puts(&s, name);
-
- for (i = 0; i < argc; i++) {
- seq_buf_putc(&s, ' ');
- seq_buf_puts(&s, argv[i]);
- }
-
- if (!seq_buf_buffer_left(&s)) {
- synth_err(SYNTH_ERR_CMD_TOO_LONG, 0);
- kfree(buf);
- return -EINVAL;
- }
- buf[s.len] = 0;
- last_cmd_set(buf);
-
- kfree(buf);
- return 0;
-}
-
-static int __create_synth_event(int argc, const char *name, const char **argv)
+static int __create_synth_event(const char *name, const char *raw_fields)
{
+ char **argv, *field_str, *tmp_fields, *saved_fields = NULL;
struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
+ int consumed, cmd_version = 1, n_fields_this_loop;
+ int i, argc, n_fields = 0, ret = 0;
struct synth_event *event = NULL;
- int i, consumed = 0, n_fields = 0, ret = 0;
-
- ret = save_cmdstr(argc, name, argv);
- if (ret)
- return ret;
/*
* Argument syntax:
@@ -1208,46 +1193,101 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
* where 'field' = type field_name
*/
- if (name[0] == '\0' || argc < 1) {
- synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0);
+ if (name[0] == '\0') {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
return -EINVAL;
}
- mutex_lock(&event_mutex);
-
if (!is_good_name(name)) {
synth_err(SYNTH_ERR_BAD_NAME, errpos(name));
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
+ mutex_lock(&event_mutex);
+
event = find_synth_event(name);
if (event) {
synth_err(SYNTH_ERR_EVENT_EXISTS, errpos(name));
ret = -EEXIST;
- goto out;
+ goto err;
}
- for (i = 0; i < argc - 1; i++) {
- if (strcmp(argv[i], ";") == 0)
- continue;
- if (n_fields == SYNTH_FIELDS_MAX) {
- synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0);
- ret = -EINVAL;
+ tmp_fields = saved_fields = kstrdup(raw_fields, GFP_KERNEL);
+ if (!tmp_fields) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ while ((field_str = strsep(&tmp_fields, ";")) != NULL) {
+ argv = argv_split(GFP_KERNEL, field_str, &argc);
+ if (!argv) {
+ ret = -ENOMEM;
goto err;
}
- field = parse_synth_field(argc - i, &argv[i], &consumed);
- if (IS_ERR(field)) {
- ret = PTR_ERR(field);
+ if (!argc) {
+ argv_free(argv);
+ continue;
+ }
+
+ n_fields_this_loop = 0;
+ consumed = 0;
+ while (argc > consumed) {
+ int field_version;
+
+ field = parse_synth_field(argc - consumed,
+ argv + consumed, &consumed,
+ &field_version);
+ if (IS_ERR(field)) {
+ argv_free(argv);
+ ret = PTR_ERR(field);
+ goto err;
+ }
+
+ /*
+ * Track the highest version of any field we
+ * found in the command.
+ */
+ if (field_version > cmd_version)
+ cmd_version = field_version;
+
+ /*
+ * Now sort out what is and isn't valid for
+ * each supported version.
+ *
+ * If we see more than 1 field per loop, it
+ * means we have multiple fields between
+ * semicolons, and that's something we no
+ * longer support in a version 2 or greater
+ * command.
+ */
+ if (cmd_version > 1 && n_fields_this_loop >= 1) {
+ synth_err(SYNTH_ERR_INVALID_CMD, errpos(field_str));
+ ret = -EINVAL;
+ goto err;
+ }
+
+ fields[n_fields++] = field;
+ if (n_fields == SYNTH_FIELDS_MAX) {
+ synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ n_fields_this_loop++;
+ }
+
+ if (consumed < argc) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ ret = -EINVAL;
goto err;
}
- fields[n_fields++] = field;
- i += consumed - 1;
+
+ argv_free(argv);
}
- if (i < argc && strcmp(argv[i], ";") != 0) {
- synth_err(SYNTH_ERR_INVALID_FIELD, errpos(argv[i]));
+ if (n_fields == 0) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
ret = -EINVAL;
goto err;
}
@@ -1266,6 +1306,8 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
out:
mutex_unlock(&event_mutex);
+ kfree(saved_fields);
+
return ret;
err:
for (i = 0; i < n_fields; i++)
@@ -1383,19 +1425,79 @@ int synth_event_delete(const char *event_name)
}
EXPORT_SYMBOL_GPL(synth_event_delete);
-static int create_or_delete_synth_event(int argc, char **argv)
+static int check_command(const char *raw_command)
{
- const char *name = argv[0];
- int ret;
+ char **argv = NULL, *cmd, *saved_cmd, *name_and_field;
+ int argc, ret = 0;
+
+ cmd = saved_cmd = kstrdup(raw_command, GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ name_and_field = strsep(&cmd, ";");
+ if (!name_and_field) {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ if (name_and_field[0] == '!')
+ goto free;
+
+ argv = argv_split(GFP_KERNEL, name_and_field, &argc);
+ if (!argv) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ argv_free(argv);
+
+ if (argc < 3)
+ ret = -EINVAL;
+free:
+ kfree(saved_cmd);
+
+ return ret;
+}
+
+static int create_or_delete_synth_event(const char *raw_command)
+{
+ char *name = NULL, *fields, *p;
+ int ret = 0;
+
+ raw_command = skip_spaces(raw_command);
+ if (raw_command[0] == '\0')
+ return ret;
+
+ last_cmd_set(raw_command);
+
+ ret = check_command(raw_command);
+ if (ret) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ return ret;
+ }
+
+ p = strpbrk(raw_command, " \t");
+ if (!p && raw_command[0] != '!') {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ ret = -EINVAL;
+ goto free;
+ }
+
+ name = kmemdup_nul(raw_command, p ? p - raw_command : strlen(raw_command), GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
- /* trace_run_command() ensures argc != 0 */
if (name[0] == '!') {
ret = synth_event_delete(name + 1);
- return ret;
+ goto free;
}
- ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
- return ret == -ECANCELED ? -EINVAL : ret;
+ fields = skip_spaces(p);
+
+ ret = __create_synth_event(name, fields);
+free:
+ kfree(name);
+
+ return ret;
}
static int synth_event_run_command(struct dynevent_cmd *cmd)
@@ -1403,7 +1505,7 @@ static int synth_event_run_command(struct dynevent_cmd *cmd)
struct synth_event *se;
int ret;
- ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event);
+ ret = create_or_delete_synth_event(cmd->seq.buffer);
if (ret)
return ret;
@@ -1939,10 +2041,27 @@ int synth_event_trace_end(struct synth_event_trace_state *trace_state)
}
EXPORT_SYMBOL_GPL(synth_event_trace_end);
-static int create_synth_event(int argc, const char **argv)
+static int create_synth_event(const char *raw_command)
{
- const char *name = argv[0];
- int len;
+ char *fields, *p;
+ const char *name;
+ int len, ret = 0;
+
+ raw_command = skip_spaces(raw_command);
+ if (raw_command[0] == '\0')
+ return ret;
+
+ last_cmd_set(raw_command);
+
+ p = strpbrk(raw_command, " \t");
+ if (!p) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ return -EINVAL;
+ }
+
+ fields = skip_spaces(p);
+
+ name = raw_command;
if (name[0] != 's' || name[1] != ':')
return -ECANCELED;
@@ -1951,11 +2070,30 @@ static int create_synth_event(int argc, const char **argv)
/* This interface accepts group name prefix */
if (strchr(name, '/')) {
len = str_has_prefix(name, SYNTH_SYSTEM "/");
- if (len == 0)
+ if (len == 0) {
+ synth_err(SYNTH_ERR_INVALID_DYN_CMD, 0);
return -EINVAL;
+ }
name += len;
}
- return __create_synth_event(argc - 1, name, argv + 1);
+
+ len = name - raw_command;
+
+ ret = check_command(raw_command + len);
+ if (ret) {
+ synth_err(SYNTH_ERR_INVALID_CMD, 0);
+ return ret;
+ }
+
+ name = kmemdup_nul(raw_command + len, p - raw_command - len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ ret = __create_synth_event(name, fields);
+
+ kfree(name);
+
+ return ret;
}
static int synth_event_release(struct dyn_event *ev)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c5095dd28e20..f93723ca66bc 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -106,8 +106,7 @@ static int function_trace_init(struct trace_array *tr)
ftrace_init_array_ops(tr, func);
- tr->array_buffer.cpu = get_cpu();
- put_cpu();
+ tr->array_buffer.cpu = raw_smp_processor_id();
tracing_start_cmdline_record();
tracing_start_function_trace(tr);
@@ -132,10 +131,9 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
{
struct trace_array *tr = op->private;
struct trace_array_cpu *data;
- unsigned long flags;
+ unsigned int trace_ctx;
int bit;
int cpu;
- int pc;
if (unlikely(!tr->function_enabled))
return;
@@ -144,15 +142,14 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
if (bit < 0)
return;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
cpu = smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- if (!atomic_read(&data->disabled)) {
- local_save_flags(flags);
- trace_function(tr, ip, parent_ip, flags, pc);
- }
+ if (!atomic_read(&data->disabled))
+ trace_function(tr, ip, parent_ip, trace_ctx);
+
ftrace_test_recursion_unlock(bit);
preempt_enable_notrace();
}
@@ -184,7 +181,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
unsigned long flags;
long disabled;
int cpu;
- int pc;
+ unsigned int trace_ctx;
if (unlikely(!tr->function_enabled))
return;
@@ -199,9 +196,9 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
- pc = preempt_count();
- trace_function(tr, ip, parent_ip, flags, pc);
- __trace_stack(tr, flags, STACK_SKIP, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ trace_function(tr, ip, parent_ip, trace_ctx);
+ __trace_stack(tr, trace_ctx, STACK_SKIP);
}
atomic_dec(&data->disabled);
@@ -404,13 +401,11 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
static __always_inline void trace_stack(struct trace_array *tr)
{
- unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
- local_save_flags(flags);
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
- __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc);
+ __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP);
}
static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d874dec87131..0aa6e6faa943 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -96,8 +96,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
- unsigned long flags,
- int pc)
+ unsigned int trace_ctx)
{
struct trace_event_call *call = &event_funcgraph_entry;
struct ring_buffer_event *event;
@@ -105,7 +104,7 @@ int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
return 0;
entry = ring_buffer_event_data(event);
@@ -129,10 +128,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
struct trace_array *tr = graph_array;
struct trace_array_cpu *data;
unsigned long flags;
+ unsigned int trace_ctx;
long disabled;
int ret;
int cpu;
- int pc;
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
return 0;
@@ -174,8 +173,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
- pc = preempt_count();
- ret = __trace_graph_entry(tr, trace, flags, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ ret = __trace_graph_entry(tr, trace, trace_ctx);
} else {
ret = 0;
}
@@ -188,7 +187,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
static void
__trace_graph_function(struct trace_array *tr,
- unsigned long ip, unsigned long flags, int pc)
+ unsigned long ip, unsigned int trace_ctx)
{
u64 time = trace_clock_local();
struct ftrace_graph_ent ent = {
@@ -202,22 +201,21 @@ __trace_graph_function(struct trace_array *tr,
.rettime = time,
};
- __trace_graph_entry(tr, &ent, flags, pc);
- __trace_graph_return(tr, &ret, flags, pc);
+ __trace_graph_entry(tr, &ent, trace_ctx);
+ __trace_graph_return(tr, &ret, trace_ctx);
}
void
trace_graph_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
- __trace_graph_function(tr, ip, flags, pc);
+ __trace_graph_function(tr, ip, trace_ctx);
}
void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
- unsigned long flags,
- int pc)
+ unsigned int trace_ctx)
{
struct trace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
@@ -225,7 +223,7 @@ void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -239,9 +237,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
struct trace_array *tr = graph_array;
struct trace_array_cpu *data;
unsigned long flags;
+ unsigned int trace_ctx;
long disabled;
int cpu;
- int pc;
ftrace_graph_addr_finish(trace);
@@ -255,8 +253,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
- pc = preempt_count();
- __trace_graph_return(tr, trace, flags, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ __trace_graph_return(tr, trace, trace_ctx);
}
atomic_dec(&data->disabled);
local_irq_restore(flags);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index c0df9b97f147..34dc1a712dcb 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -108,14 +108,9 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct hwlat_entry *entry;
- unsigned long flags;
- int pc;
-
- pc = preempt_count();
- local_save_flags(flags);
event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry),
- flags, pc);
+ tracing_gen_ctx());
if (!event)
return;
entry = ring_buffer_event_data(event);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 6756379b661f..590b3d51afae 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -143,11 +143,14 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
+ unsigned int trace_ctx;
if (!func_prolog_dec(tr, &data, &flags))
return;
- trace_function(tr, ip, parent_ip, flags, preempt_count());
+ trace_ctx = tracing_gen_ctx_flags(flags);
+
+ trace_function(tr, ip, parent_ip, trace_ctx);
atomic_dec(&data->disabled);
}
@@ -177,8 +180,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
+ unsigned int trace_ctx;
int ret;
- int pc;
if (ftrace_graph_ignore_func(trace))
return 0;
@@ -195,8 +198,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
if (!func_prolog_dec(tr, &data, &flags))
return 0;
- pc = preempt_count();
- ret = __trace_graph_entry(tr, trace, flags, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ ret = __trace_graph_entry(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
return ret;
@@ -207,15 +210,15 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
ftrace_graph_addr_finish(trace);
if (!func_prolog_dec(tr, &data, &flags))
return;
- pc = preempt_count();
- __trace_graph_return(tr, trace, flags, pc);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ __trace_graph_return(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
}
@@ -267,12 +270,12 @@ static void irqsoff_print_header(struct seq_file *s)
static void
__trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
if (is_graph(tr))
- trace_graph_function(tr, ip, parent_ip, flags, pc);
+ trace_graph_function(tr, ip, parent_ip, trace_ctx);
else
- trace_function(tr, ip, parent_ip, flags, pc);
+ trace_function(tr, ip, parent_ip, trace_ctx);
}
#else
@@ -322,15 +325,13 @@ check_critical_timing(struct trace_array *tr,
{
u64 T0, T1, delta;
unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
T0 = data->preempt_timestamp;
T1 = ftrace_now(cpu);
delta = T1-T0;
- local_save_flags(flags);
-
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
if (!report_latency(tr, delta))
goto out;
@@ -341,9 +342,9 @@ check_critical_timing(struct trace_array *tr,
if (!report_latency(tr, delta))
goto out_unlock;
- __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+ __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx);
/* Skip 5 functions to get to the irq/preempt enable function */
- __trace_stack(tr, flags, 5, pc);
+ __trace_stack(tr, trace_ctx, 5);
if (data->critical_sequence != max_sequence)
goto out_unlock;
@@ -363,16 +364,15 @@ out_unlock:
out:
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
- __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+ __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx);
}
static nokprobe_inline void
-start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
{
int cpu;
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
- unsigned long flags;
if (!tracer_enabled || !tracing_is_enabled())
return;
@@ -393,9 +393,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
data->preempt_timestamp = ftrace_now(cpu);
data->critical_start = parent_ip ? : ip;
- local_save_flags(flags);
-
- __trace_function(tr, ip, parent_ip, flags, pc);
+ __trace_function(tr, ip, parent_ip, tracing_gen_ctx());
per_cpu(tracing_cpu, cpu) = 1;
@@ -403,12 +401,12 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
}
static nokprobe_inline void
-stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
{
int cpu;
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
- unsigned long flags;
+ unsigned int trace_ctx;
cpu = raw_smp_processor_id();
/* Always clear the tracing cpu on stopping the trace */
@@ -428,8 +426,8 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
atomic_inc(&data->disabled);
- local_save_flags(flags);
- __trace_function(tr, ip, parent_ip, flags, pc);
+ trace_ctx = tracing_gen_ctx();
+ __trace_function(tr, ip, parent_ip, trace_ctx);
check_critical_timing(tr, data, parent_ip ? : ip, cpu);
data->critical_start = 0;
atomic_dec(&data->disabled);
@@ -438,20 +436,16 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
/* start and stop critical timings used to for stoppage (in idle) */
void start_critical_timings(void)
{
- int pc = preempt_count();
-
- if (preempt_trace(pc) || irq_trace())
- start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
+ if (preempt_trace(preempt_count()) || irq_trace())
+ start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
EXPORT_SYMBOL_GPL(start_critical_timings);
NOKPROBE_SYMBOL(start_critical_timings);
void stop_critical_timings(void)
{
- int pc = preempt_count();
-
- if (preempt_trace(pc) || irq_trace())
- stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
+ if (preempt_trace(preempt_count()) || irq_trace())
+ stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
EXPORT_SYMBOL_GPL(stop_critical_timings);
NOKPROBE_SYMBOL(stop_critical_timings);
@@ -613,19 +607,15 @@ static void irqsoff_tracer_stop(struct trace_array *tr)
*/
void tracer_hardirqs_on(unsigned long a0, unsigned long a1)
{
- unsigned int pc = preempt_count();
-
- if (!preempt_trace(pc) && irq_trace())
- stop_critical_timing(a0, a1, pc);
+ if (!preempt_trace(preempt_count()) && irq_trace())
+ stop_critical_timing(a0, a1);
}
NOKPROBE_SYMBOL(tracer_hardirqs_on);
void tracer_hardirqs_off(unsigned long a0, unsigned long a1)
{
- unsigned int pc = preempt_count();
-
- if (!preempt_trace(pc) && irq_trace())
- start_critical_timing(a0, a1, pc);
+ if (!preempt_trace(preempt_count()) && irq_trace())
+ start_critical_timing(a0, a1);
}
NOKPROBE_SYMBOL(tracer_hardirqs_off);
@@ -665,18 +655,14 @@ static struct tracer irqsoff_tracer __read_mostly =
#ifdef CONFIG_PREEMPT_TRACER
void tracer_preempt_on(unsigned long a0, unsigned long a1)
{
- int pc = preempt_count();
-
- if (preempt_trace(pc) && !irq_trace())
- stop_critical_timing(a0, a1, pc);
+ if (preempt_trace(preempt_count()) && !irq_trace())
+ stop_critical_timing(a0, a1);
}
void tracer_preempt_off(unsigned long a0, unsigned long a1)
{
- int pc = preempt_count();
-
- if (preempt_trace(pc) && !irq_trace())
- start_critical_timing(a0, a1, pc);
+ if (preempt_trace(preempt_count()) && !irq_trace())
+ start_critical_timing(a0, a1);
}
static int preemptoff_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 56c7fbff7bd7..6fe770d86dc3 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,7 +35,7 @@ static int __init set_kprobe_boot_events(char *str)
}
__setup("kprobe_event=", set_kprobe_boot_events);
-static int trace_kprobe_create(int argc, const char **argv);
+static int trace_kprobe_create(const char *raw_command);
static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev);
static int trace_kprobe_release(struct dyn_event *ev);
static bool trace_kprobe_is_busy(struct dyn_event *ev);
@@ -124,9 +124,9 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
if (!p)
return true;
*p = '\0';
- mutex_lock(&module_mutex);
+ rcu_read_lock_sched();
ret = !!find_module(tk->symbol);
- mutex_unlock(&module_mutex);
+ rcu_read_unlock_sched();
*p = ':';
return ret;
@@ -711,7 +711,7 @@ static inline void sanitize_event_name(char *name)
*name = '_';
}
-static int trace_kprobe_create(int argc, const char *argv[])
+static int __trace_kprobe_create(int argc, const char *argv[])
{
/*
* Argument syntax:
@@ -910,20 +910,25 @@ error:
goto out;
}
-static int create_or_delete_trace_kprobe(int argc, char **argv)
+static int trace_kprobe_create(const char *raw_command)
+{
+ return trace_probe_create(raw_command, __trace_kprobe_create);
+}
+
+static int create_or_delete_trace_kprobe(const char *raw_command)
{
int ret;
- if (argv[0][0] == '-')
- return dyn_event_release(argc, argv, &trace_kprobe_ops);
+ if (raw_command[0] == '-')
+ return dyn_event_release(raw_command, &trace_kprobe_ops);
- ret = trace_kprobe_create(argc, (const char **)argv);
+ ret = trace_kprobe_create(raw_command);
return ret == -ECANCELED ? -EINVAL : ret;
}
static int trace_kprobe_run_command(struct dynevent_cmd *cmd)
{
- return trace_run_command(cmd->seq.buffer, create_or_delete_trace_kprobe);
+ return create_or_delete_trace_kprobe(cmd->seq.buffer);
}
/**
@@ -1084,7 +1089,7 @@ int kprobe_event_delete(const char *name)
snprintf(buf, MAX_EVENT_NAME_LEN, "-:%s", name);
- return trace_run_command(buf, create_or_delete_trace_kprobe);
+ return create_or_delete_trace_kprobe(buf);
}
EXPORT_SYMBOL_GPL(kprobe_event_delete);
@@ -1386,8 +1391,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
if (trace_trigger_soft_disabled(trace_file))
return;
- local_save_flags(fbuffer.flags);
- fbuffer.pc = preempt_count();
+ fbuffer.trace_ctx = tracing_gen_ctx();
fbuffer.trace_file = trace_file;
dsize = __get_data_size(&tk->tp, regs);
@@ -1396,7 +1400,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
call->event.type,
sizeof(*entry) + tk->tp.size + dsize,
- fbuffer.flags, fbuffer.pc);
+ fbuffer.trace_ctx);
if (!fbuffer.event)
return;
@@ -1434,8 +1438,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
if (trace_trigger_soft_disabled(trace_file))
return;
- local_save_flags(fbuffer.flags);
- fbuffer.pc = preempt_count();
+ fbuffer.trace_ctx = tracing_gen_ctx();
fbuffer.trace_file = trace_file;
dsize = __get_data_size(&tk->tp, regs);
@@ -1443,7 +1446,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
call->event.type,
sizeof(*entry) + tk->tp.size + dsize,
- fbuffer.flags, fbuffer.pc);
+ fbuffer.trace_ctx);
if (!fbuffer.event)
return;
@@ -1888,7 +1891,7 @@ static __init void setup_boot_kprobe_events(void)
if (p)
*p++ = '\0';
- ret = trace_run_command(cmd, create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe(cmd);
if (ret)
pr_warn("Failed to add event(%d): %s\n", ret, cmd);
@@ -1982,8 +1985,7 @@ static __init int kprobe_trace_self_tests_init(void)
pr_info("Testing kprobe tracing: ");
- ret = trace_run_command("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)",
- create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)");
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function entry.\n");
warn++;
@@ -2004,8 +2006,7 @@ static __init int kprobe_trace_self_tests_init(void)
}
}
- ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target $retval",
- create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe("r:testprobe2 kprobe_trace_selftest_target $retval");
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function return.\n");
warn++;
@@ -2078,13 +2079,13 @@ static __init int kprobe_trace_self_tests_init(void)
trace_probe_event_call(&tk->tp), file);
}
- ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe("-:testprobe");
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
}
- ret = trace_run_command("-:testprobe2", create_or_delete_trace_kprobe);
+ ret = create_or_delete_trace_kprobe("-:testprobe2");
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 84582bf1ed5f..64e77b513697 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -5,8 +5,6 @@
* Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
*/
-#define DEBUG 1
-
#include <linux/kernel.h>
#include <linux/mmiotrace.h>
#include <linux/pci.h>
@@ -300,10 +298,11 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
- int pc = preempt_count();
+ unsigned int trace_ctx;
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
- sizeof(*entry), 0, pc);
+ sizeof(*entry), trace_ctx);
if (!event) {
atomic_inc(&dropped_count);
return;
@@ -312,7 +311,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
entry->rw = *rw;
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -330,10 +329,11 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
- int pc = preempt_count();
+ unsigned int trace_ctx;
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
- sizeof(*entry), 0, pc);
+ sizeof(*entry), trace_ctx);
if (!event) {
atomic_inc(&dropped_count);
return;
@@ -342,7 +342,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
entry->map = *map;
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 92b1575ae0ca..61255bad7e01 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -312,13 +312,23 @@ int trace_raw_output_prep(struct trace_iterator *iter,
}
EXPORT_SYMBOL(trace_raw_output_prep);
+void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ trace_seq_vprintf(&iter->seq, trace_event_format(iter, fmt), ap);
+ va_end(ap);
+}
+EXPORT_SYMBOL(trace_event_printf);
+
static int trace_output_raw(struct trace_iterator *iter, char *name,
char *fmt, va_list ap)
{
struct trace_seq *s = &iter->seq;
trace_seq_printf(s, "%s: ", name);
- trace_seq_vprintf(s, fmt, ap);
+ trace_seq_vprintf(s, trace_event_format(iter, fmt), ap);
return trace_handle_return(s);
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d2867ccc6aca..ec589a4612df 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1134,3 +1134,20 @@ bool trace_probe_match_command_args(struct trace_probe *tp,
}
return true;
}
+
+int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **))
+{
+ int argc = 0, ret = 0;
+ char **argv;
+
+ argv = argv_split(GFP_KERNEL, raw_command, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ if (argc)
+ ret = createfn(argc, (const char **)argv);
+
+ argv_free(argv);
+
+ return ret;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 2f703a20c724..7ce4027089ee 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -341,6 +341,7 @@ struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp,
int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b);
bool trace_probe_match_command_args(struct trace_probe *tp,
int argc, const char **argv);
+int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **));
#define trace_probe_for_each_link(pos, tp) \
list_for_each_entry(pos, &(tp)->event->files, list)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index c0181066dbe9..e5778d1d7a5b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -67,7 +67,7 @@ static bool function_enabled;
static int
func_prolog_preempt_disable(struct trace_array *tr,
struct trace_array_cpu **data,
- int *pc)
+ unsigned int *trace_ctx)
{
long disabled;
int cpu;
@@ -75,7 +75,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
if (likely(!wakeup_task))
return 0;
- *pc = preempt_count();
+ *trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
cpu = raw_smp_processor_id();
@@ -116,8 +116,8 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
- unsigned long flags;
- int pc, ret = 0;
+ unsigned int trace_ctx;
+ int ret = 0;
if (ftrace_graph_ignore_func(trace))
return 0;
@@ -131,11 +131,10 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
if (ftrace_graph_notrace_addr(trace->func))
return 1;
- if (!func_prolog_preempt_disable(tr, &data, &pc))
+ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return 0;
- local_save_flags(flags);
- ret = __trace_graph_entry(tr, trace, flags, pc);
+ ret = __trace_graph_entry(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
preempt_enable_notrace();
@@ -146,16 +145,14 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
- unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
ftrace_graph_addr_finish(trace);
- if (!func_prolog_preempt_disable(tr, &data, &pc))
+ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return;
- local_save_flags(flags);
- __trace_graph_return(tr, trace, flags, pc);
+ __trace_graph_return(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
preempt_enable_notrace();
@@ -217,13 +214,13 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
- if (!func_prolog_preempt_disable(tr, &data, &pc))
+ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return;
local_irq_save(flags);
- trace_function(tr, ip, parent_ip, flags, pc);
+ trace_function(tr, ip, parent_ip, trace_ctx);
local_irq_restore(flags);
atomic_dec(&data->disabled);
@@ -303,12 +300,12 @@ static void wakeup_print_header(struct seq_file *s)
static void
__trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
if (is_graph(tr))
- trace_graph_function(tr, ip, parent_ip, flags, pc);
+ trace_graph_function(tr, ip, parent_ip, trace_ctx);
else
- trace_function(tr, ip, parent_ip, flags, pc);
+ trace_function(tr, ip, parent_ip, trace_ctx);
}
static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -375,7 +372,7 @@ static void
tracing_sched_switch_trace(struct trace_array *tr,
struct task_struct *prev,
struct task_struct *next,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct trace_event_call *call = &event_context_switch;
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -383,7 +380,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
struct ctx_switch_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -396,14 +393,14 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry->next_cpu = task_cpu(next);
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
static void
tracing_sched_wakeup_trace(struct trace_array *tr,
struct task_struct *wakee,
struct task_struct *curr,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct trace_event_call *call = &event_wakeup;
struct ring_buffer_event *event;
@@ -411,7 +408,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct trace_buffer *buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -424,7 +421,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry->next_cpu = task_cpu(wakee);
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
static void notrace
@@ -436,7 +433,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
unsigned long flags;
long disabled;
int cpu;
- int pc;
+ unsigned int trace_ctx;
tracing_record_cmdline(prev);
@@ -455,8 +452,6 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
if (next != wakeup_task)
return;
- pc = preempt_count();
-
/* disable local data, not wakeup_cpu data */
cpu = raw_smp_processor_id();
disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
@@ -464,6 +459,8 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
goto out;
local_irq_save(flags);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+
arch_spin_lock(&wakeup_lock);
/* We could race with grabbing wakeup_lock */
@@ -473,9 +470,9 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
/* The task we are waiting for is waking up */
data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu);
- __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
- tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
- __trace_stack(wakeup_trace, flags, 0, pc);
+ __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, trace_ctx);
+ tracing_sched_switch_trace(wakeup_trace, prev, next, trace_ctx);
+ __trace_stack(wakeup_trace, trace_ctx, 0);
T0 = data->preempt_timestamp;
T1 = ftrace_now(cpu);
@@ -527,9 +524,8 @@ probe_wakeup(void *ignore, struct task_struct *p)
{
struct trace_array_cpu *data;
int cpu = smp_processor_id();
- unsigned long flags;
long disabled;
- int pc;
+ unsigned int trace_ctx;
if (likely(!tracer_enabled))
return;
@@ -550,11 +546,12 @@ probe_wakeup(void *ignore, struct task_struct *p)
(!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
return;
- pc = preempt_count();
disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
if (unlikely(disabled != 1))
goto out;
+ trace_ctx = tracing_gen_ctx();
+
/* interrupts should be off from try_to_wake_up */
arch_spin_lock(&wakeup_lock);
@@ -581,19 +578,17 @@ probe_wakeup(void *ignore, struct task_struct *p)
wakeup_task = get_task_struct(p);
- local_save_flags(flags);
-
data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu);
data->preempt_timestamp = ftrace_now(cpu);
- tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
- __trace_stack(wakeup_trace, flags, 0, pc);
+ tracing_sched_wakeup_trace(wakeup_trace, p, current, trace_ctx);
+ __trace_stack(wakeup_trace, trace_ctx, 0);
/*
* We must be careful in using CALLER_ADDR2. But since wake_up
* is not called by an assembly function (where as schedule is)
* it should be safe to use it here.
*/
- __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+ __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, trace_ctx);
out_locked:
arch_spin_unlock(&wakeup_lock);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d85a2f0f316b..8bfcd3b09422 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -298,9 +298,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
struct trace_buffer *buffer;
- unsigned long irq_flags;
+ unsigned int trace_ctx;
unsigned long args[6];
- int pc;
int syscall_nr;
int size;
@@ -322,12 +321,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
- local_save_flags(irq_flags);
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer,
- sys_data->enter_event->event.type, size, irq_flags, pc);
+ sys_data->enter_event->event.type, size, trace_ctx);
if (!event)
return;
@@ -337,7 +335,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
event_trigger_unlock_commit(trace_file, buffer, event, entry,
- irq_flags, pc);
+ trace_ctx);
}
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -348,8 +346,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
struct trace_buffer *buffer;
- unsigned long irq_flags;
- int pc;
+ unsigned int trace_ctx;
int syscall_nr;
syscall_nr = trace_get_syscall_nr(current, regs);
@@ -368,13 +365,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
if (!sys_data)
return;
- local_save_flags(irq_flags);
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer,
sys_data->exit_event->event.type, sizeof(*entry),
- irq_flags, pc);
+ trace_ctx);
if (!event)
return;
@@ -383,7 +379,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
entry->ret = syscall_get_return_value(current, regs);
event_trigger_unlock_commit(trace_file, buffer, event, entry,
- irq_flags, pc);
+ trace_ctx);
}
static int reg_event_syscall_enter(struct trace_event_file *file,
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 3cf7128e1ad3..9b50869a5ddb 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -34,7 +34,7 @@ struct uprobe_trace_entry_head {
#define DATAOF_TRACE_ENTRY(entry, is_return) \
((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
-static int trace_uprobe_create(int argc, const char **argv);
+static int trace_uprobe_create(const char *raw_command);
static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev);
static int trace_uprobe_release(struct dyn_event *ev);
static bool trace_uprobe_is_busy(struct dyn_event *ev);
@@ -530,7 +530,7 @@ end:
* Argument syntax:
* - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET[%return][(REF)] [FETCHARGS]
*/
-static int trace_uprobe_create(int argc, const char **argv)
+static int __trace_uprobe_create(int argc, const char **argv)
{
struct trace_uprobe *tu;
const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
@@ -716,14 +716,19 @@ fail_address_parse:
return ret;
}
-static int create_or_delete_trace_uprobe(int argc, char **argv)
+int trace_uprobe_create(const char *raw_command)
+{
+ return trace_probe_create(raw_command, __trace_uprobe_create);
+}
+
+static int create_or_delete_trace_uprobe(const char *raw_command)
{
int ret;
- if (argv[0][0] == '-')
- return dyn_event_release(argc, argv, &trace_uprobe_ops);
+ if (raw_command[0] == '-')
+ return dyn_event_release(raw_command, &trace_uprobe_ops);
- ret = trace_uprobe_create(argc, (const char **)argv);
+ ret = trace_uprobe_create(raw_command);
return ret == -ECANCELED ? -EINVAL : ret;
}
@@ -961,7 +966,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
size = esize + tu->tp.size + dsize;
event = trace_event_buffer_lock_reserve(&buffer, trace_file,
- call->event.type, size, 0, 0);
+ call->event.type, size, 0);
if (!event)
return;
@@ -977,7 +982,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
memcpy(data, ucb->buf, tu->tp.size + dsize);
- event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0);
+ event_trigger_unlock_commit(trace_file, buffer, event, entry, 0);
}
/* uprobe handler */
@@ -1635,7 +1640,7 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call)
}
#endif /* CONFIG_PERF_EVENTS */
-/* Make a trace interface for controling probe points */
+/* Make a trace interface for controlling probe points */
static __init int init_uprobe_trace(void)
{
int ret;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 7261fa0f5e3c..9f478d29b926 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -53,6 +53,12 @@ struct tp_probes {
struct tracepoint_func probes[];
};
+/* Called in removal of a func but failed to allocate a new tp_funcs */
+static void tp_stub_func(void)
+{
+ return;
+}
+
static inline void *allocate_probes(int count)
{
struct tp_probes *p = kmalloc(struct_size(p, probes, count),
@@ -130,8 +136,9 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
int prio)
{
struct tracepoint_func *old, *new;
- int nr_probes = 0;
- int pos = -1;
+ int iter_probes; /* Iterate over old probe array. */
+ int nr_probes = 0; /* Counter for probes */
+ int pos = -1; /* Insertion position into new array */
if (WARN_ON(!tp_func->func))
return ERR_PTR(-EINVAL);
@@ -140,13 +147,13 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
old = *funcs;
if (old) {
/* (N -> N+1), (N != 0, 1) probes */
- for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
- /* Insert before probes of lower priority */
- if (pos < 0 && old[nr_probes].prio < prio)
- pos = nr_probes;
- if (old[nr_probes].func == tp_func->func &&
- old[nr_probes].data == tp_func->data)
+ for (iter_probes = 0; old[iter_probes].func; iter_probes++) {
+ if (old[iter_probes].func == tp_stub_func)
+ continue; /* Skip stub functions. */
+ if (old[iter_probes].func == tp_func->func &&
+ old[iter_probes].data == tp_func->data)
return ERR_PTR(-EEXIST);
+ nr_probes++;
}
}
/* + 2 : one for new probe, one for NULL func */
@@ -154,20 +161,24 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
if (new == NULL)
return ERR_PTR(-ENOMEM);
if (old) {
- if (pos < 0) {
- pos = nr_probes;
- memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
- } else {
- /* Copy higher priority probes ahead of the new probe */
- memcpy(new, old, pos * sizeof(struct tracepoint_func));
- /* Copy the rest after it. */
- memcpy(new + pos + 1, old + pos,
- (nr_probes - pos) * sizeof(struct tracepoint_func));
+ nr_probes = 0;
+ for (iter_probes = 0; old[iter_probes].func; iter_probes++) {
+ if (old[iter_probes].func == tp_stub_func)
+ continue;
+ /* Insert before probes of lower priority */
+ if (pos < 0 && old[iter_probes].prio < prio)
+ pos = nr_probes++;
+ new[nr_probes++] = old[iter_probes];
}
- } else
+ if (pos < 0)
+ pos = nr_probes++;
+ /* nr_probes now points to the end of the new array */
+ } else {
pos = 0;
+ nr_probes = 1; /* must point at end of array */
+ }
new[pos] = *tp_func;
- new[nr_probes + 1].func = NULL;
+ new[nr_probes].func = NULL;
*funcs = new;
debug_print_probes(*funcs);
return old;
@@ -188,8 +199,9 @@ static void *func_remove(struct tracepoint_func **funcs,
/* (N -> M), (N > 1, M >= 0) probes */
if (tp_func->func) {
for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
- if (old[nr_probes].func == tp_func->func &&
- old[nr_probes].data == tp_func->data)
+ if ((old[nr_probes].func == tp_func->func &&
+ old[nr_probes].data == tp_func->data) ||
+ old[nr_probes].func == tp_stub_func)
nr_del++;
}
}
@@ -208,14 +220,27 @@ static void *func_remove(struct tracepoint_func **funcs,
/* N -> M, (N > 1, M > 0) */
/* + 1 for NULL */
new = allocate_probes(nr_probes - nr_del + 1);
- if (new == NULL)
- return ERR_PTR(-ENOMEM);
- for (i = 0; old[i].func; i++)
- if (old[i].func != tp_func->func
- || old[i].data != tp_func->data)
- new[j++] = old[i];
- new[nr_probes - nr_del].func = NULL;
- *funcs = new;
+ if (new) {
+ for (i = 0; old[i].func; i++) {
+ if ((old[i].func != tp_func->func ||
+ old[i].data != tp_func->data) &&
+ old[i].func != tp_stub_func)
+ new[j++] = old[i];
+ }
+ new[nr_probes - nr_del].func = NULL;
+ *funcs = new;
+ } else {
+ /*
+ * Failed to allocate, replace the old function
+ * with calls to tp_stub_func.
+ */
+ for (i = 0; old[i].func; i++) {
+ if (old[i].func == tp_func->func &&
+ old[i].data == tp_func->data)
+ WRITE_ONCE(old[i].func, tp_stub_func);
+ }
+ *funcs = old;
+ }
}
debug_print_probes(*funcs);
return old;
@@ -295,10 +320,12 @@ static int tracepoint_remove_func(struct tracepoint *tp,
tp_funcs = rcu_dereference_protected(tp->funcs,
lockdep_is_held(&tracepoints_mutex));
old = func_remove(&tp_funcs, func);
- if (IS_ERR(old)) {
- WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
+ if (WARN_ON_ONCE(IS_ERR(old)))
return PTR_ERR(old);
- }
+
+ if (tp_funcs == old)
+ /* Failed allocating new tp_funcs, replaced func with stub */
+ return 0;
if (!tp_funcs) {
/* Removed last function */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index af612945a4d0..9a4b980d695b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -106,6 +106,7 @@ int create_user_ns(struct cred *new)
if (!ns)
goto fail_dec;
+ ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
ret = ns_alloc_inum(&ns->ns);
if (ret)
goto fail_free;
@@ -841,6 +842,60 @@ static int sort_idmaps(struct uid_gid_map *map)
return 0;
}
+/**
+ * verify_root_map() - check the uid 0 mapping
+ * @file: idmapping file
+ * @map_ns: user namespace of the target process
+ * @new_map: requested idmap
+ *
+ * If a process requests mapping parent uid 0 into the new ns, verify that the
+ * process writing the map had the CAP_SETFCAP capability as the target process
+ * will be able to write fscaps that are valid in ancestor user namespaces.
+ *
+ * Return: true if the mapping is allowed, false if not.
+ */
+static bool verify_root_map(const struct file *file,
+ struct user_namespace *map_ns,
+ struct uid_gid_map *new_map)
+{
+ int idx;
+ const struct user_namespace *file_ns = file->f_cred->user_ns;
+ struct uid_gid_extent *extent0 = NULL;
+
+ for (idx = 0; idx < new_map->nr_extents; idx++) {
+ if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ extent0 = &new_map->extent[idx];
+ else
+ extent0 = &new_map->forward[idx];
+ if (extent0->lower_first == 0)
+ break;
+
+ extent0 = NULL;
+ }
+
+ if (!extent0)
+ return true;
+
+ if (map_ns == file_ns) {
+ /* The process unshared its ns and is writing to its own
+ * /proc/self/uid_map. User already has full capabilites in
+ * the new namespace. Verify that the parent had CAP_SETFCAP
+ * when it unshared.
+ * */
+ if (!file_ns->parent_could_setfcap)
+ return false;
+ } else {
+ /* Process p1 is writing to uid_map of p2, who is in a child
+ * user namespace to p1's. Verify that the opener of the map
+ * file has CAP_SETFCAP against the parent of the new map
+ * namespace */
+ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
+ return false;
+ }
+
+ return true;
+}
+
static ssize_t map_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos,
int cap_setid,
@@ -848,7 +903,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct uid_gid_map *parent_map)
{
struct seq_file *seq = file->private_data;
- struct user_namespace *ns = seq->private;
+ struct user_namespace *map_ns = seq->private;
struct uid_gid_map new_map;
unsigned idx;
struct uid_gid_extent extent;
@@ -895,7 +950,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
/*
* Adjusting namespace settings requires capabilities on the target.
*/
- if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
+ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
goto out;
/* Parse the user data */
@@ -965,7 +1020,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ret = -EPERM;
/* Validate the user is allowed to use user id's mapped to. */
- if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
+ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
goto out;
ret = -EPERM;
@@ -1086,6 +1141,10 @@ static bool new_idmap_permitted(const struct file *file,
struct uid_gid_map *new_map)
{
const struct cred *cred = file->f_cred;
+
+ if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
+ return false;
+
/* Don't allow mappings that would allow anything that wouldn't
* be allowed without the establishment of unprivileged mappings.
*/
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index 0b35212ffc3d..bb7bb3b478ab 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -139,13 +139,22 @@ static void umd_cleanup(struct subprocess_info *info)
struct umd_info *umd_info = info->data;
/* cleanup if umh_setup() was successful but exec failed */
- if (info->retval) {
- fput(umd_info->pipe_to_umh);
- fput(umd_info->pipe_from_umh);
- put_pid(umd_info->tgid);
- umd_info->tgid = NULL;
- }
+ if (info->retval)
+ umd_cleanup_helper(umd_info);
+}
+
+/**
+ * umd_cleanup_helper - release the resources which were allocated in umd_setup
+ * @info: information about usermode driver
+ */
+void umd_cleanup_helper(struct umd_info *info)
+{
+ fput(info->pipe_to_umh);
+ fput(info->pipe_from_umh);
+ put_pid(info->tgid);
+ info->tgid = NULL;
}
+EXPORT_SYMBOL_GPL(umd_cleanup_helper);
/**
* fork_usermode_driver - fork a usermode driver
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 0ef8f65bd2d7..9c9eb20dd2c5 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -413,7 +413,7 @@ static void put_watch(struct watch *watch)
}
/**
- * init_watch_queue - Initialise a watch
+ * init_watch - Initialise a watch
* @watch: The watch to initialise.
* @wqueue: The queue to assign.
*
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 71109065bd8e..107bc38b1945 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -278,9 +278,10 @@ void touch_all_softlockup_watchdogs(void)
* update as well, the only side effect might be a cycle delay for
* the softlockup check.
*/
- for_each_cpu(cpu, &watchdog_allowed_mask)
+ for_each_cpu(cpu, &watchdog_allowed_mask) {
per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET;
- wq_watchdog_touch(-1);
+ wq_watchdog_touch(cpu);
+ }
}
void touch_softlockup_watchdog_sync(void)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 894bb885b40b..b19d759e55a5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1412,7 +1412,6 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
*/
lockdep_assert_irqs_disabled();
- debug_work_activate(work);
/* if draining, only works from the same workqueue are allowed */
if (unlikely(wq->flags & __WQ_DRAINING) &&
@@ -1494,6 +1493,7 @@ retry:
worklist = &pwq->delayed_works;
}
+ debug_work_activate(work);
insert_work(pwq, work, worklist, work_flags);
out:
@@ -1630,7 +1630,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work = &dwork->work;
WARN_ON_ONCE(!wq);
- WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
+ WARN_ON_FUNCTION_MISMATCH(timer->function, delayed_work_timer_fn);
WARN_ON_ONCE(timer_pending(timer));
WARN_ON_ONCE(!list_empty(&work->entry));
@@ -2964,8 +2964,8 @@ reflush:
if (++flush_cnt == 10 ||
(flush_cnt % 100 == 0 && flush_cnt <= 1000))
- pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",
- wq->name, flush_cnt);
+ pr_warn("workqueue %s: %s() isn't complete after %u tries\n",
+ wq->name, __func__, flush_cnt);
mutex_unlock(&wq->mutex);
goto reflush;
@@ -5787,22 +5787,17 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
continue;
/* get the latest of pool and touched timestamps */
+ if (pool->cpu >= 0)
+ touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
+ else
+ touched = READ_ONCE(wq_watchdog_touched);
pool_ts = READ_ONCE(pool->watchdog_ts);
- touched = READ_ONCE(wq_watchdog_touched);
if (time_after(pool_ts, touched))
ts = pool_ts;
else
ts = touched;
- if (pool->cpu >= 0) {
- unsigned long cpu_touched =
- READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
- pool->cpu));
- if (time_after(cpu_touched, ts))
- ts = cpu_touched;
- }
-
/* did we stall? */
if (time_after(jiffies, ts + thresh)) {
lockup_detected = true;
@@ -5826,8 +5821,8 @@ notrace void wq_watchdog_touch(int cpu)
{
if (cpu >= 0)
per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
- else
- wq_watchdog_touched = jiffies;
+
+ wq_watchdog_touched = jiffies;
}
static void wq_watchdog_set_thresh(unsigned long thresh)