aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig14
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/bpf_trace.c4
-rw-r--r--kernel/trace/fgraph.c4
-rw-r--r--kernel/trace/ftrace.c6
-rw-r--r--kernel/trace/trace.c73
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events.c104
-rw-r--r--kernel/trace/trace_events_hist.c33
-rw-r--r--kernel/trace/trace_events_synth.c14
-rw-r--r--kernel/trace/trace_events_user.c1690
-rw-r--r--kernel/trace/trace_osnoise.c4
-rw-r--r--kernel/trace/trace_sched_switch.c3
-rw-r--r--kernel/trace/trace_sched_wakeup.c1
14 files changed, 1919 insertions, 34 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 99dd4ca63d68..9bb54c0b3b2d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -763,6 +763,20 @@ config SYNTH_EVENTS
If in doubt, say N.
+config USER_EVENTS
+ bool "User trace events"
+ select TRACING
+ select DYNAMIC_EVENTS
+ help
+ User trace events are user-defined trace events that
+ can be used like an existing kernel trace event. User trace
+ events are generated by writing to a tracefs file. User
+ processes can determine if their tracing events should be
+ generated by memory mapping a tracefs file and checking for
+ an associated byte being non-zero.
+
+ If in doubt, say N.
+
config HIST_TRIGGERS
bool "Histogram triggers"
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c6f11a139eac..d77cd8032213 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
+obj-$(CONFIG_USER_EVENTS) += trace_events_user.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 172ef545730d..7fa2ebc07f60 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -337,8 +337,6 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
if (unlikely(in_interrupt() ||
current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
@@ -840,8 +838,6 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
*/
if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 22061d38fc00..19028e072cdb 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -415,7 +415,9 @@ free:
static void
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
unsigned long long timestamp;
int index;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4214d6a69e60..2e114659f7f8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7136,6 +7136,8 @@ void __init ftrace_free_init_mem(void)
void *start = (void *)(&__init_begin);
void *end = (void *)(&__init_end);
+ ftrace_boot_snapshot();
+
ftrace_free_mem(NULL, start, end);
}
@@ -7386,7 +7388,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
static void
ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *pid_list;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eb44418574f9..08ea781540b5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -185,6 +185,7 @@ static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
static bool allocate_snapshot;
+static bool snapshot_at_boot;
static int __init set_cmdline_ftrace(char *str)
{
@@ -230,6 +231,15 @@ static int __init boot_alloc_snapshot(char *str)
__setup("alloc_snapshot", boot_alloc_snapshot);
+static int __init boot_snapshot(char *str)
+{
+ snapshot_at_boot = true;
+ boot_alloc_snapshot(str);
+ return 1;
+}
+__setup("ftrace_boot_snapshot", boot_snapshot);
+
+
static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
@@ -7725,7 +7735,7 @@ const struct file_operations trace_min_max_fops = {
struct err_info {
const char **errs; /* ptr to loc-specific array of err strings */
u8 type; /* index into errs -> specific err string */
- u8 pos; /* MAX_FILTER_STR_VAL = 256 */
+ u16 pos; /* caret position */
u64 ts;
};
@@ -7733,26 +7743,52 @@ struct tracing_log_err {
struct list_head list;
struct err_info info;
char loc[TRACING_LOG_LOC_MAX]; /* err location */
- char cmd[MAX_FILTER_STR_VAL]; /* what caused err */
+ char *cmd; /* what caused err */
};
static DEFINE_MUTEX(tracing_err_log_lock);
-static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
+static struct tracing_log_err *alloc_tracing_log_err(int len)
+{
+ struct tracing_log_err *err;
+
+ err = kzalloc(sizeof(*err), GFP_KERNEL);
+ if (!err)
+ return ERR_PTR(-ENOMEM);
+
+ err->cmd = kzalloc(len, GFP_KERNEL);
+ if (!err->cmd) {
+ kfree(err);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return err;
+}
+
+static void free_tracing_log_err(struct tracing_log_err *err)
+{
+ kfree(err->cmd);
+ kfree(err);
+}
+
+static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr,
+ int len)
{
struct tracing_log_err *err;
if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
- err = kzalloc(sizeof(*err), GFP_KERNEL);
- if (!err)
- err = ERR_PTR(-ENOMEM);
- else
+ err = alloc_tracing_log_err(len);
+ if (PTR_ERR(err) != -ENOMEM)
tr->n_err_log_entries++;
return err;
}
err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
+ kfree(err->cmd);
+ err->cmd = kzalloc(len, GFP_KERNEL);
+ if (!err->cmd)
+ return ERR_PTR(-ENOMEM);
list_del(&err->list);
return err;
@@ -7813,22 +7849,25 @@ unsigned int err_pos(char *cmd, const char *str)
*/
void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos)
+ const char **errs, u8 type, u16 pos)
{
struct tracing_log_err *err;
+ int len = 0;
if (!tr)
tr = &global_trace;
+ len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1;
+
mutex_lock(&tracing_err_log_lock);
- err = get_tracing_log_err(tr);
+ err = get_tracing_log_err(tr, len);
if (PTR_ERR(err) == -ENOMEM) {
mutex_unlock(&tracing_err_log_lock);
return;
}
snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
- snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd);
+ snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd);
err->info.errs = errs;
err->info.type = type;
@@ -7846,7 +7885,7 @@ static void clear_tracing_err_log(struct trace_array *tr)
mutex_lock(&tracing_err_log_lock);
list_for_each_entry_safe(err, next, &tr->err_log, list) {
list_del(&err->list);
- kfree(err);
+ free_tracing_log_err(err);
}
tr->n_err_log_entries = 0;
@@ -7874,9 +7913,9 @@ static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
mutex_unlock(&tracing_err_log_lock);
}
-static void tracing_err_log_show_pos(struct seq_file *m, u8 pos)
+static void tracing_err_log_show_pos(struct seq_file *m, u16 pos)
{
- u8 i;
+ u16 i;
for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
seq_putc(m, ' ');
@@ -10122,6 +10161,14 @@ out:
return ret;
}
+void __init ftrace_boot_snapshot(void)
+{
+ if (snapshot_at_boot) {
+ tracing_snapshot();
+ internal_trace_puts("** Boot snapshot taken **\n");
+ }
+}
+
void __init early_trace_init(void)
{
if (tracepoint_printk) {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c5b09c31e077..07d990270e2a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1877,7 +1877,7 @@ extern ssize_t trace_parse_run_command(struct file *file,
extern unsigned int err_pos(char *cmd, const char *str);
extern void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos);
+ const char **errs, u8 type, u16 pos);
/*
* Normal trace_printk() and friends allocates special buffers
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3147614c1812..e11e167b7809 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -40,6 +40,14 @@ static LIST_HEAD(ftrace_generic_fields);
static LIST_HEAD(ftrace_common_fields);
static bool eventdir_initialized;
+static LIST_HEAD(module_strings);
+
+struct module_string {
+ struct list_head next;
+ struct module *module;
+ char *str;
+};
+
#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
static struct kmem_cache *field_cachep;
@@ -384,6 +392,12 @@ static void test_event_printk(struct trace_event_call *call)
if (!(dereference_flags & (1ULL << arg)))
goto next_arg;
+ /* Check for __get_sockaddr */;
+ if (str_has_prefix(fmt + i, "__get_sockaddr(")) {
+ dereference_flags &= ~(1ULL << arg);
+ goto next_arg;
+ }
+
/* Find the REC-> in the argument */
c = strchr(fmt + i, ',');
r = strstr(fmt + i, "REC->");
@@ -759,7 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
static void
event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -783,7 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
static void
event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -2633,6 +2651,76 @@ static void update_event_printk(struct trace_event_call *call,
}
}
+static void add_str_to_module(struct module *module, char *str)
+{
+ struct module_string *modstr;
+
+ modstr = kmalloc(sizeof(*modstr), GFP_KERNEL);
+
+ /*
+ * If we failed to allocate memory here, then we'll just
+ * let the str memory leak when the module is removed.
+ * If this fails to allocate, there's worse problems than
+ * a leaked string on module removal.
+ */
+ if (WARN_ON_ONCE(!modstr))
+ return;
+
+ modstr->module = module;
+ modstr->str = str;
+
+ list_add(&modstr->next, &module_strings);
+}
+
+static void update_event_fields(struct trace_event_call *call,
+ struct trace_eval_map *map)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+ char *ptr;
+ char *str;
+ int len = strlen(map->eval_string);
+
+ /* Dynamic events should never have field maps */
+ if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
+ return;
+
+ head = trace_get_fields(call);
+ list_for_each_entry(field, head, link) {
+ ptr = strchr(field->type, '[');
+ if (!ptr)
+ continue;
+ ptr++;
+
+ if (!isalpha(*ptr) && *ptr != '_')
+ continue;
+
+ if (strncmp(map->eval_string, ptr, len) != 0)
+ continue;
+
+ str = kstrdup(field->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!str))
+ return;
+ ptr = str + (ptr - field->type);
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
+ if (WARN_ON_ONCE(!ptr)) {
+ kfree(str);
+ continue;
+ }
+
+ /*
+ * If the event is part of a module, then we need to free the string
+ * when the module is removed. Otherwise, it will stay allocated
+ * until a reboot.
+ */
+ if (call->module)
+ add_str_to_module(call->module, str);
+
+ field->type = str;
+ }
+}
+
void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
@@ -2668,6 +2756,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
first = false;
}
update_event_printk(call, map[i]);
+ update_event_fields(call, map[i]);
}
}
}
@@ -2758,6 +2847,7 @@ int trace_add_event_call(struct trace_event_call *call)
mutex_unlock(&trace_types_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(trace_add_event_call);
/*
* Must be called under locking of trace_types_lock, event_mutex and
@@ -2819,6 +2909,7 @@ int trace_remove_event_call(struct trace_event_call *call)
return ret;
}
+EXPORT_SYMBOL_GPL(trace_remove_event_call);
#define for_each_event(event, start, end) \
for (event = start; \
@@ -2853,6 +2944,7 @@ static void trace_module_add_events(struct module *mod)
static void trace_module_remove_events(struct module *mod)
{
struct trace_event_call *call, *p;
+ struct module_string *modstr, *m;
down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
@@ -2861,6 +2953,14 @@ static void trace_module_remove_events(struct module *mod)
if (call->module == mod)
__trace_remove_event_call(call);
}
+ /* Check for any strings allocade for this module */
+ list_for_each_entry_safe(modstr, m, &module_strings, next) {
+ if (modstr->module != mod)
+ continue;
+ list_del(&modstr->next);
+ kfree(modstr->str);
+ kfree(modstr);
+ }
up_write(&trace_event_sem);
/*
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index dc7f733b4cb3..44db5ba9cabb 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -727,11 +727,16 @@ static struct track_data *track_data_alloc(unsigned int key_len,
return data;
}
-static char last_cmd[MAX_FILTER_STR_VAL];
+#define HIST_PREFIX "hist:"
+
+static char *last_cmd;
static char last_cmd_loc[MAX_FILTER_STR_VAL];
static int errpos(char *str)
{
+ if (!str || !last_cmd)
+ return 0;
+
return err_pos(last_cmd, str);
}
@@ -739,12 +744,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
{
const char *system = NULL, *name = NULL;
struct trace_event_call *call;
+ int len;
if (!str)
return;
- strcpy(last_cmd, "hist:");
- strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:"));
+ /* sizeof() contains the nul byte */
+ len = sizeof(HIST_PREFIX) + strlen(str);
+ kfree(last_cmd);
+ last_cmd = kzalloc(len, GFP_KERNEL);
+ if (!last_cmd)
+ return;
+
+ strcpy(last_cmd, HIST_PREFIX);
+ /* Again, sizeof() contains the nul byte */
+ len -= sizeof(HIST_PREFIX);
+ strncat(last_cmd, str, len);
if (file) {
call = file->event_call;
@@ -757,18 +772,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
}
if (system)
- snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
+ snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, HIST_PREFIX "%s:%s", system, name);
}
-static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)
+static void hist_err(struct trace_array *tr, u8 err_type, u16 err_pos)
{
+ if (!last_cmd)
+ return;
+
tracing_log_err(tr, last_cmd_loc, last_cmd, err_text,
err_type, err_pos);
}
static void hist_err_clear(void)
{
- last_cmd[0] = '\0';
+ if (last_cmd)
+ last_cmd[0] = '\0';
last_cmd_loc[0] = '\0';
}
@@ -5610,7 +5629,7 @@ static int event_hist_trigger_print(struct seq_file *m,
bool have_var = false;
unsigned int i;
- seq_puts(m, "hist:");
+ seq_puts(m, HIST_PREFIX);
if (data->name)
seq_printf(m, "%s:", data->name);
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 154db74dadbc..5e8c07aef071 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -42,10 +42,13 @@ enum { ERRORS };
static const char *err_text[] = { ERRORS };
-static char last_cmd[MAX_FILTER_STR_VAL];
+static char *last_cmd;
static int errpos(const char *str)
{
+ if (!str || !last_cmd)
+ return 0;
+
return err_pos(last_cmd, str);
}
@@ -54,11 +57,16 @@ static void last_cmd_set(const char *str)
if (!str)
return;
- strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1);
+ kfree(last_cmd);
+
+ last_cmd = kstrdup(str, GFP_KERNEL);
}
-static void synth_err(u8 err_type, u8 err_pos)
+static void synth_err(u8 err_type, u16 err_pos)
{
+ if (!last_cmd)
+ return;
+
tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
err_type, err_pos);
}
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
new file mode 100644
index 000000000000..8b3d241a31c2
--- /dev/null
+++ b/kernel/trace/trace_events_user.c
@@ -0,0 +1,1690 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, Microsoft Corporation.
+ *
+ * Authors:
+ * Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#include <linux/bitmap.h>
+#include <linux/cdev.h>
+#include <linux/hashtable.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/ioctl.h>
+#include <linux/jhash.h>
+#include <linux/trace_events.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/user_events.h>
+#include "trace.h"
+#include "trace_dynevent.h"
+
+#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
+
+#define FIELD_DEPTH_TYPE 0
+#define FIELD_DEPTH_NAME 1
+#define FIELD_DEPTH_SIZE 2
+
+/*
+ * Limits how many trace_event calls user processes can create:
+ * Must be a power of two of PAGE_SIZE.
+ */
+#define MAX_PAGE_ORDER 0
+#define MAX_PAGES (1 << MAX_PAGE_ORDER)
+#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
+
+/* Limit how long of an event name plus args within the subsystem. */
+#define MAX_EVENT_DESC 512
+#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define MAX_FIELD_ARRAY_SIZE 1024
+#define MAX_FIELD_ARG_NAME 256
+
+#define MAX_BPF_COPY_SIZE PAGE_SIZE
+#define MAX_STACK_BPF_DATA 512
+
+static char *register_page_data;
+
+static DEFINE_MUTEX(reg_mutex);
+static DEFINE_HASHTABLE(register_table, 4);
+static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+
+/*
+ * Stores per-event properties, as users register events
+ * within a file a user_event might be created if it does not
+ * already exist. These are globally used and their lifetime
+ * is tied to the refcnt member. These cannot go away until the
+ * refcnt reaches zero.
+ */
+struct user_event {
+ struct tracepoint tracepoint;
+ struct trace_event_call call;
+ struct trace_event_class class;
+ struct dyn_event devent;
+ struct hlist_node node;
+ struct list_head fields;
+ struct list_head validators;
+ atomic_t refcnt;
+ int index;
+ int flags;
+ int min_size;
+};
+
+/*
+ * Stores per-file events references, as users register events
+ * within a file this structure is modified and freed via RCU.
+ * The lifetime of this struct is tied to the lifetime of the file.
+ * These are not shared and only accessible by the file that created it.
+ */
+struct user_event_refs {
+ struct rcu_head rcu;
+ int count;
+ struct user_event *events[];
+};
+
+#define VALIDATOR_ENSURE_NULL (1 << 0)
+#define VALIDATOR_REL (1 << 1)
+
+struct user_event_validator {
+ struct list_head link;
+ int offset;
+ int flags;
+};
+
+typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted);
+
+static int user_event_parse(char *name, char *args, char *flags,
+ struct user_event **newuser);
+
+static u32 user_event_key(char *name)
+{
+ return jhash(name, strlen(name), 0);
+}
+
+static __always_inline __must_check
+size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
+{
+ size_t ret;
+
+ pagefault_disable();
+
+ ret = copy_from_iter_nocache(addr, bytes, i);
+
+ pagefault_enable();
+
+ return ret;
+}
+
+static struct list_head *user_event_get_fields(struct trace_event_call *call)
+{
+ struct user_event *user = (struct user_event *)call->data;
+
+ return &user->fields;
+}
+
+/*
+ * Parses a register command for user_events
+ * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]]
+ *
+ * Example event named 'test' with a 20 char 'msg' field with an unsigned int
+ * 'id' field after:
+ * test char[20] msg;unsigned int id
+ *
+ * NOTE: Offsets are from the user data perspective, they are not from the
+ * trace_entry/buffer perspective. We automatically add the common properties
+ * sizes to the offset for the user.
+ *
+ * Upon success user_event has its ref count increased by 1.
+ */
+static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
+{
+ char *name = raw_command;
+ char *args = strpbrk(name, " ");
+ char *flags;
+
+ if (args)
+ *args++ = '\0';
+
+ flags = strpbrk(name, ":");
+
+ if (flags)
+ *flags++ = '\0';
+
+ return user_event_parse(name, args, flags, newuser);
+}
+
+static int user_field_array_size(const char *type)
+{
+ const char *start = strchr(type, '[');
+ char val[8];
+ char *bracket;
+ int size = 0;
+
+ if (start == NULL)
+ return -EINVAL;
+
+ if (strscpy(val, start + 1, sizeof(val)) <= 0)
+ return -EINVAL;
+
+ bracket = strchr(val, ']');
+
+ if (!bracket)
+ return -EINVAL;
+
+ *bracket = '\0';
+
+ if (kstrtouint(val, 0, &size))
+ return -EINVAL;
+
+ if (size > MAX_FIELD_ARRAY_SIZE)
+ return -EINVAL;
+
+ return size;
+}
+
+static int user_field_size(const char *type)
+{
+ /* long is not allowed from a user, since it's ambigious in size */
+ if (strcmp(type, "s64") == 0)
+ return sizeof(s64);
+ if (strcmp(type, "u64") == 0)
+ return sizeof(u64);
+ if (strcmp(type, "s32") == 0)
+ return sizeof(s32);
+ if (strcmp(type, "u32") == 0)
+ return sizeof(u32);
+ if (strcmp(type, "int") == 0)
+ return sizeof(int);
+ if (strcmp(type, "unsigned int") == 0)
+ return sizeof(unsigned int);
+ if (strcmp(type, "s16") == 0)
+ return sizeof(s16);
+ if (strcmp(type, "u16") == 0)
+ return sizeof(u16);
+ if (strcmp(type, "short") == 0)
+ return sizeof(short);
+ if (strcmp(type, "unsigned short") == 0)
+ return sizeof(unsigned short);
+ if (strcmp(type, "s8") == 0)
+ return sizeof(s8);
+ if (strcmp(type, "u8") == 0)
+ return sizeof(u8);
+ if (strcmp(type, "char") == 0)
+ return sizeof(char);
+ if (strcmp(type, "unsigned char") == 0)
+ return sizeof(unsigned char);
+ if (str_has_prefix(type, "char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "unsigned char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "__data_loc "))
+ return sizeof(u32);
+ if (str_has_prefix(type, "__rel_loc "))
+ return sizeof(u32);
+
+ /* Uknown basic type, error */
+ return -EINVAL;
+}
+
+static void user_event_destroy_validators(struct user_event *user)
+{
+ struct user_event_validator *validator, *next;
+ struct list_head *head = &user->validators;
+
+ list_for_each_entry_safe(validator, next, head, link) {
+ list_del(&validator->link);
+ kfree(validator);
+ }
+}
+
+static void user_event_destroy_fields(struct user_event *user)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+
+ list_for_each_entry_safe(field, next, head, link) {
+ list_del(&field->link);
+ kfree(field);
+ }
+}
+
+static int user_event_add_field(struct user_event *user, const char *type,
+ const char *name, int offset, int size,
+ int is_signed, int filter_type)
+{
+ struct user_event_validator *validator;
+ struct ftrace_event_field *field;
+ int validator_flags = 0;
+
+ field = kmalloc(sizeof(*field), GFP_KERNEL);
+
+ if (!field)
+ return -ENOMEM;
+
+ if (str_has_prefix(type, "__data_loc "))
+ goto add_validator;
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ validator_flags |= VALIDATOR_REL;
+ goto add_validator;
+ }
+
+ goto add_field;
+
+add_validator:
+ if (strstr(type, "char") != 0)
+ validator_flags |= VALIDATOR_ENSURE_NULL;
+
+ validator = kmalloc(sizeof(*validator), GFP_KERNEL);
+
+ if (!validator) {
+ kfree(field);
+ return -ENOMEM;
+ }
+
+ validator->flags = validator_flags;
+ validator->offset = offset;
+
+ /* Want sequential access when validating */
+ list_add_tail(&validator->link, &user->validators);
+
+add_field:
+ field->type = type;
+ field->name = name;
+ field->offset = offset;
+ field->size = size;
+ field->is_signed = is_signed;
+ field->filter_type = filter_type;
+
+ list_add(&field->link, &user->fields);
+
+ /*
+ * Min size from user writes that are required, this does not include
+ * the size of trace_entry (common fields).
+ */
+ user->min_size = (offset + size) - sizeof(struct trace_entry);
+
+ return 0;
+}
+
+/*
+ * Parses the values of a field within the description
+ * Format: type name [size]
+ */
+static int user_event_parse_field(char *field, struct user_event *user,
+ u32 *offset)
+{
+ char *part, *type, *name;
+ u32 depth = 0, saved_offset = *offset;
+ int len, size = -EINVAL;
+ bool is_struct = false;
+
+ field = skip_spaces(field);
+
+ if (*field == '\0')
+ return 0;
+
+ /* Handle types that have a space within */
+ len = str_has_prefix(field, "unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "struct ");
+ if (len) {
+ is_struct = true;
+ goto skip_next;
+ }
+
+ len = str_has_prefix(field, "__data_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__data_loc ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc ");
+ if (len)
+ goto skip_next;
+
+ goto parse;
+skip_next:
+ type = field;
+ field = strpbrk(field + len, " ");
+
+ if (field == NULL)
+ return -EINVAL;
+
+ *field++ = '\0';
+ depth++;
+parse:
+ name = NULL;
+
+ while ((part = strsep(&field, " ")) != NULL) {
+ switch (depth++) {
+ case FIELD_DEPTH_TYPE:
+ type = part;
+ break;
+ case FIELD_DEPTH_NAME:
+ name = part;
+ break;
+ case FIELD_DEPTH_SIZE:
+ if (!is_struct)
+ return -EINVAL;
+
+ if (kstrtou32(part, 10, &size))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (depth < FIELD_DEPTH_SIZE || !name)
+ return -EINVAL;
+
+ if (depth == FIELD_DEPTH_SIZE)
+ size = user_field_size(type);
+
+ if (size == 0)
+ return -EINVAL;
+
+ if (size < 0)
+ return size;
+
+ *offset = saved_offset + size;
+
+ return user_event_add_field(user, type, name, saved_offset, size,
+ type[0] != 'u', FILTER_OTHER);
+}
+
+static void user_event_parse_flags(struct user_event *user, char *flags)
+{
+ char *flag;
+
+ if (flags == NULL)
+ return;
+
+ while ((flag = strsep(&flags, ",")) != NULL) {
+ if (strcmp(flag, "BPF_ITER") == 0)
+ user->flags |= FLAG_BPF_ITER;
+ }
+}
+
+static int user_event_parse_fields(struct user_event *user, char *args)
+{
+ char *field;
+ u32 offset = sizeof(struct trace_entry);
+ int ret = -EINVAL;
+
+ if (args == NULL)
+ return 0;
+
+ while ((field = strsep(&args, ";")) != NULL) {
+ ret = user_event_parse_field(field, user, &offset);
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static struct trace_event_fields user_event_fields_array[1];
+
+static const char *user_field_format(const char *type)
+{
+ if (strcmp(type, "s64") == 0)
+ return "%lld";
+ if (strcmp(type, "u64") == 0)
+ return "%llu";
+ if (strcmp(type, "s32") == 0)
+ return "%d";
+ if (strcmp(type, "u32") == 0)
+ return "%u";
+ if (strcmp(type, "int") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned int") == 0)
+ return "%u";
+ if (strcmp(type, "s16") == 0)
+ return "%d";
+ if (strcmp(type, "u16") == 0)
+ return "%u";
+ if (strcmp(type, "short") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned short") == 0)
+ return "%u";
+ if (strcmp(type, "s8") == 0)
+ return "%d";
+ if (strcmp(type, "u8") == 0)
+ return "%u";
+ if (strcmp(type, "char") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned char") == 0)
+ return "%u";
+ if (strstr(type, "char[") != 0)
+ return "%s";
+
+ /* Unknown, likely struct, allowed treat as 64-bit */
+ return "%llu";
+}
+
+static bool user_field_is_dyn_string(const char *type, const char **str_func)
+{
+ if (str_has_prefix(type, "__data_loc ")) {
+ *str_func = "__get_str";
+ goto check;
+ }
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ *str_func = "__get_rel_str";
+ goto check;
+ }
+
+ return false;
+check:
+ return strstr(type, "char") != 0;
+}
+
+#define LEN_OR_ZERO (len ? len - pos : 0)
+static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+ int pos = 0, depth = 0;
+ const char *str_func;
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (depth != 0)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s",
+ field->name, user_field_format(field->type));
+
+ depth++;
+ }
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (user_field_is_dyn_string(field->type, &str_func))
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", %s(%s)", str_func, field->name);
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", REC->%s", field->name);
+ }
+
+ return pos + 1;
+}
+#undef LEN_OR_ZERO
+
+static int user_event_create_print_fmt(struct user_event *user)
+{
+ char *print_fmt;
+ int len;
+
+ len = user_event_set_print_fmt(user, NULL, 0);
+
+ print_fmt = kmalloc(len, GFP_KERNEL);
+
+ if (!print_fmt)
+ return -ENOMEM;
+
+ user_event_set_print_fmt(user, print_fmt, len);
+
+ user->call.print_fmt = print_fmt;
+
+ return 0;
+}
+
+static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
+ int flags,
+ struct trace_event *event)
+{
+ /* Unsafe to try to decode user provided print_fmt, use hex */
+ trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16,
+ 1, iter->ent, iter->ent_size, true);
+
+ return trace_handle_return(&iter->seq);
+}
+
+static struct trace_event_functions user_event_funcs = {
+ .trace = user_event_print_trace,
+};
+
+static int user_event_set_call_visible(struct user_event *user, bool visible)
+{
+ int ret;
+ const struct cred *old_cred;
+ struct cred *cred;
+
+ cred = prepare_creds();
+
+ if (!cred)
+ return -ENOMEM;
+
+ /*
+ * While by default tracefs is locked down, systems can be configured
+ * to allow user_event files to be less locked down. The extreme case
+ * being "other" has read/write access to user_events_data/status.
+ *
+ * When not locked down, processes may not have have permissions to
+ * add/remove calls themselves to tracefs. We need to temporarily
+ * switch to root file permission to allow for this scenario.
+ */
+ cred->fsuid = GLOBAL_ROOT_UID;
+
+ old_cred = override_creds(cred);
+
+ if (visible)
+ ret = trace_add_event_call(&user->call);
+ else
+ ret = trace_remove_event_call(&user->call);
+
+ revert_creds(old_cred);
+ put_cred(cred);
+
+ return ret;
+}
+
+static int destroy_user_event(struct user_event *user)
+{
+ int ret = 0;
+
+ /* Must destroy fields before call removal */
+ user_event_destroy_fields(user);
+
+ ret = user_event_set_call_visible(user, false);
+
+ if (ret)
+ return ret;
+
+ dyn_event_remove(&user->devent);
+
+ register_page_data[user->index] = 0;
+ clear_bit(user->index, page_bitmap);
+ hash_del(&user->node);
+
+ user_event_destroy_validators(user);
+ kfree(user->call.print_fmt);
+ kfree(EVENT_NAME(user));
+ kfree(user);
+
+ return ret;
+}
+
+static struct user_event *find_user_event(char *name, u32 *outkey)
+{
+ struct user_event *user;
+ u32 key = user_event_key(name);
+
+ *outkey = key;
+
+ hash_for_each_possible(register_table, user, node, key)
+ if (!strcmp(EVENT_NAME(user), name)) {
+ atomic_inc(&user->refcnt);
+ return user;
+ }
+
+ return NULL;
+}
+
+static int user_event_validate(struct user_event *user, void *data, int len)
+{
+ struct list_head *head = &user->validators;
+ struct user_event_validator *validator;
+ void *pos, *end = data + len;
+ u32 loc, offset, size;
+
+ list_for_each_entry(validator, head, link) {
+ pos = data + validator->offset;
+
+ /* Already done min_size check, no bounds check here */
+ loc = *(u32 *)pos;
+ offset = loc & 0xffff;
+ size = loc >> 16;
+
+ if (likely(validator->flags & VALIDATOR_REL))
+ pos += offset + sizeof(loc);
+ else
+ pos = data + offset;
+
+ pos += size;
+
+ if (unlikely(pos > end))
+ return -EFAULT;
+
+ if (likely(validator->flags & VALIDATOR_ENSURE_NULL))
+ if (unlikely(*(char *)(pos - 1) != '\0'))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * Writes the user supplied payload out to a trace file.
+ */
+static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct trace_event_file *file;
+ struct trace_entry *entry;
+ struct trace_event_buffer event_buffer;
+ size_t size = sizeof(*entry) + i->count;
+
+ file = (struct trace_event_file *)tpdata;
+
+ if (!file ||
+ !(file->flags & EVENT_FILE_FL_ENABLED) ||
+ trace_trigger_soft_disabled(file))
+ return;
+
+ /* Allocates and fills trace_entry, + 1 of this is data payload */
+ entry = trace_event_buffer_reserve(&event_buffer, file, size);
+
+ if (unlikely(!entry))
+ return;
+
+ if (unlikely(!copy_nofault(entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, entry, size)))
+ goto discard;
+
+ trace_event_buffer_commit(&event_buffer);
+
+ return;
+discard:
+ *faulted = true;
+ __trace_event_discard_commit(event_buffer.buffer,
+ event_buffer.event);
+}
+
+#ifdef CONFIG_PERF_EVENTS
+static void user_event_bpf(struct user_event *user, struct iov_iter *i)
+{
+ struct user_bpf_context context;
+ struct user_bpf_iter bpf_i;
+ char fast_data[MAX_STACK_BPF_DATA];
+ void *temp = NULL;
+
+ if ((user->flags & FLAG_BPF_ITER) && iter_is_iovec(i)) {
+ /* Raw iterator */
+ context.data_type = USER_BPF_DATA_ITER;
+ context.data_len = i->count;
+ context.iter = &bpf_i;
+
+ bpf_i.iov_offset = i->iov_offset;
+ bpf_i.iov = i->iov;
+ bpf_i.nr_segs = i->nr_segs;
+ } else if (i->nr_segs == 1 && iter_is_iovec(i)) {
+ /* Single buffer from user */
+ context.data_type = USER_BPF_DATA_USER;
+ context.data_len = i->count;
+ context.udata = i->iov->iov_base + i->iov_offset;
+ } else {
+ /* Multi buffer from user */
+ struct iov_iter copy = *i;
+ size_t copy_size = min_t(size_t, i->count, MAX_BPF_COPY_SIZE);
+
+ context.data_type = USER_BPF_DATA_KERNEL;
+ context.kdata = fast_data;
+
+ if (unlikely(copy_size > sizeof(fast_data))) {
+ temp = kmalloc(copy_size, GFP_NOWAIT);
+
+ if (temp)
+ context.kdata = temp;
+ else
+ copy_size = sizeof(fast_data);
+ }
+
+ context.data_len = copy_nofault(context.kdata,
+ copy_size, &copy);
+ }
+
+ trace_call_bpf(&user->call, &context);
+
+ kfree(temp);
+}
+
+/*
+ * Writes the user supplied payload out to perf ring buffer or eBPF program.
+ */
+static void user_event_perf(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct hlist_head *perf_head;
+
+ if (bpf_prog_array_valid(&user->call))
+ user_event_bpf(user, i);
+
+ perf_head = this_cpu_ptr(user->call.perf_events);
+
+ if (perf_head && !hlist_empty(perf_head)) {
+ struct trace_entry *perf_entry;
+ struct pt_regs *regs;
+ size_t size = sizeof(*perf_entry) + i->count;
+ int context;
+
+ perf_entry = perf_trace_buf_alloc(ALIGN(size, 8),
+ &regs, &context);
+
+ if (unlikely(!perf_entry))
+ return;
+
+ perf_fetch_caller_regs(regs);
+
+ if (unlikely(!copy_nofault(perf_entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, perf_entry, size)))
+ goto discard;
+
+ perf_trace_buf_submit(perf_entry, size, context,
+ user->call.event.type, 1, regs,
+ perf_head, NULL);
+
+ return;
+discard:
+ *faulted = true;
+ perf_swevent_put_recursion_context(context);
+ }
+}
+#endif
+
+/*
+ * Update the register page that is shared between user processes.
+ */
+static void update_reg_page_for(struct user_event *user)
+{
+ struct tracepoint *tp = &user->tracepoint;
+ char status = 0;
+
+ if (atomic_read(&tp->key.enabled) > 0) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ probe_func = probe_func_ptr->func;
+
+ if (probe_func == user_event_ftrace)
+ status |= EVENT_STATUS_FTRACE;
+#ifdef CONFIG_PERF_EVENTS
+ else if (probe_func == user_event_perf)
+ status |= EVENT_STATUS_PERF;
+#endif
+ else
+ status |= EVENT_STATUS_OTHER;
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+ }
+
+ register_page_data[user->index] = status;
+}
+
+/*
+ * Register callback for our events from tracing sub-systems.
+ */
+static int user_event_reg(struct trace_event_call *call,
+ enum trace_reg type,
+ void *data)
+{
+ struct user_event *user = (struct user_event *)call->data;
+ int ret = 0;
+
+ if (!user)
+ return -ENOENT;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->probe,
+ data);
+ goto dec;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->perf_probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_PERF_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->perf_probe,
+ data);
+ goto dec;
+
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ break;
+#endif
+ }
+
+ return ret;
+inc:
+ atomic_inc(&user->refcnt);
+ update_reg_page_for(user);
+ return 0;
+dec:
+ update_reg_page_for(user);
+ atomic_dec(&user->refcnt);
+ return 0;
+}
+
+static int user_event_create(const char *raw_command)
+{
+ struct user_event *user;
+ char *name;
+ int ret;
+
+ if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX))
+ return -ECANCELED;
+
+ raw_command += USER_EVENTS_PREFIX_LEN;
+ raw_command = skip_spaces(raw_command);
+
+ name = kstrdup(raw_command, GFP_KERNEL);
+
+ if (!name)
+ return -ENOMEM;
+
+ mutex_lock(&reg_mutex);
+
+ ret = user_event_parse_cmd(name, &user);
+
+ if (!ret)
+ atomic_dec(&user->refcnt);
+
+ mutex_unlock(&reg_mutex);
+
+ if (ret)
+ kfree(name);
+
+ return ret;
+}
+
+static int user_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ struct ftrace_event_field *field, *next;
+ struct list_head *head;
+ int depth = 0;
+
+ seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user));
+
+ head = trace_get_fields(&user->call);
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (depth == 0)
+ seq_puts(m, " ");
+ else
+ seq_puts(m, "; ");
+
+ seq_printf(m, "%s %s", field->type, field->name);
+
+ if (str_has_prefix(field->type, "struct "))
+ seq_printf(m, " %d", field->size);
+
+ depth++;
+ }
+
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static bool user_event_is_busy(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ return atomic_read(&user->refcnt) != 0;
+}
+
+static int user_event_free(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ if (atomic_read(&user->refcnt) != 0)
+ return -EBUSY;
+
+ return destroy_user_event(user);
+}
+
+static bool user_field_match(struct ftrace_event_field *field, int argc,
+ const char **argv, int *iout)
+{
+ char *field_name, *arg_name;
+ int len, pos, i = *iout;
+ bool colon = false, match = false;
+
+ if (i >= argc)
+ return false;
+
+ len = MAX_FIELD_ARG_NAME;
+ field_name = kmalloc(len, GFP_KERNEL);
+ arg_name = kmalloc(len, GFP_KERNEL);
+
+ if (!arg_name || !field_name)
+ goto out;
+
+ pos = 0;
+
+ for (; i < argc; ++i) {
+ if (i != *iout)
+ pos += snprintf(arg_name + pos, len - pos, " ");
+
+ pos += snprintf(arg_name + pos, len - pos, argv[i]);
+
+ if (strchr(argv[i], ';')) {
+ ++i;
+ colon = true;
+ break;
+ }
+ }
+
+ pos = 0;
+
+ pos += snprintf(field_name + pos, len - pos, field->type);
+ pos += snprintf(field_name + pos, len - pos, " ");
+ pos += snprintf(field_name + pos, len - pos, field->name);
+
+ if (colon)
+ pos += snprintf(field_name + pos, len - pos, ";");
+
+ *iout = i;
+
+ match = strcmp(arg_name, field_name) == 0;
+out:
+ kfree(arg_name);
+ kfree(field_name);
+
+ return match;
+}
+
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+ int i = 0;
+
+ list_for_each_entry_safe_reverse(field, next, head, link)
+ if (!user_field_match(field, argc, argv, &i))
+ return false;
+
+ if (i != argc)
+ return false;
+
+ return true;
+}
+
+static bool user_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ bool match;
+
+ match = strcmp(EVENT_NAME(user), event) == 0 &&
+ (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
+
+ if (match && argc > 0)
+ match = user_fields_match(user, argc, argv);
+
+ return match;
+}
+
+static struct dyn_event_operations user_event_dops = {
+ .create = user_event_create,
+ .show = user_event_show,
+ .is_busy = user_event_is_busy,
+ .free = user_event_free,
+ .match = user_event_match,
+};
+
+static int user_event_trace_register(struct user_event *user)
+{
+ int ret;
+
+ ret = register_trace_event(&user->call.event);
+
+ if (!ret)
+ return -ENODEV;
+
+ ret = user_event_set_call_visible(user, true);
+
+ if (ret)
+ unregister_trace_event(&user->call.event);
+
+ return ret;
+}
+
+/*
+ * Parses the event name, arguments and flags then registers if successful.
+ * The name buffer lifetime is owned by this method for success cases only.
+ * Upon success the returned user_event has its ref count increased by 1.
+ */
+static int user_event_parse(char *name, char *args, char *flags,
+ struct user_event **newuser)
+{
+ int ret;
+ int index;
+ u32 key;
+ struct user_event *user;
+
+ /* Prevent dyn_event from racing */
+ mutex_lock(&event_mutex);
+ user = find_user_event(name, &key);
+ mutex_unlock(&event_mutex);
+
+ if (user) {
+ *newuser = user;
+ /*
+ * Name is allocated by caller, free it since it already exists.
+ * Caller only worries about failure cases for freeing.
+ */
+ kfree(name);
+ return 0;
+ }
+
+ index = find_first_zero_bit(page_bitmap, MAX_EVENTS);
+
+ if (index == MAX_EVENTS)
+ return -EMFILE;
+
+ user = kzalloc(sizeof(*user), GFP_KERNEL);
+
+ if (!user)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&user->class.fields);
+ INIT_LIST_HEAD(&user->fields);
+ INIT_LIST_HEAD(&user->validators);
+
+ user->tracepoint.name = name;
+
+ user_event_parse_flags(user, flags);
+
+ ret = user_event_parse_fields(user, args);
+
+ if (ret)
+ goto put_user;
+
+ ret = user_event_create_print_fmt(user);
+
+ if (ret)
+ goto put_user;
+
+ user->call.data = user;
+ user->call.class = &user->class;
+ user->call.name = name;
+ user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
+ user->call.tp = &user->tracepoint;
+ user->call.event.funcs = &user_event_funcs;
+
+ user->class.system = USER_EVENTS_SYSTEM;
+ user->class.fields_array = user_event_fields_array;
+ user->class.get_fields = user_event_get_fields;
+ user->class.reg = user_event_reg;
+ user->class.probe = user_event_ftrace;
+#ifdef CONFIG_PERF_EVENTS
+ user->class.perf_probe = user_event_perf;
+#endif
+
+ mutex_lock(&event_mutex);
+ ret = user_event_trace_register(user);
+ mutex_unlock(&event_mutex);
+
+ if (ret)
+ goto put_user;
+
+ user->index = index;
+
+ /* Ensure we track ref */
+ atomic_inc(&user->refcnt);
+
+ dyn_event_init(&user->devent, &user_event_dops);
+ dyn_event_add(&user->devent, &user->call);
+ set_bit(user->index, page_bitmap);
+ hash_add(register_table, &user->node, key);
+
+ *newuser = user;
+ return 0;
+put_user:
+ user_event_destroy_fields(user);
+ user_event_destroy_validators(user);
+ kfree(user);
+ return ret;
+}
+
+/*
+ * Deletes a previously created event if it is no longer being used.
+ */
+static int delete_user_event(char *name)
+{
+ u32 key;
+ int ret;
+ struct user_event *user = find_user_event(name, &key);
+
+ if (!user)
+ return -ENOENT;
+
+ /* Ensure we are the last ref */
+ if (atomic_read(&user->refcnt) != 1) {
+ ret = -EBUSY;
+ goto put_ref;
+ }
+
+ ret = destroy_user_event(user);
+
+ if (ret)
+ goto put_ref;
+
+ return ret;
+put_ref:
+ /* No longer have this ref */
+ atomic_dec(&user->refcnt);
+
+ return ret;
+}
+
+/*
+ * Validates the user payload and writes via iterator.
+ */
+static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
+{
+ struct user_event_refs *refs;
+ struct user_event *user = NULL;
+ struct tracepoint *tp;
+ ssize_t ret = i->count;
+ int idx;
+
+ if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
+ return -EFAULT;
+
+ rcu_read_lock_sched();
+
+ refs = rcu_dereference_sched(file->private_data);
+
+ /*
+ * The refs->events array is protected by RCU, and new items may be
+ * added. But the user retrieved from indexing into the events array
+ * shall be immutable while the file is opened.
+ */
+ if (likely(refs && idx < refs->count))
+ user = refs->events[idx];
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(user == NULL))
+ return -ENOENT;
+
+ if (unlikely(i->count < user->min_size))
+ return -EINVAL;
+
+ tp = &user->tracepoint;
+
+ /*
+ * It's possible key.enabled disables after this check, however
+ * we don't mind if a few events are included in this condition.
+ */
+ if (likely(atomic_read(&tp->key.enabled) > 0)) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+ struct iov_iter copy;
+ void *tpdata;
+ bool faulted;
+
+ if (unlikely(fault_in_iov_iter_readable(i, i->count)))
+ return -EFAULT;
+
+ faulted = false;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ copy = *i;
+ probe_func = probe_func_ptr->func;
+ tpdata = probe_func_ptr->data;
+ probe_func(user, &copy, tpdata, &faulted);
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(faulted))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
+static ssize_t user_events_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct iovec iov;
+ struct iov_iter i;
+
+ if (unlikely(*ppos != 0))
+ return -EFAULT;
+
+ if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i)))
+ return -EFAULT;
+
+ return user_events_write_core(file, &i);
+}
+
+static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
+{
+ return user_events_write_core(kp->ki_filp, i);
+}
+
+static int user_events_ref_add(struct file *file, struct user_event *user)
+{
+ struct user_event_refs *refs, *new_refs;
+ int i, size, count = 0;
+
+ refs = rcu_dereference_protected(file->private_data,
+ lockdep_is_held(&reg_mutex));
+
+ if (refs) {
+ count = refs->count;
+
+ for (i = 0; i < count; ++i)
+ if (refs->events[i] == user)
+ return i;
+ }
+
+ size = struct_size(refs, events, count + 1);
+
+ new_refs = kzalloc(size, GFP_KERNEL);
+
+ if (!new_refs)
+ return -ENOMEM;
+
+ new_refs->count = count + 1;
+
+ for (i = 0; i < count; ++i)
+ new_refs->events[i] = refs->events[i];
+
+ new_refs->events[i] = user;
+
+ atomic_inc(&user->refcnt);
+
+ rcu_assign_pointer(file->private_data, new_refs);
+
+ if (refs)
+ kfree_rcu(refs, rcu);
+
+ return i;
+}
+
+static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
+{
+ u32 size;
+ long ret;
+
+ ret = get_user(size, &ureg->size);
+
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE)
+ return -E2BIG;
+
+ return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+}
+
+/*
+ * Registers a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
+{
+ struct user_reg __user *ureg = (struct user_reg __user *)uarg;
+ struct user_reg reg;
+ struct user_event *user;
+ char *name;
+ long ret;
+
+ ret = user_reg_get(ureg, &reg);
+
+ if (ret)
+ return ret;
+
+ name = strndup_user((const char __user *)(uintptr_t)reg.name_args,
+ MAX_EVENT_DESC);
+
+ if (IS_ERR(name)) {
+ ret = PTR_ERR(name);
+ return ret;
+ }
+
+ ret = user_event_parse_cmd(name, &user);
+
+ if (ret) {
+ kfree(name);
+ return ret;
+ }
+
+ ret = user_events_ref_add(file, user);
+
+ /* No longer need parse ref, ref_add either worked or not */
+ atomic_dec(&user->refcnt);
+
+ /* Positive number is index and valid */
+ if (ret < 0)
+ return ret;
+
+ put_user((u32)ret, &ureg->write_index);
+ put_user(user->index, &ureg->status_index);
+
+ return 0;
+}
+
+/*
+ * Deletes a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_del(struct file *file, unsigned long uarg)
+{
+ void __user *ubuf = (void __user *)uarg;
+ char *name;
+ long ret;
+
+ name = strndup_user(ubuf, MAX_EVENT_DESC);
+
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ /* event_mutex prevents dyn_event from racing */
+ mutex_lock(&event_mutex);
+ ret = delete_user_event(name);
+ mutex_unlock(&event_mutex);
+
+ kfree(name);
+
+ return ret;
+}
+
+/*
+ * Handles the ioctl from user mode to register or alter operations.
+ */
+static long user_events_ioctl(struct file *file, unsigned int cmd,
+ unsigned long uarg)
+{
+ long ret = -ENOTTY;
+
+ switch (cmd) {
+ case DIAG_IOCSREG:
+ mutex_lock(&reg_mutex);
+ ret = user_events_ioctl_reg(file, uarg);
+ mutex_unlock(&reg_mutex);
+ break;
+
+ case DIAG_IOCSDEL:
+ mutex_lock(&reg_mutex);
+ ret = user_events_ioctl_del(file, uarg);
+ mutex_unlock(&reg_mutex);
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Handles the final close of the file from user mode.
+ */
+static int user_events_release(struct inode *node, struct file *file)
+{
+ struct user_event_refs *refs;
+ struct user_event *user;
+ int i;
+
+ /*
+ * Ensure refs cannot change under any situation by taking the
+ * register mutex during the final freeing of the references.
+ */
+ mutex_lock(&reg_mutex);
+
+ refs = file->private_data;
+
+ if (!refs)
+ goto out;
+
+ /*
+ * The lifetime of refs has reached an end, it's tied to this file.
+ * The underlying user_events are ref counted, and cannot be freed.
+ * After this decrement, the user_events may be freed elsewhere.
+ */
+ for (i = 0; i < refs->count; ++i) {
+ user = refs->events[i];
+
+ if (user)
+ atomic_dec(&user->refcnt);
+ }
+out:
+ file->private_data = NULL;
+
+ mutex_unlock(&reg_mutex);
+
+ kfree(refs);
+
+ return 0;
+}
+
+static const struct file_operations user_data_fops = {
+ .write = user_events_write,
+ .write_iter = user_events_write_iter,
+ .unlocked_ioctl = user_events_ioctl,
+ .release = user_events_release,
+};
+
+/*
+ * Maps the shared page into the user process for checking if event is enabled.
+ */
+static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ unsigned long size = vma->vm_end - vma->vm_start;
+
+ if (size != MAX_EVENTS)
+ return -EINVAL;
+
+ return remap_pfn_range(vma, vma->vm_start,
+ virt_to_phys(register_page_data) >> PAGE_SHIFT,
+ size, vm_get_page_prot(VM_READ));
+}
+
+static void *user_seq_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+
+ return (void *)1;
+}
+
+static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ ++*pos;
+ return NULL;
+}
+
+static void user_seq_stop(struct seq_file *m, void *p)
+{
+}
+
+static int user_seq_show(struct seq_file *m, void *p)
+{
+ struct user_event *user;
+ char status;
+ int i, active = 0, busy = 0, flags;
+
+ mutex_lock(&reg_mutex);
+
+ hash_for_each(register_table, i, user, node) {
+ status = register_page_data[user->index];
+ flags = user->flags;
+
+ seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+
+ if (flags != 0 || status != 0)
+ seq_puts(m, " #");
+
+ if (status != 0) {
+ seq_puts(m, " Used by");
+ if (status & EVENT_STATUS_FTRACE)
+ seq_puts(m, " ftrace");
+ if (status & EVENT_STATUS_PERF)
+ seq_puts(m, " perf");
+ if (status & EVENT_STATUS_OTHER)
+ seq_puts(m, " other");
+ busy++;
+ }
+
+ if (flags & FLAG_BPF_ITER)
+ seq_puts(m, " FLAG:BPF_ITER");
+
+ seq_puts(m, "\n");
+ active++;
+ }
+
+ mutex_unlock(&reg_mutex);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Active: %d\n", active);
+ seq_printf(m, "Busy: %d\n", busy);
+ seq_printf(m, "Max: %ld\n", MAX_EVENTS);
+
+ return 0;
+}
+
+static const struct seq_operations user_seq_ops = {
+ .start = user_seq_start,
+ .next = user_seq_next,
+ .stop = user_seq_stop,
+ .show = user_seq_show,
+};
+
+static int user_status_open(struct inode *node, struct file *file)
+{
+ return seq_open(file, &user_seq_ops);
+}
+
+static const struct file_operations user_status_fops = {
+ .open = user_status_open,
+ .mmap = user_status_mmap,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/*
+ * Creates a set of tracefs files to allow user mode interactions.
+ */
+static int create_user_tracefs(void)
+{
+ struct dentry *edata, *emmap;
+
+ edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE,
+ NULL, NULL, &user_data_fops);
+
+ if (!edata) {
+ pr_warn("Could not create tracefs 'user_events_data' entry\n");
+ goto err;
+ }
+
+ /* mmap with MAP_SHARED requires writable fd */
+ emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
+ NULL, NULL, &user_status_fops);
+
+ if (!emmap) {
+ tracefs_remove(edata);
+ pr_warn("Could not create tracefs 'user_events_mmap' entry\n");
+ goto err;
+ }
+
+ return 0;
+err:
+ return -ENODEV;
+}
+
+static void set_page_reservations(bool set)
+{
+ int page;
+
+ for (page = 0; page < MAX_PAGES; ++page) {
+ void *addr = register_page_data + (PAGE_SIZE * page);
+
+ if (set)
+ SetPageReserved(virt_to_page(addr));
+ else
+ ClearPageReserved(virt_to_page(addr));
+ }
+}
+
+static int __init trace_events_user_init(void)
+{
+ struct page *pages;
+ int ret;
+
+ /* Zero all bits beside 0 (which is reserved for failures) */
+ bitmap_zero(page_bitmap, MAX_EVENTS);
+ set_bit(0, page_bitmap);
+
+ pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
+ if (!pages)
+ return -ENOMEM;
+ register_page_data = page_address(pages);
+
+ set_page_reservations(true);
+
+ ret = create_user_tracefs();
+
+ if (ret) {
+ pr_warn("user_events could not register with tracefs\n");
+ set_page_reservations(false);
+ __free_pages(pages, MAX_PAGE_ORDER);
+ return ret;
+ }
+
+ if (dyn_event_register(&user_event_dops))
+ pr_warn("user_events could not register with dyn_events\n");
+
+ return 0;
+}
+
+fs_initcall(trace_events_user_init);
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 5e3c62a08fc0..e9ae1f33a7f0 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1167,7 +1167,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
* used to record the beginning and to report the end of a thread noise window.
*/
static void
-trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
+trace_sched_switch_callback(void *data, bool preempt,
+ unsigned int prev_state,
+ struct task_struct *p,
struct task_struct *n)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e304196d7c28..45796d8bd4b2 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex);
static void
probe_sched_switch(void *ignore, bool preempt,
+ unsigned int prev_state,
struct task_struct *prev, struct task_struct *next)
{
int flags;
@@ -44,7 +45,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee)
if (!flags)
return;
- tracing_record_taskinfo(current, flags);
+ tracing_record_taskinfo_sched_switch(current, wakee, flags);
}
static int tracing_sched_register(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 2402de520eca..46429f9a96fa 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
static void notrace
probe_wakeup_sched_switch(void *ignore, bool preempt,
+ unsigned int prev_state,
struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;