aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c114
-rw-r--r--kernel/audit.h3
-rw-r--r--kernel/audit_tree.c8
-rw-r--r--kernel/auditfilter.c5
-rw-r--r--kernel/auditsc.c22
-rw-r--r--kernel/bpf/cgroup.c39
-rw-r--r--kernel/bpf/disasm.c52
-rw-r--r--kernel/bpf/disasm.h5
-rw-r--r--kernel/bpf/inode.c3
-rw-r--r--kernel/bpf/sockmap.c1024
-rw-r--r--kernel/bpf/stackmap.c257
-rw-r--r--kernel/bpf/syscall.c198
-rw-r--r--kernel/bpf/verifier.c73
-rw-r--r--kernel/cgroup/cgroup.c21
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/debug/kdb/kdb_bp.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c89
-rw-r--r--kernel/debug/kdb/kdb_support.c4
-rw-r--r--kernel/events/core.c14
-rw-r--r--kernel/exec_domain.c1
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/irq/Kconfig6
-rw-r--r--kernel/irq/affinity.c162
-rw-r--r--kernel/irq/autoprobe.c2
-rw-r--r--kernel/irq/chip.c10
-rw-r--r--kernel/irq/cpuhotplug.c1
-rw-r--r--kernel/irq/debugfs.c8
-rw-r--r--kernel/irq/devres.c1
-rw-r--r--kernel/irq/dummychip.c1
-rw-r--r--kernel/irq/generic-chip.c1
-rw-r--r--kernel/irq/handle.c23
-rw-r--r--kernel/irq/ipi.c3
-rw-r--r--kernel/irq/irq_sim.c14
-rw-r--r--kernel/irq/irqdesc.c23
-rw-r--r--kernel/irq/irqdomain.c2
-rw-r--r--kernel/irq/manage.c21
-rw-r--r--kernel/irq/matrix.c8
-rw-r--r--kernel/irq/msi.c3
-rw-r--r--kernel/irq/pm.c3
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/irq/timings.c13
-rw-r--r--kernel/kexec_file.c619
-rw-r--r--kernel/panic.c65
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/pid_namespace.c67
-rw-r--r--kernel/power/hibernate.c26
-rw-r--r--kernel/power/qos.c2
-rw-r--r--kernel/printk/printk.c66
-rw-r--r--kernel/resource.c3
-rw-r--r--kernel/sched/core.c3
-rw-r--r--kernel/sched/cpufreq_schedutil.c3
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/sched/idle.c34
-rw-r--r--kernel/sched/rt.c4
-rw-r--r--kernel/sched/sched.h17
-rw-r--r--kernel/signal.c10
-rw-r--r--kernel/softirq.c82
-rw-r--r--kernel/sysctl.c35
-rw-r--r--kernel/time/alarmtimer.c34
-rw-r--r--kernel/time/clocksource.c66
-rw-r--r--kernel/time/hrtimer.c69
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/posix-stubs.c2
-rw-r--r--kernel/time/posix-timers.c26
-rw-r--r--kernel/time/tick-common.c15
-rw-r--r--kernel/time/tick-internal.h6
-rw-r--r--kernel/time/tick-sched.c259
-rw-r--r--kernel/time/tick-sched.h18
-rw-r--r--kernel/time/time.c12
-rw-r--r--kernel/time/timekeeping.c219
-rw-r--r--kernel/time/timekeeping.h1
-rw-r--r--kernel/time/timekeeping_internal.h2
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/bpf_trace.c226
-rw-r--r--kernel/trace/ftrace.c7
-rw-r--r--kernel/trace/ring_buffer.c226
-rw-r--r--kernel/trace/trace.c120
-rw-r--r--kernel/trace/trace.h33
-rw-r--r--kernel/trace/trace_clock.c4
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_events_filter.c2383
-rw-r--r--kernel/trace/trace_events_hist.c4450
-rw-r--r--kernel/trace/trace_events_trigger.c53
-rw-r--r--kernel/trace/trace_printk.c4
-rw-r--r--kernel/trace/trace_uprobe.c25
-rw-r--r--kernel/trace/tracing_map.c232
-rw-r--r--kernel/trace/tracing_map.h18
-rw-r--r--kernel/ucount.c1
-rw-r--r--kernel/utsname.c20
-rw-r--r--kernel/workqueue.c60
94 files changed, 9065 insertions, 2834 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 227db99b0f19..670665c6e2a6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -38,7 +38,8 @@
* 6) Support low-overhead kernel-based filtering to minimize the
* information that must be passed to user-space.
*
- * Example user-space utilities: http://people.redhat.com/sgrubb/audit/
+ * Audit userspace, documentation, tests, and bug/issue trackers:
+ * https://github.com/linux-audit
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -180,9 +181,21 @@ static char *audit_feature_names[2] = {
"loginuid_immutable",
};
-
-/* Serialize requests from userspace. */
-DEFINE_MUTEX(audit_cmd_mutex);
+/**
+ * struct audit_ctl_mutex - serialize requests from userspace
+ * @lock: the mutex used for locking
+ * @owner: the task which owns the lock
+ *
+ * Description:
+ * This is the lock struct used to ensure we only process userspace requests
+ * in an orderly fashion. We can't simply use a mutex/lock here because we
+ * need to track lock ownership so we don't end up blocking the lock owner in
+ * audit_log_start() or similar.
+ */
+static struct audit_ctl_mutex {
+ struct mutex lock;
+ void *owner;
+} audit_cmd_mutex;
/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
* audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -227,6 +240,36 @@ int auditd_test_task(struct task_struct *task)
}
/**
+ * audit_ctl_lock - Take the audit control lock
+ */
+void audit_ctl_lock(void)
+{
+ mutex_lock(&audit_cmd_mutex.lock);
+ audit_cmd_mutex.owner = current;
+}
+
+/**
+ * audit_ctl_unlock - Drop the audit control lock
+ */
+void audit_ctl_unlock(void)
+{
+ audit_cmd_mutex.owner = NULL;
+ mutex_unlock(&audit_cmd_mutex.lock);
+}
+
+/**
+ * audit_ctl_owner_current - Test to see if the current task owns the lock
+ *
+ * Description:
+ * Return true if the current task owns the audit control lock, false if it
+ * doesn't own the lock.
+ */
+static bool audit_ctl_owner_current(void)
+{
+ return (current == audit_cmd_mutex.owner);
+}
+
+/**
* auditd_pid_vnr - Return the auditd PID relative to the namespace
*
* Description:
@@ -443,15 +486,15 @@ static int audit_set_failure(u32 state)
* Drop any references inside the auditd connection tracking struct and free
* the memory.
*/
- static void auditd_conn_free(struct rcu_head *rcu)
- {
+static void auditd_conn_free(struct rcu_head *rcu)
+{
struct auditd_connection *ac;
ac = container_of(rcu, struct auditd_connection, rcu);
put_pid(ac->pid);
put_net(ac->net);
kfree(ac);
- }
+}
/**
* auditd_set - Set/Reset the auditd connection state
@@ -860,8 +903,8 @@ int audit_send_list(void *_dest)
struct sock *sk = audit_get_sk(dest->net);
/* wait for parent to finish and send an ACK */
- mutex_lock(&audit_cmd_mutex);
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_lock();
+ audit_ctl_unlock();
while ((skb = __skb_dequeue(&dest->q)) != NULL)
netlink_unicast(sk, skb, dest->portid, 0);
@@ -902,8 +945,8 @@ static int audit_send_reply_thread(void *arg)
struct audit_reply *reply = (struct audit_reply *)arg;
struct sock *sk = audit_get_sk(reply->net);
- mutex_lock(&audit_cmd_mutex);
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_lock();
+ audit_ctl_unlock();
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
@@ -1058,6 +1101,8 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
return;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
+ if (!ab)
+ return;
audit_log_task_info(ab, current);
audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
audit_feature_names[which], !!old_feature, !!new_feature,
@@ -1466,7 +1511,7 @@ static void audit_receive(struct sk_buff *skb)
nlh = nlmsg_hdr(skb);
len = skb->len;
- mutex_lock(&audit_cmd_mutex);
+ audit_ctl_lock();
while (nlmsg_ok(nlh, len)) {
err = audit_receive_msg(skb, nlh);
/* if err or if this message says it wants a response */
@@ -1475,7 +1520,7 @@ static void audit_receive(struct sk_buff *skb)
nlh = nlmsg_next(nlh, &len);
}
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_unlock();
}
/* Run custom bind function on netlink socket group connect or bind requests. */
@@ -1547,6 +1592,9 @@ static int __init audit_init(void)
for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
INIT_LIST_HEAD(&audit_inode_hash[i]);
+ mutex_init(&audit_cmd_mutex.lock);
+ audit_cmd_mutex.owner = NULL;
+
pr_info("initializing netlink subsys (%s)\n",
audit_default ? "enabled" : "disabled");
register_pernet_subsys(&audit_net_ops);
@@ -1567,19 +1615,26 @@ static int __init audit_init(void)
}
postcore_initcall(audit_init);
-/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
+/*
+ * Process kernel command-line parameter at boot time.
+ * audit={0|off} or audit={1|on}.
+ */
static int __init audit_enable(char *str)
{
- long val;
-
- if (kstrtol(str, 0, &val))
- panic("audit: invalid 'audit' parameter value (%s)\n", str);
- audit_default = (val ? AUDIT_ON : AUDIT_OFF);
+ if (!strcasecmp(str, "off") || !strcmp(str, "0"))
+ audit_default = AUDIT_OFF;
+ else if (!strcasecmp(str, "on") || !strcmp(str, "1"))
+ audit_default = AUDIT_ON;
+ else {
+ pr_err("audit: invalid 'audit' parameter value (%s)\n", str);
+ audit_default = AUDIT_ON;
+ }
if (audit_default == AUDIT_OFF)
audit_initialized = AUDIT_DISABLED;
if (audit_set_enabled(audit_default))
- panic("audit: error setting audit state (%d)\n", audit_default);
+ pr_err("audit: error setting audit state (%d)\n",
+ audit_default);
pr_info("%s\n", audit_default ?
"enabled (after initialization)" : "disabled (until reboot)");
@@ -1710,8 +1765,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
* using a PID anchored in the caller's namespace
* 2. generator holding the audit_cmd_mutex - we don't want to block
* while holding the mutex */
- if (!(auditd_test_task(current) ||
- (current == __mutex_owner(&audit_cmd_mutex)))) {
+ if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
long stime = audit_backlog_wait_time;
while (audit_backlog_limit &&
@@ -2254,33 +2308,23 @@ EXPORT_SYMBOL(audit_log_task_info);
/**
* audit_log_link_denied - report a link restriction denial
* @operation: specific link operation
- * @link: the path that triggered the restriction
*/
-void audit_log_link_denied(const char *operation, const struct path *link)
+void audit_log_link_denied(const char *operation)
{
struct audit_buffer *ab;
- struct audit_names *name;
- name = kzalloc(sizeof(*name), GFP_NOFS);
- if (!name)
+ if (!audit_enabled || audit_dummy_context())
return;
/* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
ab = audit_log_start(current->audit_context, GFP_KERNEL,
AUDIT_ANOM_LINK);
if (!ab)
- goto out;
+ return;
audit_log_format(ab, "op=%s", operation);
audit_log_task_info(ab, current);
audit_log_format(ab, " res=0");
audit_log_end(ab);
-
- /* Generate AUDIT_PATH record with object. */
- name->type = AUDIT_TYPE_NORMAL;
- audit_copy_inode(name, link->dentry, d_backing_inode(link->dentry));
- audit_log_name(current->audit_context, name, link, 0, NULL);
-out:
- kfree(name);
}
/**
diff --git a/kernel/audit.h b/kernel/audit.h
index af5bc59487ed..214e14948370 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -341,4 +341,5 @@ extern struct list_head *audit_killed_trees(void);
#define audit_filter_inodes(t,c) AUDIT_DISABLED
#endif
-extern struct mutex audit_cmd_mutex;
+extern void audit_ctl_lock(void);
+extern void audit_ctl_unlock(void);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index fd353120e0d9..67e6956c0b61 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -709,7 +709,7 @@ static int prune_tree_thread(void *unused)
schedule();
}
- mutex_lock(&audit_cmd_mutex);
+ audit_ctl_lock();
mutex_lock(&audit_filter_mutex);
while (!list_empty(&prune_list)) {
@@ -727,7 +727,7 @@ static int prune_tree_thread(void *unused)
}
mutex_unlock(&audit_filter_mutex);
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_unlock();
}
return 0;
}
@@ -924,7 +924,7 @@ static void audit_schedule_prune(void)
*/
void audit_kill_trees(struct list_head *list)
{
- mutex_lock(&audit_cmd_mutex);
+ audit_ctl_lock();
mutex_lock(&audit_filter_mutex);
while (!list_empty(list)) {
@@ -942,7 +942,7 @@ void audit_kill_trees(struct list_head *list)
}
mutex_unlock(&audit_filter_mutex);
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_unlock();
}
/*
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4a1758adb222..d7a807e81451 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -258,8 +258,8 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
goto exit_err;
#ifdef CONFIG_AUDITSYSCALL
case AUDIT_FILTER_ENTRY:
- if (rule->action == AUDIT_ALWAYS)
- goto exit_err;
+ pr_err("AUDIT_FILTER_ENTRY is deprecated\n");
+ goto exit_err;
case AUDIT_FILTER_EXIT:
case AUDIT_FILTER_TASK:
#endif
@@ -496,7 +496,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
if (!gid_valid(f->gid))
goto exit_free;
break;
- case AUDIT_SESSIONID:
case AUDIT_ARCH:
entry->rule.arch_f = f;
break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e80459f7e132..4e0a4ac803db 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1511,30 +1511,28 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
struct audit_context *context = tsk->audit_context;
enum audit_state state;
- if (!context)
+ if (!audit_enabled || !context)
return;
BUG_ON(context->in_syscall || context->name_count);
- if (!audit_enabled)
+ state = context->state;
+ if (state == AUDIT_DISABLED)
return;
+ context->dummy = !audit_n_rules;
+ if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
+ context->prio = 0;
+ if (auditd_test_task(tsk))
+ return;
+ }
+
context->arch = syscall_get_arch();
context->major = major;
context->argv[0] = a1;
context->argv[1] = a2;
context->argv[2] = a3;
context->argv[3] = a4;
-
- state = context->state;
- context->dummy = !audit_n_rules;
- if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
- context->prio = 0;
- state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
- }
- if (state == AUDIT_DISABLED)
- return;
-
context->serial = 0;
context->ctime = current_kernel_time64();
context->in_syscall = 1;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index c1c0b60d3f2f..43171a0bb02b 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -495,6 +495,42 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
/**
+ * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
+ * provided by user sockaddr
+ * @sk: sock struct that will use sockaddr
+ * @uaddr: sockaddr struct provided by user
+ * @type: The type of program to be exectuted
+ *
+ * socket is expected to be of type INET or INET6.
+ *
+ * This function will return %-EPERM if an attached program is found and
+ * returned value != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
+ struct sockaddr *uaddr,
+ enum bpf_attach_type type)
+{
+ struct bpf_sock_addr_kern ctx = {
+ .sk = sk,
+ .uaddr = uaddr,
+ };
+ struct cgroup *cgrp;
+ int ret;
+
+ /* Check socket family since not all sockets represent network
+ * endpoint (e.g. AF_UNIX).
+ */
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+
+ return ret == 1 ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
+
+/**
* __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
* @sk: socket to get cgroup from
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
@@ -545,7 +581,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
static const struct bpf_func_proto *
-cgroup_dev_func_proto(enum bpf_func_id func_id)
+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -566,6 +602,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id)
static bool cgroup_dev_is_valid_access(int off, int size,
enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
const int size_default = sizeof(__u32);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 8740406df2cd..d6b76377cb6e 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -113,16 +113,16 @@ static const char *const bpf_jmp_string[16] = {
};
static void print_bpf_end_insn(bpf_insn_print_t verbose,
- struct bpf_verifier_env *env,
+ void *private_data,
const struct bpf_insn *insn)
{
- verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+ verbose(private_data, "(%02x) r%d = %s%d r%d\n",
+ insn->code, insn->dst_reg,
BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
insn->imm, insn->dst_reg);
}
void print_bpf_insn(const struct bpf_insn_cbs *cbs,
- struct bpf_verifier_env *env,
const struct bpf_insn *insn,
bool allow_ptr_leaks)
{
@@ -132,23 +132,23 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
if (class == BPF_ALU || class == BPF_ALU64) {
if (BPF_OP(insn->code) == BPF_END) {
if (class == BPF_ALU64)
- verbose(env, "BUG_alu64_%02x\n", insn->code);
+ verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code);
else
- print_bpf_end_insn(verbose, env, insn);
+ print_bpf_end_insn(verbose, cbs->private_data, insn);
} else if (BPF_OP(insn->code) == BPF_NEG) {
- verbose(env, "(%02x) r%d = %s-r%d\n",
+ verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n",
insn->code, insn->dst_reg,
class == BPF_ALU ? "(u32) " : "",
insn->dst_reg);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(env, "(%02x) %sr%d %s %sr%d\n",
+ verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n",
insn->code, class == BPF_ALU ? "(u32) " : "",
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
class == BPF_ALU ? "(u32) " : "",
insn->src_reg);
} else {
- verbose(env, "(%02x) %sr%d %s %s%d\n",
+ verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n",
insn->code, class == BPF_ALU ? "(u32) " : "",
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
@@ -157,46 +157,46 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
}
} else if (class == BPF_STX) {
if (BPF_MODE(insn->code) == BPF_MEM)
- verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg,
insn->off, insn->src_reg);
else if (BPF_MODE(insn->code) == BPF_XADD)
- verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg, insn->off,
insn->src_reg);
else
- verbose(env, "BUG_%02x\n", insn->code);
+ verbose(cbs->private_data, "BUG_%02x\n", insn->code);
} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose(env, "BUG_st_%02x\n", insn->code);
+ verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
return;
}
- verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg,
insn->off, insn->imm);
} else if (class == BPF_LDX) {
if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose(env, "BUG_ldx_%02x\n", insn->code);
+ verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
return;
}
- verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+ verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n",
insn->code, insn->dst_reg,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->off);
} else if (class == BPF_LD) {
if (BPF_MODE(insn->code) == BPF_ABS) {
- verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
+ verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->imm);
} else if (BPF_MODE(insn->code) == BPF_IND) {
- verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->imm);
@@ -212,12 +212,12 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
if (map_ptr && !allow_ptr_leaks)
imm = 0;
- verbose(env, "(%02x) r%d = %s\n",
+ verbose(cbs->private_data, "(%02x) r%d = %s\n",
insn->code, insn->dst_reg,
__func_imm_name(cbs, insn, imm,
tmp, sizeof(tmp)));
} else {
- verbose(env, "BUG_ld_%02x\n", insn->code);
+ verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code);
return;
}
} else if (class == BPF_JMP) {
@@ -227,35 +227,35 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
char tmp[64];
if (insn->src_reg == BPF_PSEUDO_CALL) {
- verbose(env, "(%02x) call pc%s\n",
+ verbose(cbs->private_data, "(%02x) call pc%s\n",
insn->code,
__func_get_name(cbs, insn,
tmp, sizeof(tmp)));
} else {
strcpy(tmp, "unknown");
- verbose(env, "(%02x) call %s#%d\n", insn->code,
+ verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code,
__func_get_name(cbs, insn,
tmp, sizeof(tmp)),
insn->imm);
}
} else if (insn->code == (BPF_JMP | BPF_JA)) {
- verbose(env, "(%02x) goto pc%+d\n",
+ verbose(cbs->private_data, "(%02x) goto pc%+d\n",
insn->code, insn->off);
} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
- verbose(env, "(%02x) exit\n", insn->code);
+ verbose(cbs->private_data, "(%02x) exit\n", insn->code);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
+ verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n",
insn->code, insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
insn->src_reg, insn->off);
} else {
- verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
+ verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n",
insn->code, insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
insn->imm, insn->off);
}
} else {
- verbose(env, "(%02x) %s\n",
+ verbose(cbs->private_data, "(%02x) %s\n",
insn->code, bpf_class_string[class]);
}
}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index 266fe8ee542b..e1324a834a24 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -22,14 +22,12 @@
#include <string.h>
#endif
-struct bpf_verifier_env;
-
extern const char *const bpf_alu_string[16];
extern const char *const bpf_class_string[8];
const char *func_id_name(int id);
-typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env,
+typedef __printf(2, 3) void (*bpf_insn_print_t)(void *private_data,
const char *, ...);
typedef const char *(*bpf_insn_revmap_call_t)(void *private_data,
const struct bpf_insn *insn);
@@ -45,7 +43,6 @@ struct bpf_insn_cbs {
};
void print_bpf_insn(const struct bpf_insn_cbs *cbs,
- struct bpf_verifier_env *env,
const struct bpf_insn *insn,
bool allow_ptr_leaks);
#endif
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 81e2f6995adb..bf6da59ae0d0 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -178,6 +178,9 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
static struct dentry *
bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
{
+ /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future
+ * extensions.
+ */
if (strchr(dentry->d_name.name, '.'))
return ERR_PTR(-EPERM);
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a927e89dad6e..8dd9210d7db7 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -38,8 +38,11 @@
#include <linux/skbuff.h>
#include <linux/workqueue.h>
#include <linux/list.h>
+#include <linux/mm.h>
#include <net/strparser.h>
#include <net/tcp.h>
+#include <linux/ptr_ring.h>
+#include <net/inet_common.h>
#define SOCK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -47,6 +50,7 @@
struct bpf_stab {
struct bpf_map map;
struct sock **sock_map;
+ struct bpf_prog *bpf_tx_msg;
struct bpf_prog *bpf_parse;
struct bpf_prog *bpf_verdict;
};
@@ -62,8 +66,7 @@ struct smap_psock_map_entry {
struct smap_psock {
struct rcu_head rcu;
- /* refcnt is used inside sk_callback_lock */
- u32 refcnt;
+ refcount_t refcnt;
/* datapath variables */
struct sk_buff_head rxqueue;
@@ -74,7 +77,17 @@ struct smap_psock {
int save_off;
struct sk_buff *save_skb;
+ /* datapath variables for tx_msg ULP */
+ struct sock *sk_redir;
+ int apply_bytes;
+ int cork_bytes;
+ int sg_size;
+ int eval;
+ struct sk_msg_buff *cork;
+ struct list_head ingress;
+
struct strparser strp;
+ struct bpf_prog *bpf_tx_msg;
struct bpf_prog *bpf_parse;
struct bpf_prog *bpf_verdict;
struct list_head maps;
@@ -92,11 +105,33 @@ struct smap_psock {
void (*save_write_space)(struct sock *sk);
};
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len);
+static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags);
+
static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
{
return rcu_dereference_sk_user_data(sk);
}
+static bool bpf_tcp_stream_read(const struct sock *sk)
+{
+ struct smap_psock *psock;
+ bool empty = true;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ goto out;
+ empty = list_empty(&psock->ingress);
+out:
+ rcu_read_unlock();
+ return !empty;
+}
+
static struct proto tcp_bpf_proto;
static int bpf_tcp_init(struct sock *sk)
{
@@ -116,31 +151,50 @@ static int bpf_tcp_init(struct sock *sk)
psock->save_close = sk->sk_prot->close;
psock->sk_proto = sk->sk_prot;
+
+ if (psock->bpf_tx_msg) {
+ tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg;
+ tcp_bpf_proto.sendpage = bpf_tcp_sendpage;
+ tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg;
+ tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read;
+ }
+
sk->sk_prot = &tcp_bpf_proto;
rcu_read_unlock();
return 0;
}
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+static int free_start_sg(struct sock *sk, struct sk_msg_buff *md);
+
static void bpf_tcp_release(struct sock *sk)
{
struct smap_psock *psock;
rcu_read_lock();
psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ goto out;
- if (likely(psock)) {
+ if (psock->cork) {
+ free_start_sg(psock->sock, psock->cork);
+ kfree(psock->cork);
+ psock->cork = NULL;
+ }
+
+ if (psock->sk_proto) {
sk->sk_prot = psock->sk_proto;
psock->sk_proto = NULL;
}
+out:
rcu_read_unlock();
}
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-
static void bpf_tcp_close(struct sock *sk, long timeout)
{
void (*close_fun)(struct sock *sk, long timeout);
struct smap_psock_map_entry *e, *tmp;
+ struct sk_msg_buff *md, *mtmp;
struct smap_psock *psock;
struct sock *osk;
@@ -159,6 +213,18 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
close_fun = psock->save_close;
write_lock_bh(&sk->sk_callback_lock);
+ if (psock->cork) {
+ free_start_sg(psock->sock, psock->cork);
+ kfree(psock->cork);
+ psock->cork = NULL;
+ }
+
+ list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
+ list_del(&md->list);
+ free_start_sg(psock->sock, md);
+ kfree(md);
+ }
+
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
osk = cmpxchg(e->entry, sk, NULL);
if (osk == sk) {
@@ -175,6 +241,7 @@ enum __sk_action {
__SK_DROP = 0,
__SK_PASS,
__SK_REDIRECT,
+ __SK_NONE,
};
static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
@@ -186,10 +253,782 @@ static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
.release = bpf_tcp_release,
};
+static int memcopy_from_iter(struct sock *sk,
+ struct sk_msg_buff *md,
+ struct iov_iter *from, int bytes)
+{
+ struct scatterlist *sg = md->sg_data;
+ int i = md->sg_curr, rc = -ENOSPC;
+
+ do {
+ int copy;
+ char *to;
+
+ if (md->sg_copybreak >= sg[i].length) {
+ md->sg_copybreak = 0;
+
+ if (++i == MAX_SKB_FRAGS)
+ i = 0;
+
+ if (i == md->sg_end)
+ break;
+ }
+
+ copy = sg[i].length - md->sg_copybreak;
+ to = sg_virt(&sg[i]) + md->sg_copybreak;
+ md->sg_copybreak += copy;
+
+ if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
+ rc = copy_from_iter_nocache(to, copy, from);
+ else
+ rc = copy_from_iter(to, copy, from);
+
+ if (rc != copy) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ bytes -= copy;
+ if (!bytes)
+ break;
+
+ md->sg_copybreak = 0;
+ if (++i == MAX_SKB_FRAGS)
+ i = 0;
+ } while (i != md->sg_end);
+out:
+ md->sg_curr = i;
+ return rc;
+}
+
+static int bpf_tcp_push(struct sock *sk, int apply_bytes,
+ struct sk_msg_buff *md,
+ int flags, bool uncharge)
+{
+ bool apply = apply_bytes;
+ struct scatterlist *sg;
+ int offset, ret = 0;
+ struct page *p;
+ size_t size;
+
+ while (1) {
+ sg = md->sg_data + md->sg_start;
+ size = (apply && apply_bytes < sg->length) ?
+ apply_bytes : sg->length;
+ offset = sg->offset;
+
+ tcp_rate_check_app_limited(sk);
+ p = sg_page(sg);
+retry:
+ ret = do_tcp_sendpages(sk, p, offset, size, flags);
+ if (ret != size) {
+ if (ret > 0) {
+ if (apply)
+ apply_bytes -= ret;
+ size -= ret;
+ offset += ret;
+ if (uncharge)
+ sk_mem_uncharge(sk, ret);
+ goto retry;
+ }
+
+ sg->length = size;
+ sg->offset = offset;
+ return ret;
+ }
+
+ if (apply)
+ apply_bytes -= ret;
+ sg->offset += ret;
+ sg->length -= ret;
+ if (uncharge)
+ sk_mem_uncharge(sk, ret);
+
+ if (!sg->length) {
+ put_page(p);
+ md->sg_start++;
+ if (md->sg_start == MAX_SKB_FRAGS)
+ md->sg_start = 0;
+ sg_init_table(sg, 1);
+
+ if (md->sg_start == md->sg_end)
+ break;
+ }
+
+ if (apply && !apply_bytes)
+ break;
+ }
+ return 0;
+}
+
+static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md)
+{
+ struct scatterlist *sg = md->sg_data + md->sg_start;
+
+ if (md->sg_copy[md->sg_start]) {
+ md->data = md->data_end = 0;
+ } else {
+ md->data = sg_virt(sg);
+ md->data_end = md->data + sg->length;
+ }
+}
+
+static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
+{
+ struct scatterlist *sg = md->sg_data;
+ int i = md->sg_start;
+
+ do {
+ int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length;
+
+ sk_mem_uncharge(sk, uncharge);
+ bytes -= uncharge;
+ if (!bytes)
+ break;
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ } while (i != md->sg_end);
+}
+
+static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
+{
+ struct scatterlist *sg = md->sg_data;
+ int i = md->sg_start, free;
+
+ while (bytes && sg[i].length) {
+ free = sg[i].length;
+ if (bytes < free) {
+ sg[i].length -= bytes;
+ sg[i].offset += bytes;
+ sk_mem_uncharge(sk, bytes);
+ break;
+ }
+
+ sk_mem_uncharge(sk, sg[i].length);
+ put_page(sg_page(&sg[i]));
+ bytes -= sg[i].length;
+ sg[i].length = 0;
+ sg[i].page_link = 0;
+ sg[i].offset = 0;
+ i++;
+
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ }
+}
+
+static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
+{
+ struct scatterlist *sg = md->sg_data;
+ int i = start, free = 0;
+
+ while (sg[i].length) {
+ free += sg[i].length;
+ sk_mem_uncharge(sk, sg[i].length);
+ put_page(sg_page(&sg[i]));
+ sg[i].length = 0;
+ sg[i].page_link = 0;
+ sg[i].offset = 0;
+ i++;
+
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ }
+
+ return free;
+}
+
+static int free_start_sg(struct sock *sk, struct sk_msg_buff *md)
+{
+ int free = free_sg(sk, md->sg_start, md);
+
+ md->sg_start = md->sg_end;
+ return free;
+}
+
+static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
+{
+ return free_sg(sk, md->sg_curr, md);
+}
+
+static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
+{
+ return ((_rc == SK_PASS) ?
+ (md->map ? __SK_REDIRECT : __SK_PASS) :
+ __SK_DROP);
+}
+
+static unsigned int smap_do_tx_msg(struct sock *sk,
+ struct smap_psock *psock,
+ struct sk_msg_buff *md)
+{
+ struct bpf_prog *prog;
+ unsigned int rc, _rc;
+
+ preempt_disable();
+ rcu_read_lock();
+
+ /* If the policy was removed mid-send then default to 'accept' */
+ prog = READ_ONCE(psock->bpf_tx_msg);
+ if (unlikely(!prog)) {
+ _rc = SK_PASS;
+ goto verdict;
+ }
+
+ bpf_compute_data_pointers_sg(md);
+ rc = (*prog->bpf_func)(md, prog->insnsi);
+ psock->apply_bytes = md->apply_bytes;
+
+ /* Moving return codes from UAPI namespace into internal namespace */
+ _rc = bpf_map_msg_verdict(rc, md);
+
+ /* The psock has a refcount on the sock but not on the map and because
+ * we need to drop rcu read lock here its possible the map could be
+ * removed between here and when we need it to execute the sock
+ * redirect. So do the map lookup now for future use.
+ */
+ if (_rc == __SK_REDIRECT) {
+ if (psock->sk_redir)
+ sock_put(psock->sk_redir);
+ psock->sk_redir = do_msg_redirect_map(md);
+ if (!psock->sk_redir) {
+ _rc = __SK_DROP;
+ goto verdict;
+ }
+ sock_hold(psock->sk_redir);
+ }
+verdict:
+ rcu_read_unlock();
+ preempt_enable();
+
+ return _rc;
+}
+
+static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
+ struct smap_psock *psock,
+ struct sk_msg_buff *md, int flags)
+{
+ bool apply = apply_bytes;
+ size_t size, copied = 0;
+ struct sk_msg_buff *r;
+ int err = 0, i;
+
+ r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL);
+ if (unlikely(!r))
+ return -ENOMEM;
+
+ lock_sock(sk);
+ r->sg_start = md->sg_start;
+ i = md->sg_start;
+
+ do {
+ r->sg_data[i] = md->sg_data[i];
+
+ size = (apply && apply_bytes < md->sg_data[i].length) ?
+ apply_bytes : md->sg_data[i].length;
+
+ if (!sk_wmem_schedule(sk, size)) {
+ if (!copied)
+ err = -ENOMEM;
+ break;
+ }
+
+ sk_mem_charge(sk, size);
+ r->sg_data[i].length = size;
+ md->sg_data[i].length -= size;
+ md->sg_data[i].offset += size;
+ copied += size;
+
+ if (md->sg_data[i].length) {
+ get_page(sg_page(&r->sg_data[i]));
+ r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1;
+ } else {
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ r->sg_end = i;
+ }
+
+ if (apply) {
+ apply_bytes -= size;
+ if (!apply_bytes)
+ break;
+ }
+ } while (i != md->sg_end);
+
+ md->sg_start = i;
+
+ if (!err) {
+ list_add_tail(&r->list, &psock->ingress);
+ sk->sk_data_ready(sk);
+ } else {
+ free_start_sg(sk, r);
+ kfree(r);
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
+ struct sk_msg_buff *md,
+ int flags)
+{
+ struct smap_psock *psock;
+ struct scatterlist *sg;
+ int i, err, free = 0;
+ bool ingress = !!(md->flags & BPF_F_INGRESS);
+
+ sg = md->sg_data;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ goto out_rcu;
+
+ if (!refcount_inc_not_zero(&psock->refcnt))
+ goto out_rcu;
+
+ rcu_read_unlock();
+
+ if (ingress) {
+ err = bpf_tcp_ingress(sk, send, psock, md, flags);
+ } else {
+ lock_sock(sk);
+ err = bpf_tcp_push(sk, send, md, flags, false);
+ release_sock(sk);
+ }
+ smap_release_sock(psock, sk);
+ if (unlikely(err))
+ goto out;
+ return 0;
+out_rcu:
+ rcu_read_unlock();
+out:
+ i = md->sg_start;
+ while (sg[i].length) {
+ free += sg[i].length;
+ put_page(sg_page(&sg[i]));
+ sg[i].length = 0;
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ }
+ return free;
+}
+
+static inline void bpf_md_init(struct smap_psock *psock)
+{
+ if (!psock->apply_bytes) {
+ psock->eval = __SK_NONE;
+ if (psock->sk_redir) {
+ sock_put(psock->sk_redir);
+ psock->sk_redir = NULL;
+ }
+ }
+}
+
+static void apply_bytes_dec(struct smap_psock *psock, int i)
+{
+ if (psock->apply_bytes) {
+ if (psock->apply_bytes < i)
+ psock->apply_bytes = 0;
+ else
+ psock->apply_bytes -= i;
+ }
+}
+
+static int bpf_exec_tx_verdict(struct smap_psock *psock,
+ struct sk_msg_buff *m,
+ struct sock *sk,
+ int *copied, int flags)
+{
+ bool cork = false, enospc = (m->sg_start == m->sg_end);
+ struct sock *redir;
+ int err = 0;
+ int send;
+
+more_data:
+ if (psock->eval == __SK_NONE)
+ psock->eval = smap_do_tx_msg(sk, psock, m);
+
+ if (m->cork_bytes &&
+ m->cork_bytes > psock->sg_size && !enospc) {
+ psock->cork_bytes = m->cork_bytes - psock->sg_size;
+ if (!psock->cork) {
+ psock->cork = kcalloc(1,
+ sizeof(struct sk_msg_buff),
+ GFP_ATOMIC | __GFP_NOWARN);
+
+ if (!psock->cork) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+ }
+ memcpy(psock->cork, m, sizeof(*m));
+ goto out_err;
+ }
+
+ send = psock->sg_size;
+ if (psock->apply_bytes && psock->apply_bytes < send)
+ send = psock->apply_bytes;
+
+ switch (psock->eval) {
+ case __SK_PASS:
+ err = bpf_tcp_push(sk, send, m, flags, true);
+ if (unlikely(err)) {
+ *copied -= free_start_sg(sk, m);
+ break;
+ }
+
+ apply_bytes_dec(psock, send);
+ psock->sg_size -= send;
+ break;
+ case __SK_REDIRECT:
+ redir = psock->sk_redir;
+ apply_bytes_dec(psock, send);
+
+ if (psock->cork) {
+ cork = true;
+ psock->cork = NULL;
+ }
+
+ return_mem_sg(sk, send, m);
+ release_sock(sk);
+
+ err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
+ lock_sock(sk);
+
+ if (cork) {
+ free_start_sg(sk, m);
+ kfree(m);
+ m = NULL;
+ }
+ if (unlikely(err))
+ *copied -= err;
+ else
+ psock->sg_size -= send;
+ break;
+ case __SK_DROP:
+ default:
+ free_bytes_sg(sk, send, m);
+ apply_bytes_dec(psock, send);
+ *copied -= send;
+ psock->sg_size -= send;
+ err = -EACCES;
+ break;
+ }
+
+ if (likely(!err)) {
+ bpf_md_init(psock);
+ if (m &&
+ m->sg_data[m->sg_start].page_link &&
+ m->sg_data[m->sg_start].length)
+ goto more_data;
+ }
+
+out_err:
+ return err;
+}
+
+static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len)
+{
+ struct iov_iter *iter = &msg->msg_iter;
+ struct smap_psock *psock;
+ int copied = 0;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ goto out;
+
+ if (unlikely(!refcount_inc_not_zero(&psock->refcnt)))
+ goto out;
+ rcu_read_unlock();
+
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+ lock_sock(sk);
+ while (copied != len) {
+ struct scatterlist *sg;
+ struct sk_msg_buff *md;
+ int i;
+
+ md = list_first_entry_or_null(&psock->ingress,
+ struct sk_msg_buff, list);
+ if (unlikely(!md))
+ break;
+ i = md->sg_start;
+ do {
+ struct page *page;
+ int n, copy;
+
+ sg = &md->sg_data[i];
+ copy = sg->length;
+ page = sg_page(sg);
+
+ if (copied + copy > len)
+ copy = len - copied;
+
+ n = copy_page_to_iter(page, sg->offset, copy, iter);
+ if (n != copy) {
+ md->sg_start = i;
+ release_sock(sk);
+ smap_release_sock(psock, sk);
+ return -EFAULT;
+ }
+
+ copied += copy;
+ sg->offset += copy;
+ sg->length -= copy;
+ sk_mem_uncharge(sk, copy);
+
+ if (!sg->length) {
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ if (!md->skb)
+ put_page(page);
+ }
+ if (copied == len)
+ break;
+ } while (i != md->sg_end);
+ md->sg_start = i;
+
+ if (!sg->length && md->sg_start == md->sg_end) {
+ list_del(&md->list);
+ if (md->skb)
+ consume_skb(md->skb);
+ kfree(md);
+ }
+ }
+
+ release_sock(sk);
+ smap_release_sock(psock, sk);
+ return copied;
+out:
+ rcu_read_unlock();
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+}
+
+
+static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
+ struct sk_msg_buff md = {0};
+ unsigned int sg_copy = 0;
+ struct smap_psock *psock;
+ int copied = 0, err = 0;
+ struct scatterlist *sg;
+ long timeo;
+
+ /* Its possible a sock event or user removed the psock _but_ the ops
+ * have not been reprogrammed yet so we get here. In this case fallback
+ * to tcp_sendmsg. Note this only works because we _only_ ever allow
+ * a single ULP there is no hierarchy here.
+ */
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ return tcp_sendmsg(sk, msg, size);
+ }
+
+ /* Increment the psock refcnt to ensure its not released while sending a
+ * message. Required because sk lookup and bpf programs are used in
+ * separate rcu critical sections. Its OK if we lose the map entry
+ * but we can't lose the sock reference.
+ */
+ if (!refcount_inc_not_zero(&psock->refcnt)) {
+ rcu_read_unlock();
+ return tcp_sendmsg(sk, msg, size);
+ }
+
+ sg = md.sg_data;
+ sg_init_marker(sg, MAX_SKB_FRAGS);
+ rcu_read_unlock();
+
+ lock_sock(sk);
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+ while (msg_data_left(msg)) {
+ struct sk_msg_buff *m;
+ bool enospc = false;
+ int copy;
+
+ if (sk->sk_err) {
+ err = sk->sk_err;
+ goto out_err;
+ }
+
+ copy = msg_data_left(msg);
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+
+ m = psock->cork_bytes ? psock->cork : &md;
+ m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end;
+ err = sk_alloc_sg(sk, copy, m->sg_data,
+ m->sg_start, &m->sg_end, &sg_copy,
+ m->sg_end - 1);
+ if (err) {
+ if (err != -ENOSPC)
+ goto wait_for_memory;
+ enospc = true;
+ copy = sg_copy;
+ }
+
+ err = memcopy_from_iter(sk, m, &msg->msg_iter, copy);
+ if (err < 0) {
+ free_curr_sg(sk, m);
+ goto out_err;
+ }
+
+ psock->sg_size += copy;
+ copied += copy;
+ sg_copy = 0;
+
+ /* When bytes are being corked skip running BPF program and
+ * applying verdict unless there is no more buffer space. In
+ * the ENOSPC case simply run BPF prorgram with currently
+ * accumulated data. We don't have much choice at this point
+ * we could try extending the page frags or chaining complex
+ * frags but even in these cases _eventually_ we will hit an
+ * OOM scenario. More complex recovery schemes may be
+ * implemented in the future, but BPF programs must handle
+ * the case where apply_cork requests are not honored. The
+ * canonical method to verify this is to check data length.
+ */
+ if (psock->cork_bytes) {
+ if (copy > psock->cork_bytes)
+ psock->cork_bytes = 0;
+ else
+ psock->cork_bytes -= copy;
+
+ if (psock->cork_bytes && !enospc)
+ goto out_cork;
+
+ /* All cork bytes accounted for re-run filter */
+ psock->eval = __SK_NONE;
+ psock->cork_bytes = 0;
+ }
+
+ err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
+ if (unlikely(err < 0))
+ goto out_err;
+ continue;
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_err;
+ }
+out_err:
+ if (err < 0)
+ err = sk_stream_error(sk, msg->msg_flags, err);
+out_cork:
+ release_sock(sk);
+ smap_release_sock(psock, sk);
+ return copied ? copied : err;
+}
+
+static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags)
+{
+ struct sk_msg_buff md = {0}, *m = NULL;
+ int err = 0, copied = 0;
+ struct smap_psock *psock;
+ struct scatterlist *sg;
+ bool enospc = false;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ goto accept;
+
+ if (!refcount_inc_not_zero(&psock->refcnt))
+ goto accept;
+ rcu_read_unlock();
+
+ lock_sock(sk);
+
+ if (psock->cork_bytes) {
+ m = psock->cork;
+ sg = &m->sg_data[m->sg_end];
+ } else {
+ m = &md;
+ sg = m->sg_data;
+ sg_init_marker(sg, MAX_SKB_FRAGS);
+ }
+
+ /* Catch case where ring is full and sendpage is stalled. */
+ if (unlikely(m->sg_end == m->sg_start &&
+ m->sg_data[m->sg_end].length))
+ goto out_err;
+
+ psock->sg_size += size;
+ sg_set_page(sg, page, size, offset);
+ get_page(page);
+ m->sg_copy[m->sg_end] = true;
+ sk_mem_charge(sk, size);
+ m->sg_end++;
+ copied = size;
+
+ if (m->sg_end == MAX_SKB_FRAGS)
+ m->sg_end = 0;
+
+ if (m->sg_end == m->sg_start)
+ enospc = true;
+
+ if (psock->cork_bytes) {
+ if (size > psock->cork_bytes)
+ psock->cork_bytes = 0;
+ else
+ psock->cork_bytes -= size;
+
+ if (psock->cork_bytes && !enospc)
+ goto out_err;
+
+ /* All cork bytes accounted for re-run filter */
+ psock->eval = __SK_NONE;
+ psock->cork_bytes = 0;
+ }
+
+ err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
+out_err:
+ release_sock(sk);
+ smap_release_sock(psock, sk);
+ return copied ? copied : err;
+accept:
+ rcu_read_unlock();
+ return tcp_sendpage(sk, page, offset, size, flags);
+}
+
+static void bpf_tcp_msg_add(struct smap_psock *psock,
+ struct sock *sk,
+ struct bpf_prog *tx_msg)
+{
+ struct bpf_prog *orig_tx_msg;
+
+ orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg);
+ if (orig_tx_msg)
+ bpf_prog_put(orig_tx_msg);
+}
+
static int bpf_tcp_ulp_register(void)
{
tcp_bpf_proto = tcp_prot;
tcp_bpf_proto.close = bpf_tcp_close;
+ /* Once BPF TX ULP is registered it is never unregistered. It
+ * will be in the ULP list for the lifetime of the system. Doing
+ * duplicate registers is not a problem.
+ */
return tcp_register_ulp(&bpf_tcp_ulp_ops);
}
@@ -220,27 +1059,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
__SK_DROP;
}
+static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb)
+{
+ struct sock *sk = psock->sock;
+ int copied = 0, num_sg;
+ struct sk_msg_buff *r;
+
+ r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC);
+ if (unlikely(!r))
+ return -EAGAIN;
+
+ if (!sk_rmem_schedule(sk, skb, skb->len)) {
+ kfree(r);
+ return -EAGAIN;
+ }
+
+ sg_init_table(r->sg_data, MAX_SKB_FRAGS);
+ num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len);
+ if (unlikely(num_sg < 0)) {
+ kfree(r);
+ return num_sg;
+ }
+ sk_mem_charge(sk, skb->len);
+ copied = skb->len;
+ r->sg_start = 0;
+ r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg;
+ r->skb = skb;
+ list_add_tail(&r->list, &psock->ingress);
+ sk->sk_data_ready(sk);
+ return copied;
+}
+
static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
{
+ struct smap_psock *peer;
struct sock *sk;
+ __u32 in;
int rc;
rc = smap_verdict_func(psock, skb);
switch (rc) {
case __SK_REDIRECT:
sk = do_sk_redirect_map(skb);
- if (likely(sk)) {
- struct smap_psock *peer = smap_psock_sk(sk);
-
- if (likely(peer &&
- test_bit(SMAP_TX_RUNNING, &peer->state) &&
- !sock_flag(sk, SOCK_DEAD) &&
- sock_writeable(sk))) {
- skb_set_owner_w(skb, sk);
- skb_queue_tail(&peer->rxqueue, skb);
- schedule_work(&peer->tx_work);
- break;
- }
+ if (!sk) {
+ kfree_skb(skb);
+ break;
+ }
+
+ peer = smap_psock_sk(sk);
+ in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
+
+ if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) ||
+ !test_bit(SMAP_TX_RUNNING, &peer->state))) {
+ kfree_skb(skb);
+ break;
+ }
+
+ if (!in && sock_writeable(sk)) {
+ skb_set_owner_w(skb, sk);
+ skb_queue_tail(&peer->rxqueue, skb);
+ schedule_work(&peer->tx_work);
+ break;
+ } else if (in &&
+ atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
+ skb_queue_tail(&peer->rxqueue, skb);
+ schedule_work(&peer->tx_work);
+ break;
}
/* Fall through and free skb otherwise */
case __SK_DROP:
@@ -302,15 +1186,23 @@ static void smap_tx_work(struct work_struct *w)
}
while ((skb = skb_dequeue(&psock->rxqueue))) {
+ __u32 flags;
+
rem = skb->len;
off = 0;
start:
+ flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
do {
- if (likely(psock->sock->sk_socket))
- n = skb_send_sock_locked(psock->sock,
- skb, off, rem);
- else
+ if (likely(psock->sock->sk_socket)) {
+ if (flags)
+ n = smap_do_ingress(psock, skb);
+ else
+ n = skb_send_sock_locked(psock->sock,
+ skb, off, rem);
+ } else {
n = -EINVAL;
+ }
+
if (n <= 0) {
if (n == -EAGAIN) {
/* Retry when space is available */
@@ -328,7 +1220,9 @@ start:
rem -= n;
off += n;
} while (rem);
- kfree_skb(skb);
+
+ if (!flags)
+ kfree_skb(skb);
}
out:
release_sock(psock->sock);
@@ -373,15 +1267,13 @@ static void smap_destroy_psock(struct rcu_head *rcu)
static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
{
- psock->refcnt--;
- if (psock->refcnt)
- return;
-
- tcp_cleanup_ulp(sock);
- smap_stop_sock(psock, sock);
- clear_bit(SMAP_TX_RUNNING, &psock->state);
- rcu_assign_sk_user_data(sock, NULL);
- call_rcu_sched(&psock->rcu, smap_destroy_psock);
+ if (refcount_dec_and_test(&psock->refcnt)) {
+ tcp_cleanup_ulp(sock);
+ smap_stop_sock(psock, sock);
+ clear_bit(SMAP_TX_RUNNING, &psock->state);
+ rcu_assign_sk_user_data(sock, NULL);
+ call_rcu_sched(&psock->rcu, smap_destroy_psock);
+ }
}
static int smap_parse_func_strparser(struct strparser *strp,
@@ -415,7 +1307,6 @@ static int smap_parse_func_strparser(struct strparser *strp,
return rc;
}
-
static int smap_read_sock_done(struct strparser *strp, int err)
{
return err;
@@ -469,6 +1360,7 @@ static void sock_map_remove_complete(struct bpf_stab *stab)
static void smap_gc_work(struct work_struct *w)
{
struct smap_psock_map_entry *e, *tmp;
+ struct sk_msg_buff *md, *mtmp;
struct smap_psock *psock;
psock = container_of(w, struct smap_psock, gc_work);
@@ -485,12 +1377,28 @@ static void smap_gc_work(struct work_struct *w)
bpf_prog_put(psock->bpf_parse);
if (psock->bpf_verdict)
bpf_prog_put(psock->bpf_verdict);
+ if (psock->bpf_tx_msg)
+ bpf_prog_put(psock->bpf_tx_msg);
+
+ if (psock->cork) {
+ free_start_sg(psock->sock, psock->cork);
+ kfree(psock->cork);
+ }
+
+ list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
+ list_del(&md->list);
+ free_start_sg(psock->sock, md);
+ kfree(md);
+ }
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
list_del(&e->list);
kfree(e);
}
+ if (psock->sk_redir)
+ sock_put(psock->sk_redir);
+
sock_put(psock->sock);
kfree(psock);
}
@@ -506,12 +1414,14 @@ static struct smap_psock *smap_init_psock(struct sock *sock,
if (!psock)
return ERR_PTR(-ENOMEM);
+ psock->eval = __SK_NONE;
psock->sock = sock;
skb_queue_head_init(&psock->rxqueue);
INIT_WORK(&psock->tx_work, smap_tx_work);
INIT_WORK(&psock->gc_work, smap_gc_work);
INIT_LIST_HEAD(&psock->maps);
- psock->refcnt = 1;
+ INIT_LIST_HEAD(&psock->ingress);
+ refcount_set(&psock->refcnt, 1);
rcu_assign_sk_user_data(sock, psock);
sock_hold(sock);
@@ -714,10 +1624,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
struct smap_psock_map_entry *e = NULL;
- struct bpf_prog *verdict, *parse;
+ struct bpf_prog *verdict, *parse, *tx_msg;
struct sock *osock, *sock;
struct smap_psock *psock;
u32 i = *(u32 *)key;
+ bool new = false;
int err;
if (unlikely(flags > BPF_EXIST))
@@ -740,6 +1651,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
*/
verdict = READ_ONCE(stab->bpf_verdict);
parse = READ_ONCE(stab->bpf_parse);
+ tx_msg = READ_ONCE(stab->bpf_tx_msg);
if (parse && verdict) {
/* bpf prog refcnt may be zero if a concurrent attach operation
@@ -758,6 +1670,17 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
}
}
+ if (tx_msg) {
+ tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
+ if (IS_ERR(tx_msg)) {
+ if (verdict)
+ bpf_prog_put(verdict);
+ if (parse)
+ bpf_prog_put(parse);
+ return PTR_ERR(tx_msg);
+ }
+ }
+
write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
@@ -772,7 +1695,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
err = -EBUSY;
goto out_progs;
}
- psock->refcnt++;
+ if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) {
+ err = -EBUSY;
+ goto out_progs;
+ }
+ if (!refcount_inc_not_zero(&psock->refcnt)) {
+ err = -EAGAIN;
+ goto out_progs;
+ }
} else {
psock = smap_init_psock(sock, stab);
if (IS_ERR(psock)) {
@@ -780,11 +1710,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
goto out_progs;
}
- err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
- if (err)
- goto out_progs;
-
set_bit(SMAP_TX_RUNNING, &psock->state);
+ new = true;
}
e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
@@ -797,6 +1724,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
/* 3. At this point we have a reference to a valid psock that is
* running. Attach any BPF programs needed.
*/
+ if (tx_msg)
+ bpf_tcp_msg_add(psock, sock, tx_msg);
+ if (new) {
+ err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
+ if (err)
+ goto out_free;
+ }
+
if (parse && verdict && !psock->strp_enabled) {
err = smap_init_sock(psock, sock);
if (err)
@@ -818,8 +1753,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
struct smap_psock *opsock = smap_psock_sk(osock);
write_lock_bh(&osock->sk_callback_lock);
- if (osock != sock && parse)
- smap_stop_sock(opsock, osock);
smap_list_remove(opsock, &stab->sock_map[i]);
smap_release_sock(opsock, osock);
write_unlock_bh(&osock->sk_callback_lock);
@@ -832,6 +1765,8 @@ out_progs:
bpf_prog_put(verdict);
if (parse)
bpf_prog_put(parse);
+ if (tx_msg)
+ bpf_prog_put(tx_msg);
write_unlock_bh(&sock->sk_callback_lock);
kfree(e);
return err;
@@ -846,6 +1781,9 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
return -EINVAL;
switch (type) {
+ case BPF_SK_MSG_VERDICT:
+ orig = xchg(&stab->bpf_tx_msg, prog);
+ break;
case BPF_SK_SKB_STREAM_PARSER:
orig = xchg(&stab->bpf_parse, prog);
break;
@@ -907,6 +1845,10 @@ static void sock_map_release(struct bpf_map *map, struct file *map_file)
orig = xchg(&stab->bpf_verdict, NULL);
if (orig)
bpf_prog_put(orig);
+
+ orig = xchg(&stab->bpf_tx_msg, NULL);
+ if (orig)
+ bpf_prog_put(orig);
}
const struct bpf_map_ops sock_map_ops = {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b0ecf43f5894..57eeb1234b67 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -9,16 +9,19 @@
#include <linux/filter.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
+#include <linux/elf.h>
+#include <linux/pagemap.h>
#include "percpu_freelist.h"
-#define STACK_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define STACK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
+ BPF_F_STACK_BUILD_ID)
struct stack_map_bucket {
struct pcpu_freelist_node fnode;
u32 hash;
u32 nr;
- u64 ip[];
+ u64 data[];
};
struct bpf_stack_map {
@@ -29,6 +32,17 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[];
};
+static inline bool stack_map_use_build_id(struct bpf_map *map)
+{
+ return (map->map_flags & BPF_F_STACK_BUILD_ID);
+}
+
+static inline int stack_map_data_size(struct bpf_map *map)
+{
+ return stack_map_use_build_id(map) ?
+ sizeof(struct bpf_stack_build_id) : sizeof(u64);
+}
+
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
@@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- value_size < 8 || value_size % 8 ||
- value_size / 8 > sysctl_perf_event_max_stack)
+ value_size < 8 || value_size % 8)
+ return ERR_PTR(-EINVAL);
+
+ BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
+ if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
+ if (value_size % sizeof(struct bpf_stack_build_id) ||
+ value_size / sizeof(struct bpf_stack_build_id)
+ > sysctl_perf_event_max_stack)
+ return ERR_PTR(-EINVAL);
+ } else if (value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
/* hash table size must be power of 2 */
@@ -114,13 +136,184 @@ free_smap:
return ERR_PTR(err);
}
+#define BPF_BUILD_ID 3
+/*
+ * Parse build id from the note segment. This logic can be shared between
+ * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
+ * identical.
+ */
+static inline int stack_map_parse_build_id(void *page_addr,
+ unsigned char *build_id,
+ void *note_start,
+ Elf32_Word note_size)
+{
+ Elf32_Word note_offs = 0, new_offs;
+
+ /* check for overflow */
+ if (note_start < page_addr || note_start + note_size < note_start)
+ return -EINVAL;
+
+ /* only supports note that fits in the first page */
+ if (note_start + note_size > page_addr + PAGE_SIZE)
+ return -EINVAL;
+
+ while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
+ Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
+
+ if (nhdr->n_type == BPF_BUILD_ID &&
+ nhdr->n_namesz == sizeof("GNU") &&
+ nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
+ memcpy(build_id,
+ note_start + note_offs +
+ ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
+ BPF_BUILD_ID_SIZE);
+ return 0;
+ }
+ new_offs = note_offs + sizeof(Elf32_Nhdr) +
+ ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
+ if (new_offs <= note_offs) /* overflow */
+ break;
+ note_offs = new_offs;
+ }
+ return -EINVAL;
+}
+
+/* Parse build ID from 32-bit ELF */
+static int stack_map_get_build_id_32(void *page_addr,
+ unsigned char *build_id)
+{
+ Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
+ Elf32_Phdr *phdr;
+ int i;
+
+ /* only supports phdr that fits in one page */
+ if (ehdr->e_phnum >
+ (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
+ return -EINVAL;
+
+ phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
+
+ for (i = 0; i < ehdr->e_phnum; ++i)
+ if (phdr[i].p_type == PT_NOTE)
+ return stack_map_parse_build_id(page_addr, build_id,
+ page_addr + phdr[i].p_offset,
+ phdr[i].p_filesz);
+ return -EINVAL;
+}
+
+/* Parse build ID from 64-bit ELF */
+static int stack_map_get_build_id_64(void *page_addr,
+ unsigned char *build_id)
+{
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
+ Elf64_Phdr *phdr;
+ int i;
+
+ /* only supports phdr that fits in one page */
+ if (ehdr->e_phnum >
+ (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
+ return -EINVAL;
+
+ phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
+
+ for (i = 0; i < ehdr->e_phnum; ++i)
+ if (phdr[i].p_type == PT_NOTE)
+ return stack_map_parse_build_id(page_addr, build_id,
+ page_addr + phdr[i].p_offset,
+ phdr[i].p_filesz);
+ return -EINVAL;
+}
+
+/* Parse build ID of ELF file mapped to vma */
+static int stack_map_get_build_id(struct vm_area_struct *vma,
+ unsigned char *build_id)
+{
+ Elf32_Ehdr *ehdr;
+ struct page *page;
+ void *page_addr;
+ int ret;
+
+ /* only works for page backed storage */
+ if (!vma->vm_file)
+ return -EINVAL;
+
+ page = find_get_page(vma->vm_file->f_mapping, 0);
+ if (!page)
+ return -EFAULT; /* page not mapped */
+
+ ret = -EINVAL;
+ page_addr = page_address(page);
+ ehdr = (Elf32_Ehdr *)page_addr;
+
+ /* compare magic x7f "ELF" */
+ if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
+ goto out;
+
+ /* only support executable file and shared object file */
+ if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
+ goto out;
+
+ if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
+ ret = stack_map_get_build_id_32(page_addr, build_id);
+ else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
+ ret = stack_map_get_build_id_64(page_addr, build_id);
+out:
+ put_page(page);
+ return ret;
+}
+
+static void stack_map_get_build_id_offset(struct bpf_map *map,
+ struct stack_map_bucket *bucket,
+ u64 *ips, u32 trace_nr, bool user)
+{
+ int i;
+ struct vm_area_struct *vma;
+ struct bpf_stack_build_id *id_offs;
+
+ bucket->nr = trace_nr;
+ id_offs = (struct bpf_stack_build_id *)bucket->data;
+
+ /*
+ * We cannot do up_read() in nmi context, so build_id lookup is
+ * only supported for non-nmi events. If at some point, it is
+ * possible to run find_vma() without taking the semaphore, we
+ * would like to allow build_id lookup in nmi context.
+ *
+ * Same fallback is used for kernel stack (!user) on a stackmap
+ * with build_id.
+ */
+ if (!user || !current || !current->mm || in_nmi() ||
+ down_read_trylock(&current->mm->mmap_sem) == 0) {
+ /* cannot access current->mm, fall back to ips */
+ for (i = 0; i < trace_nr; i++) {
+ id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+ id_offs[i].ip = ips[i];
+ }
+ return;
+ }
+
+ for (i = 0; i < trace_nr; i++) {
+ vma = find_vma(current->mm, ips[i]);
+ if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
+ /* per entry fall back to ips */
+ id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+ id_offs[i].ip = ips[i];
+ continue;
+ }
+ id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
+ - vma->vm_start;
+ id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+ }
+ up_read(&current->mm->mmap_sem);
+}
+
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
- u32 max_depth = map->value_size / 8;
+ u32 max_depth = map->value_size / stack_map_data_size(map);
/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
bool user = flags & BPF_F_USER_STACK;
bool kernel = !user;
u64 *ips;
+ bool hash_matches;
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
@@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
id = hash & (smap->n_buckets - 1);
bucket = READ_ONCE(smap->buckets[id]);
- if (bucket && bucket->hash == hash) {
- if (flags & BPF_F_FAST_STACK_CMP)
+ hash_matches = bucket && bucket->hash == hash;
+ /* fast cmp */
+ if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
+ return id;
+
+ if (stack_map_use_build_id(map)) {
+ /* for build_id+offset, pop a bucket before slow cmp */
+ new_bucket = (struct stack_map_bucket *)
+ pcpu_freelist_pop(&smap->freelist);
+ if (unlikely(!new_bucket))
+ return -ENOMEM;
+ stack_map_get_build_id_offset(map, new_bucket, ips,
+ trace_nr, user);
+ trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
+ if (hash_matches && bucket->nr == trace_nr &&
+ memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
+ pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
return id;
- if (bucket->nr == trace_nr &&
- memcmp(bucket->ip, ips, trace_len) == 0)
+ }
+ if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
+ pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
+ return -EEXIST;
+ }
+ } else {
+ if (hash_matches && bucket->nr == trace_nr &&
+ memcmp(bucket->data, ips, trace_len) == 0)
return id;
+ if (bucket && !(flags & BPF_F_REUSE_STACKID))
+ return -EEXIST;
+
+ new_bucket = (struct stack_map_bucket *)
+ pcpu_freelist_pop(&smap->freelist);
+ if (unlikely(!new_bucket))
+ return -ENOMEM;
+ memcpy(new_bucket->data, ips, trace_len);
}
- /* this call stack is not in the map, try to add it */
- if (bucket && !(flags & BPF_F_REUSE_STACKID))
- return -EEXIST;
-
- new_bucket = (struct stack_map_bucket *)
- pcpu_freelist_pop(&smap->freelist);
- if (unlikely(!new_bucket))
- return -ENOMEM;
-
- memcpy(new_bucket->ip, ips, trace_len);
new_bucket->hash = hash;
new_bucket->nr = trace_nr;
@@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
if (!bucket)
return -ENOENT;
- trace_len = bucket->nr * sizeof(u64);
- memcpy(value, bucket->ip, trace_len);
+ trace_len = bucket->nr * stack_map_data_size(map);
+ memcpy(value, bucket->data, trace_len);
memset(value + trace_len, 0, map->value_size - trace_len);
old_bucket = xchg(&smap->buckets[id], bucket);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 43f95d190eea..4ca46df19c9a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -203,11 +203,13 @@ static int bpf_map_alloc_id(struct bpf_map *map)
{
int id;
+ idr_preload(GFP_KERNEL);
spin_lock_bh(&map_idr_lock);
id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
if (id > 0)
map->id = id;
spin_unlock_bh(&map_idr_lock);
+ idr_preload_end();
if (WARN_ON_ONCE(!id))
return -ENOSPC;
@@ -940,11 +942,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
{
int id;
+ idr_preload(GFP_KERNEL);
spin_lock_bh(&prog_idr_lock);
id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
if (id > 0)
prog->aux->id = id;
spin_unlock_bh(&prog_idr_lock);
+ idr_preload_end();
/* id is in [1, INT_MAX) */
if (WARN_ON_ONCE(!id))
@@ -1167,8 +1171,63 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
}
EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
+/* Initially all BPF programs could be loaded w/o specifying
+ * expected_attach_type. Later for some of them specifying expected_attach_type
+ * at load time became required so that program could be validated properly.
+ * Programs of types that are allowed to be loaded both w/ and w/o (for
+ * backward compatibility) expected_attach_type, should have the default attach
+ * type assigned to expected_attach_type for the latter case, so that it can be
+ * validated later at attach time.
+ *
+ * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
+ * prog type requires it but has some attach types that have to be backward
+ * compatible.
+ */
+static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
+{
+ switch (attr->prog_type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
+ * exist so checking for non-zero is the way to go here.
+ */
+ if (!attr->expected_attach_type)
+ attr->expected_attach_type =
+ BPF_CGROUP_INET_SOCK_CREATE;
+ break;
+ }
+}
+
+static int
+bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
+ enum bpf_attach_type expected_attach_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ default:
+ return 0;
+ }
+}
+
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex
+#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type
static int bpf_prog_load(union bpf_attr *attr)
{
@@ -1205,11 +1264,17 @@ static int bpf_prog_load(union bpf_attr *attr)
!capable(CAP_SYS_ADMIN))
return -EPERM;
+ bpf_prog_load_fixup_attach_type(attr);
+ if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type))
+ return -EINVAL;
+
/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog)
return -ENOMEM;
+ prog->expected_attach_type = attr->expected_attach_type;
+
prog->aux->offload_requested = !!attr->prog_ifindex;
err = security_bpf_prog_alloc(prog->aux);
@@ -1311,11 +1376,99 @@ static int bpf_obj_get(const union bpf_attr *attr)
attr->file_flags);
}
+struct bpf_raw_tracepoint {
+ struct bpf_raw_event_map *btp;
+ struct bpf_prog *prog;
+};
+
+static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_raw_tracepoint *raw_tp = filp->private_data;
+
+ if (raw_tp->prog) {
+ bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
+ bpf_prog_put(raw_tp->prog);
+ }
+ kfree(raw_tp);
+ return 0;
+}
+
+static const struct file_operations bpf_raw_tp_fops = {
+ .release = bpf_raw_tracepoint_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
+};
+
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+
+static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+{
+ struct bpf_raw_tracepoint *raw_tp;
+ struct bpf_raw_event_map *btp;
+ struct bpf_prog *prog;
+ char tp_name[128];
+ int tp_fd, err;
+
+ if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
+ sizeof(tp_name) - 1) < 0)
+ return -EFAULT;
+ tp_name[sizeof(tp_name) - 1] = 0;
+
+ btp = bpf_find_raw_tracepoint(tp_name);
+ if (!btp)
+ return -ENOENT;
+
+ raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
+ if (!raw_tp)
+ return -ENOMEM;
+ raw_tp->btp = btp;
+
+ prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd,
+ BPF_PROG_TYPE_RAW_TRACEPOINT);
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto out_free_tp;
+ }
+
+ err = bpf_probe_register(raw_tp->btp, prog);
+ if (err)
+ goto out_put_prog;
+
+ raw_tp->prog = prog;
+ tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp,
+ O_CLOEXEC);
+ if (tp_fd < 0) {
+ bpf_probe_unregister(raw_tp->btp, prog);
+ err = tp_fd;
+ goto out_put_prog;
+ }
+ return tp_fd;
+
+out_put_prog:
+ bpf_prog_put(prog);
+out_free_tp:
+ kfree(raw_tp);
+ return err;
+}
+
#ifdef CONFIG_CGROUP_BPF
+static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
+ enum bpf_attach_type attach_type)
+{
+ switch (prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+ default:
+ return 0;
+ }
+}
+
#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
-static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
+static int sockmap_get_from_fd(const union bpf_attr *attr,
+ int type, bool attach)
{
struct bpf_prog *prog = NULL;
int ufd = attr->target_fd;
@@ -1329,8 +1482,7 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
return PTR_ERR(map);
if (attach) {
- prog = bpf_prog_get_type(attr->attach_bpf_fd,
- BPF_PROG_TYPE_SK_SKB);
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, type);
if (IS_ERR(prog)) {
fdput(f);
return PTR_ERR(prog);
@@ -1374,17 +1526,27 @@ static int bpf_prog_attach(const union bpf_attr *attr)
ptype = BPF_PROG_TYPE_CGROUP_SKB;
break;
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
break;
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
+ break;
case BPF_CGROUP_SOCK_OPS:
ptype = BPF_PROG_TYPE_SOCK_OPS;
break;
case BPF_CGROUP_DEVICE:
ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break;
+ case BPF_SK_MSG_VERDICT:
+ return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true);
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- return sockmap_get_from_fd(attr, true);
+ return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
default:
return -EINVAL;
}
@@ -1393,6 +1555,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
if (IS_ERR(prog))
return PTR_ERR(prog);
+ if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp)) {
bpf_prog_put(prog);
@@ -1429,17 +1596,27 @@ static int bpf_prog_detach(const union bpf_attr *attr)
ptype = BPF_PROG_TYPE_CGROUP_SKB;
break;
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
break;
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
+ break;
case BPF_CGROUP_SOCK_OPS:
ptype = BPF_PROG_TYPE_SOCK_OPS;
break;
case BPF_CGROUP_DEVICE:
ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break;
+ case BPF_SK_MSG_VERDICT:
+ return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false);
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- return sockmap_get_from_fd(attr, false);
+ return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
default:
return -EINVAL;
}
@@ -1478,6 +1655,12 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
break;
@@ -1917,6 +2100,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_OBJ_GET_INFO_BY_FD:
err = bpf_obj_get_info_by_fd(&attr, uattr);
break;
+ case BPF_RAW_TRACEPOINT_OPEN:
+ err = bpf_raw_tracepoint_open(&attr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c6eff108aa99..5dd1dcb902bf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -168,23 +168,12 @@ struct bpf_call_arg_meta {
static DEFINE_MUTEX(bpf_verifier_lock);
-/* log_level controls verbosity level of eBPF verifier.
- * bpf_verifier_log_write() is used to dump the verification trace to the log,
- * so the user can figure out what's wrong with the program
- */
-__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
- const char *fmt, ...)
+void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
+ va_list args)
{
- struct bpf_verifer_log *log = &env->log;
unsigned int n;
- va_list args;
- if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
- return;
-
- va_start(args, fmt);
n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
- va_end(args);
WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
"verifier log line truncated - local buffer too short\n");
@@ -197,14 +186,37 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
else
log->ubuf = NULL;
}
-EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
-/* Historically bpf_verifier_log_write was called verbose, but the name was too
- * generic for symbol export. The function was renamed, but not the calls in
- * the verifier to avoid complicating backports. Hence the alias below.
+
+/* log_level controls verbosity level of eBPF verifier.
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
*/
-static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
- const char *fmt, ...)
- __attribute__((alias("bpf_verifier_log_write")));
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(&env->log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+
+__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
+{
+ struct bpf_verifier_env *env = private_data;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(&env->log, fmt, args);
+ va_end(args);
+}
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
@@ -508,10 +520,6 @@ err:
static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-#define CALLEE_SAVED_REGS 5
-static const int callee_saved[CALLEE_SAVED_REGS] = {
- BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9
-};
static void __mark_reg_not_init(struct bpf_reg_state *reg);
@@ -1252,6 +1260,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_XDP:
case BPF_PROG_TYPE_LWT_XMIT:
case BPF_PROG_TYPE_SK_SKB:
+ case BPF_PROG_TYPE_SK_MSG:
if (meta)
return meta->pkt_access;
@@ -1314,7 +1323,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
};
if (env->ops->is_valid_access &&
- env->ops->is_valid_access(off, size, t, &info)) {
+ env->ops->is_valid_access(off, size, t, env->prog, &info)) {
/* A non zero info.ctx_field_size indicates that this field is a
* candidate for later verifier transformation to load the whole
* field and then apply a mask when accessed with a narrower
@@ -2075,7 +2084,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_SOCKMAP:
if (func_id != BPF_FUNC_sk_redirect_map &&
func_id != BPF_FUNC_sock_map_update &&
- func_id != BPF_FUNC_map_delete_elem)
+ func_id != BPF_FUNC_map_delete_elem &&
+ func_id != BPF_FUNC_msg_redirect_map)
goto error;
break;
default:
@@ -2113,6 +2123,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
goto error;
break;
case BPF_FUNC_sk_redirect_map:
+ case BPF_FUNC_msg_redirect_map:
if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
goto error;
break;
@@ -2338,7 +2349,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
}
if (env->ops->get_func_proto)
- fn = env->ops->get_func_proto(func_id);
+ fn = env->ops->get_func_proto(func_id, env->prog);
if (!fn) {
verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
func_id);
@@ -3876,6 +3887,7 @@ static int check_return_code(struct bpf_verifier_env *env)
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_CGROUP_DEVICE:
break;
@@ -4601,10 +4613,11 @@ static int do_check(struct bpf_verifier_env *env)
if (env->log.level) {
const struct bpf_insn_cbs cbs = {
.cb_print = verbose,
+ .private_data = env,
};
verbose(env, "%d: ", insn_idx);
- print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks);
+ print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
}
if (bpf_prog_is_dev_bound(env->prog->aux)) {
@@ -5560,7 +5573,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn = new_prog->insnsi + i + delta;
}
patch_call_imm:
- fn = env->ops->get_func_proto(insn->imm);
+ fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
* programs to call them, must be real in-kernel functions
*/
@@ -5602,7 +5615,7 @@ static void free_states(struct bpf_verifier_env *env)
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
struct bpf_verifier_env *env;
- struct bpf_verifer_log *log;
+ struct bpf_verifier_log *log;
int ret = -EINVAL;
/* no program is valid */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4bfb2908ec15..a662bfcbea0e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4524,10 +4524,10 @@ static struct cftype cgroup_base_files[] = {
* and thus involve punting to css->destroy_work adding two additional
* steps to the already complex sequence.
*/
-static void css_free_work_fn(struct work_struct *work)
+static void css_free_rwork_fn(struct work_struct *work)
{
- struct cgroup_subsys_state *css =
- container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
+ struct cgroup_subsys_state, destroy_rwork);
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
@@ -4573,15 +4573,6 @@ static void css_free_work_fn(struct work_struct *work)
}
}
-static void css_free_rcu_fn(struct rcu_head *rcu_head)
-{
- struct cgroup_subsys_state *css =
- container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
-
- INIT_WORK(&css->destroy_work, css_free_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
-}
-
static void css_release_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
@@ -4631,7 +4622,8 @@ static void css_release_work_fn(struct work_struct *work)
mutex_unlock(&cgroup_mutex);
- call_rcu(&css->rcu_head, css_free_rcu_fn);
+ INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
+ queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
}
static void css_release(struct percpu_ref *ref)
@@ -4765,7 +4757,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
err_list_del:
list_del_rcu(&css->sibling);
err_free_css:
- call_rcu(&css->rcu_head, css_free_rcu_fn);
+ INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
+ queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
return ERR_PTR(err);
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 4f63597c824d..f7674d676889 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -376,6 +376,7 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
{
return __pa(vmcoreinfo_note);
}
+EXPORT_SYMBOL(paddr_vmcoreinfo_note);
static int __init crash_save_vmcoreinfo_init(void)
{
@@ -453,6 +454,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PG_lru);
VMCOREINFO_NUMBER(PG_private);
VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_swapbacked);
VMCOREINFO_NUMBER(PG_slab);
#ifdef CONFIG_MEMORY_FAILURE
VMCOREINFO_NUMBER(PG_hwpoison);
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 90ff129c88a2..62c301ad0773 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -242,11 +242,11 @@ static void kdb_printbp(kdb_bp_t *bp, int i)
kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
if (bp->bp_enabled)
- kdb_printf("\n is enabled");
+ kdb_printf("\n is enabled ");
else
kdb_printf("\n is disabled");
- kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
+ kdb_printf(" addr at %016lx, hardtype=%d installed=%d\n",
bp->bp_addr, bp->bp_type, bp->bp_installed);
kdb_printf("\n");
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index dbb0781a0533..e405677ee08d 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1150,6 +1150,16 @@ void kdb_set_current_task(struct task_struct *p)
kdb_current_regs = NULL;
}
+static void drop_newline(char *buf)
+{
+ size_t len = strlen(buf);
+
+ if (len == 0)
+ return;
+ if (*(buf + len - 1) == '\n')
+ *(buf + len - 1) = '\0';
+}
+
/*
* kdb_local - The main code for kdb. This routine is invoked on a
* specific processor, it is not global. The main kdb() routine
@@ -1327,6 +1337,7 @@ do_full_getstr:
cmdptr = cmd_head;
diag = kdb_parse(cmdbuf);
if (diag == KDB_NOTFOUND) {
+ drop_newline(cmdbuf);
kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
diag = 0;
}
@@ -1566,6 +1577,7 @@ static int kdb_md(int argc, const char **argv)
int symbolic = 0;
int valid = 0;
int phys = 0;
+ int raw = 0;
kdbgetintenv("MDCOUNT", &mdcount);
kdbgetintenv("RADIX", &radix);
@@ -1575,9 +1587,10 @@ static int kdb_md(int argc, const char **argv)
repeat = mdcount * 16 / bytesperword;
if (strcmp(argv[0], "mdr") == 0) {
- if (argc != 2)
+ if (argc == 2 || (argc == 0 && last_addr != 0))
+ valid = raw = 1;
+ else
return KDB_ARGCOUNT;
- valid = 1;
} else if (isdigit(argv[0][2])) {
bytesperword = (int)(argv[0][2] - '0');
if (bytesperword == 0) {
@@ -1613,7 +1626,10 @@ static int kdb_md(int argc, const char **argv)
radix = last_radix;
bytesperword = last_bytesperword;
repeat = last_repeat;
- mdcount = ((repeat * bytesperword) + 15) / 16;
+ if (raw)
+ mdcount = repeat;
+ else
+ mdcount = ((repeat * bytesperword) + 15) / 16;
}
if (argc) {
@@ -1630,7 +1646,10 @@ static int kdb_md(int argc, const char **argv)
diag = kdbgetularg(argv[nextarg], &val);
if (!diag) {
mdcount = (int) val;
- repeat = mdcount * 16 / bytesperword;
+ if (raw)
+ repeat = mdcount;
+ else
+ repeat = mdcount * 16 / bytesperword;
}
}
if (argc >= nextarg+1) {
@@ -1640,8 +1659,15 @@ static int kdb_md(int argc, const char **argv)
}
}
- if (strcmp(argv[0], "mdr") == 0)
- return kdb_mdr(addr, mdcount);
+ if (strcmp(argv[0], "mdr") == 0) {
+ int ret;
+ last_addr = addr;
+ ret = kdb_mdr(addr, mdcount);
+ last_addr += mdcount;
+ last_repeat = mdcount;
+ last_bytesperword = bytesperword; // to make REPEAT happy
+ return ret;
+ }
switch (radix) {
case 10:
@@ -2473,41 +2499,6 @@ static int kdb_kill(int argc, const char **argv)
return 0;
}
-struct kdb_tm {
- int tm_sec; /* seconds */
- int tm_min; /* minutes */
- int tm_hour; /* hours */
- int tm_mday; /* day of the month */
- int tm_mon; /* month */
- int tm_year; /* year */
-};
-
-static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
-{
- /* This will work from 1970-2099, 2100 is not a leap year */
- static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
- 31, 30, 31, 30, 31 };
- memset(tm, 0, sizeof(*tm));
- tm->tm_sec = tv->tv_sec % (24 * 60 * 60);
- tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
- (2 * 365 + 1); /* shift base from 1970 to 1968 */
- tm->tm_min = tm->tm_sec / 60 % 60;
- tm->tm_hour = tm->tm_sec / 60 / 60;
- tm->tm_sec = tm->tm_sec % 60;
- tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
- tm->tm_mday %= (4*365+1);
- mon_day[1] = 29;
- while (tm->tm_mday >= mon_day[tm->tm_mon]) {
- tm->tm_mday -= mon_day[tm->tm_mon];
- if (++tm->tm_mon == 12) {
- tm->tm_mon = 0;
- ++tm->tm_year;
- mon_day[1] = 28;
- }
- }
- ++tm->tm_mday;
-}
-
/*
* Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
* I cannot call that code directly from kdb, it has an unconditional
@@ -2515,10 +2506,10 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
*/
static void kdb_sysinfo(struct sysinfo *val)
{
- struct timespec uptime;
- ktime_get_ts(&uptime);
+ u64 uptime = ktime_get_mono_fast_ns();
+
memset(val, 0, sizeof(*val));
- val->uptime = uptime.tv_sec;
+ val->uptime = div_u64(uptime, NSEC_PER_SEC);
val->loads[0] = avenrun[0];
val->loads[1] = avenrun[1];
val->loads[2] = avenrun[2];
@@ -2533,8 +2524,8 @@ static void kdb_sysinfo(struct sysinfo *val)
*/
static int kdb_summary(int argc, const char **argv)
{
- struct timespec now;
- struct kdb_tm tm;
+ time64_t now;
+ struct tm tm;
struct sysinfo val;
if (argc)
@@ -2548,9 +2539,9 @@ static int kdb_summary(int argc, const char **argv)
kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
kdb_printf("ccversion %s\n", __stringify(CCVERSION));
- now = __current_kernel_time();
- kdb_gmtime(&now, &tm);
- kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
+ now = __ktime_get_real_seconds();
+ time64_to_tm(now, 0, &tm);
+ kdb_printf("date %04ld-%02d-%02d %02d:%02d:%02d "
"tz_minuteswest %d\n",
1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
tm.tm_hour, tm.tm_min, tm.tm_sec,
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index d35cc2d3a4cc..990b3cc526c8 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -129,13 +129,13 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
}
if (i >= ARRAY_SIZE(kdb_name_table)) {
debug_kfree(kdb_name_table[0]);
- memcpy(kdb_name_table, kdb_name_table+1,
+ memmove(kdb_name_table, kdb_name_table+1,
sizeof(kdb_name_table[0]) *
(ARRAY_SIZE(kdb_name_table)-1));
} else {
debug_kfree(knt1);
knt1 = kdb_name_table[i];
- memcpy(kdb_name_table+i, kdb_name_table+i+1,
+ memmove(kdb_name_table+i, kdb_name_table+i+1,
sizeof(kdb_name_table[0]) *
(ARRAY_SIZE(kdb_name_table)-i-1));
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc1c330c6bd6..2d5fe26551f8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4447,6 +4447,9 @@ static void _free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
+ if (event->hw.target)
+ put_task_struct(event->hw.target);
+
exclusive_event_destroy(event);
module_put(event->pmu->module);
@@ -8397,6 +8400,10 @@ static int perf_kprobe_event_init(struct perf_event *event)
if (event->attr.type != perf_kprobe.type)
return -ENOENT;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
/*
* no branch sampling for probe events
*/
@@ -8434,6 +8441,10 @@ static int perf_uprobe_event_init(struct perf_event *event)
if (event->attr.type != perf_uprobe.type)
return -ENOENT;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
/*
* no branch sampling for probe events
*/
@@ -9955,6 +9966,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* and we cannot use the ctx information because we need the
* pmu before we get a ctx.
*/
+ get_task_struct(task);
event->hw.target = task;
}
@@ -10070,6 +10082,8 @@ err_ns:
perf_detach_cgroup(event);
if (event->ns)
put_pid_ns(event->ns);
+ if (event->hw.target)
+ put_task_struct(event->hw.target);
kfree(event);
return ERR_PTR(err);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0975b0268545..a5697119290e 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -19,7 +19,6 @@
#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/types.h>
-#include <linux/fs_struct.h>
#ifdef CONFIG_PROC_FS
static int execdomains_proc_show(struct seq_file *m, void *v)
diff --git a/kernel/fork.c b/kernel/fork.c
index f71b67dc156d..242c8c93d285 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -595,6 +595,8 @@ static void check_mm(struct mm_struct *mm)
void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
+ WARN_ON_ONCE(mm == current->mm);
+ WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
hmm_mm_destroy(mm);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 6fc87ccda1d7..c6766f326072 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -132,3 +132,9 @@ config GENERIC_IRQ_DEBUGFS
If you don't know what to do here, say N.
endmenu
+
+config GENERIC_IRQ_MULTI_HANDLER
+ depends on !MULTI_IRQ_HANDLER
+ bool
+ help
+ Allow to specify the low level IRQ handler at run time.
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index a37a3b4b6342..f4f29b9d90ee 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
}
}
-static cpumask_var_t *alloc_node_to_possible_cpumask(void)
+static cpumask_var_t *alloc_node_to_cpumask(void)
{
cpumask_var_t *masks;
int node;
@@ -62,7 +62,7 @@ out_unwind:
return NULL;
}
-static void free_node_to_possible_cpumask(cpumask_var_t *masks)
+static void free_node_to_cpumask(cpumask_var_t *masks)
{
int node;
@@ -71,7 +71,7 @@ static void free_node_to_possible_cpumask(cpumask_var_t *masks)
kfree(masks);
}
-static void build_node_to_possible_cpumask(cpumask_var_t *masks)
+static void build_node_to_cpumask(cpumask_var_t *masks)
{
int cpu;
@@ -79,14 +79,14 @@ static void build_node_to_possible_cpumask(cpumask_var_t *masks)
cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
}
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
+static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
const struct cpumask *mask, nodemask_t *nodemsk)
{
int n, nodes = 0;
/* Calculate the number of nodes in the supplied affinity mask */
for_each_node(n) {
- if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
+ if (cpumask_intersects(mask, node_to_cpumask[n])) {
node_set(n, *nodemsk);
nodes++;
}
@@ -94,73 +94,46 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
return nodes;
}
-/**
- * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
- * @nvecs: The total number of vectors
- * @affd: Description of the affinity requirements
- *
- * Returns the masks pointer or NULL if allocation failed.
- */
-struct cpumask *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+static int irq_build_affinity_masks(const struct irq_affinity *affd,
+ int startvec, int numvecs,
+ cpumask_var_t *node_to_cpumask,
+ const struct cpumask *cpu_mask,
+ struct cpumask *nmsk,
+ struct cpumask *masks)
{
- int n, nodes, cpus_per_vec, extra_vecs, curvec;
- int affv = nvecs - affd->pre_vectors - affd->post_vectors;
- int last_affv = affv + affd->pre_vectors;
+ int n, nodes, cpus_per_vec, extra_vecs, done = 0;
+ int last_affv = affd->pre_vectors + numvecs;
+ int curvec = startvec;
nodemask_t nodemsk = NODE_MASK_NONE;
- struct cpumask *masks;
- cpumask_var_t nmsk, *node_to_possible_cpumask;
-
- /*
- * If there aren't any vectors left after applying the pre/post
- * vectors don't bother with assigning affinity.
- */
- if (!affv)
- return NULL;
-
- if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
- return NULL;
-
- masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
- if (!masks)
- goto out;
- node_to_possible_cpumask = alloc_node_to_possible_cpumask();
- if (!node_to_possible_cpumask)
- goto out;
+ if (!cpumask_weight(cpu_mask))
+ return 0;
- /* Fill out vectors at the beginning that don't need affinity */
- for (curvec = 0; curvec < affd->pre_vectors; curvec++)
- cpumask_copy(masks + curvec, irq_default_affinity);
-
- /* Stabilize the cpumasks */
- get_online_cpus();
- build_node_to_possible_cpumask(node_to_possible_cpumask);
- nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask,
- &nodemsk);
+ nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
/*
* If the number of nodes in the mask is greater than or equal the
* number of vectors we just spread the vectors across the nodes.
*/
- if (affv <= nodes) {
+ if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
- cpumask_copy(masks + curvec,
- node_to_possible_cpumask[n]);
- if (++curvec == last_affv)
+ cpumask_copy(masks + curvec, node_to_cpumask[n]);
+ if (++done == numvecs)
break;
+ if (++curvec == last_affv)
+ curvec = affd->pre_vectors;
}
- goto done;
+ goto out;
}
for_each_node_mask(n, nodemsk) {
int ncpus, v, vecs_to_assign, vecs_per_node;
/* Spread the vectors per node */
- vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
+ vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes;
/* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]);
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
/* Calculate the number of cpus per vector */
ncpus = cpumask_weight(nmsk);
@@ -181,19 +154,96 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
}
- if (curvec >= last_affv)
+ done += v;
+ if (done >= numvecs)
break;
+ if (curvec >= last_affv)
+ curvec = affd->pre_vectors;
--nodes;
}
-done:
+out:
+ return done;
+}
+
+/**
+ * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
+ * @nvecs: The total number of vectors
+ * @affd: Description of the affinity requirements
+ *
+ * Returns the masks pointer or NULL if allocation failed.
+ */
+struct cpumask *
+irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+{
+ int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+ int curvec, usedvecs;
+ cpumask_var_t nmsk, npresmsk, *node_to_cpumask;
+ struct cpumask *masks = NULL;
+
+ /*
+ * If there aren't any vectors left after applying the pre/post
+ * vectors don't bother with assigning affinity.
+ */
+ if (nvecs == affd->pre_vectors + affd->post_vectors)
+ return NULL;
+
+ if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+ return NULL;
+
+ if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
+ goto outcpumsk;
+
+ node_to_cpumask = alloc_node_to_cpumask();
+ if (!node_to_cpumask)
+ goto outnpresmsk;
+
+ masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ goto outnodemsk;
+
+ /* Fill out vectors at the beginning that don't need affinity */
+ for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+ cpumask_copy(masks + curvec, irq_default_affinity);
+
+ /* Stabilize the cpumasks */
+ get_online_cpus();
+ build_node_to_cpumask(node_to_cpumask);
+
+ /* Spread on present CPUs starting from affd->pre_vectors */
+ usedvecs = irq_build_affinity_masks(affd, curvec, affvecs,
+ node_to_cpumask, cpu_present_mask,
+ nmsk, masks);
+
+ /*
+ * Spread on non present CPUs starting from the next vector to be
+ * handled. If the spreading of present CPUs already exhausted the
+ * vector space, assign the non present CPUs to the already spread
+ * out vectors.
+ */
+ if (usedvecs >= affvecs)
+ curvec = affd->pre_vectors;
+ else
+ curvec = affd->pre_vectors + usedvecs;
+ cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
+ usedvecs += irq_build_affinity_masks(affd, curvec, affvecs,
+ node_to_cpumask, npresmsk,
+ nmsk, masks);
put_online_cpus();
/* Fill out vectors at the end that don't need affinity */
+ if (usedvecs >= affvecs)
+ curvec = affd->pre_vectors + affvecs;
+ else
+ curvec = affd->pre_vectors + usedvecs;
for (; curvec < nvecs; curvec++)
cpumask_copy(masks + curvec, irq_default_affinity);
- free_node_to_possible_cpumask(node_to_possible_cpumask);
-out:
+
+outnodemsk:
+ free_node_to_cpumask(node_to_cpumask);
+outnpresmsk:
+ free_cpumask_var(npresmsk);
+outcpumsk:
free_cpumask_var(nmsk);
return masks;
}
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 8c82ea26e837..16cbf6beb276 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/autoprobe.c
- *
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the interrupt probing code and driver APIs.
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c69357a43849..a2b3d9de999c 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1,13 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/chip.c
- *
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
- * This file contains the core interrupt handling code, for irq-chip
- * based architectures.
- *
- * Detailed information is available in Documentation/core-api/genericirq.rst
+ * This file contains the core interrupt handling code, for irq-chip based
+ * architectures. Detailed information is available in
+ * Documentation/core-api/genericirq.rst
*/
#include <linux/irq.h>
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 9eb09aef0313..5b1072e394b2 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Generic cpu hotunplug interrupt migration code copied from the
* arch/arm implementation
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index acfaaef8672a..4dadeb3d6666 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -1,8 +1,6 @@
-/*
- * Copyright 2017 Thomas Gleixner <tglx@linutronix.de>
- *
- * This file is licensed under the GPL V2.
- */
+// SPDX-License-Identifier: GPL-2.0
+// Copyright 2017 Thomas Gleixner <tglx@linutronix.de>
+
#include <linux/irqdomain.h>
#include <linux/irq.h>
#include <linux/uaccess.h>
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 194c506d9d20..6a682c229e10 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/device.h>
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 326a67f2410b..0b0cdf206dc4 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 508c03dfef25..e2999a070a99 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Library implementing the most common irq chip callback functions
*
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 79f987b942b8..38554bc35375 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/handle.c
- *
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
- * This file contains the core interrupt handling code.
- *
- * Detailed information is available in Documentation/core-api/genericirq.rst
+ * This file contains the core interrupt handling code. Detailed
+ * information is available in Documentation/core-api/genericirq.rst
*
*/
@@ -20,6 +18,10 @@
#include "internals.h"
+#ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
+void (*handle_arch_irq)(struct pt_regs *) __ro_after_init;
+#endif
+
/**
* handle_bad_irq - handle spurious and unhandled irqs
* @desc: description of the interrupt
@@ -207,3 +209,14 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
return ret;
}
+
+#ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
+int __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
+{
+ if (handle_arch_irq)
+ return -EBUSY;
+
+ handle_arch_irq = handle_irq;
+ return 0;
+}
+#endif
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 259a22aa9934..8b778e37dc6d 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/ipi.c
- *
* Copyright (C) 2015 Imagination Technologies Ltd
* Author: Qais Yousef <qais.yousef@imgtec.com>
*
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 24caabf1a0f7..fc4f361a86bb 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl>
*
@@ -7,6 +8,7 @@
* option) any later version.
*/
+#include <linux/slab.h>
#include <linux/irq_sim.h>
#include <linux/irq.h>
@@ -49,7 +51,8 @@ static void irq_sim_handle_irq(struct irq_work *work)
* @sim: The interrupt simulator object to initialize.
* @num_irqs: Number of interrupts to allocate
*
- * Returns 0 on success and a negative error number on failure.
+ * On success: return the base of the allocated interrupt range.
+ * On failure: a negative errno.
*/
int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs)
{
@@ -78,7 +81,7 @@ int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs)
init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq);
sim->irq_count = num_irqs;
- return 0;
+ return sim->irq_base;
}
EXPORT_SYMBOL_GPL(irq_sim_init);
@@ -110,7 +113,8 @@ static void devm_irq_sim_release(struct device *dev, void *res)
* @sim: The interrupt simulator object to initialize.
* @num_irqs: Number of interrupts to allocate
*
- * Returns 0 on success and a negative error number on failure.
+ * On success: return the base of the allocated interrupt range.
+ * On failure: a negative errno.
*/
int devm_irq_sim_init(struct device *dev, struct irq_sim *sim,
unsigned int num_irqs)
@@ -123,7 +127,7 @@ int devm_irq_sim_init(struct device *dev, struct irq_sim *sim,
return -ENOMEM;
rv = irq_sim_init(sim, num_irqs);
- if (rv) {
+ if (rv < 0) {
devres_free(dr);
return rv;
}
@@ -131,7 +135,7 @@ int devm_irq_sim_init(struct device *dev, struct irq_sim *sim,
dr->sim = sim;
devres_add(dev, dr);
- return 0;
+ return rv;
}
EXPORT_SYMBOL_GPL(devm_irq_sim_init);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 49b54e9979cc..afc7f902d74a 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -1,10 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
- * This file contains the interrupt descriptor management code
- *
- * Detailed information is available in Documentation/core-api/genericirq.rst
+ * This file contains the interrupt descriptor management code. Detailed
+ * information is available in Documentation/core-api/genericirq.rst
*
*/
#include <linux/irq.h>
@@ -210,6 +210,22 @@ static ssize_t type_show(struct kobject *kobj,
}
IRQ_ATTR_RO(type);
+static ssize_t wakeup_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ ssize_t ret = 0;
+
+ raw_spin_lock_irq(&desc->lock);
+ ret = sprintf(buf, "%s\n",
+ irqd_is_wakeup_set(&desc->irq_data) ? "enabled" : "disabled");
+ raw_spin_unlock_irq(&desc->lock);
+
+ return ret;
+
+}
+IRQ_ATTR_RO(wakeup);
+
static ssize_t name_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -253,6 +269,7 @@ static struct attribute *irq_attrs[] = {
&chip_name_attr.attr,
&hwirq_attr.attr,
&type_attr.attr,
+ &wakeup_attr.attr,
&name_attr.attr,
&actions_attr.attr,
NULL
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 82b8b18ee1eb..5d9fc01b60a6 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
#define pr_fmt(fmt) "irq: " fmt
#include <linux/acpi.h>
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0f922729bab9..e3336d904f64 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/manage.c
- *
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006 Thomas Gleixner
*
@@ -855,10 +854,14 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
* This code is triggered unconditionally. Check the affinity
* mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
*/
- if (cpumask_available(desc->irq_common_data.affinity))
- cpumask_copy(mask, desc->irq_common_data.affinity);
- else
+ if (cpumask_available(desc->irq_common_data.affinity)) {
+ const struct cpumask *m;
+
+ m = irq_data_get_effective_affinity_mask(&desc->irq_data);
+ cpumask_copy(mask, m);
+ } else {
valid = false;
+ }
raw_spin_unlock_irq(&desc->lock);
if (valid)
@@ -1519,9 +1522,9 @@ EXPORT_SYMBOL_GPL(setup_irq);
* Internal function to unregister an irqaction - used to free
* regular and special interrupts that are part of the architecture.
*/
-static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
+static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ unsigned irq = desc->irq_data.irq;
struct irqaction *action, **action_ptr;
unsigned long flags;
@@ -1651,7 +1654,7 @@ void remove_irq(unsigned int irq, struct irqaction *act)
struct irq_desc *desc = irq_to_desc(irq);
if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
- __free_irq(irq, act->dev_id);
+ __free_irq(desc, act->dev_id);
}
EXPORT_SYMBOL_GPL(remove_irq);
@@ -1685,7 +1688,7 @@ const void *free_irq(unsigned int irq, void *dev_id)
desc->affinity_notify = NULL;
#endif
- action = __free_irq(irq, dev_id);
+ action = __free_irq(desc, dev_id);
if (!action)
return NULL;
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 4c5770407031..5092494bf261 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -1,8 +1,6 @@
-/*
- * Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de>
- *
- * SPDX-License-Identifier: GPL-2.0
- */
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de>
+
#include <linux/spinlock.h>
#include <linux/seq_file.h>
#include <linux/bitmap.h>
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 2f3c4f5382cc..2a8571f72b17 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/msi.c
- *
* Copyright (C) 2014 Intel Corp.
* Author: Jiang Liu <jiang.liu@linux.intel.com>
*
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 6bd9b58429cc..d6961d3c6f9e 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/pm.c
- *
* Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
*
* This file contains power management functions related to interrupts.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index e8f374971e37..7cb091d81d91 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/proc.c
- *
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the /proc/irq/ handling code.
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 1d08f45135c2..95414ad3506a 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/resend.c
- *
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner
*
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 6cdecc6f4c53..d867d6ddafdd 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/spurious.c
- *
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains spurious interrupt handling.
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index e0923fa4927a..1e4cb63a5c82 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -1,13 +1,6 @@
-/*
- * linux/kernel/irq/timings.c
- *
- * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
+
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/slab.h>
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index e5bcd94c1efb..75d8e7cf040e 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -22,50 +22,123 @@
#include <linux/ima.h>
#include <crypto/hash.h>
#include <crypto/sha.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/vmalloc.h>
#include "kexec_internal.h"
static int kexec_calculate_store_digests(struct kimage *image);
+/*
+ * Currently this is the only default function that is exported as some
+ * architectures need it to do additional handlings.
+ * In the future, other default functions may be exported too if required.
+ */
+int kexec_image_probe_default(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ const struct kexec_file_ops * const *fops;
+ int ret = -ENOEXEC;
+
+ for (fops = &kexec_file_loaders[0]; *fops && (*fops)->probe; ++fops) {
+ ret = (*fops)->probe(buf, buf_len);
+ if (!ret) {
+ image->fops = *fops;
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
/* Architectures can provide this probe function */
int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
unsigned long buf_len)
{
- return -ENOEXEC;
+ return kexec_image_probe_default(image, buf, buf_len);
+}
+
+static void *kexec_image_load_default(struct kimage *image)
+{
+ if (!image->fops || !image->fops->load)
+ return ERR_PTR(-ENOEXEC);
+
+ return image->fops->load(image, image->kernel_buf,
+ image->kernel_buf_len, image->initrd_buf,
+ image->initrd_buf_len, image->cmdline_buf,
+ image->cmdline_buf_len);
}
void * __weak arch_kexec_kernel_image_load(struct kimage *image)
{
- return ERR_PTR(-ENOEXEC);
+ return kexec_image_load_default(image);
+}
+
+static int kexec_image_post_load_cleanup_default(struct kimage *image)
+{
+ if (!image->fops || !image->fops->cleanup)
+ return 0;
+
+ return image->fops->cleanup(image->image_loader_data);
}
int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
{
- return -EINVAL;
+ return kexec_image_post_load_cleanup_default(image);
}
#ifdef CONFIG_KEXEC_VERIFY_SIG
+static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ if (!image->fops || !image->fops->verify_sig) {
+ pr_debug("kernel loader does not support signature verification.\n");
+ return -EKEYREJECTED;
+ }
+
+ return image->fops->verify_sig(buf, buf_len);
+}
+
int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
unsigned long buf_len)
{
- return -EKEYREJECTED;
+ return kexec_image_verify_sig_default(image, buf, buf_len);
}
#endif
-/* Apply relocations of type RELA */
+/*
+ * arch_kexec_apply_relocations_add - apply relocations of type RELA
+ * @pi: Purgatory to be relocated.
+ * @section: Section relocations applying to.
+ * @relsec: Section containing RELAs.
+ * @symtab: Corresponding symtab.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
+arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section,
+ const Elf_Shdr *relsec, const Elf_Shdr *symtab)
{
pr_err("RELA relocation unsupported.\n");
return -ENOEXEC;
}
-/* Apply relocations of type REL */
+/*
+ * arch_kexec_apply_relocations - apply relocations of type REL
+ * @pi: Purgatory to be relocated.
+ * @section: Section relocations applying to.
+ * @relsec: Section containing RELs.
+ * @symtab: Corresponding symtab.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
+arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section,
+ const Elf_Shdr *relsec, const Elf_Shdr *symtab)
{
pr_err("REL relocation unsupported.\n");
return -ENOEXEC;
@@ -532,6 +605,9 @@ static int kexec_calculate_store_digests(struct kimage *image)
struct kexec_sha_region *sha_regions;
struct purgatory_info *pi = &image->purgatory_info;
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY))
+ return 0;
+
zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
zero_buf_sz = PAGE_SIZE;
@@ -633,87 +709,29 @@ out:
return ret;
}
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down)
+#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY
+/*
+ * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
+ * @pi: Purgatory to be loaded.
+ * @kbuf: Buffer to setup.
+ *
+ * Allocates the memory needed for the buffer. Caller is responsible to free
+ * the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi,
+ struct kexec_buf *kbuf)
{
- struct purgatory_info *pi = &image->purgatory_info;
- unsigned long align, bss_align, bss_sz, bss_pad;
- unsigned long entry, load_addr, curr_load_addr, bss_addr, offset;
- unsigned char *buf_addr, *src;
- int i, ret = 0, entry_sidx = -1;
- const Elf_Shdr *sechdrs_c;
- Elf_Shdr *sechdrs = NULL;
- struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1,
- .buf_min = min, .buf_max = max,
- .top_down = top_down };
-
- /*
- * sechdrs_c points to section headers in purgatory and are read
- * only. No modifications allowed.
- */
- sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-
- /*
- * We can not modify sechdrs_c[] and its fields. It is read only.
- * Copy it over to a local copy where one can store some temporary
- * data and free it at the end. We need to modify ->sh_addr and
- * ->sh_offset fields to keep track of permanent and temporary
- * locations of sections.
- */
- sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
- if (!sechdrs)
- return -ENOMEM;
-
- memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-
- /*
- * We seem to have multiple copies of sections. First copy is which
- * is embedded in kernel in read only section. Some of these sections
- * will be copied to a temporary buffer and relocated. And these
- * sections will finally be copied to their final destination at
- * segment load time.
- *
- * Use ->sh_offset to reflect section address in memory. It will
- * point to original read only copy if section is not allocatable.
- * Otherwise it will point to temporary copy which will be relocated.
- *
- * Use ->sh_addr to contain final address of the section where it
- * will go during execution time.
- */
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type == SHT_NOBITS)
- continue;
-
- sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
- sechdrs[i].sh_offset;
- }
-
- /*
- * Identify entry point section and make entry relative to section
- * start.
- */
- entry = pi->ehdr->e_entry;
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- /* Make entry section relative */
- if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
- ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
- pi->ehdr->e_entry)) {
- entry_sidx = i;
- entry -= sechdrs[i].sh_addr;
- break;
- }
- }
+ const Elf_Shdr *sechdrs;
+ unsigned long bss_align;
+ unsigned long bss_sz;
+ unsigned long align;
+ int i, ret;
- /* Determine how much memory is needed to load relocatable object. */
- bss_align = 1;
- bss_sz = 0;
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ kbuf->buf_align = bss_align = 1;
+ kbuf->bufsz = bss_sz = 0;
for (i = 0; i < pi->ehdr->e_shnum; i++) {
if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -721,111 +739,124 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
align = sechdrs[i].sh_addralign;
if (sechdrs[i].sh_type != SHT_NOBITS) {
- if (kbuf.buf_align < align)
- kbuf.buf_align = align;
- kbuf.bufsz = ALIGN(kbuf.bufsz, align);
- kbuf.bufsz += sechdrs[i].sh_size;
+ if (kbuf->buf_align < align)
+ kbuf->buf_align = align;
+ kbuf->bufsz = ALIGN(kbuf->bufsz, align);
+ kbuf->bufsz += sechdrs[i].sh_size;
} else {
- /* bss section */
if (bss_align < align)
bss_align = align;
bss_sz = ALIGN(bss_sz, align);
bss_sz += sechdrs[i].sh_size;
}
}
+ kbuf->bufsz = ALIGN(kbuf->bufsz, bss_align);
+ kbuf->memsz = kbuf->bufsz + bss_sz;
+ if (kbuf->buf_align < bss_align)
+ kbuf->buf_align = bss_align;
- /* Determine the bss padding required to align bss properly */
- bss_pad = 0;
- if (kbuf.bufsz & (bss_align - 1))
- bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1));
-
- kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz;
+ kbuf->buffer = vzalloc(kbuf->bufsz);
+ if (!kbuf->buffer)
+ return -ENOMEM;
+ pi->purgatory_buf = kbuf->buffer;
- /* Allocate buffer for purgatory */
- kbuf.buffer = vzalloc(kbuf.bufsz);
- if (!kbuf.buffer) {
- ret = -ENOMEM;
+ ret = kexec_add_buffer(kbuf);
+ if (ret)
goto out;
- }
- if (kbuf.buf_align < bss_align)
- kbuf.buf_align = bss_align;
+ return 0;
+out:
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+ return ret;
+}
- /* Add buffer to segment list */
- ret = kexec_add_buffer(&kbuf);
- if (ret)
- goto out;
- pi->purgatory_load_addr = kbuf.mem;
+/*
+ * kexec_purgatory_setup_sechdrs - prepares the pi->sechdrs buffer.
+ * @pi: Purgatory to be loaded.
+ * @kbuf: Buffer prepared to store purgatory.
+ *
+ * Allocates the memory needed for the buffer. Caller is responsible to free
+ * the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
+ struct kexec_buf *kbuf)
+{
+ unsigned long bss_addr;
+ unsigned long offset;
+ Elf_Shdr *sechdrs;
+ int i;
+
+ /*
+ * The section headers in kexec_purgatory are read-only. In order to
+ * have them modifiable make a temporary copy.
+ */
+ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ if (!sechdrs)
+ return -ENOMEM;
+ memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff,
+ pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ pi->sechdrs = sechdrs;
- /* Load SHF_ALLOC sections */
- buf_addr = kbuf.buffer;
- load_addr = curr_load_addr = pi->purgatory_load_addr;
- bss_addr = load_addr + kbuf.bufsz + bss_pad;
+ offset = 0;
+ bss_addr = kbuf->mem + kbuf->bufsz;
+ kbuf->image->start = pi->ehdr->e_entry;
for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ unsigned long align;
+ void *src, *dst;
+
if (!(sechdrs[i].sh_flags & SHF_ALLOC))
continue;
align = sechdrs[i].sh_addralign;
- if (sechdrs[i].sh_type != SHT_NOBITS) {
- curr_load_addr = ALIGN(curr_load_addr, align);
- offset = curr_load_addr - load_addr;
- /* We already modifed ->sh_offset to keep src addr */
- src = (char *) sechdrs[i].sh_offset;
- memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
- /* Store load address and source address of section */
- sechdrs[i].sh_addr = curr_load_addr;
-
- /*
- * This section got copied to temporary buffer. Update
- * ->sh_offset accordingly.
- */
- sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
- /* Advance to the next address */
- curr_load_addr += sechdrs[i].sh_size;
- } else {
+ if (sechdrs[i].sh_type == SHT_NOBITS) {
bss_addr = ALIGN(bss_addr, align);
sechdrs[i].sh_addr = bss_addr;
bss_addr += sechdrs[i].sh_size;
+ continue;
}
- }
- /* Update entry point based on load address of text section */
- if (entry_sidx >= 0)
- entry += sechdrs[entry_sidx].sh_addr;
+ offset = ALIGN(offset, align);
+ if (sechdrs[i].sh_flags & SHF_EXECINSTR &&
+ pi->ehdr->e_entry >= sechdrs[i].sh_addr &&
+ pi->ehdr->e_entry < (sechdrs[i].sh_addr
+ + sechdrs[i].sh_size)) {
+ kbuf->image->start -= sechdrs[i].sh_addr;
+ kbuf->image->start += kbuf->mem + offset;
+ }
- /* Make kernel jump to purgatory after shutdown */
- image->start = entry;
+ src = (void *)pi->ehdr + sechdrs[i].sh_offset;
+ dst = pi->purgatory_buf + offset;
+ memcpy(dst, src, sechdrs[i].sh_size);
- /* Used later to get/set symbol values */
- pi->sechdrs = sechdrs;
+ sechdrs[i].sh_addr = kbuf->mem + offset;
+ sechdrs[i].sh_offset = offset;
+ offset += sechdrs[i].sh_size;
+ }
- /*
- * Used later to identify which section is purgatory and skip it
- * from checksumming.
- */
- pi->purgatory_buf = kbuf.buffer;
- return ret;
-out:
- vfree(sechdrs);
- vfree(kbuf.buffer);
- return ret;
+ return 0;
}
static int kexec_apply_relocations(struct kimage *image)
{
int i, ret;
struct purgatory_info *pi = &image->purgatory_info;
- Elf_Shdr *sechdrs = pi->sechdrs;
+ const Elf_Shdr *sechdrs;
+
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
- /* Apply relocations */
for (i = 0; i < pi->ehdr->e_shnum; i++) {
- Elf_Shdr *section, *symtab;
+ const Elf_Shdr *relsec;
+ const Elf_Shdr *symtab;
+ Elf_Shdr *section;
+
+ relsec = sechdrs + i;
- if (sechdrs[i].sh_type != SHT_RELA &&
- sechdrs[i].sh_type != SHT_REL)
+ if (relsec->sh_type != SHT_RELA &&
+ relsec->sh_type != SHT_REL)
continue;
/*
@@ -834,12 +865,12 @@ static int kexec_apply_relocations(struct kimage *image)
* symbol table. And ->sh_info contains section header
* index of section to which relocations apply.
*/
- if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
- sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+ if (relsec->sh_info >= pi->ehdr->e_shnum ||
+ relsec->sh_link >= pi->ehdr->e_shnum)
return -ENOEXEC;
- section = &sechdrs[sechdrs[i].sh_info];
- symtab = &sechdrs[sechdrs[i].sh_link];
+ section = pi->sechdrs + relsec->sh_info;
+ symtab = sechdrs + relsec->sh_link;
if (!(section->sh_flags & SHF_ALLOC))
continue;
@@ -856,12 +887,12 @@ static int kexec_apply_relocations(struct kimage *image)
* Respective architecture needs to provide support for applying
* relocations of type SHT_RELA/SHT_REL.
*/
- if (sechdrs[i].sh_type == SHT_RELA)
- ret = arch_kexec_apply_relocations_add(pi->ehdr,
- sechdrs, i);
- else if (sechdrs[i].sh_type == SHT_REL)
- ret = arch_kexec_apply_relocations(pi->ehdr,
- sechdrs, i);
+ if (relsec->sh_type == SHT_RELA)
+ ret = arch_kexec_apply_relocations_add(pi, section,
+ relsec, symtab);
+ else if (relsec->sh_type == SHT_REL)
+ ret = arch_kexec_apply_relocations(pi, section,
+ relsec, symtab);
if (ret)
return ret;
}
@@ -869,10 +900,18 @@ static int kexec_apply_relocations(struct kimage *image)
return 0;
}
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down,
- unsigned long *load_addr)
+/*
+ * kexec_load_purgatory - Load and relocate the purgatory object.
+ * @image: Image to add the purgatory to.
+ * @kbuf: Memory parameters to use.
+ *
+ * Allocates the memory needed for image->purgatory_info.sechdrs and
+ * image->purgatory_info.purgatory_buf/kbuf->buffer. Caller is responsible
+ * to free the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf)
{
struct purgatory_info *pi = &image->purgatory_info;
int ret;
@@ -880,55 +919,51 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
if (kexec_purgatory_size <= 0)
return -EINVAL;
- if (kexec_purgatory_size < sizeof(Elf_Ehdr))
- return -ENOEXEC;
-
- pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-
- if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
- || pi->ehdr->e_type != ET_REL
- || !elf_check_arch(pi->ehdr)
- || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
- return -ENOEXEC;
-
- if (pi->ehdr->e_shoff >= kexec_purgatory_size
- || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
- kexec_purgatory_size - pi->ehdr->e_shoff))
- return -ENOEXEC;
+ pi->ehdr = (const Elf_Ehdr *)kexec_purgatory;
- ret = __kexec_load_purgatory(image, min, max, top_down);
+ ret = kexec_purgatory_setup_kbuf(pi, kbuf);
if (ret)
return ret;
+ ret = kexec_purgatory_setup_sechdrs(pi, kbuf);
+ if (ret)
+ goto out_free_kbuf;
+
ret = kexec_apply_relocations(image);
if (ret)
goto out;
- *load_addr = pi->purgatory_load_addr;
return 0;
out:
vfree(pi->sechdrs);
pi->sechdrs = NULL;
-
+out_free_kbuf:
vfree(pi->purgatory_buf);
pi->purgatory_buf = NULL;
return ret;
}
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
- const char *name)
+/*
+ * kexec_purgatory_find_symbol - find a symbol in the purgatory
+ * @pi: Purgatory to search in.
+ * @name: Name of the symbol.
+ *
+ * Return: pointer to symbol in read-only symtab on success, NULL on error.
+ */
+static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
{
- Elf_Sym *syms;
- Elf_Shdr *sechdrs;
- Elf_Ehdr *ehdr;
- int i, k;
+ const Elf_Shdr *sechdrs;
+ const Elf_Ehdr *ehdr;
+ const Elf_Sym *syms;
const char *strtab;
+ int i, k;
- if (!pi->sechdrs || !pi->ehdr)
+ if (!pi->ehdr)
return NULL;
- sechdrs = pi->sechdrs;
ehdr = pi->ehdr;
+ sechdrs = (void *)ehdr + ehdr->e_shoff;
for (i = 0; i < ehdr->e_shnum; i++) {
if (sechdrs[i].sh_type != SHT_SYMTAB)
@@ -937,8 +972,8 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
if (sechdrs[i].sh_link >= ehdr->e_shnum)
/* Invalid strtab section number */
continue;
- strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
- syms = (Elf_Sym *)sechdrs[i].sh_offset;
+ strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (void *)ehdr + sechdrs[i].sh_offset;
/* Go through symbols for a match */
for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
@@ -966,7 +1001,7 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
{
struct purgatory_info *pi = &image->purgatory_info;
- Elf_Sym *sym;
+ const Elf_Sym *sym;
Elf_Shdr *sechdr;
sym = kexec_purgatory_find_symbol(pi, name);
@@ -989,9 +1024,9 @@ void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
void *buf, unsigned int size, bool get_value)
{
- Elf_Sym *sym;
- Elf_Shdr *sechdrs;
struct purgatory_info *pi = &image->purgatory_info;
+ const Elf_Sym *sym;
+ Elf_Shdr *sec;
char *sym_buf;
sym = kexec_purgatory_find_symbol(pi, name);
@@ -1004,16 +1039,15 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
return -EINVAL;
}
- sechdrs = pi->sechdrs;
+ sec = pi->sechdrs + sym->st_shndx;
- if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ if (sec->sh_type == SHT_NOBITS) {
pr_err("symbol %s is in a bss section. Cannot %s\n", name,
get_value ? "get" : "set");
return -EINVAL;
}
- sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
- sym->st_value;
+ sym_buf = (char *)pi->purgatory_buf + sec->sh_offset + sym->st_value;
if (get_value)
memcpy((void *)buf, sym_buf, size);
@@ -1022,3 +1056,174 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
return 0;
}
+#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */
+
+int crash_exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart, unsigned long long mend)
+{
+ int i, j;
+ unsigned long long start, end;
+ struct crash_mem_range temp_range = {0, 0};
+
+ for (i = 0; i < mem->nr_ranges; i++) {
+ start = mem->ranges[i].start;
+ end = mem->ranges[i].end;
+
+ if (mstart > end || mend < start)
+ continue;
+
+ /* Truncate any area outside of range */
+ if (mstart < start)
+ mstart = start;
+ if (mend > end)
+ mend = end;
+
+ /* Found completely overlapping range */
+ if (mstart == start && mend == end) {
+ mem->ranges[i].start = 0;
+ mem->ranges[i].end = 0;
+ if (i < mem->nr_ranges - 1) {
+ /* Shift rest of the ranges to left */
+ for (j = i; j < mem->nr_ranges - 1; j++) {
+ mem->ranges[j].start =
+ mem->ranges[j+1].start;
+ mem->ranges[j].end =
+ mem->ranges[j+1].end;
+ }
+ }
+ mem->nr_ranges--;
+ return 0;
+ }
+
+ if (mstart > start && mend < end) {
+ /* Split original range */
+ mem->ranges[i].end = mstart - 1;
+ temp_range.start = mend + 1;
+ temp_range.end = end;
+ } else if (mstart != start)
+ mem->ranges[i].end = mstart - 1;
+ else
+ mem->ranges[i].start = mend + 1;
+ break;
+ }
+
+ /* If a split happened, add the split to array */
+ if (!temp_range.end)
+ return 0;
+
+ /* Split happened */
+ if (i == mem->max_nr_ranges - 1)
+ return -ENOMEM;
+
+ /* Location where new range should go */
+ j = i + 1;
+ if (j < mem->nr_ranges) {
+ /* Move over all ranges one slot towards the end */
+ for (i = mem->nr_ranges - 1; i >= j; i--)
+ mem->ranges[i + 1] = mem->ranges[i];
+ }
+
+ mem->ranges[j].start = temp_range.start;
+ mem->ranges[j].end = temp_range.end;
+ mem->nr_ranges++;
+ return 0;
+}
+
+int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
+ void **addr, unsigned long *sz)
+{
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+ unsigned char *buf;
+ unsigned int cpu, i;
+ unsigned long long notes_addr;
+ unsigned long mstart, mend;
+
+ /* extra phdr for vmcoreinfo elf note */
+ nr_phdr = nr_cpus + 1;
+ nr_phdr += mem->nr_ranges;
+
+ /*
+ * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+ * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
+ * I think this is required by tools like gdb. So same physical
+ * memory will be mapped in two elf headers. One will contain kernel
+ * text virtual addresses and other will have __va(physical) addresses.
+ */
+
+ nr_phdr++;
+ elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+ elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+ buf = vzalloc(elf_sz);
+ if (!buf)
+ return -ENOMEM;
+
+ ehdr = (Elf64_Ehdr *)buf;
+ phdr = (Elf64_Phdr *)(ehdr + 1);
+ memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+ ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+ ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_phoff = sizeof(Elf64_Ehdr);
+ ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Prepare one phdr of type PT_NOTE for each present cpu */
+ for_each_present_cpu(cpu) {
+ phdr->p_type = PT_NOTE;
+ notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+ phdr->p_offset = phdr->p_paddr = notes_addr;
+ phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+ (ehdr->e_phnum)++;
+ phdr++;
+ }
+
+ /* Prepare one PT_NOTE header for vmcoreinfo */
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+ phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
+ (ehdr->e_phnum)++;
+ phdr++;
+
+ /* Prepare PT_LOAD type program header for kernel text region */
+ if (kernel_map) {
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (Elf64_Addr)_text;
+ phdr->p_filesz = phdr->p_memsz = _end - _text;
+ phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+ ehdr->e_phnum++;
+ phdr++;
+ }
+
+ /* Go through all the ranges in mem->ranges[] and prepare phdr */
+ for (i = 0; i < mem->nr_ranges; i++) {
+ mstart = mem->ranges[i].start;
+ mend = mem->ranges[i].end;
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mstart;
+
+ phdr->p_paddr = mstart;
+ phdr->p_vaddr = (unsigned long long) __va(mstart);
+ phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+ phdr->p_align = 0;
+ ehdr->e_phnum++;
+ phdr++;
+ pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+ }
+
+ *addr = buf;
+ *sz = elf_sz;
+ return 0;
+}
diff --git a/kernel/panic.c b/kernel/panic.c
index 9d833d913c84..42e487488554 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,7 +34,8 @@
#define PANIC_BLINK_SPD 18
int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
-static unsigned long tainted_mask;
+static unsigned long tainted_mask =
+ IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -308,52 +309,40 @@ EXPORT_SYMBOL(panic);
* is being removed anyway.
*/
const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
- { 'P', 'G', true }, /* TAINT_PROPRIETARY_MODULE */
- { 'F', ' ', true }, /* TAINT_FORCED_MODULE */
- { 'S', ' ', false }, /* TAINT_CPU_OUT_OF_SPEC */
- { 'R', ' ', false }, /* TAINT_FORCED_RMMOD */
- { 'M', ' ', false }, /* TAINT_MACHINE_CHECK */
- { 'B', ' ', false }, /* TAINT_BAD_PAGE */
- { 'U', ' ', false }, /* TAINT_USER */
- { 'D', ' ', false }, /* TAINT_DIE */
- { 'A', ' ', false }, /* TAINT_OVERRIDDEN_ACPI_TABLE */
- { 'W', ' ', false }, /* TAINT_WARN */
- { 'C', ' ', true }, /* TAINT_CRAP */
- { 'I', ' ', false }, /* TAINT_FIRMWARE_WORKAROUND */
- { 'O', ' ', true }, /* TAINT_OOT_MODULE */
- { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */
- { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */
- { 'K', ' ', true }, /* TAINT_LIVEPATCH */
- { 'X', ' ', true }, /* TAINT_AUX */
+ [ TAINT_PROPRIETARY_MODULE ] = { 'P', 'G', true },
+ [ TAINT_FORCED_MODULE ] = { 'F', ' ', true },
+ [ TAINT_CPU_OUT_OF_SPEC ] = { 'S', ' ', false },
+ [ TAINT_FORCED_RMMOD ] = { 'R', ' ', false },
+ [ TAINT_MACHINE_CHECK ] = { 'M', ' ', false },
+ [ TAINT_BAD_PAGE ] = { 'B', ' ', false },
+ [ TAINT_USER ] = { 'U', ' ', false },
+ [ TAINT_DIE ] = { 'D', ' ', false },
+ [ TAINT_OVERRIDDEN_ACPI_TABLE ] = { 'A', ' ', false },
+ [ TAINT_WARN ] = { 'W', ' ', false },
+ [ TAINT_CRAP ] = { 'C', ' ', true },
+ [ TAINT_FIRMWARE_WORKAROUND ] = { 'I', ' ', false },
+ [ TAINT_OOT_MODULE ] = { 'O', ' ', true },
+ [ TAINT_UNSIGNED_MODULE ] = { 'E', ' ', true },
+ [ TAINT_SOFTLOCKUP ] = { 'L', ' ', false },
+ [ TAINT_LIVEPATCH ] = { 'K', ' ', true },
+ [ TAINT_AUX ] = { 'X', ' ', true },
+ [ TAINT_RANDSTRUCT ] = { 'T', ' ', true },
};
/**
- * print_tainted - return a string to represent the kernel taint state.
+ * print_tainted - return a string to represent the kernel taint state.
*
- * 'P' - Proprietary module has been loaded.
- * 'F' - Module has been forcibly loaded.
- * 'S' - SMP with CPUs not designed for SMP.
- * 'R' - User forced a module unload.
- * 'M' - System experienced a machine check exception.
- * 'B' - System has hit bad_page.
- * 'U' - Userspace-defined naughtiness.
- * 'D' - Kernel has oopsed before
- * 'A' - ACPI table overridden.
- * 'W' - Taint on warning.
- * 'C' - modules from drivers/staging are loaded.
- * 'I' - Working around severe firmware bug.
- * 'O' - Out-of-tree module has been loaded.
- * 'E' - Unsigned module has been loaded.
- * 'L' - A soft lockup has previously occurred.
- * 'K' - Kernel has been live patched.
- * 'X' - Auxiliary taint, for distros' use.
+ * For individual taint flag meanings, see Documentation/sysctl/kernel.txt
*
- * The string is overwritten by the next call to print_tainted().
+ * The string is overwritten by the next call to print_tainted(),
+ * but is always NULL terminated.
*/
const char *print_tainted(void)
{
static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")];
+ BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);
+
if (tainted_mask) {
char *s;
int i;
@@ -554,6 +543,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
else
dump_stack();
+ print_irqtrace_events(current);
+
print_oops_end_marker();
/* Just a warning, don't kill lockdep. */
diff --git a/kernel/params.c b/kernel/params.c
index cc9108c2a1fd..ce89f757e6da 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -111,8 +111,8 @@ bool parameq(const char *a, const char *b)
static void param_check_unsafe(const struct kernel_param *kp)
{
if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
- pr_warn("Setting dangerous option %s - tainting kernel\n",
- kp->name);
+ pr_notice("Setting dangerous option %s - tainting kernel\n",
+ kp->name);
add_taint(TAINT_USER, LOCKDEP_STILL_OK);
}
}
diff --git a/kernel/pid.c b/kernel/pid.c
index ed6c343fe50d..157fe4b19971 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -70,7 +70,7 @@ int pid_max_max = PID_MAX_LIMIT;
*/
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
- .idr = IDR_INIT,
+ .idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 93b57f026688..2a2ac53d8b8b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,55 +23,39 @@
#include <linux/sched/signal.h>
#include <linux/idr.h>
-struct pid_cache {
- int nr_ids;
- char name[16];
- struct kmem_cache *cachep;
- struct list_head list;
-};
-
-static LIST_HEAD(pid_caches_lh);
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
+/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
+#define MAX_PID_NS_LEVEL 32
+/* Write once array, filled from the beginning. */
+static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];
/*
* creates the kmem cache to allocate pids from.
- * @nr_ids: the number of numerical ids this pid will have to carry
+ * @level: pid namespace level
*/
-static struct kmem_cache *create_pid_cachep(int nr_ids)
+static struct kmem_cache *create_pid_cachep(unsigned int level)
{
- struct pid_cache *pcache;
- struct kmem_cache *cachep;
-
+ /* Level 0 is init_pid_ns.pid_cachep */
+ struct kmem_cache **pkc = &pid_cache[level - 1];
+ struct kmem_cache *kc;
+ char name[4 + 10 + 1];
+ unsigned int len;
+
+ kc = READ_ONCE(*pkc);
+ if (kc)
+ return kc;
+
+ snprintf(name, sizeof(name), "pid_%u", level + 1);
+ len = sizeof(struct pid) + level * sizeof(struct upid);
mutex_lock(&pid_caches_mutex);
- list_for_each_entry(pcache, &pid_caches_lh, list)
- if (pcache->nr_ids == nr_ids)
- goto out;
-
- pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
- if (pcache == NULL)
- goto err_alloc;
-
- snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
- cachep = kmem_cache_create(pcache->name,
- sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (cachep == NULL)
- goto err_cachep;
-
- pcache->nr_ids = nr_ids;
- pcache->cachep = cachep;
- list_add(&pcache->list, &pid_caches_lh);
-out:
+ /* Name collision forces to do allocation under mutex. */
+ if (!*pkc)
+ *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0);
mutex_unlock(&pid_caches_mutex);
- return pcache->cachep;
-
-err_cachep:
- kfree(pcache);
-err_alloc:
- mutex_unlock(&pid_caches_mutex);
- return NULL;
+ /* current can fail, but someone else can succeed. */
+ return READ_ONCE(*pkc);
}
static void proc_cleanup_work(struct work_struct *work)
@@ -80,9 +64,6 @@ static void proc_cleanup_work(struct work_struct *work)
pid_ns_release_proc(ns);
}
-/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
-#define MAX_PID_NS_LEVEL 32
-
static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
{
return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
@@ -119,7 +100,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
idr_init(&ns->idr);
- ns->pid_cachep = create_pid_cachep(level + 1);
+ ns->pid_cachep = create_pid_cachep(level);
if (ns->pid_cachep == NULL)
goto out_free_idr;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 4710f1b142fc..5454cc639a8d 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1053,7 +1053,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
lock_system_sleep();
swsusp_resume_device = res;
unlock_system_sleep();
- pr_info("Starting manual resume from disk\n");
+ pm_pr_dbg("Configured resume from disk to %u\n", swsusp_resume_device);
noresume = 0;
software_resume();
return n;
@@ -1061,6 +1061,29 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(resume);
+static ssize_t resume_offset_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)swsusp_resume_block);
+}
+
+static ssize_t resume_offset_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf,
+ size_t n)
+{
+ unsigned long long offset;
+ int rc;
+
+ rc = kstrtoull(buf, 0, &offset);
+ if (rc)
+ return rc;
+ swsusp_resume_block = offset;
+
+ return n;
+}
+
+power_attr(resume_offset);
+
static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -1106,6 +1129,7 @@ power_attr(reserved_size);
static struct attribute * g[] = {
&disk_attr.attr,
+ &resume_offset_attr.attr,
&resume_attr.attr,
&image_size_attr.attr,
&reserved_size_attr.attr,
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9d7503910ce2..fa39092b7aea 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -295,6 +295,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
* changed
*/
plist_del(node, &c->list);
+ /* fall through */
case PM_QOS_ADD_REQ:
plist_node_init(node, new_value);
plist_add(node, &c->list);
@@ -367,6 +368,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
break;
case PM_QOS_UPDATE_REQ:
pm_qos_flags_remove_req(pqf, req);
+ /* fall through */
case PM_QOS_ADD_REQ:
req->flags = val;
INIT_LIST_HEAD(&req->node);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f274fbef821d..2f4af216bd6e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -42,7 +42,6 @@
#include <linux/rculist.h>
#include <linux/poll.h>
#include <linux/irq_work.h>
-#include <linux/utsname.h>
#include <linux/ctype.h>
#include <linux/uio.h>
#include <linux/sched/clock.h>
@@ -52,6 +51,7 @@
#include <linux/uaccess.h>
#include <asm/sections.h>
+#include <trace/events/initcall.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
@@ -2162,7 +2162,7 @@ void suspend_console(void)
{
if (!console_suspend_enabled)
return;
- printk("Suspending console(s) (use no_console_suspend to debug)\n");
+ pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
console_lock();
console_suspended = 1;
up_console_sem();
@@ -2781,6 +2781,7 @@ EXPORT_SYMBOL(unregister_console);
*/
void __init console_init(void)
{
+ int ret;
initcall_t *call;
/* Setup the default TTY line discipline. */
@@ -2791,8 +2792,11 @@ void __init console_init(void)
* inform about problems etc..
*/
call = __con_initcall_start;
+ trace_initcall_level("console");
while (call < __con_initcall_end) {
- (*call)();
+ trace_initcall_start((*call));
+ ret = (*call)();
+ trace_initcall_finish((*call), ret);
call++;
}
}
@@ -3257,60 +3261,4 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
-static char dump_stack_arch_desc_str[128];
-
-/**
- * dump_stack_set_arch_desc - set arch-specific str to show with task dumps
- * @fmt: printf-style format string
- * @...: arguments for the format string
- *
- * The configured string will be printed right after utsname during task
- * dumps. Usually used to add arch-specific system identifiers. If an
- * arch wants to make use of such an ID string, it should initialize this
- * as soon as possible during boot.
- */
-void __init dump_stack_set_arch_desc(const char *fmt, ...)
-{
- va_list args;
-
- va_start(args, fmt);
- vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str),
- fmt, args);
- va_end(args);
-}
-
-/**
- * dump_stack_print_info - print generic debug info for dump_stack()
- * @log_lvl: log level
- *
- * Arch-specific dump_stack() implementations can use this function to
- * print out the same debug information as the generic dump_stack().
- */
-void dump_stack_print_info(const char *log_lvl)
-{
- printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
- log_lvl, raw_smp_processor_id(), current->pid, current->comm,
- print_tainted(), init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
-
- if (dump_stack_arch_desc_str[0] != '\0')
- printk("%sHardware name: %s\n",
- log_lvl, dump_stack_arch_desc_str);
-
- print_worker_info(log_lvl, current);
-}
-
-/**
- * show_regs_print_info - print generic debug info for show_regs()
- * @log_lvl: log level
- *
- * show_regs() implementations can use this function to print out generic
- * debug information.
- */
-void show_regs_print_info(const char *log_lvl)
-{
- dump_stack_print_info(log_lvl);
-}
-
#endif
diff --git a/kernel/resource.c b/kernel/resource.c
index e270b5048988..2af6c03858b9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -651,7 +651,8 @@ static int __find_resource(struct resource *root, struct resource *old,
alloc.start = constraint->alignf(constraint->alignf_data, &avail,
size, constraint->align);
alloc.end = alloc.start + size - 1;
- if (resource_contains(&avail, &alloc)) {
+ if (alloc.start <= alloc.end &&
+ resource_contains(&avail, &alloc)) {
new->start = alloc.start;
new->end = alloc.end;
return 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 28b68995a417..5e10aaeebfcc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -874,7 +874,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* this case, we can save a useless back to back clock update.
*/
if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
- rq_clock_skip_update(rq, true);
+ rq_clock_skip_update(rq);
}
#ifdef CONFIG_SMP
@@ -5560,6 +5560,7 @@ void idle_task_exit(void)
if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
+ current->active_mm = &init_mm;
finish_arch_post_lock_switch();
}
mmdrop(mm);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2b124811947d..d2c6083304b4 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -631,10 +631,9 @@ fail:
stop_kthread:
sugov_kthread_stop(sg_policy);
-
-free_sg_policy:
mutex_unlock(&global_tunables_lock);
+free_sg_policy:
sugov_policy_free(sg_policy);
disable_fast_switch:
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d1c7bf7c7e5b..e7b3008b85bb 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1560,7 +1560,7 @@ static void yield_task_dl(struct rq *rq)
* so we don't do microscopic update in schedule()
* and double the fastpath cost.
*/
- rq_clock_skip_update(rq, true);
+ rq_clock_skip_update(rq);
}
#ifdef CONFIG_SMP
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0951d1c58d2f..54dc31e7ab9b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7089,7 +7089,7 @@ static void yield_task_fair(struct rq *rq)
* so we don't do microscopic update in schedule()
* and double the fastpath cost.
*/
- rq_clock_skip_update(rq, true);
+ rq_clock_skip_update(rq);
}
set_skip_buddy(se);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2975f195e1c4..1a3e9bddd17b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -141,13 +141,15 @@ static void cpuidle_idle_call(void)
}
/*
- * Tell the RCU framework we are entering an idle section,
- * so no more rcu read side critical sections and one more
+ * The RCU framework needs to be told that we are entering an idle
+ * section, so no more rcu read side critical sections and one more
* step to the grace period
*/
- rcu_idle_enter();
if (cpuidle_not_available(drv, dev)) {
+ tick_nohz_idle_stop_tick();
+ rcu_idle_enter();
+
default_idle_call();
goto exit_idle;
}
@@ -164,20 +166,37 @@ static void cpuidle_idle_call(void)
if (idle_should_enter_s2idle() || dev->use_deepest_state) {
if (idle_should_enter_s2idle()) {
+ rcu_idle_enter();
+
entered_state = cpuidle_enter_s2idle(drv, dev);
if (entered_state > 0) {
local_irq_enable();
goto exit_idle;
}
+
+ rcu_idle_exit();
}
+ tick_nohz_idle_stop_tick();
+ rcu_idle_enter();
+
next_state = cpuidle_find_deepest_state(drv, dev);
call_cpuidle(drv, dev, next_state);
} else {
+ bool stop_tick = true;
+
/*
* Ask the cpuidle framework to choose a convenient idle state.
*/
- next_state = cpuidle_select(drv, dev);
+ next_state = cpuidle_select(drv, dev, &stop_tick);
+
+ if (stop_tick)
+ tick_nohz_idle_stop_tick();
+ else
+ tick_nohz_idle_retain_tick();
+
+ rcu_idle_enter();
+
entered_state = call_cpuidle(drv, dev, next_state);
/*
* Give the governor an opportunity to reflect on the outcome
@@ -222,6 +241,7 @@ static void do_idle(void)
rmb();
if (cpu_is_offline(cpu)) {
+ tick_nohz_idle_stop_tick_protected();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
@@ -235,10 +255,12 @@ static void do_idle(void)
* broadcast device expired for us, we don't want to go deep
* idle as we know that the IPI is going to arrive right away.
*/
- if (cpu_idle_force_poll || tick_check_broadcast_expired())
+ if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+ tick_nohz_idle_restart_tick();
cpu_idle_poll();
- else
+ } else {
cpuidle_idle_call();
+ }
arch_cpu_idle_exit();
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 86b77987435e..7aef6b4e885a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -839,6 +839,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
continue;
raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+
if (rt_rq->rt_time) {
u64 runtime;
@@ -859,7 +861,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
* 'runtime'.
*/
if (rt_rq->rt_nr_running && rq->curr == rq->idle)
- rq_clock_skip_update(rq, false);
+ rq_clock_cancel_skipupdate(rq);
}
if (rt_rq->rt_time || rt_rq->rt_nr_running)
idle = 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c3deaee7a7a2..15750c222ca2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -976,13 +976,20 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}
-static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+static inline void rq_clock_skip_update(struct rq *rq)
{
lockdep_assert_held(&rq->lock);
- if (skip)
- rq->clock_update_flags |= RQCF_REQ_SKIP;
- else
- rq->clock_update_flags &= ~RQCF_REQ_SKIP;
+ rq->clock_update_flags |= RQCF_REQ_SKIP;
+}
+
+/*
+ * See rt task throttoling, which is the only time a skip
+ * request is cancelled.
+ */
+static inline void rq_clock_cancel_skipupdate(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}
struct rq_flags {
diff --git a/kernel/signal.c b/kernel/signal.c
index f04466655238..d4ccea599692 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -770,7 +770,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
}
}
- return security_task_kill(t, info, sig, 0);
+ return security_task_kill(t, info, sig, NULL);
}
/**
@@ -1361,7 +1361,7 @@ static int kill_as_cred_perm(const struct cred *cred,
/* like kill_pid_info(), but doesn't use uid/euid of "current" */
int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
- const struct cred *cred, u32 secid)
+ const struct cred *cred)
{
int ret = -EINVAL;
struct task_struct *p;
@@ -1380,7 +1380,7 @@ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
ret = -EPERM;
goto out_unlock;
}
- ret = security_task_kill(p, info, sig, secid);
+ ret = security_task_kill(p, info, sig, cred);
if (ret)
goto out_unlock;
@@ -2844,10 +2844,6 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
if ((sig == SIGFPE) && (si_code == FPE_FIXME))
layout = SIL_FAULT;
#endif
-#ifdef BUS_FIXME
- if ((sig == SIGBUS) && (si_code == BUS_FIXME))
- layout = SIL_FAULT;
-#endif
}
return layout;
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 24d243ef8e71..177de3640c78 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -460,40 +460,46 @@ struct tasklet_head {
static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
-void __tasklet_schedule(struct tasklet_struct *t)
+static void __tasklet_schedule_common(struct tasklet_struct *t,
+ struct tasklet_head __percpu *headp,
+ unsigned int softirq_nr)
{
+ struct tasklet_head *head;
unsigned long flags;
local_irq_save(flags);
+ head = this_cpu_ptr(headp);
t->next = NULL;
- *__this_cpu_read(tasklet_vec.tail) = t;
- __this_cpu_write(tasklet_vec.tail, &(t->next));
- raise_softirq_irqoff(TASKLET_SOFTIRQ);
+ *head->tail = t;
+ head->tail = &(t->next);
+ raise_softirq_irqoff(softirq_nr);
local_irq_restore(flags);
}
+
+void __tasklet_schedule(struct tasklet_struct *t)
+{
+ __tasklet_schedule_common(t, &tasklet_vec,
+ TASKLET_SOFTIRQ);
+}
EXPORT_SYMBOL(__tasklet_schedule);
void __tasklet_hi_schedule(struct tasklet_struct *t)
{
- unsigned long flags;
-
- local_irq_save(flags);
- t->next = NULL;
- *__this_cpu_read(tasklet_hi_vec.tail) = t;
- __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
- raise_softirq_irqoff(HI_SOFTIRQ);
- local_irq_restore(flags);
+ __tasklet_schedule_common(t, &tasklet_hi_vec,
+ HI_SOFTIRQ);
}
EXPORT_SYMBOL(__tasklet_hi_schedule);
-static __latent_entropy void tasklet_action(struct softirq_action *a)
+static void tasklet_action_common(struct softirq_action *a,
+ struct tasklet_head *tl_head,
+ unsigned int softirq_nr)
{
struct tasklet_struct *list;
local_irq_disable();
- list = __this_cpu_read(tasklet_vec.head);
- __this_cpu_write(tasklet_vec.head, NULL);
- __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
+ list = tl_head->head;
+ tl_head->head = NULL;
+ tl_head->tail = &tl_head->head;
local_irq_enable();
while (list) {
@@ -515,47 +521,21 @@ static __latent_entropy void tasklet_action(struct softirq_action *a)
local_irq_disable();
t->next = NULL;
- *__this_cpu_read(tasklet_vec.tail) = t;
- __this_cpu_write(tasklet_vec.tail, &(t->next));
- __raise_softirq_irqoff(TASKLET_SOFTIRQ);
+ *tl_head->tail = t;
+ tl_head->tail = &t->next;
+ __raise_softirq_irqoff(softirq_nr);
local_irq_enable();
}
}
-static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
+static __latent_entropy void tasklet_action(struct softirq_action *a)
{
- struct tasklet_struct *list;
-
- local_irq_disable();
- list = __this_cpu_read(tasklet_hi_vec.head);
- __this_cpu_write(tasklet_hi_vec.head, NULL);
- __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
- local_irq_enable();
-
- while (list) {
- struct tasklet_struct *t = list;
-
- list = list->next;
-
- if (tasklet_trylock(t)) {
- if (!atomic_read(&t->count)) {
- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
- &t->state))
- BUG();
- t->func(t->data);
- tasklet_unlock(t);
- continue;
- }
- tasklet_unlock(t);
- }
+ tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
+}
- local_irq_disable();
- t->next = NULL;
- *__this_cpu_read(tasklet_hi_vec.tail) = t;
- __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
- __raise_softirq_irqoff(HI_SOFTIRQ);
- local_irq_enable();
- }
+static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
+{
+ tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
}
void tasklet_init(struct tasklet_struct *t,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f98f28c12020..6a78cf70761d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -253,6 +253,10 @@ extern struct ctl_table random_table[];
extern struct ctl_table epoll_table[];
#endif
+#ifdef CONFIG_FW_LOADER_USER_HELPER
+extern struct ctl_table firmware_config_table[];
+#endif
+
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
int sysctl_legacy_va_layout;
#endif
@@ -748,6 +752,13 @@ static struct ctl_table kern_table[] = {
.mode = 0555,
.child = usermodehelper_table,
},
+#ifdef CONFIG_FW_LOADER_USER_HELPER
+ {
+ .procname = "firmware_config",
+ .mode = 0555,
+ .child = firmware_config_table,
+ },
+#endif
{
.procname = "overflowuid",
.data = &overflowuid,
@@ -1329,7 +1340,7 @@ static struct ctl_table vm_table[] = {
{
.procname = "dirtytime_expire_seconds",
.data = &dirtytime_expire_interval,
- .maxlen = sizeof(dirty_expire_interval),
+ .maxlen = sizeof(dirtytime_expire_interval),
.mode = 0644,
.proc_handler = dirtytime_interval_handler,
.extra1 = &zero,
@@ -2500,6 +2511,15 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
}
#endif
+/**
+ * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_dointvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_dointvec_minmax() handler.
+ */
struct do_proc_dointvec_minmax_conv_param {
int *min;
int *max;
@@ -2543,7 +2563,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
* This routine will ensure the values are within the range specified by
* table->extra1 (min) and table->extra2 (max).
*
- * Returns 0 on success.
+ * Returns 0 on success or -EINVAL on write when the range check fails.
*/
int proc_dointvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2556,6 +2576,15 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
do_proc_dointvec_minmax_conv, &param);
}
+/**
+ * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_douintvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_douintvec_minmax() handler.
+ */
struct do_proc_douintvec_minmax_conv_param {
unsigned int *min;
unsigned int *max;
@@ -2603,7 +2632,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
* check for UINT_MAX to avoid having to support wrap around uses from
* userspace.
*
- * Returns 0 on success.
+ * Returns 0 on success or -ERANGE on write when the range check fails.
*/
int proc_douintvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ec09ce9a6012..639321bf2e39 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -326,6 +326,17 @@ static int alarmtimer_resume(struct device *dev)
}
#endif
+static void
+__alarm_init(struct alarm *alarm, enum alarmtimer_type type,
+ enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
+{
+ timerqueue_init(&alarm->node);
+ alarm->timer.function = alarmtimer_fired;
+ alarm->function = function;
+ alarm->type = type;
+ alarm->state = ALARMTIMER_STATE_INACTIVE;
+}
+
/**
* alarm_init - Initialize an alarm structure
* @alarm: ptr to alarm to be initialized
@@ -335,13 +346,9 @@ static int alarmtimer_resume(struct device *dev)
void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
{
- timerqueue_init(&alarm->node);
hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
- HRTIMER_MODE_ABS);
- alarm->timer.function = alarmtimer_fired;
- alarm->function = function;
- alarm->type = type;
- alarm->state = ALARMTIMER_STATE_INACTIVE;
+ HRTIMER_MODE_ABS);
+ __alarm_init(alarm, type, function);
}
EXPORT_SYMBOL_GPL(alarm_init);
@@ -719,6 +726,8 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
__set_current_state(TASK_RUNNING);
+ destroy_hrtimer_on_stack(&alarm->timer);
+
if (!alarm->data)
return 0;
@@ -740,6 +749,15 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
return -ERESTART_RESTARTBLOCK;
}
+static void
+alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type,
+ enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
+{
+ hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid,
+ HRTIMER_MODE_ABS);
+ __alarm_init(alarm, type, function);
+}
+
/**
* alarm_timer_nsleep_restart - restartblock alarmtimer nsleep
* @restart: ptr to restart block
@@ -752,7 +770,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
ktime_t exp = restart->nanosleep.expires;
struct alarm alarm;
- alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+ alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup);
return alarmtimer_do_nsleep(&alarm, exp, type);
}
@@ -784,7 +802,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
- alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+ alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup);
exp = timespec64_to_ktime(*tsreq);
/* Convert (if necessary) to absolute time */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 65f9e3f24dde..0e974cface0b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -594,6 +594,9 @@ static void __clocksource_select(bool skipcur)
if (!best)
return;
+ if (!strlen(override_name))
+ goto found;
+
/* Check for the override clocksource. */
list_for_each_entry(cs, &clocksource_list, list) {
if (skipcur && cs == curr_clocksource)
@@ -625,6 +628,7 @@ static void __clocksource_select(bool skipcur)
break;
}
+found:
if (curr_clocksource != best && !timekeeping_notify(best)) {
pr_info("Switched to clocksource %s\n", best->name);
curr_clocksource = best;
@@ -853,16 +857,16 @@ EXPORT_SYMBOL(clocksource_unregister);
#ifdef CONFIG_SYSFS
/**
- * sysfs_show_current_clocksources - sysfs interface for current clocksource
+ * current_clocksource_show - sysfs interface for current clocksource
* @dev: unused
* @attr: unused
* @buf: char buffer to be filled with clocksource list
*
* Provides sysfs interface for listing current clocksource.
*/
-static ssize_t
-sysfs_show_current_clocksources(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t current_clocksource_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
ssize_t count = 0;
@@ -891,7 +895,7 @@ ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
}
/**
- * sysfs_override_clocksource - interface for manually overriding clocksource
+ * current_clocksource_store - interface for manually overriding clocksource
* @dev: unused
* @attr: unused
* @buf: name of override clocksource
@@ -900,9 +904,9 @@ ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
* Takes input from sysfs interface for manually overriding the default
* clocksource selection.
*/
-static ssize_t sysfs_override_clocksource(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t current_clocksource_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
ssize_t ret;
@@ -916,9 +920,10 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
return ret;
}
+static DEVICE_ATTR_RW(current_clocksource);
/**
- * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource
+ * unbind_clocksource_store - interface for manually unbinding clocksource
* @dev: unused
* @attr: unused
* @buf: unused
@@ -926,7 +931,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
*
* Takes input from sysfs interface for manually unbinding a clocksource.
*/
-static ssize_t sysfs_unbind_clocksource(struct device *dev,
+static ssize_t unbind_clocksource_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
@@ -950,19 +955,19 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev,
return ret ? ret : count;
}
+static DEVICE_ATTR_WO(unbind_clocksource);
/**
- * sysfs_show_available_clocksources - sysfs interface for listing clocksource
+ * available_clocksource_show - sysfs interface for listing clocksource
* @dev: unused
* @attr: unused
* @buf: char buffer to be filled with clocksource list
*
* Provides sysfs interface for listing registered clocksources
*/
-static ssize_t
-sysfs_show_available_clocksources(struct device *dev,
- struct device_attribute *attr,
- char *buf)
+static ssize_t available_clocksource_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
struct clocksource *src;
ssize_t count = 0;
@@ -986,17 +991,15 @@ sysfs_show_available_clocksources(struct device *dev,
return count;
}
+static DEVICE_ATTR_RO(available_clocksource);
-/*
- * Sysfs setup bits:
- */
-static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
- sysfs_override_clocksource);
-
-static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource);
-
-static DEVICE_ATTR(available_clocksource, 0444,
- sysfs_show_available_clocksources, NULL);
+static struct attribute *clocksource_attrs[] = {
+ &dev_attr_current_clocksource.attr,
+ &dev_attr_unbind_clocksource.attr,
+ &dev_attr_available_clocksource.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(clocksource);
static struct bus_type clocksource_subsys = {
.name = "clocksource",
@@ -1006,6 +1009,7 @@ static struct bus_type clocksource_subsys = {
static struct device device_clocksource = {
.id = 0,
.bus = &clocksource_subsys,
+ .groups = clocksource_groups,
};
static int __init init_clocksource_sysfs(void)
@@ -1014,17 +1018,7 @@ static int __init init_clocksource_sysfs(void)
if (!error)
error = device_register(&device_clocksource);
- if (!error)
- error = device_create_file(
- &device_clocksource,
- &dev_attr_current_clocksource);
- if (!error)
- error = device_create_file(&device_clocksource,
- &dev_attr_unbind_clocksource);
- if (!error)
- error = device_create_file(
- &device_clocksource,
- &dev_attr_available_clocksource);
+
return error;
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 23788100e214..eda1210ce50f 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -91,11 +91,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.get_time = &ktime_get_real,
},
{
- .index = HRTIMER_BASE_BOOTTIME,
- .clockid = CLOCK_BOOTTIME,
- .get_time = &ktime_get_boottime,
- },
- {
.index = HRTIMER_BASE_TAI,
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
@@ -111,11 +106,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.get_time = &ktime_get_real,
},
{
- .index = HRTIMER_BASE_BOOTTIME_SOFT,
- .clockid = CLOCK_BOOTTIME,
- .get_time = &ktime_get_boottime,
- },
- {
.index = HRTIMER_BASE_TAI_SOFT,
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
@@ -129,7 +119,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
- [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
+ [CLOCK_BOOTTIME] = HRTIMER_BASE_MONOTONIC,
[CLOCK_TAI] = HRTIMER_BASE_TAI,
};
@@ -490,6 +480,7 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
while ((base = __next_base((cpu_base), &(active))))
static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
+ const struct hrtimer *exclude,
unsigned int active,
ktime_t expires_next)
{
@@ -502,9 +493,22 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
+ if (timer == exclude) {
+ /* Get to the next timer in the queue. */
+ next = timerqueue_iterate_next(next);
+ if (!next)
+ continue;
+
+ timer = container_of(next, struct hrtimer, node);
+ }
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
if (expires < expires_next) {
expires_next = expires;
+
+ /* Skip cpu_base update if a timer is being excluded. */
+ if (exclude)
+ continue;
+
if (timer->is_soft)
cpu_base->softirq_next_timer = timer;
else
@@ -548,7 +552,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
cpu_base->softirq_next_timer = NULL;
- expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
+ expires_next = __hrtimer_next_event_base(cpu_base, NULL,
+ active, KTIME_MAX);
next_timer = cpu_base->softirq_next_timer;
}
@@ -556,7 +561,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
if (active_mask & HRTIMER_ACTIVE_HARD) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
cpu_base->next_timer = next_timer;
- expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
+ expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
+ expires_next);
}
return expires_next;
@@ -565,14 +571,12 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
- ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
- offs_real, offs_boot, offs_tai);
+ offs_real, offs_tai);
base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
- base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
return now;
@@ -1202,6 +1206,39 @@ u64 hrtimer_get_next_event(void)
return expires;
}
+
+/**
+ * hrtimer_next_event_without - time until next expiry event w/o one timer
+ * @exclude: timer to exclude
+ *
+ * Returns the next expiry time over all timers except for the @exclude one or
+ * KTIME_MAX if none of them is pending.
+ */
+u64 hrtimer_next_event_without(const struct hrtimer *exclude)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ u64 expires = KTIME_MAX;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
+
+ if (__hrtimer_hres_active(cpu_base)) {
+ unsigned int active;
+
+ if (!cpu_base->softirq_activated) {
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+ expires = __hrtimer_next_event_base(cpu_base, exclude,
+ active, KTIME_MAX);
+ }
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+ expires = __hrtimer_next_event_base(cpu_base, exclude, active,
+ expires);
+ }
+
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+
+ return expires;
+}
#endif
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8d70da1b9a0d..a09ded765f6c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -31,7 +31,7 @@
/* USER_HZ period (usecs): */
-unsigned long tick_usec = TICK_USEC;
+unsigned long tick_usec = USER_TICK_USEC;
/* SHIFTED_HZ period (nsecs): */
unsigned long tick_nsec;
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 69a937c3cd81..e0dbae98db9d 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -83,6 +83,8 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
case CLOCK_BOOTTIME:
get_monotonic_boottime64(tp);
break;
+ case CLOCK_MONOTONIC_ACTIVE:
+ ktime_get_active_ts64(tp);
default:
return -EINVAL;
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 10b7186d0638..b6899b5060bd 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -252,15 +252,16 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *
return 0;
}
-static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
{
- get_monotonic_boottime64(tp);
+ timekeeping_clocktai64(tp);
return 0;
}
-static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_monotonic_active(clockid_t which_clock,
+ struct timespec64 *tp)
{
- timekeeping_clocktai64(tp);
+ ktime_get_active_ts64(tp);
return 0;
}
@@ -1316,19 +1317,9 @@ static const struct k_clock clock_tai = {
.timer_arm = common_hrtimer_arm,
};
-static const struct k_clock clock_boottime = {
+static const struct k_clock clock_monotonic_active = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_boottime,
- .nsleep = common_nsleep,
- .timer_create = common_timer_create,
- .timer_set = common_timer_set,
- .timer_get = common_timer_get,
- .timer_del = common_timer_del,
- .timer_rearm = common_hrtimer_rearm,
- .timer_forward = common_hrtimer_forward,
- .timer_remaining = common_hrtimer_remaining,
- .timer_try_to_cancel = common_hrtimer_try_to_cancel,
- .timer_arm = common_hrtimer_arm,
+ .clock_get = posix_get_monotonic_active,
};
static const struct k_clock * const posix_clocks[] = {
@@ -1339,10 +1330,11 @@ static const struct k_clock * const posix_clocks[] = {
[CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw,
[CLOCK_REALTIME_COARSE] = &clock_realtime_coarse,
[CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse,
- [CLOCK_BOOTTIME] = &clock_boottime,
+ [CLOCK_BOOTTIME] = &clock_monotonic,
[CLOCK_REALTIME_ALARM] = &alarm_clock,
[CLOCK_BOOTTIME_ALARM] = &alarm_clock,
[CLOCK_TAI] = &clock_tai,
+ [CLOCK_MONOTONIC_ACTIVE] = &clock_monotonic_active,
};
static const struct k_clock *clockid_to_kclock(const clockid_t id)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 49edc1c4f3e6..099572ca4a8f 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -419,6 +419,19 @@ void tick_suspend_local(void)
clockevents_shutdown(td->evtdev);
}
+static void tick_forward_next_period(void)
+{
+ ktime_t delta, now = ktime_get();
+ u64 n;
+
+ delta = ktime_sub(now, tick_next_period);
+ n = ktime_divns(delta, tick_period);
+ tick_next_period += n * tick_period;
+ if (tick_next_period < now)
+ tick_next_period += tick_period;
+ tick_sched_forward_next_period();
+}
+
/**
* tick_resume_local - Resume the local tick device
*
@@ -431,6 +444,8 @@ void tick_resume_local(void)
struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
bool broadcast = tick_resume_check_broadcast();
+ tick_forward_next_period();
+
clockevents_tick_resume(td->evtdev);
if (!broadcast) {
if (td->mode == TICKDEV_MODE_PERIODIC)
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index e277284c2831..21efab7485ca 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -141,6 +141,12 @@ static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
#endif /* !(BROADCAST && ONESHOT) */
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+extern void tick_sched_forward_next_period(void);
+#else
+static inline void tick_sched_forward_next_period(void) { }
+#endif
+
/* NO_HZ_FULL internal */
#ifdef CONFIG_NO_HZ_FULL
extern void tick_nohz_init(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 5d4a0342f934..646645e981f9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -52,6 +52,15 @@ struct tick_sched *tick_get_tick_sched(int cpu)
static ktime_t last_jiffies_update;
/*
+ * Called after resume. Make sure that jiffies are not fast forwarded due to
+ * clock monotonic being forwarded by the suspended time.
+ */
+void tick_sched_forward_next_period(void)
+{
+ last_jiffies_update = tick_next_period;
+}
+
+/*
* Must be called with interrupts disabled !
*/
static void tick_do_update_jiffies64(ktime_t now)
@@ -113,8 +122,7 @@ static ktime_t tick_init_jiffy_update(void)
return period;
}
-
-static void tick_sched_do_timer(ktime_t now)
+static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
int cpu = smp_processor_id();
@@ -134,6 +142,9 @@ static void tick_sched_do_timer(ktime_t now)
/* Check, if the jiffies need an update */
if (tick_do_timer_cpu == cpu)
tick_do_update_jiffies64(now);
+
+ if (ts->inidle)
+ ts->got_idle_tick = 1;
}
static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
@@ -465,7 +476,9 @@ __setup("nohz=", setup_tick_nohz);
bool tick_nohz_tick_stopped(void)
{
- return __this_cpu_read(tick_cpu_sched.tick_stopped);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ return ts->tick_stopped;
}
bool tick_nohz_tick_stopped_cpu(int cpu)
@@ -528,14 +541,11 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
sched_clock_idle_wakeup_event();
}
-static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+static void tick_nohz_start_idle(struct tick_sched *ts)
{
- ktime_t now = ktime_get();
-
- ts->idle_entrytime = now;
+ ts->idle_entrytime = ktime_get();
ts->idle_active = 1;
sched_clock_idle_sleep_event();
- return now;
}
/**
@@ -644,13 +654,10 @@ static inline bool local_timer_softirq_pending(void)
return local_softirq_pending() & TIMER_SOFTIRQ;
}
-static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
- ktime_t now, int cpu)
+static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
- struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
unsigned long seq, basejiff;
- ktime_t tick;
/* Read jiffies and the time when jiffies were updated last */
do {
@@ -659,6 +666,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
basejiff = jiffies;
} while (read_seqretry(&jiffies_lock, seq));
ts->last_jiffies = basejiff;
+ ts->timer_expires_base = basemono;
/*
* Keep the periodic tick, when RCU, architecture or irq_work
@@ -703,47 +711,63 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
* next period, so no point in stopping it either, bail.
*/
if (!ts->tick_stopped) {
- tick = 0;
+ ts->timer_expires = 0;
goto out;
}
}
/*
+ * If this CPU is the one which had the do_timer() duty last, we limit
+ * the sleep time to the timekeeping max_deferment value.
+ * Otherwise we can sleep as long as we want.
+ */
+ delta = timekeeping_max_deferment();
+ if (cpu != tick_do_timer_cpu &&
+ (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last))
+ delta = KTIME_MAX;
+
+ /* Calculate the next expiry time */
+ if (delta < (KTIME_MAX - basemono))
+ expires = basemono + delta;
+ else
+ expires = KTIME_MAX;
+
+ ts->timer_expires = min_t(u64, expires, next_tick);
+
+out:
+ return ts->timer_expires;
+}
+
+static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
+{
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+ u64 basemono = ts->timer_expires_base;
+ u64 expires = ts->timer_expires;
+ ktime_t tick = expires;
+
+ /* Make sure we won't be trying to stop it twice in a row. */
+ ts->timer_expires_base = 0;
+
+ /*
* If this CPU is the one which updates jiffies, then give up
* the assignment and let it be taken by the CPU which runs
* the tick timer next, which might be this CPU as well. If we
* don't drop this here the jiffies might be stale and
* do_timer() never invoked. Keep track of the fact that it
- * was the one which had the do_timer() duty last. If this CPU
- * is the one which had the do_timer() duty last, we limit the
- * sleep time to the timekeeping max_deferment value.
- * Otherwise we can sleep as long as we want.
+ * was the one which had the do_timer() duty last.
*/
- delta = timekeeping_max_deferment();
if (cpu == tick_do_timer_cpu) {
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
ts->do_timer_last = 1;
} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
- delta = KTIME_MAX;
ts->do_timer_last = 0;
- } else if (!ts->do_timer_last) {
- delta = KTIME_MAX;
}
- /* Calculate the next expiry time */
- if (delta < (KTIME_MAX - basemono))
- expires = basemono + delta;
- else
- expires = KTIME_MAX;
-
- expires = min_t(u64, expires, next_tick);
- tick = expires;
-
/* Skip reprogram of event if its not changed */
if (ts->tick_stopped && (expires == ts->next_tick)) {
/* Sanity check: make sure clockevent is actually programmed */
if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
- goto out;
+ return;
WARN_ON_ONCE(1);
printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
@@ -777,7 +801,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
if (unlikely(expires == KTIME_MAX)) {
if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
hrtimer_cancel(&ts->sched_timer);
- goto out;
+ return;
}
hrtimer_set_expires(&ts->sched_timer, tick);
@@ -786,15 +810,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
else
tick_program_event(tick, 1);
-out:
- /*
- * Update the estimated sleep length until the next timer
- * (not only the tick).
- */
- ts->sleep_length = ktime_sub(dev->next_event, now);
- return tick;
}
+static void tick_nohz_retain_tick(struct tick_sched *ts)
+{
+ ts->timer_expires_base = 0;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu)
+{
+ if (tick_nohz_next_event(ts, cpu))
+ tick_nohz_stop_tick(ts, cpu);
+ else
+ tick_nohz_retain_tick(ts);
+}
+#endif /* CONFIG_NO_HZ_FULL */
+
static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
/* Update jiffies first */
@@ -830,7 +862,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
return;
if (can_stop_full_tick(cpu, ts))
- tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+ tick_nohz_stop_sched_tick(ts, cpu);
else if (ts->tick_stopped)
tick_nohz_restart_sched_tick(ts, ktime_get());
#endif
@@ -856,10 +888,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return false;
}
- if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
- ts->sleep_length = NSEC_PER_SEC / HZ;
+ if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
return false;
- }
if (need_resched())
return false;
@@ -894,42 +924,65 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return true;
}
-static void __tick_nohz_idle_enter(struct tick_sched *ts)
+static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
{
- ktime_t now, expires;
+ ktime_t expires;
int cpu = smp_processor_id();
- now = tick_nohz_start_idle(ts);
+ /*
+ * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
+ * tick timer expiration time is known already.
+ */
+ if (ts->timer_expires_base)
+ expires = ts->timer_expires;
+ else if (can_stop_idle_tick(cpu, ts))
+ expires = tick_nohz_next_event(ts, cpu);
+ else
+ return;
+
+ ts->idle_calls++;
- if (can_stop_idle_tick(cpu, ts)) {
+ if (expires > 0LL) {
int was_stopped = ts->tick_stopped;
- ts->idle_calls++;
+ tick_nohz_stop_tick(ts, cpu);
- expires = tick_nohz_stop_sched_tick(ts, now, cpu);
- if (expires > 0LL) {
- ts->idle_sleeps++;
- ts->idle_expires = expires;
- }
+ ts->idle_sleeps++;
+ ts->idle_expires = expires;
if (!was_stopped && ts->tick_stopped) {
ts->idle_jiffies = ts->last_jiffies;
nohz_balance_enter_idle(cpu);
}
+ } else {
+ tick_nohz_retain_tick(ts);
}
}
/**
- * tick_nohz_idle_enter - stop the idle tick from the idle task
+ * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
*
* When the next event is more than a tick into the future, stop the idle tick
- * Called when we start the idle loop.
- *
- * The arch is responsible of calling:
+ */
+void tick_nohz_idle_stop_tick(void)
+{
+ __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
+}
+
+void tick_nohz_idle_retain_tick(void)
+{
+ tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
+ /*
+ * Undo the effect of get_next_timer_interrupt() called from
+ * tick_nohz_next_event().
+ */
+ timer_clear_idle();
+}
+
+/**
+ * tick_nohz_idle_enter - prepare for entering idle on the current CPU
*
- * - rcu_idle_enter() after its last use of RCU before the CPU is put
- * to sleep.
- * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
+ * Called when we start the idle loop.
*/
void tick_nohz_idle_enter(void)
{
@@ -940,8 +993,11 @@ void tick_nohz_idle_enter(void)
local_irq_disable();
ts = this_cpu_ptr(&tick_cpu_sched);
+
+ WARN_ON_ONCE(ts->timer_expires_base);
+
ts->inidle = 1;
- __tick_nohz_idle_enter(ts);
+ tick_nohz_start_idle(ts);
local_irq_enable();
}
@@ -959,21 +1015,62 @@ void tick_nohz_irq_exit(void)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->inidle)
- __tick_nohz_idle_enter(ts);
+ tick_nohz_start_idle(ts);
else
tick_nohz_full_update_tick(ts);
}
/**
- * tick_nohz_get_sleep_length - return the length of the current sleep
+ * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
+ */
+bool tick_nohz_idle_got_tick(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ if (ts->got_idle_tick) {
+ ts->got_idle_tick = 0;
+ return true;
+ }
+ return false;
+}
+
+/**
+ * tick_nohz_get_sleep_length - return the expected length of the current sleep
+ * @delta_next: duration until the next event if the tick cannot be stopped
*
* Called from power state control code with interrupts disabled
*/
-ktime_t tick_nohz_get_sleep_length(void)
+ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+ int cpu = smp_processor_id();
+ /*
+ * The idle entry time is expected to be a sufficient approximation of
+ * the current time at this point.
+ */
+ ktime_t now = ts->idle_entrytime;
+ ktime_t next_event;
+
+ WARN_ON_ONCE(!ts->inidle);
+
+ *delta_next = ktime_sub(dev->next_event, now);
- return ts->sleep_length;
+ if (!can_stop_idle_tick(cpu, ts))
+ return *delta_next;
+
+ next_event = tick_nohz_next_event(ts, cpu);
+ if (!next_event)
+ return *delta_next;
+
+ /*
+ * If the next highres timer to expire is earlier than next_event, the
+ * idle governor needs to know that.
+ */
+ next_event = min_t(u64, next_event,
+ hrtimer_next_event_without(&ts->sched_timer));
+
+ return ktime_sub(next_event, now);
}
/**
@@ -1022,6 +1119,20 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
#endif
}
+static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now)
+{
+ tick_nohz_restart_sched_tick(ts, now);
+ tick_nohz_account_idle_ticks(ts);
+}
+
+void tick_nohz_idle_restart_tick(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ if (ts->tick_stopped)
+ __tick_nohz_idle_restart_tick(ts, ktime_get());
+}
+
/**
* tick_nohz_idle_exit - restart the idle tick from the idle task
*
@@ -1032,24 +1143,26 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
void tick_nohz_idle_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+ bool idle_active, tick_stopped;
ktime_t now;
local_irq_disable();
WARN_ON_ONCE(!ts->inidle);
+ WARN_ON_ONCE(ts->timer_expires_base);
ts->inidle = 0;
+ idle_active = ts->idle_active;
+ tick_stopped = ts->tick_stopped;
- if (ts->idle_active || ts->tick_stopped)
+ if (idle_active || tick_stopped)
now = ktime_get();
- if (ts->idle_active)
+ if (idle_active)
tick_nohz_stop_idle(ts, now);
- if (ts->tick_stopped) {
- tick_nohz_restart_sched_tick(ts, now);
- tick_nohz_account_idle_ticks(ts);
- }
+ if (tick_stopped)
+ __tick_nohz_idle_restart_tick(ts, now);
local_irq_enable();
}
@@ -1065,7 +1178,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
dev->next_event = KTIME_MAX;
- tick_sched_do_timer(now);
+ tick_sched_do_timer(ts, now);
tick_sched_handle(ts, regs);
/* No need to reprogram if we are running tickless */
@@ -1160,7 +1273,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
- tick_sched_do_timer(now);
+ tick_sched_do_timer(ts, now);
/*
* Do not call, when we are not in irq context and have
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 954b43dbf21c..6de959a854b2 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -38,31 +38,37 @@ enum tick_nohz_mode {
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @sleep_length: Duration of the current idle sleep
+ * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
+ * @timer_expires_base: Base time clock monotonic for @timer_expires
* @do_timer_lst: CPU was the last one doing do_timer before going idle
+ * @got_idle_tick: Tick timer function has run with @inidle set
*/
struct tick_sched {
struct hrtimer sched_timer;
unsigned long check_clocks;
enum tick_nohz_mode nohz_mode;
+
+ unsigned int inidle : 1;
+ unsigned int tick_stopped : 1;
+ unsigned int idle_active : 1;
+ unsigned int do_timer_last : 1;
+ unsigned int got_idle_tick : 1;
+
ktime_t last_tick;
ktime_t next_tick;
- int inidle;
- int tick_stopped;
unsigned long idle_jiffies;
unsigned long idle_calls;
unsigned long idle_sleeps;
- int idle_active;
ktime_t idle_entrytime;
ktime_t idle_waketime;
ktime_t idle_exittime;
ktime_t idle_sleeptime;
ktime_t iowait_sleeptime;
- ktime_t sleep_length;
unsigned long last_jiffies;
+ u64 timer_expires;
+ u64 timer_expires_base;
u64 next_timer;
ktime_t idle_expires;
- int do_timer_last;
atomic_t tick_dep_mask;
};
diff --git a/kernel/time/time.c b/kernel/time/time.c
index bd4e6c7dd689..3044d48ebe56 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -488,6 +488,18 @@ struct timeval ns_to_timeval(const s64 nsec)
}
EXPORT_SYMBOL(ns_to_timeval);
+struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec)
+{
+ struct timespec64 ts = ns_to_timespec64(nsec);
+ struct __kernel_old_timeval tv;
+
+ tv.tv_sec = ts.tv_sec;
+ tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000;
+
+ return tv;
+}
+EXPORT_SYMBOL(ns_to_kernel_old_timeval);
+
/**
* set_normalized_timespec - set timespec sec and nsec parts and normalize
*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cd03317e7b57..ca90219a1e73 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -138,7 +138,12 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
- tk->offs_boot = ktime_add(tk->offs_boot, delta);
+ /* Update both bases so mono and raw stay coupled. */
+ tk->tkr_mono.base += delta;
+ tk->tkr_raw.base += delta;
+
+ /* Accumulate time spent in suspend */
+ tk->time_suspended += delta;
}
/*
@@ -332,6 +337,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
tk->tkr_mono.mult = clock->mult;
tk->tkr_raw.mult = clock->mult;
tk->ntp_err_mult = 0;
+ tk->skip_second_overflow = 0;
}
/* Timekeeper helper functions. */
@@ -467,36 +473,6 @@ u64 ktime_get_raw_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
-/**
- * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
- *
- * To keep it NMI safe since we're accessing from tracing, we're not using a
- * separate timekeeper with updates to monotonic clock and boot offset
- * protected with seqlocks. This has the following minor side effects:
- *
- * (1) Its possible that a timestamp be taken after the boot offset is updated
- * but before the timekeeper is updated. If this happens, the new boot offset
- * is added to the old timekeeping making the clock appear to update slightly
- * earlier:
- * CPU 0 CPU 1
- * timekeeping_inject_sleeptime64()
- * __timekeeping_inject_sleeptime(tk, delta);
- * timestamp();
- * timekeeping_update(tk, TK_CLEAR_NTP...);
- *
- * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
- * partially updated. Since the tk->offs_boot update is a rare event, this
- * should be a rare occurrence which postprocessing should be able to handle.
- */
-u64 notrace ktime_get_boot_fast_ns(void)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
-
- return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
-}
-EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
-
-
/*
* See comment for __ktime_get_fast_ns() vs. timestamp ordering
*/
@@ -788,7 +764,6 @@ EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
static ktime_t *offsets[TK_OFFS_MAX] = {
[TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
- [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
[TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai,
};
@@ -886,6 +861,39 @@ void ktime_get_ts64(struct timespec64 *ts)
EXPORT_SYMBOL_GPL(ktime_get_ts64);
/**
+ * ktime_get_active_ts64 - Get the active non-suspended monotonic clock
+ * @ts: pointer to timespec variable
+ *
+ * The function calculates the monotonic clock from the realtime clock and
+ * the wall_to_monotonic offset, subtracts the accumulated suspend time and
+ * stores the result in normalized timespec64 format in the variable
+ * pointed to by @ts.
+ */
+void ktime_get_active_ts64(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 tomono, tsusp;
+ u64 nsec, nssusp;
+ unsigned int seq;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ ts->tv_sec = tk->xtime_sec;
+ nsec = timekeeping_get_ns(&tk->tkr_mono);
+ tomono = tk->wall_to_monotonic;
+ nssusp = tk->time_suspended;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ ts->tv_sec += tomono.tv_sec;
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsec + tomono.tv_nsec);
+ tsusp = ns_to_timespec64(nssusp);
+ *ts = timespec64_sub(*ts, tsusp);
+}
+
+/**
* ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
*
* Returns the seconds portion of CLOCK_MONOTONIC with a single non
@@ -1585,7 +1593,6 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
return;
}
tk_xtime_add(tk, delta);
- tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
tk_debug_account_sleep_time(delta);
}
@@ -1799,20 +1806,19 @@ device_initcall(timekeeping_init_ops);
*/
static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
s64 offset,
- bool negative,
- int adj_scale)
+ s32 mult_adj)
{
s64 interval = tk->cycle_interval;
- s32 mult_adj = 1;
- if (negative) {
- mult_adj = -mult_adj;
+ if (mult_adj == 0) {
+ return;
+ } else if (mult_adj == -1) {
interval = -interval;
- offset = -offset;
+ offset = -offset;
+ } else if (mult_adj != 1) {
+ interval *= mult_adj;
+ offset *= mult_adj;
}
- mult_adj <<= adj_scale;
- interval <<= adj_scale;
- offset <<= adj_scale;
/*
* So the following can be confusing.
@@ -1860,8 +1866,6 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
* xtime_nsec_2 = xtime_nsec_1 - offset
* Which simplfies to:
* xtime_nsec -= offset
- *
- * XXX - TODO: Doc ntp_error calculation.
*/
if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
/* NTP adjustment caused clocksource mult overflow */
@@ -1872,89 +1876,38 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
tk->tkr_mono.mult += mult_adj;
tk->xtime_interval += interval;
tk->tkr_mono.xtime_nsec -= offset;
- tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
}
/*
- * Calculate the multiplier adjustment needed to match the frequency
- * specified by NTP
+ * Adjust the timekeeper's multiplier to the correct frequency
+ * and also to reduce the accumulated error value.
*/
-static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
- s64 offset)
+static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
- s64 interval = tk->cycle_interval;
- s64 xinterval = tk->xtime_interval;
- u32 base = tk->tkr_mono.clock->mult;
- u32 max = tk->tkr_mono.clock->maxadj;
- u32 cur_adj = tk->tkr_mono.mult;
- s64 tick_error;
- bool negative;
- u32 adj_scale;
-
- /* Remove any current error adj from freq calculation */
- if (tk->ntp_err_mult)
- xinterval -= tk->cycle_interval;
-
- tk->ntp_tick = ntp_tick_length();
-
- /* Calculate current error per tick */
- tick_error = ntp_tick_length() >> tk->ntp_error_shift;
- tick_error -= (xinterval + tk->xtime_remainder);
-
- /* Don't worry about correcting it if its small */
- if (likely((tick_error >= 0) && (tick_error <= interval)))
- return;
-
- /* preserve the direction of correction */
- negative = (tick_error < 0);
+ u32 mult;
- /* If any adjustment would pass the max, just return */
- if (negative && (cur_adj - 1) <= (base - max))
- return;
- if (!negative && (cur_adj + 1) >= (base + max))
- return;
/*
- * Sort out the magnitude of the correction, but
- * avoid making so large a correction that we go
- * over the max adjustment.
+ * Determine the multiplier from the current NTP tick length.
+ * Avoid expensive division when the tick length doesn't change.
*/
- adj_scale = 0;
- tick_error = abs(tick_error);
- while (tick_error > interval) {
- u32 adj = 1 << (adj_scale + 1);
-
- /* Check if adjustment gets us within 1 unit from the max */
- if (negative && (cur_adj - adj) <= (base - max))
- break;
- if (!negative && (cur_adj + adj) >= (base + max))
- break;
-
- adj_scale++;
- tick_error >>= 1;
+ if (likely(tk->ntp_tick == ntp_tick_length())) {
+ mult = tk->tkr_mono.mult - tk->ntp_err_mult;
+ } else {
+ tk->ntp_tick = ntp_tick_length();
+ mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
+ tk->xtime_remainder, tk->cycle_interval);
}
- /* scale the corrections */
- timekeeping_apply_adjustment(tk, offset, negative, adj_scale);
-}
+ /*
+ * If the clock is behind the NTP time, increase the multiplier by 1
+ * to catch up with it. If it's ahead and there was a remainder in the
+ * tick division, the clock will slow down. Otherwise it will stay
+ * ahead until the tick length changes to a non-divisible value.
+ */
+ tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0;
+ mult += tk->ntp_err_mult;
-/*
- * Adjust the timekeeper's multiplier to the correct frequency
- * and also to reduce the accumulated error value.
- */
-static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
-{
- /* Correct for the current frequency error */
- timekeeping_freqadjust(tk, offset);
-
- /* Next make a small adjustment to fix any cumulative error */
- if (!tk->ntp_err_mult && (tk->ntp_error > 0)) {
- tk->ntp_err_mult = 1;
- timekeeping_apply_adjustment(tk, offset, 0, 0);
- } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) {
- /* Undo any existing error adjustment */
- timekeeping_apply_adjustment(tk, offset, 1, 0);
- tk->ntp_err_mult = 0;
- }
+ timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult);
if (unlikely(tk->tkr_mono.clock->maxadj &&
(abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
@@ -1971,18 +1924,15 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
* in the code above, its possible the required corrective factor to
* xtime_nsec could cause it to underflow.
*
- * Now, since we already accumulated the second, cannot simply roll
- * the accumulated second back, since the NTP subsystem has been
- * notified via second_overflow. So instead we push xtime_nsec forward
- * by the amount we underflowed, and add that amount into the error.
- *
- * We'll correct this error next time through this function, when
- * xtime_nsec is not as small.
+ * Now, since we have already accumulated the second and the NTP
+ * subsystem has been notified via second_overflow(), we need to skip
+ * the next update.
*/
if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
- s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
- tk->tkr_mono.xtime_nsec = 0;
- tk->ntp_error += neg << tk->ntp_error_shift;
+ tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC <<
+ tk->tkr_mono.shift;
+ tk->xtime_sec--;
+ tk->skip_second_overflow = 1;
}
}
@@ -2005,6 +1955,15 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
tk->tkr_mono.xtime_nsec -= nsecps;
tk->xtime_sec++;
+ /*
+ * Skip NTP update if this second was accumulated before,
+ * i.e. xtime_nsec underflowed in timekeeping_adjust()
+ */
+ if (unlikely(tk->skip_second_overflow)) {
+ tk->skip_second_overflow = 0;
+ continue;
+ }
+
/* Figure out if its a leap sec and apply if needed */
leap = second_overflow(tk->xtime_sec);
if (unlikely(leap)) {
@@ -2121,7 +2080,7 @@ void update_wall_time(void)
shift--;
}
- /* correct the clock when NTP error is too big */
+ /* Adjust the multiplier to correct NTP error */
timekeeping_adjust(tk, offset);
/*
@@ -2166,7 +2125,7 @@ out:
void getboottime64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
+ ktime_t t = ktime_sub(tk->offs_real, tk->time_suspended);
*ts = ktime_to_timespec64(t);
}
@@ -2236,7 +2195,6 @@ void do_timer(unsigned long ticks)
* ktime_get_update_offsets_now - hrtimer helper
* @cwsseq: pointer to check and store the clock was set sequence number
* @offs_real: pointer to storage for monotonic -> realtime offset
- * @offs_boot: pointer to storage for monotonic -> boottime offset
* @offs_tai: pointer to storage for monotonic -> clock tai offset
*
* Returns current monotonic time and updates the offsets if the
@@ -2246,7 +2204,7 @@ void do_timer(unsigned long ticks)
* Called from hrtimer_interrupt() or retrigger_next_event()
*/
ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
- ktime_t *offs_boot, ktime_t *offs_tai)
+ ktime_t *offs_tai)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
@@ -2263,7 +2221,6 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
if (*cwsseq != tk->clock_was_set_seq) {
*cwsseq = tk->clock_was_set_seq;
*offs_real = tk->offs_real;
- *offs_boot = tk->offs_boot;
*offs_tai = tk->offs_tai;
}
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 7a9b4eb7a1d5..79b67f5e0343 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -6,7 +6,6 @@
*/
extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
ktime_t *offs_real,
- ktime_t *offs_boot,
ktime_t *offs_tai);
extern int timekeeping_valid_for_hres(void);
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index fdbeeb02dde9..cf5c0828ee31 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -31,6 +31,4 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
}
#endif
-extern time64_t __ktime_get_real_seconds(void);
-
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 0b249e2f0c3c..c4f0f2e4126e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -606,7 +606,10 @@ config HIST_TRIGGERS
event activity as an initial guide for further investigation
using more advanced tools.
- See Documentation/trace/events.txt.
+ Inter-event tracing of quantities such as latencies is also
+ supported using hist triggers under this option.
+
+ See Documentation/trace/histogram.txt.
If in doubt, say N.
config MMIOTRACE_TEST
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 01e6b3a38871..d88e96d4e12c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -524,7 +524,8 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
.arg3_type = ARG_ANYTHING,
};
-static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *
+tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -568,7 +569,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
}
}
-static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *
+kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
@@ -582,12 +584,13 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_override_return_proto;
#endif
default:
- return tracing_func_proto(func_id);
+ return tracing_func_proto(func_id, prog);
}
}
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= sizeof(struct pt_regs))
@@ -661,7 +664,8 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
.arg3_type = ARG_ANYTHING,
};
-static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *
+tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
@@ -669,11 +673,12 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
default:
- return tracing_func_proto(func_id);
+ return tracing_func_proto(func_id, prog);
}
}
static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
@@ -721,7 +726,8 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
.arg3_type = ARG_CONST_SIZE,
};
-static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *
+pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
@@ -731,15 +737,97 @@ static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_perf_prog_read_value:
return &bpf_perf_prog_read_value_proto;
default:
- return tracing_func_proto(func_id);
+ return tracing_func_proto(func_id, prog);
}
}
+/*
+ * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
+ * to avoid potential recursive reuse issue when/if tracepoints are added
+ * inside bpf_*_event_output and/or bpf_get_stack_id
+ */
+static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
+BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
+ struct bpf_map *, map, u64, flags, void *, data, u64, size)
+{
+ struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+ perf_fetch_caller_regs(regs);
+ return ____bpf_perf_event_output(regs, map, flags, data, size);
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
+ .func = bpf_perf_event_output_raw_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
+ struct bpf_map *, map, u64, flags)
+{
+ struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+ perf_fetch_caller_regs(regs);
+ /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
+ return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+ flags, 0, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
+ .func = bpf_get_stackid_raw_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_perf_event_output:
+ return &bpf_perf_event_output_proto_raw_tp;
+ case BPF_FUNC_get_stackid:
+ return &bpf_get_stackid_proto_raw_tp;
+ default:
+ return tracing_func_proto(func_id, prog);
+ }
+}
+
+static bool raw_tp_prog_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ /* largest tracepoint in the kernel has 12 args */
+ if (off < 0 || off >= sizeof(__u64) * 12)
+ return false;
+ if (type != BPF_READ)
+ return false;
+ if (off % size != 0)
+ return false;
+ return true;
+}
+
+const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
+ .get_func_proto = raw_tp_prog_func_proto,
+ .is_valid_access = raw_tp_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops raw_tracepoint_prog_ops = {
+};
+
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
- const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data,
- sample_period);
+ const int size_u64 = sizeof(u64);
if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
return false;
@@ -750,8 +838,13 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
switch (off) {
case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
- bpf_ctx_record_field_size(info, size_sp);
- if (!bpf_ctx_narrow_access_ok(off, size, size_sp))
+ bpf_ctx_record_field_size(info, size_u64);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
+ return false;
+ break;
+ case bpf_ctx_range(struct bpf_perf_event_data, addr):
+ bpf_ctx_record_field_size(info, size_u64);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
return false;
break;
default:
@@ -778,6 +871,14 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
bpf_target_off(struct perf_sample_data, period, 8,
target_size));
break;
+ case offsetof(struct bpf_perf_event_data, addr):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
+ data), si->dst_reg, si->src_reg,
+ offsetof(struct bpf_perf_event_data_kern, data));
+ *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct perf_sample_data, addr, 8,
+ target_size));
+ break;
default:
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
regs), si->dst_reg, si->src_reg,
@@ -896,3 +997,106 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
return ret;
}
+
+extern struct bpf_raw_event_map __start__bpf_raw_tp[];
+extern struct bpf_raw_event_map __stop__bpf_raw_tp[];
+
+struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name)
+{
+ struct bpf_raw_event_map *btp = __start__bpf_raw_tp;
+
+ for (; btp < __stop__bpf_raw_tp; btp++) {
+ if (!strcmp(btp->tp->name, name))
+ return btp;
+ }
+ return NULL;
+}
+
+static __always_inline
+void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
+{
+ rcu_read_lock();
+ preempt_disable();
+ (void) BPF_PROG_RUN(prog, args);
+ preempt_enable();
+ rcu_read_unlock();
+}
+
+#define UNPACK(...) __VA_ARGS__
+#define REPEAT_1(FN, DL, X, ...) FN(X)
+#define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__)
+#define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__)
+#define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__)
+#define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__)
+#define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__)
+#define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__)
+#define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__)
+#define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__)
+#define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__)
+#define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__)
+#define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__)
+#define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__)
+
+#define SARG(X) u64 arg##X
+#define COPY(X) args[X] = arg##X
+
+#define __DL_COM (,)
+#define __DL_SEM (;)
+
+#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+
+#define BPF_TRACE_DEFN_x(x) \
+ void bpf_trace_run##x(struct bpf_prog *prog, \
+ REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \
+ { \
+ u64 args[x]; \
+ REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \
+ __bpf_trace_run(prog, args); \
+ } \
+ EXPORT_SYMBOL_GPL(bpf_trace_run##x)
+BPF_TRACE_DEFN_x(1);
+BPF_TRACE_DEFN_x(2);
+BPF_TRACE_DEFN_x(3);
+BPF_TRACE_DEFN_x(4);
+BPF_TRACE_DEFN_x(5);
+BPF_TRACE_DEFN_x(6);
+BPF_TRACE_DEFN_x(7);
+BPF_TRACE_DEFN_x(8);
+BPF_TRACE_DEFN_x(9);
+BPF_TRACE_DEFN_x(10);
+BPF_TRACE_DEFN_x(11);
+BPF_TRACE_DEFN_x(12);
+
+static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+{
+ struct tracepoint *tp = btp->tp;
+
+ /*
+ * check that program doesn't access arguments beyond what's
+ * available in this tracepoint
+ */
+ if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64))
+ return -EINVAL;
+
+ return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog);
+}
+
+int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+{
+ int err;
+
+ mutex_lock(&bpf_event_mutex);
+ err = __bpf_probe_register(btp, prog);
+ mutex_unlock(&bpf_event_mutex);
+ return err;
+}
+
+int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+{
+ int err;
+
+ mutex_lock(&bpf_event_mutex);
+ err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
+ mutex_unlock(&bpf_event_mutex);
+ return err;
+}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eac9ce2c57a2..16bbf062018f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3902,14 +3902,13 @@ static bool module_exists(const char *module)
{
/* All modules have the symbol __this_module */
const char this_mod[] = "__this_module";
- const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1;
- char modname[modname_size + 1];
+ char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
unsigned long val;
int n;
- n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod);
+ n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod);
- if (n > modname_size)
+ if (n > sizeof(modname) - 1)
return false;
val = module_kallsyms_lookup_name(modname);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dcf1c4dd3efe..c9cb9767d49b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -22,6 +22,7 @@
#include <linux/hash.h>
#include <linux/list.h>
#include <linux/cpu.h>
+#include <linux/oom.h>
#include <asm/local.h>
@@ -41,6 +42,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
RINGBUF_TYPE_PADDING);
trace_seq_printf(s, "\ttime_extend : type == %d\n",
RINGBUF_TYPE_TIME_EXTEND);
+ trace_seq_printf(s, "\ttime_stamp : type == %d\n",
+ RINGBUF_TYPE_TIME_STAMP);
trace_seq_printf(s, "\tdata max type_len == %d\n",
RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
@@ -140,12 +143,15 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
enum {
RB_LEN_TIME_EXTEND = 8,
- RB_LEN_TIME_STAMP = 16,
+ RB_LEN_TIME_STAMP = 8,
};
#define skip_time_extend(event) \
((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
+#define extended_time(event) \
+ (event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
+
static inline int rb_null_event(struct ring_buffer_event *event)
{
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -209,7 +215,7 @@ rb_event_ts_length(struct ring_buffer_event *event)
{
unsigned len = 0;
- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+ if (extended_time(event)) {
/* time extends include the data event after it */
len = RB_LEN_TIME_EXTEND;
event = skip_time_extend(event);
@@ -231,7 +237,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
{
unsigned length;
- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ if (extended_time(event))
event = skip_time_extend(event);
length = rb_event_length(event);
@@ -248,7 +254,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
static __always_inline void *
rb_event_data(struct ring_buffer_event *event)
{
- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ if (extended_time(event))
event = skip_time_extend(event);
BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
/* If length is in len field, then array[0] has the data */
@@ -275,6 +281,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define TS_MASK ((1ULL << TS_SHIFT) - 1)
#define TS_DELTA_TEST (~TS_MASK)
+/**
+ * ring_buffer_event_time_stamp - return the event's extended timestamp
+ * @event: the event to get the timestamp of
+ *
+ * Returns the extended timestamp associated with a data event.
+ * An extended time_stamp is a 64-bit timestamp represented
+ * internally in a special way that makes the best use of space
+ * contained within a ring buffer event. This function decodes
+ * it and maps it to a straight u64 value.
+ */
+u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
+{
+ u64 ts;
+
+ ts = event->array[0];
+ ts <<= TS_SHIFT;
+ ts += event->time_delta;
+
+ return ts;
+}
+
/* Flag when events were overwritten */
#define RB_MISSED_EVENTS (1 << 31)
/* Missed count stored at end */
@@ -451,6 +478,7 @@ struct ring_buffer_per_cpu {
struct buffer_page *reader_page;
unsigned long lost_events;
unsigned long last_overrun;
+ unsigned long nest;
local_t entries_bytes;
local_t entries;
local_t overrun;
@@ -488,6 +516,7 @@ struct ring_buffer {
u64 (*clock)(void);
struct rb_irq_work irq_work;
+ bool time_stamp_abs;
};
struct ring_buffer_iter {
@@ -1134,30 +1163,60 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
{
struct buffer_page *bpage, *tmp;
+ bool user_thread = current->mm != NULL;
+ gfp_t mflags;
long i;
+ /*
+ * Check if the available memory is there first.
+ * Note, si_mem_available() only gives us a rough estimate of available
+ * memory. It may not be accurate. But we don't care, we just want
+ * to prevent doing any allocation when it is obvious that it is
+ * not going to succeed.
+ */
+ i = si_mem_available();
+ if (i < nr_pages)
+ return -ENOMEM;
+
+ /*
+ * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
+ * gracefully without invoking oom-killer and the system is not
+ * destabilized.
+ */
+ mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
+
+ /*
+ * If a user thread allocates too much, and si_mem_available()
+ * reports there's enough memory, even though there is not.
+ * Make sure the OOM killer kills this thread. This can happen
+ * even with RETRY_MAYFAIL because another task may be doing
+ * an allocation after this task has taken all memory.
+ * This is the task the OOM killer needs to take out during this
+ * loop, even if it was triggered by an allocation somewhere else.
+ */
+ if (user_thread)
+ set_current_oom_origin();
for (i = 0; i < nr_pages; i++) {
struct page *page;
- /*
- * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
- * gracefully without invoking oom-killer and the system is not
- * destabilized.
- */
+
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- GFP_KERNEL | __GFP_RETRY_MAYFAIL,
- cpu_to_node(cpu));
+ mflags, cpu_to_node(cpu));
if (!bpage)
goto free_pages;
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0);
+ page = alloc_pages_node(cpu_to_node(cpu), mflags, 0);
if (!page)
goto free_pages;
bpage->page = page_address(page);
rb_init_page(bpage->page);
+
+ if (user_thread && fatal_signal_pending(current))
+ goto free_pages;
}
+ if (user_thread)
+ clear_current_oom_origin();
return 0;
@@ -1166,6 +1225,8 @@ free_pages:
list_del_init(&bpage->list);
free_buffer_page(bpage);
}
+ if (user_thread)
+ clear_current_oom_origin();
return -ENOMEM;
}
@@ -1382,6 +1443,16 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
buffer->clock = clock;
}
+void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
+{
+ buffer->time_stamp_abs = abs;
+}
+
+bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
+{
+ return buffer->time_stamp_abs;
+}
+
static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
static inline unsigned long rb_page_entries(struct buffer_page *bpage)
@@ -2206,12 +2277,15 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* Slow path, do not inline */
static noinline struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
{
- event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+ if (abs)
+ event->type_len = RINGBUF_TYPE_TIME_STAMP;
+ else
+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
- /* Not the first event on the page? */
- if (rb_event_index(event)) {
+ /* Not the first event on the page, or not delta? */
+ if (abs || rb_event_index(event)) {
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
@@ -2254,7 +2328,9 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
* add it to the start of the resevered space.
*/
if (unlikely(info->add_timestamp)) {
- event = rb_add_time_stamp(event, delta);
+ bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
+
+ event = rb_add_time_stamp(event, info->delta, abs);
length -= RB_LEN_TIME_EXTEND;
delta = 0;
}
@@ -2442,7 +2518,7 @@ static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer
static inline void rb_event_discard(struct ring_buffer_event *event)
{
- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ if (extended_time(event))
event = skip_time_extend(event);
/* array[0] holds the actual length for the discarded event */
@@ -2486,10 +2562,11 @@ rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
cpu_buffer->write_stamp =
cpu_buffer->commit_page->page->time_stamp;
else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
- delta = event->array[0];
- delta <<= TS_SHIFT;
- delta += event->time_delta;
+ delta = ring_buffer_event_time_stamp(event);
cpu_buffer->write_stamp += delta;
+ } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
+ delta = ring_buffer_event_time_stamp(event);
+ cpu_buffer->write_stamp = delta;
} else
cpu_buffer->write_stamp += event->time_delta;
}
@@ -2581,10 +2658,10 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
bit = pc & NMI_MASK ? RB_CTX_NMI :
pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
- if (unlikely(val & (1 << bit)))
+ if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
return 1;
- val |= (1 << bit);
+ val |= (1 << (bit + cpu_buffer->nest));
cpu_buffer->current_context = val;
return 0;
@@ -2593,7 +2670,57 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
static __always_inline void
trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
- cpu_buffer->current_context &= cpu_buffer->current_context - 1;
+ cpu_buffer->current_context &=
+ cpu_buffer->current_context - (1 << cpu_buffer->nest);
+}
+
+/* The recursive locking above uses 4 bits */
+#define NESTED_BITS 4
+
+/**
+ * ring_buffer_nest_start - Allow to trace while nested
+ * @buffer: The ring buffer to modify
+ *
+ * The ring buffer has a safty mechanism to prevent recursion.
+ * But there may be a case where a trace needs to be done while
+ * tracing something else. In this case, calling this function
+ * will allow this function to nest within a currently active
+ * ring_buffer_lock_reserve().
+ *
+ * Call this function before calling another ring_buffer_lock_reserve() and
+ * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
+ */
+void ring_buffer_nest_start(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* Enabled by ring_buffer_nest_end() */
+ preempt_disable_notrace();
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ /* This is the shift value for the above recusive locking */
+ cpu_buffer->nest += NESTED_BITS;
+}
+
+/**
+ * ring_buffer_nest_end - Allow to trace while nested
+ * @buffer: The ring buffer to modify
+ *
+ * Must be called after ring_buffer_nest_start() and after the
+ * ring_buffer_unlock_commit().
+ */
+void ring_buffer_nest_end(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* disabled by ring_buffer_nest_start() */
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ /* This is the shift value for the above recusive locking */
+ cpu_buffer->nest -= NESTED_BITS;
+ preempt_enable_notrace();
}
/**
@@ -2637,7 +2764,8 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
sched_clock_stable() ? "" :
"If you just came from a suspend/resume,\n"
"please switch to the trace global clock:\n"
- " echo global > /sys/kernel/debug/tracing/trace_clock\n");
+ " echo global > /sys/kernel/debug/tracing/trace_clock\n"
+ "or add trace_clock=global to the kernel command line\n");
info->add_timestamp = 1;
}
@@ -2669,7 +2797,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
* If this is the first commit on the page, then it has the same
* timestamp as the page itself.
*/
- if (!tail)
+ if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
info->delta = 0;
/* See if we shot pass the end of this buffer page */
@@ -2746,8 +2874,11 @@ rb_reserve_next_event(struct ring_buffer *buffer,
/* make sure this diff is calculated here */
barrier();
- /* Did the write stamp get updated already? */
- if (likely(info.ts >= cpu_buffer->write_stamp)) {
+ if (ring_buffer_time_stamp_abs(buffer)) {
+ info.delta = info.ts;
+ rb_handle_timestamp(cpu_buffer, &info);
+ } else /* Did the write stamp get updated already? */
+ if (likely(info.ts >= cpu_buffer->write_stamp)) {
info.delta = diff;
if (unlikely(test_time_stamp(info.delta)))
rb_handle_timestamp(cpu_buffer, &info);
@@ -3429,14 +3560,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
return;
case RINGBUF_TYPE_TIME_EXTEND:
- delta = event->array[0];
- delta <<= TS_SHIFT;
- delta += event->time_delta;
+ delta = ring_buffer_event_time_stamp(event);
cpu_buffer->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
- /* FIXME: not implemented */
+ delta = ring_buffer_event_time_stamp(event);
+ cpu_buffer->read_stamp = delta;
return;
case RINGBUF_TYPE_DATA:
@@ -3460,14 +3590,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
return;
case RINGBUF_TYPE_TIME_EXTEND:
- delta = event->array[0];
- delta <<= TS_SHIFT;
- delta += event->time_delta;
+ delta = ring_buffer_event_time_stamp(event);
iter->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
- /* FIXME: not implemented */
+ delta = ring_buffer_event_time_stamp(event);
+ iter->read_stamp = delta;
return;
case RINGBUF_TYPE_DATA:
@@ -3691,6 +3820,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
struct buffer_page *reader;
int nr_loops = 0;
+ if (ts)
+ *ts = 0;
again:
/*
* We repeat when a time extend is encountered.
@@ -3727,12 +3858,17 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
goto again;
case RINGBUF_TYPE_TIME_STAMP:
- /* FIXME: not implemented */
+ if (ts) {
+ *ts = ring_buffer_event_time_stamp(event);
+ ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
+ cpu_buffer->cpu, ts);
+ }
+ /* Internal data, OK to advance */
rb_advance_reader(cpu_buffer);
goto again;
case RINGBUF_TYPE_DATA:
- if (ts) {
+ if (ts && !(*ts)) {
*ts = cpu_buffer->read_stamp + event->time_delta;
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
@@ -3757,6 +3893,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer_event *event;
int nr_loops = 0;
+ if (ts)
+ *ts = 0;
+
cpu_buffer = iter->cpu_buffer;
buffer = cpu_buffer->buffer;
@@ -3809,12 +3948,17 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
goto again;
case RINGBUF_TYPE_TIME_STAMP:
- /* FIXME: not implemented */
+ if (ts) {
+ *ts = ring_buffer_event_time_stamp(event);
+ ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
+ cpu_buffer->cpu, ts);
+ }
+ /* Internal data, OK to advance */
rb_advance_iter(iter);
goto again;
case RINGBUF_TYPE_DATA:
- if (ts) {
+ if (ts && !(*ts)) {
*ts = iter->read_stamp + event->time_delta;
ring_buffer_normalize_time_stamp(buffer,
cpu_buffer->cpu, ts);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 20a2300ae4e8..dfbcf9ee1447 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,6 +41,7 @@
#include <linux/nmi.h>
#include <linux/fs.h>
#include <linux/trace.h>
+#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
#include "trace.h"
@@ -1164,10 +1165,18 @@ static struct {
{ trace_clock, "perf", 1 },
{ ktime_get_mono_fast_ns, "mono", 1 },
{ ktime_get_raw_fast_ns, "mono_raw", 1 },
- { ktime_get_boot_fast_ns, "boot", 1 },
+ { ktime_get_mono_fast_ns, "boot", 1 },
ARCH_TRACE_CLOCKS
};
+bool trace_clock_in_ns(struct trace_array *tr)
+{
+ if (trace_clocks[tr->clock_id].in_ns)
+ return true;
+
+ return false;
+}
+
/*
* trace_parser_get_init - gets the buffer for trace parser
*/
@@ -2269,7 +2278,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
*current_rb = trace_file->tr->trace_buffer.buffer;
- if ((trace_file->flags &
+ if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
(EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
(entry = this_cpu_read(trace_buffered_event))) {
/* Try to use the per cpu buffer first */
@@ -2380,7 +2389,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
* trace_buffer_unlock_commit_regs()
* trace_event_buffer_commit()
* trace_event_raw_event_xxx()
-*/
+ */
# define STACK_SKIP 3
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
@@ -4515,6 +4524,9 @@ static const char readme_msg[] =
#ifdef CONFIG_X86_64
" x86-tsc: TSC cycle counter\n"
#endif
+ "\n timestamp_mode\t-view the mode used to timestamp events\n"
+ " delta: Delta difference against a buffer-wide timestamp\n"
+ " absolute: Absolute (standalone) timestamp\n"
"\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
"\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
" tracing_cpumask\t- Limit which CPUs to trace\n"
@@ -4691,8 +4703,9 @@ static const char readme_msg[] =
"\t .sym display an address as a symbol\n"
"\t .sym-offset display an address as a symbol and offset\n"
"\t .execname display a common_pid as a program name\n"
- "\t .syscall display a syscall id as a syscall name\n\n"
- "\t .log2 display log2 value rather than raw number\n\n"
+ "\t .syscall display a syscall id as a syscall name\n"
+ "\t .log2 display log2 value rather than raw number\n"
+ "\t .usecs display a common_timestamp in microseconds\n\n"
"\t The 'pause' parameter can be used to pause an existing hist\n"
"\t trigger or to start a hist trigger but not log any events\n"
"\t until told to do so. 'continue' can be used to start or\n"
@@ -6202,7 +6215,7 @@ static int tracing_clock_show(struct seq_file *m, void *v)
return 0;
}
-static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
+int tracing_set_clock(struct trace_array *tr, const char *clockstr)
{
int i;
@@ -6282,6 +6295,71 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
return ret;
}
+static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
+{
+ struct trace_array *tr = m->private;
+
+ mutex_lock(&trace_types_lock);
+
+ if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer))
+ seq_puts(m, "delta [absolute]\n");
+ else
+ seq_puts(m, "[delta] absolute\n");
+
+ mutex_unlock(&trace_types_lock);
+
+ return 0;
+}
+
+static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ if (trace_array_get(tr))
+ return -ENODEV;
+
+ ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private);
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
+}
+
+int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
+{
+ int ret = 0;
+
+ mutex_lock(&trace_types_lock);
+
+ if (abs && tr->time_stamp_abs_ref++)
+ goto out;
+
+ if (!abs) {
+ if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (--tr->time_stamp_abs_ref)
+ goto out;
+ }
+
+ ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (tr->max_buffer.buffer)
+ ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
+#endif
+ out:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
struct ftrace_buffer_info {
struct trace_iterator iter;
void *spare;
@@ -6529,6 +6607,13 @@ static const struct file_operations trace_clock_fops = {
.write = tracing_clock_write,
};
+static const struct file_operations trace_time_stamp_mode_fops = {
+ .open = tracing_time_stamp_mode_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_single_release_tr,
+};
+
#ifdef CONFIG_TRACER_SNAPSHOT
static const struct file_operations snapshot_fops = {
.open = tracing_snapshot_open,
@@ -7699,6 +7784,7 @@ static int instance_mkdir(const char *name)
INIT_LIST_HEAD(&tr->systems);
INIT_LIST_HEAD(&tr->events);
+ INIT_LIST_HEAD(&tr->hist_vars);
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
@@ -7851,6 +7937,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("tracing_on", 0644, d_tracer,
tr, &rb_simple_fops);
+ trace_create_file("timestamp_mode", 0444, d_tracer, tr,
+ &trace_time_stamp_mode_fops);
+
create_trace_options_dir(tr);
#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
@@ -8446,6 +8535,7 @@ __init static int tracer_alloc_buffers(void)
INIT_LIST_HEAD(&global_trace.systems);
INIT_LIST_HEAD(&global_trace.events);
+ INIT_LIST_HEAD(&global_trace.hist_vars);
list_add(&global_trace.list, &ftrace_trace_arrays);
apply_trace_boot_options();
@@ -8507,3 +8597,21 @@ __init static int clear_boot_tracer(void)
fs_initcall(tracer_init_tracefs);
late_initcall_sync(clear_boot_tracer);
+
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+__init static int tracing_set_default_clock(void)
+{
+ /* sched_clock_stable() is determined in late_initcall */
+ if (!trace_boot_clock && !sched_clock_stable()) {
+ printk(KERN_WARNING
+ "Unstable clock detected, switching default tracing clock to \"global\"\n"
+ "If you want to keep using the local clock, then add:\n"
+ " \"trace_clock=local\"\n"
+ "on the kernel command line\n");
+ tracing_set_clock(&global_trace, "global");
+ }
+
+ return 0;
+}
+late_initcall_sync(tracing_set_default_clock);
+#endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2a6d0325a761..6fb46a06c9dc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -273,6 +273,8 @@ struct trace_array {
/* function tracing enabled */
int function_enabled;
#endif
+ int time_stamp_abs_ref;
+ struct list_head hist_vars;
};
enum {
@@ -286,6 +288,11 @@ extern struct mutex trace_types_lock;
extern int trace_array_get(struct trace_array *tr);
extern void trace_array_put(struct trace_array *tr);
+extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
+extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
+
+extern bool trace_clock_in_ns(struct trace_array *tr);
+
/*
* The global tracer (top) should be the first trace array added,
* but we check the flag anyway.
@@ -1209,12 +1216,11 @@ struct ftrace_event_field {
int is_signed;
};
+struct prog_entry;
+
struct event_filter {
- int n_preds; /* Number assigned */
- int a_preds; /* allocated */
- struct filter_pred __rcu *preds;
- struct filter_pred __rcu *root;
- char *filter_string;
+ struct prog_entry __rcu *prog;
+ char *filter_string;
};
struct event_subsystem {
@@ -1291,7 +1297,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
unsigned long eflags = file->flags;
if (eflags & EVENT_FILE_FL_TRIGGER_COND)
- *tt = event_triggers_call(file, entry);
+ *tt = event_triggers_call(file, entry, event);
if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
(unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
@@ -1328,7 +1334,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
if (tt)
- event_triggers_post_call(file, tt, entry);
+ event_triggers_post_call(file, tt, entry, event);
}
/**
@@ -1361,7 +1367,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file,
irq_flags, pc, regs);
if (tt)
- event_triggers_post_call(file, tt, entry);
+ event_triggers_post_call(file, tt, entry, event);
}
#define FILTER_PRED_INVALID ((unsigned short)-1)
@@ -1406,12 +1412,8 @@ struct filter_pred {
unsigned short *ops;
struct ftrace_event_field *field;
int offset;
- int not;
+ int not;
int op;
- unsigned short index;
- unsigned short parent;
- unsigned short left;
- unsigned short right;
};
static inline bool is_string_field(struct ftrace_event_field *field)
@@ -1543,6 +1545,8 @@ extern void pause_named_trigger(struct event_trigger_data *data);
extern void unpause_named_trigger(struct event_trigger_data *data);
extern void set_named_trigger_data(struct event_trigger_data *data,
struct event_trigger_data *named_data);
+extern struct event_trigger_data *
+get_named_trigger_data(struct event_trigger_data *data);
extern int register_event_command(struct event_command *cmd);
extern int unregister_event_command(struct event_command *cmd);
extern int register_trigger_hist_enable_disable_cmds(void);
@@ -1586,7 +1590,8 @@ extern int register_trigger_hist_enable_disable_cmds(void);
*/
struct event_trigger_ops {
void (*func)(struct event_trigger_data *data,
- void *rec);
+ void *rec,
+ struct ring_buffer_event *rbe);
int (*init)(struct event_trigger_ops *ops,
struct event_trigger_data *data);
void (*free)(struct event_trigger_ops *ops,
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 5fdc779f411d..d8a188e0418a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -96,7 +96,7 @@ u64 notrace trace_clock_global(void)
int this_cpu;
u64 now;
- local_irq_save(flags);
+ raw_local_irq_save(flags);
this_cpu = raw_smp_processor_id();
now = sched_clock_cpu(this_cpu);
@@ -122,7 +122,7 @@ u64 notrace trace_clock_global(void)
arch_spin_unlock(&trace_clock_struct.lock);
out:
- local_irq_restore(flags);
+ raw_local_irq_restore(flags);
return now;
}
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 2c416509b834..c79193e598f5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -252,6 +252,8 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
ret = strncpy_from_user(
func, u64_to_user_ptr(p_event->attr.kprobe_func),
KSYM_NAME_LEN);
+ if (ret == KSYM_NAME_LEN)
+ ret = -E2BIG;
if (ret < 0)
goto out;
@@ -300,6 +302,8 @@ int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe)
return -ENOMEM;
ret = strncpy_from_user(
path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX);
+ if (ret == PATH_MAX)
+ return -E2BIG;
if (ret < 0)
goto out;
if (path[0] == '\0') {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index a764aec3c9a1..9b4716bb8bb0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -33,163 +33,595 @@
"# Only events with the given fields will be affected.\n" \
"# If no events are modified, an error message will be displayed here"
-enum filter_op_ids
-{
- OP_OR,
- OP_AND,
- OP_GLOB,
- OP_NE,
- OP_EQ,
- OP_LT,
- OP_LE,
- OP_GT,
- OP_GE,
- OP_BAND,
- OP_NOT,
- OP_NONE,
- OP_OPEN_PAREN,
-};
+/* Due to token parsing '<=' must be before '<' and '>=' must be before '>' */
+#define OPS \
+ C( OP_GLOB, "~" ), \
+ C( OP_NE, "!=" ), \
+ C( OP_EQ, "==" ), \
+ C( OP_LE, "<=" ), \
+ C( OP_LT, "<" ), \
+ C( OP_GE, ">=" ), \
+ C( OP_GT, ">" ), \
+ C( OP_BAND, "&" ), \
+ C( OP_MAX, NULL )
-struct filter_op {
- int id;
- char *string;
- int precedence;
-};
+#undef C
+#define C(a, b) a
-/* Order must be the same as enum filter_op_ids above */
-static struct filter_op filter_ops[] = {
- { OP_OR, "||", 1 },
- { OP_AND, "&&", 2 },
- { OP_GLOB, "~", 4 },
- { OP_NE, "!=", 4 },
- { OP_EQ, "==", 4 },
- { OP_LT, "<", 5 },
- { OP_LE, "<=", 5 },
- { OP_GT, ">", 5 },
- { OP_GE, ">=", 5 },
- { OP_BAND, "&", 6 },
- { OP_NOT, "!", 6 },
- { OP_NONE, "OP_NONE", 0 },
- { OP_OPEN_PAREN, "(", 0 },
-};
+enum filter_op_ids { OPS };
-enum {
- FILT_ERR_NONE,
- FILT_ERR_INVALID_OP,
- FILT_ERR_UNBALANCED_PAREN,
- FILT_ERR_TOO_MANY_OPERANDS,
- FILT_ERR_OPERAND_TOO_LONG,
- FILT_ERR_FIELD_NOT_FOUND,
- FILT_ERR_ILLEGAL_FIELD_OP,
- FILT_ERR_ILLEGAL_INTVAL,
- FILT_ERR_BAD_SUBSYS_FILTER,
- FILT_ERR_TOO_MANY_PREDS,
- FILT_ERR_MISSING_FIELD,
- FILT_ERR_INVALID_FILTER,
- FILT_ERR_IP_FIELD_ONLY,
- FILT_ERR_ILLEGAL_NOT_OP,
-};
+#undef C
+#define C(a, b) b
-static char *err_text[] = {
- "No error",
- "Invalid operator",
- "Unbalanced parens",
- "Too many operands",
- "Operand too long",
- "Field not found",
- "Illegal operation for field type",
- "Illegal integer value",
- "Couldn't find or set field in one of a subsystem's events",
- "Too many terms in predicate expression",
- "Missing field name and/or value",
- "Meaningless filter expression",
- "Only 'ip' field is supported for function trace",
- "Illegal use of '!'",
-};
+static const char * ops[] = { OPS };
-struct opstack_op {
- enum filter_op_ids op;
- struct list_head list;
-};
+/*
+ * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND
+ * pred_funcs_##type below must match the order of them above.
+ */
+#define PRED_FUNC_START OP_LE
+#define PRED_FUNC_MAX (OP_BAND - PRED_FUNC_START)
+
+#define ERRORS \
+ C(NONE, "No error"), \
+ C(INVALID_OP, "Invalid operator"), \
+ C(TOO_MANY_OPEN, "Too many '('"), \
+ C(TOO_MANY_CLOSE, "Too few '('"), \
+ C(MISSING_QUOTE, "Missing matching quote"), \
+ C(OPERAND_TOO_LONG, "Operand too long"), \
+ C(EXPECT_STRING, "Expecting string field"), \
+ C(EXPECT_DIGIT, "Expecting numeric field"), \
+ C(ILLEGAL_FIELD_OP, "Illegal operation for field type"), \
+ C(FIELD_NOT_FOUND, "Field not found"), \
+ C(ILLEGAL_INTVAL, "Illegal integer value"), \
+ C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \
+ C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \
+ C(INVALID_FILTER, "Meaningless filter expression"), \
+ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \
+ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"),
+
+#undef C
+#define C(a, b) FILT_ERR_##a
+
+enum { ERRORS };
+
+#undef C
+#define C(a, b) b
+
+static char *err_text[] = { ERRORS };
+
+/* Called after a '!' character but "!=" and "!~" are not "not"s */
+static bool is_not(const char *str)
+{
+ switch (str[1]) {
+ case '=':
+ case '~':
+ return false;
+ }
+ return true;
+}
-struct postfix_elt {
- enum filter_op_ids op;
- char *operand;
- struct list_head list;
+/**
+ * prog_entry - a singe entry in the filter program
+ * @target: Index to jump to on a branch (actually one minus the index)
+ * @when_to_branch: The value of the result of the predicate to do a branch
+ * @pred: The predicate to execute.
+ */
+struct prog_entry {
+ int target;
+ int when_to_branch;
+ struct filter_pred *pred;
};
-struct filter_parse_state {
- struct filter_op *ops;
- struct list_head opstack;
- struct list_head postfix;
+/**
+ * update_preds- assign a program entry a label target
+ * @prog: The program array
+ * @N: The index of the current entry in @prog
+ * @when_to_branch: What to assign a program entry for its branch condition
+ *
+ * The program entry at @N has a target that points to the index of a program
+ * entry that can have its target and when_to_branch fields updated.
+ * Update the current program entry denoted by index @N target field to be
+ * that of the updated entry. This will denote the entry to update if
+ * we are processing an "||" after an "&&"
+ */
+static void update_preds(struct prog_entry *prog, int N, int invert)
+{
+ int t, s;
+
+ t = prog[N].target;
+ s = prog[t].target;
+ prog[t].when_to_branch = invert;
+ prog[t].target = N;
+ prog[N].target = s;
+}
+
+struct filter_parse_error {
int lasterr;
int lasterr_pos;
-
- struct {
- char *string;
- unsigned int cnt;
- unsigned int tail;
- } infix;
-
- struct {
- char string[MAX_FILTER_STR_VAL];
- int pos;
- unsigned int tail;
- } operand;
};
-struct pred_stack {
- struct filter_pred **preds;
- int index;
+static void parse_error(struct filter_parse_error *pe, int err, int pos)
+{
+ pe->lasterr = err;
+ pe->lasterr_pos = pos;
+}
+
+typedef int (*parse_pred_fn)(const char *str, void *data, int pos,
+ struct filter_parse_error *pe,
+ struct filter_pred **pred);
+
+enum {
+ INVERT = 1,
+ PROCESS_AND = 2,
+ PROCESS_OR = 4,
};
-/* If not of not match is equal to not of not, then it is a match */
+/*
+ * Without going into a formal proof, this explains the method that is used in
+ * parsing the logical expressions.
+ *
+ * For example, if we have: "a && !(!b || (c && g)) || d || e && !f"
+ * The first pass will convert it into the following program:
+ *
+ * n1: r=a; l1: if (!r) goto l4;
+ * n2: r=b; l2: if (!r) goto l4;
+ * n3: r=c; r=!r; l3: if (r) goto l4;
+ * n4: r=g; r=!r; l4: if (r) goto l5;
+ * n5: r=d; l5: if (r) goto T
+ * n6: r=e; l6: if (!r) goto l7;
+ * n7: r=f; r=!r; l7: if (!r) goto F
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * To do this, we use a data structure to represent each of the above
+ * predicate and conditions that has:
+ *
+ * predicate, when_to_branch, invert, target
+ *
+ * The "predicate" will hold the function to determine the result "r".
+ * The "when_to_branch" denotes what "r" should be if a branch is to be taken
+ * "&&" would contain "!r" or (0) and "||" would contain "r" or (1).
+ * The "invert" holds whether the value should be reversed before testing.
+ * The "target" contains the label "l#" to jump to.
+ *
+ * A stack is created to hold values when parentheses are used.
+ *
+ * To simplify the logic, the labels will start at 0 and not 1.
+ *
+ * The possible invert values are 1 and 0. The number of "!"s that are in scope
+ * before the predicate determines the invert value, if the number is odd then
+ * the invert value is 1 and 0 otherwise. This means the invert value only
+ * needs to be toggled when a new "!" is introduced compared to what is stored
+ * on the stack, where parentheses were used.
+ *
+ * The top of the stack and "invert" are initialized to zero.
+ *
+ * ** FIRST PASS **
+ *
+ * #1 A loop through all the tokens is done:
+ *
+ * #2 If the token is an "(", the stack is push, and the current stack value
+ * gets the current invert value, and the loop continues to the next token.
+ * The top of the stack saves the "invert" value to keep track of what
+ * the current inversion is. As "!(a && !b || c)" would require all
+ * predicates being affected separately by the "!" before the parentheses.
+ * And that would end up being equivalent to "(!a || b) && !c"
+ *
+ * #3 If the token is an "!", the current "invert" value gets inverted, and
+ * the loop continues. Note, if the next token is a predicate, then
+ * this "invert" value is only valid for the current program entry,
+ * and does not affect other predicates later on.
+ *
+ * The only other acceptable token is the predicate string.
+ *
+ * #4 A new entry into the program is added saving: the predicate and the
+ * current value of "invert". The target is currently assigned to the
+ * previous program index (this will not be its final value).
+ *
+ * #5 We now enter another loop and look at the next token. The only valid
+ * tokens are ")", "&&", "||" or end of the input string "\0".
+ *
+ * #6 The invert variable is reset to the current value saved on the top of
+ * the stack.
+ *
+ * #7 The top of the stack holds not only the current invert value, but also
+ * if a "&&" or "||" needs to be processed. Note, the "&&" takes higher
+ * precedence than "||". That is "a && b || c && d" is equivalent to
+ * "(a && b) || (c && d)". Thus the first thing to do is to see if "&&" needs
+ * to be processed. This is the case if an "&&" was the last token. If it was
+ * then we call update_preds(). This takes the program, the current index in
+ * the program, and the current value of "invert". More will be described
+ * below about this function.
+ *
+ * #8 If the next token is "&&" then we set a flag in the top of the stack
+ * that denotes that "&&" needs to be processed, break out of this loop
+ * and continue with the outer loop.
+ *
+ * #9 Otherwise, if a "||" needs to be processed then update_preds() is called.
+ * This is called with the program, the current index in the program, but
+ * this time with an inverted value of "invert" (that is !invert). This is
+ * because the value taken will become the "when_to_branch" value of the
+ * program.
+ * Note, this is called when the next token is not an "&&". As stated before,
+ * "&&" takes higher precedence, and "||" should not be processed yet if the
+ * next logical operation is "&&".
+ *
+ * #10 If the next token is "||" then we set a flag in the top of the stack
+ * that denotes that "||" needs to be processed, break out of this loop
+ * and continue with the outer loop.
+ *
+ * #11 If this is the end of the input string "\0" then we break out of both
+ * loops.
+ *
+ * #12 Otherwise, the next token is ")", where we pop the stack and continue
+ * this inner loop.
+ *
+ * Now to discuss the update_pred() function, as that is key to the setting up
+ * of the program. Remember the "target" of the program is initialized to the
+ * previous index and not the "l" label. The target holds the index into the
+ * program that gets affected by the operand. Thus if we have something like
+ * "a || b && c", when we process "a" the target will be "-1" (undefined).
+ * When we process "b", its target is "0", which is the index of "a", as that's
+ * the predicate that is affected by "||". But because the next token after "b"
+ * is "&&" we don't call update_preds(). Instead continue to "c". As the
+ * next token after "c" is not "&&" but the end of input, we first process the
+ * "&&" by calling update_preds() for the "&&" then we process the "||" by
+ * callin updates_preds() with the values for processing "||".
+ *
+ * What does that mean? What update_preds() does is to first save the "target"
+ * of the program entry indexed by the current program entry's "target"
+ * (remember the "target" is initialized to previous program entry), and then
+ * sets that "target" to the current index which represents the label "l#".
+ * That entry's "when_to_branch" is set to the value passed in (the "invert"
+ * or "!invert"). Then it sets the current program entry's target to the saved
+ * "target" value (the old value of the program that had its "target" updated
+ * to the label).
+ *
+ * Looking back at "a || b && c", we have the following steps:
+ * "a" - prog[0] = { "a", X, -1 } // pred, when_to_branch, target
+ * "||" - flag that we need to process "||"; continue outer loop
+ * "b" - prog[1] = { "b", X, 0 }
+ * "&&" - flag that we need to process "&&"; continue outer loop
+ * (Notice we did not process "||")
+ * "c" - prog[2] = { "c", X, 1 }
+ * update_preds(prog, 2, 0); // invert = 0 as we are processing "&&"
+ * t = prog[2].target; // t = 1
+ * s = prog[t].target; // s = 0
+ * prog[t].target = 2; // Set target to "l2"
+ * prog[t].when_to_branch = 0;
+ * prog[2].target = s;
+ * update_preds(prog, 2, 1); // invert = 1 as we are now processing "||"
+ * t = prog[2].target; // t = 0
+ * s = prog[t].target; // s = -1
+ * prog[t].target = 2; // Set target to "l2"
+ * prog[t].when_to_branch = 1;
+ * prog[2].target = s;
+ *
+ * #13 Which brings us to the final step of the first pass, which is to set
+ * the last program entry's when_to_branch and target, which will be
+ * when_to_branch = 0; target = N; ( the label after the program entry after
+ * the last program entry processed above).
+ *
+ * If we denote "TRUE" to be the entry after the last program entry processed,
+ * and "FALSE" the program entry after that, we are now done with the first
+ * pass.
+ *
+ * Making the above "a || b && c" have a progam of:
+ * prog[0] = { "a", 1, 2 }
+ * prog[1] = { "b", 0, 2 }
+ * prog[2] = { "c", 0, 3 }
+ *
+ * Which translates into:
+ * n0: r = a; l0: if (r) goto l2;
+ * n1: r = b; l1: if (!r) goto l2;
+ * n2: r = c; l2: if (!r) goto l3; // Which is the same as "goto F;"
+ * T: return TRUE; l3:
+ * F: return FALSE
+ *
+ * Although, after the first pass, the program is correct, it is
+ * inefficient. The simple sample of "a || b && c" could be easily been
+ * converted into:
+ * n0: r = a; if (r) goto T
+ * n1: r = b; if (!r) goto F
+ * n2: r = c; if (!r) goto F
+ * T: return TRUE;
+ * F: return FALSE;
+ *
+ * The First Pass is over the input string. The next too passes are over
+ * the program itself.
+ *
+ * ** SECOND PASS **
+ *
+ * Which brings us to the second pass. If a jump to a label has the
+ * same condition as that label, it can instead jump to its target.
+ * The original example of "a && !(!b || (c && g)) || d || e && !f"
+ * where the first pass gives us:
+ *
+ * n1: r=a; l1: if (!r) goto l4;
+ * n2: r=b; l2: if (!r) goto l4;
+ * n3: r=c; r=!r; l3: if (r) goto l4;
+ * n4: r=g; r=!r; l4: if (r) goto l5;
+ * n5: r=d; l5: if (r) goto T
+ * n6: r=e; l6: if (!r) goto l7;
+ * n7: r=f; r=!r; l7: if (!r) goto F:
+ * T: return TRUE;
+ * F: return FALSE
+ *
+ * We can see that "l3: if (r) goto l4;" and at l4, we have "if (r) goto l5;".
+ * And "l5: if (r) goto T", we could optimize this by converting l3 and l4
+ * to go directly to T. To accomplish this, we start from the last
+ * entry in the program and work our way back. If the target of the entry
+ * has the same "when_to_branch" then we could use that entry's target.
+ * Doing this, the above would end up as:
+ *
+ * n1: r=a; l1: if (!r) goto l4;
+ * n2: r=b; l2: if (!r) goto l4;
+ * n3: r=c; r=!r; l3: if (r) goto T;
+ * n4: r=g; r=!r; l4: if (r) goto T;
+ * n5: r=d; l5: if (r) goto T;
+ * n6: r=e; l6: if (!r) goto F;
+ * n7: r=f; r=!r; l7: if (!r) goto F;
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * In that same pass, if the "when_to_branch" doesn't match, we can simply
+ * go to the program entry after the label. That is, "l2: if (!r) goto l4;"
+ * where "l4: if (r) goto T;", then we can convert l2 to be:
+ * "l2: if (!r) goto n5;".
+ *
+ * This will have the second pass give us:
+ * n1: r=a; l1: if (!r) goto n5;
+ * n2: r=b; l2: if (!r) goto n5;
+ * n3: r=c; r=!r; l3: if (r) goto T;
+ * n4: r=g; r=!r; l4: if (r) goto T;
+ * n5: r=d; l5: if (r) goto T
+ * n6: r=e; l6: if (!r) goto F;
+ * n7: r=f; r=!r; l7: if (!r) goto F
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * Notice, all the "l#" labels are no longer used, and they can now
+ * be discarded.
+ *
+ * ** THIRD PASS **
+ *
+ * For the third pass we deal with the inverts. As they simply just
+ * make the "when_to_branch" get inverted, a simple loop over the
+ * program to that does: "when_to_branch ^= invert;" will do the
+ * job, leaving us with:
+ * n1: r=a; if (!r) goto n5;
+ * n2: r=b; if (!r) goto n5;
+ * n3: r=c: if (!r) goto T;
+ * n4: r=g; if (!r) goto T;
+ * n5: r=d; if (r) goto T
+ * n6: r=e; if (!r) goto F;
+ * n7: r=f; if (r) goto F
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * As "r = a; if (!r) goto n5;" is obviously the same as
+ * "if (!a) goto n5;" without doing anything we can interperate the
+ * program as:
+ * n1: if (!a) goto n5;
+ * n2: if (!b) goto n5;
+ * n3: if (!c) goto T;
+ * n4: if (!g) goto T;
+ * n5: if (d) goto T
+ * n6: if (!e) goto F;
+ * n7: if (f) goto F
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * Since the inverts are discarded at the end, there's no reason to store
+ * them in the program array (and waste memory). A separate array to hold
+ * the inverts is used and freed at the end.
+ */
+static struct prog_entry *
+predicate_parse(const char *str, int nr_parens, int nr_preds,
+ parse_pred_fn parse_pred, void *data,
+ struct filter_parse_error *pe)
+{
+ struct prog_entry *prog_stack;
+ struct prog_entry *prog;
+ const char *ptr = str;
+ char *inverts = NULL;
+ int *op_stack;
+ int *top;
+ int invert = 0;
+ int ret = -ENOMEM;
+ int len;
+ int N = 0;
+ int i;
+
+ nr_preds += 2; /* For TRUE and FALSE */
+
+ op_stack = kmalloc(sizeof(*op_stack) * nr_parens, GFP_KERNEL);
+ if (!op_stack)
+ return ERR_PTR(-ENOMEM);
+ prog_stack = kmalloc(sizeof(*prog_stack) * nr_preds, GFP_KERNEL);
+ if (!prog_stack) {
+ parse_error(pe, -ENOMEM, 0);
+ goto out_free;
+ }
+ inverts = kmalloc(sizeof(*inverts) * nr_preds, GFP_KERNEL);
+ if (!inverts) {
+ parse_error(pe, -ENOMEM, 0);
+ goto out_free;
+ }
+
+ top = op_stack;
+ prog = prog_stack;
+ *top = 0;
+
+ /* First pass */
+ while (*ptr) { /* #1 */
+ const char *next = ptr++;
+
+ if (isspace(*next))
+ continue;
+
+ switch (*next) {
+ case '(': /* #2 */
+ if (top - op_stack > nr_parens)
+ return ERR_PTR(-EINVAL);
+ *(++top) = invert;
+ continue;
+ case '!': /* #3 */
+ if (!is_not(next))
+ break;
+ invert = !invert;
+ continue;
+ }
+
+ if (N >= nr_preds) {
+ parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str);
+ goto out_free;
+ }
+
+ inverts[N] = invert; /* #4 */
+ prog[N].target = N-1;
+
+ len = parse_pred(next, data, ptr - str, pe, &prog[N].pred);
+ if (len < 0) {
+ ret = len;
+ goto out_free;
+ }
+ ptr = next + len;
+
+ N++;
+
+ ret = -1;
+ while (1) { /* #5 */
+ next = ptr++;
+ if (isspace(*next))
+ continue;
+
+ switch (*next) {
+ case ')':
+ case '\0':
+ break;
+ case '&':
+ case '|':
+ if (next[1] == next[0]) {
+ ptr++;
+ break;
+ }
+ default:
+ parse_error(pe, FILT_ERR_TOO_MANY_PREDS,
+ next - str);
+ goto out_free;
+ }
+
+ invert = *top & INVERT;
+
+ if (*top & PROCESS_AND) { /* #7 */
+ update_preds(prog, N - 1, invert);
+ *top &= ~PROCESS_AND;
+ }
+ if (*next == '&') { /* #8 */
+ *top |= PROCESS_AND;
+ break;
+ }
+ if (*top & PROCESS_OR) { /* #9 */
+ update_preds(prog, N - 1, !invert);
+ *top &= ~PROCESS_OR;
+ }
+ if (*next == '|') { /* #10 */
+ *top |= PROCESS_OR;
+ break;
+ }
+ if (!*next) /* #11 */
+ goto out;
+
+ if (top == op_stack) {
+ ret = -1;
+ /* Too few '(' */
+ parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, ptr - str);
+ goto out_free;
+ }
+ top--; /* #12 */
+ }
+ }
+ out:
+ if (top != op_stack) {
+ /* Too many '(' */
+ parse_error(pe, FILT_ERR_TOO_MANY_OPEN, ptr - str);
+ goto out_free;
+ }
+
+ prog[N].pred = NULL; /* #13 */
+ prog[N].target = 1; /* TRUE */
+ prog[N+1].pred = NULL;
+ prog[N+1].target = 0; /* FALSE */
+ prog[N-1].target = N;
+ prog[N-1].when_to_branch = false;
+
+ /* Second Pass */
+ for (i = N-1 ; i--; ) {
+ int target = prog[i].target;
+ if (prog[i].when_to_branch == prog[target].when_to_branch)
+ prog[i].target = prog[target].target;
+ }
+
+ /* Third Pass */
+ for (i = 0; i < N; i++) {
+ invert = inverts[i] ^ prog[i].when_to_branch;
+ prog[i].when_to_branch = invert;
+ /* Make sure the program always moves forward */
+ if (WARN_ON(prog[i].target <= i)) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+ }
+
+ return prog;
+out_free:
+ kfree(op_stack);
+ kfree(prog_stack);
+ kfree(inverts);
+ return ERR_PTR(ret);
+}
+
#define DEFINE_COMPARISON_PRED(type) \
static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
- int match = (*addr < val); \
- return !!match == !pred->not; \
+ return *addr < val; \
} \
static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
- int match = (*addr <= val); \
- return !!match == !pred->not; \
+ return *addr <= val; \
} \
static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
- int match = (*addr > val); \
- return !!match == !pred->not; \
+ return *addr > val; \
} \
static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
- int match = (*addr >= val); \
- return !!match == !pred->not; \
+ return *addr >= val; \
} \
static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
- int match = !!(*addr & val); \
- return match == !pred->not; \
+ return !!(*addr & val); \
} \
static const filter_pred_fn_t pred_funcs_##type[] = { \
- filter_pred_LT_##type, \
filter_pred_LE_##type, \
- filter_pred_GT_##type, \
+ filter_pred_LT_##type, \
filter_pred_GE_##type, \
+ filter_pred_GT_##type, \
filter_pred_BAND_##type, \
};
-#define PRED_FUNC_START OP_LT
-
#define DEFINE_EQUALITY_PRED(size) \
static int filter_pred_##size(struct filter_pred *pred, void *event) \
{ \
@@ -272,44 +704,36 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
static int filter_pred_cpu(struct filter_pred *pred, void *event)
{
int cpu, cmp;
- int match = 0;
cpu = raw_smp_processor_id();
cmp = pred->val;
switch (pred->op) {
case OP_EQ:
- match = cpu == cmp;
- break;
+ return cpu == cmp;
+ case OP_NE:
+ return cpu != cmp;
case OP_LT:
- match = cpu < cmp;
- break;
+ return cpu < cmp;
case OP_LE:
- match = cpu <= cmp;
- break;
+ return cpu <= cmp;
case OP_GT:
- match = cpu > cmp;
- break;
+ return cpu > cmp;
case OP_GE:
- match = cpu >= cmp;
- break;
+ return cpu >= cmp;
default:
- break;
+ return 0;
}
-
- return !!match == !pred->not;
}
/* Filter predicate for COMM. */
static int filter_pred_comm(struct filter_pred *pred, void *event)
{
- int cmp, match;
+ int cmp;
cmp = pred->regex.match(current->comm, &pred->regex,
- pred->regex.field_len);
- match = cmp ^ pred->not;
-
- return match;
+ TASK_COMM_LEN);
+ return cmp ^ pred->not;
}
static int filter_pred_none(struct filter_pred *pred, void *event)
@@ -366,6 +790,7 @@ static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused)
return 1;
return 0;
}
+
/**
* filter_parse_regex - parse a basic regex
* @buff: the raw regex
@@ -426,10 +851,9 @@ static void filter_build_regex(struct filter_pred *pred)
struct regex *r = &pred->regex;
char *search;
enum regex_type type = MATCH_FULL;
- int not = 0;
if (pred->op == OP_GLOB) {
- type = filter_parse_regex(r->pattern, r->len, &search, &not);
+ type = filter_parse_regex(r->pattern, r->len, &search, &pred->not);
r->len = strlen(search);
memmove(r->pattern, search, r->len+1);
}
@@ -451,210 +875,32 @@ static void filter_build_regex(struct filter_pred *pred)
r->match = regex_match_glob;
break;
}
-
- pred->not ^= not;
-}
-
-enum move_type {
- MOVE_DOWN,
- MOVE_UP_FROM_LEFT,
- MOVE_UP_FROM_RIGHT
-};
-
-static struct filter_pred *
-get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
- int index, enum move_type *move)
-{
- if (pred->parent & FILTER_PRED_IS_RIGHT)
- *move = MOVE_UP_FROM_RIGHT;
- else
- *move = MOVE_UP_FROM_LEFT;
- pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
-
- return pred;
-}
-
-enum walk_return {
- WALK_PRED_ABORT,
- WALK_PRED_PARENT,
- WALK_PRED_DEFAULT,
-};
-
-typedef int (*filter_pred_walkcb_t) (enum move_type move,
- struct filter_pred *pred,
- int *err, void *data);
-
-static int walk_pred_tree(struct filter_pred *preds,
- struct filter_pred *root,
- filter_pred_walkcb_t cb, void *data)
-{
- struct filter_pred *pred = root;
- enum move_type move = MOVE_DOWN;
- int done = 0;
-
- if (!preds)
- return -EINVAL;
-
- do {
- int err = 0, ret;
-
- ret = cb(move, pred, &err, data);
- if (ret == WALK_PRED_ABORT)
- return err;
- if (ret == WALK_PRED_PARENT)
- goto get_parent;
-
- switch (move) {
- case MOVE_DOWN:
- if (pred->left != FILTER_PRED_INVALID) {
- pred = &preds[pred->left];
- continue;
- }
- goto get_parent;
- case MOVE_UP_FROM_LEFT:
- pred = &preds[pred->right];
- move = MOVE_DOWN;
- continue;
- case MOVE_UP_FROM_RIGHT:
- get_parent:
- if (pred == root)
- break;
- pred = get_pred_parent(pred, preds,
- pred->parent,
- &move);
- continue;
- }
- done = 1;
- } while (!done);
-
- /* We are fine. */
- return 0;
-}
-
-/*
- * A series of AND or ORs where found together. Instead of
- * climbing up and down the tree branches, an array of the
- * ops were made in order of checks. We can just move across
- * the array and short circuit if needed.
- */
-static int process_ops(struct filter_pred *preds,
- struct filter_pred *op, void *rec)
-{
- struct filter_pred *pred;
- int match = 0;
- int type;
- int i;
-
- /*
- * Micro-optimization: We set type to true if op
- * is an OR and false otherwise (AND). Then we
- * just need to test if the match is equal to
- * the type, and if it is, we can short circuit the
- * rest of the checks:
- *
- * if ((match && op->op == OP_OR) ||
- * (!match && op->op == OP_AND))
- * return match;
- */
- type = op->op == OP_OR;
-
- for (i = 0; i < op->val; i++) {
- pred = &preds[op->ops[i]];
- if (!WARN_ON_ONCE(!pred->fn))
- match = pred->fn(pred, rec);
- if (!!match == type)
- break;
- }
- /* If not of not match is equal to not of not, then it is a match */
- return !!match == !op->not;
-}
-
-struct filter_match_preds_data {
- struct filter_pred *preds;
- int match;
- void *rec;
-};
-
-static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
- int *err, void *data)
-{
- struct filter_match_preds_data *d = data;
-
- *err = 0;
- switch (move) {
- case MOVE_DOWN:
- /* only AND and OR have children */
- if (pred->left != FILTER_PRED_INVALID) {
- /* If ops is set, then it was folded. */
- if (!pred->ops)
- return WALK_PRED_DEFAULT;
- /* We can treat folded ops as a leaf node */
- d->match = process_ops(d->preds, pred, d->rec);
- } else {
- if (!WARN_ON_ONCE(!pred->fn))
- d->match = pred->fn(pred, d->rec);
- }
-
- return WALK_PRED_PARENT;
- case MOVE_UP_FROM_LEFT:
- /*
- * Check for short circuits.
- *
- * Optimization: !!match == (pred->op == OP_OR)
- * is the same as:
- * if ((match && pred->op == OP_OR) ||
- * (!match && pred->op == OP_AND))
- */
- if (!!d->match == (pred->op == OP_OR))
- return WALK_PRED_PARENT;
- break;
- case MOVE_UP_FROM_RIGHT:
- break;
- }
-
- return WALK_PRED_DEFAULT;
}
/* return 1 if event matches, 0 otherwise (discard) */
int filter_match_preds(struct event_filter *filter, void *rec)
{
- struct filter_pred *preds;
- struct filter_pred *root;
- struct filter_match_preds_data data = {
- /* match is currently meaningless */
- .match = -1,
- .rec = rec,
- };
- int n_preds, ret;
+ struct prog_entry *prog;
+ int i;
/* no filter is considered a match */
if (!filter)
return 1;
- n_preds = filter->n_preds;
- if (!n_preds)
+ prog = rcu_dereference_sched(filter->prog);
+ if (!prog)
return 1;
- /*
- * n_preds, root and filter->preds are protect with preemption disabled.
- */
- root = rcu_dereference_sched(filter->root);
- if (!root)
- return 1;
-
- data.preds = preds = rcu_dereference_sched(filter->preds);
- ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
- WARN_ON(ret);
- return data.match;
+ for (i = 0; prog[i].pred; i++) {
+ struct filter_pred *pred = prog[i].pred;
+ int match = pred->fn(pred, rec);
+ if (match == prog[i].when_to_branch)
+ i = prog[i].target;
+ }
+ return prog[i].target;
}
EXPORT_SYMBOL_GPL(filter_match_preds);
-static void parse_error(struct filter_parse_state *ps, int err, int pos)
-{
- ps->lasterr = err;
- ps->lasterr_pos = pos;
-}
-
static void remove_filter_string(struct event_filter *filter)
{
if (!filter)
@@ -664,57 +910,44 @@ static void remove_filter_string(struct event_filter *filter)
filter->filter_string = NULL;
}
-static int replace_filter_string(struct event_filter *filter,
- char *filter_string)
-{
- kfree(filter->filter_string);
- filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
- if (!filter->filter_string)
- return -ENOMEM;
-
- return 0;
-}
-
-static int append_filter_string(struct event_filter *filter,
- char *string)
-{
- int newlen;
- char *new_filter_string;
-
- BUG_ON(!filter->filter_string);
- newlen = strlen(filter->filter_string) + strlen(string) + 1;
- new_filter_string = kmalloc(newlen, GFP_KERNEL);
- if (!new_filter_string)
- return -ENOMEM;
-
- strcpy(new_filter_string, filter->filter_string);
- strcat(new_filter_string, string);
- kfree(filter->filter_string);
- filter->filter_string = new_filter_string;
-
- return 0;
-}
-
-static void append_filter_err(struct filter_parse_state *ps,
+static void append_filter_err(struct filter_parse_error *pe,
struct event_filter *filter)
{
- int pos = ps->lasterr_pos;
- char *buf, *pbuf;
+ struct trace_seq *s;
+ int pos = pe->lasterr_pos;
+ char *buf;
+ int len;
- buf = (char *)__get_free_page(GFP_KERNEL);
- if (!buf)
+ if (WARN_ON(!filter->filter_string))
return;
- append_filter_string(filter, "\n");
- memset(buf, ' ', PAGE_SIZE);
- if (pos > PAGE_SIZE - 128)
- pos = 0;
- buf[pos] = '^';
- pbuf = &buf[pos] + 1;
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return;
+ trace_seq_init(s);
+
+ len = strlen(filter->filter_string);
+ if (pos > len)
+ pos = len;
+
+ /* indexing is off by one */
+ if (pos)
+ pos++;
- sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
- append_filter_string(filter, buf);
- free_page((unsigned long) buf);
+ trace_seq_puts(s, filter->filter_string);
+ if (pe->lasterr > 0) {
+ trace_seq_printf(s, "\n%*s", pos, "^");
+ trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]);
+ } else {
+ trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr);
+ }
+ trace_seq_putc(s, 0);
+ buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL);
+ if (buf) {
+ kfree(filter->filter_string);
+ filter->filter_string = buf;
+ }
+ kfree(s);
}
static inline struct event_filter *event_filter(struct trace_event_file *file)
@@ -747,108 +980,18 @@ void print_subsystem_event_filter(struct event_subsystem *system,
mutex_unlock(&event_mutex);
}
-static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
-{
- stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
- if (!stack->preds)
- return -ENOMEM;
- stack->index = n_preds;
- return 0;
-}
-
-static void __free_pred_stack(struct pred_stack *stack)
-{
- kfree(stack->preds);
- stack->index = 0;
-}
-
-static int __push_pred_stack(struct pred_stack *stack,
- struct filter_pred *pred)
-{
- int index = stack->index;
-
- if (WARN_ON(index == 0))
- return -ENOSPC;
-
- stack->preds[--index] = pred;
- stack->index = index;
- return 0;
-}
-
-static struct filter_pred *
-__pop_pred_stack(struct pred_stack *stack)
-{
- struct filter_pred *pred;
- int index = stack->index;
-
- pred = stack->preds[index++];
- if (!pred)
- return NULL;
-
- stack->index = index;
- return pred;
-}
-
-static int filter_set_pred(struct event_filter *filter,
- int idx,
- struct pred_stack *stack,
- struct filter_pred *src)
-{
- struct filter_pred *dest = &filter->preds[idx];
- struct filter_pred *left;
- struct filter_pred *right;
-
- *dest = *src;
- dest->index = idx;
-
- if (dest->op == OP_OR || dest->op == OP_AND) {
- right = __pop_pred_stack(stack);
- left = __pop_pred_stack(stack);
- if (!left || !right)
- return -EINVAL;
- /*
- * If both children can be folded
- * and they are the same op as this op or a leaf,
- * then this op can be folded.
- */
- if (left->index & FILTER_PRED_FOLD &&
- ((left->op == dest->op && !left->not) ||
- left->left == FILTER_PRED_INVALID) &&
- right->index & FILTER_PRED_FOLD &&
- ((right->op == dest->op && !right->not) ||
- right->left == FILTER_PRED_INVALID))
- dest->index |= FILTER_PRED_FOLD;
-
- dest->left = left->index & ~FILTER_PRED_FOLD;
- dest->right = right->index & ~FILTER_PRED_FOLD;
- left->parent = dest->index & ~FILTER_PRED_FOLD;
- right->parent = dest->index | FILTER_PRED_IS_RIGHT;
- } else {
- /*
- * Make dest->left invalid to be used as a quick
- * way to know this is a leaf node.
- */
- dest->left = FILTER_PRED_INVALID;
-
- /* All leafs allow folding the parent ops. */
- dest->index |= FILTER_PRED_FOLD;
- }
-
- return __push_pred_stack(stack, dest);
-}
-
-static void __free_preds(struct event_filter *filter)
+static void free_prog(struct event_filter *filter)
{
+ struct prog_entry *prog;
int i;
- if (filter->preds) {
- for (i = 0; i < filter->n_preds; i++)
- kfree(filter->preds[i].ops);
- kfree(filter->preds);
- filter->preds = NULL;
- }
- filter->a_preds = 0;
- filter->n_preds = 0;
+ prog = rcu_access_pointer(filter->prog);
+ if (!prog)
+ return;
+
+ for (i = 0; prog[i].pred; i++)
+ kfree(prog[i].pred);
+ kfree(prog);
}
static void filter_disable(struct trace_event_file *file)
@@ -866,7 +1009,7 @@ static void __free_filter(struct event_filter *filter)
if (!filter)
return;
- __free_preds(filter);
+ free_prog(filter);
kfree(filter->filter_string);
kfree(filter);
}
@@ -876,38 +1019,6 @@ void free_event_filter(struct event_filter *filter)
__free_filter(filter);
}
-static struct event_filter *__alloc_filter(void)
-{
- struct event_filter *filter;
-
- filter = kzalloc(sizeof(*filter), GFP_KERNEL);
- return filter;
-}
-
-static int __alloc_preds(struct event_filter *filter, int n_preds)
-{
- struct filter_pred *pred;
- int i;
-
- if (filter->preds)
- __free_preds(filter);
-
- filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL);
-
- if (!filter->preds)
- return -ENOMEM;
-
- filter->a_preds = n_preds;
- filter->n_preds = 0;
-
- for (i = 0; i < n_preds; i++) {
- pred = &filter->preds[i];
- pred->fn = filter_pred_none;
- }
-
- return 0;
-}
-
static inline void __remove_filter(struct trace_event_file *file)
{
filter_disable(file);
@@ -944,27 +1055,6 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
}
}
-static int filter_add_pred(struct filter_parse_state *ps,
- struct event_filter *filter,
- struct filter_pred *pred,
- struct pred_stack *stack)
-{
- int err;
-
- if (WARN_ON(filter->n_preds == filter->a_preds)) {
- parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
- return -ENOSPC;
- }
-
- err = filter_set_pred(filter, filter->n_preds, stack, pred);
- if (err)
- return err;
-
- filter->n_preds++;
-
- return 0;
-}
-
int filter_assign_type(const char *type)
{
if (strstr(type, "__data_loc") && strstr(type, "char"))
@@ -976,761 +1066,449 @@ int filter_assign_type(const char *type)
return FILTER_OTHER;
}
-static bool is_legal_op(struct ftrace_event_field *field, enum filter_op_ids op)
-{
- if (is_string_field(field) &&
- (op != OP_EQ && op != OP_NE && op != OP_GLOB))
- return false;
- if (!is_string_field(field) && op == OP_GLOB)
- return false;
-
- return true;
-}
-
static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
int field_size, int field_is_signed)
{
filter_pred_fn_t fn = NULL;
+ int pred_func_index = -1;
+
+ switch (op) {
+ case OP_EQ:
+ case OP_NE:
+ break;
+ default:
+ if (WARN_ON_ONCE(op < PRED_FUNC_START))
+ return NULL;
+ pred_func_index = op - PRED_FUNC_START;
+ if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX))
+ return NULL;
+ }
switch (field_size) {
case 8:
- if (op == OP_EQ || op == OP_NE)
+ if (pred_func_index < 0)
fn = filter_pred_64;
else if (field_is_signed)
- fn = pred_funcs_s64[op - PRED_FUNC_START];
+ fn = pred_funcs_s64[pred_func_index];
else
- fn = pred_funcs_u64[op - PRED_FUNC_START];
+ fn = pred_funcs_u64[pred_func_index];
break;
case 4:
- if (op == OP_EQ || op == OP_NE)
+ if (pred_func_index < 0)
fn = filter_pred_32;
else if (field_is_signed)
- fn = pred_funcs_s32[op - PRED_FUNC_START];
+ fn = pred_funcs_s32[pred_func_index];
else
- fn = pred_funcs_u32[op - PRED_FUNC_START];
+ fn = pred_funcs_u32[pred_func_index];
break;
case 2:
- if (op == OP_EQ || op == OP_NE)
+ if (pred_func_index < 0)
fn = filter_pred_16;
else if (field_is_signed)
- fn = pred_funcs_s16[op - PRED_FUNC_START];
+ fn = pred_funcs_s16[pred_func_index];
else
- fn = pred_funcs_u16[op - PRED_FUNC_START];
+ fn = pred_funcs_u16[pred_func_index];
break;
case 1:
- if (op == OP_EQ || op == OP_NE)
+ if (pred_func_index < 0)
fn = filter_pred_8;
else if (field_is_signed)
- fn = pred_funcs_s8[op - PRED_FUNC_START];
+ fn = pred_funcs_s8[pred_func_index];
else
- fn = pred_funcs_u8[op - PRED_FUNC_START];
+ fn = pred_funcs_u8[pred_func_index];
break;
}
return fn;
}
-static int init_pred(struct filter_parse_state *ps,
- struct ftrace_event_field *field,
- struct filter_pred *pred)
-
+/* Called when a predicate is encountered by predicate_parse() */
+static int parse_pred(const char *str, void *data,
+ int pos, struct filter_parse_error *pe,
+ struct filter_pred **pred_ptr)
{
- filter_pred_fn_t fn = filter_pred_none;
- unsigned long long val;
+ struct trace_event_call *call = data;
+ struct ftrace_event_field *field;
+ struct filter_pred *pred = NULL;
+ char num_buf[24]; /* Big enough to hold an address */
+ char *field_name;
+ char q;
+ u64 val;
+ int len;
int ret;
+ int op;
+ int s;
+ int i = 0;
- pred->offset = field->offset;
-
- if (!is_legal_op(field, pred->op)) {
- parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
- return -EINVAL;
- }
-
- if (field->filter_type == FILTER_COMM) {
- filter_build_regex(pred);
- fn = filter_pred_comm;
- pred->regex.field_len = TASK_COMM_LEN;
- } else if (is_string_field(field)) {
- filter_build_regex(pred);
-
- if (field->filter_type == FILTER_STATIC_STRING) {
- fn = filter_pred_string;
- pred->regex.field_len = field->size;
- } else if (field->filter_type == FILTER_DYN_STRING)
- fn = filter_pred_strloc;
- else
- fn = filter_pred_pchar;
- } else if (is_function_field(field)) {
- if (strcmp(field->name, "ip")) {
- parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0);
- return -EINVAL;
- }
- } else {
- if (field->is_signed)
- ret = kstrtoll(pred->regex.pattern, 0, &val);
- else
- ret = kstrtoull(pred->regex.pattern, 0, &val);
- if (ret) {
- parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
- return -EINVAL;
- }
- pred->val = val;
-
- if (field->filter_type == FILTER_CPU)
- fn = filter_pred_cpu;
- else
- fn = select_comparison_fn(pred->op, field->size,
- field->is_signed);
- if (!fn) {
- parse_error(ps, FILT_ERR_INVALID_OP, 0);
- return -EINVAL;
- }
- }
-
- if (pred->op == OP_NE)
- pred->not ^= 1;
-
- pred->fn = fn;
- return 0;
-}
-
-static void parse_init(struct filter_parse_state *ps,
- struct filter_op *ops,
- char *infix_string)
-{
- memset(ps, '\0', sizeof(*ps));
-
- ps->infix.string = infix_string;
- ps->infix.cnt = strlen(infix_string);
- ps->ops = ops;
-
- INIT_LIST_HEAD(&ps->opstack);
- INIT_LIST_HEAD(&ps->postfix);
-}
-
-static char infix_next(struct filter_parse_state *ps)
-{
- if (!ps->infix.cnt)
- return 0;
-
- ps->infix.cnt--;
+ /* First find the field to associate to */
+ while (isspace(str[i]))
+ i++;
+ s = i;
- return ps->infix.string[ps->infix.tail++];
-}
-
-static char infix_peek(struct filter_parse_state *ps)
-{
- if (ps->infix.tail == strlen(ps->infix.string))
- return 0;
-
- return ps->infix.string[ps->infix.tail];
-}
+ while (isalnum(str[i]) || str[i] == '_')
+ i++;
-static void infix_advance(struct filter_parse_state *ps)
-{
- if (!ps->infix.cnt)
- return;
+ len = i - s;
- ps->infix.cnt--;
- ps->infix.tail++;
-}
+ if (!len)
+ return -1;
-static inline int is_precedence_lower(struct filter_parse_state *ps,
- int a, int b)
-{
- return ps->ops[a].precedence < ps->ops[b].precedence;
-}
+ field_name = kmemdup_nul(str + s, len, GFP_KERNEL);
+ if (!field_name)
+ return -ENOMEM;
-static inline int is_op_char(struct filter_parse_state *ps, char c)
-{
- int i;
+ /* Make sure that the field exists */
- for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
- if (ps->ops[i].string[0] == c)
- return 1;
+ field = trace_find_event_field(call, field_name);
+ kfree(field_name);
+ if (!field) {
+ parse_error(pe, FILT_ERR_FIELD_NOT_FOUND, pos + i);
+ return -EINVAL;
}
- return 0;
-}
-
-static int infix_get_op(struct filter_parse_state *ps, char firstc)
-{
- char nextc = infix_peek(ps);
- char opstr[3];
- int i;
-
- opstr[0] = firstc;
- opstr[1] = nextc;
- opstr[2] = '\0';
+ while (isspace(str[i]))
+ i++;
- for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
- if (!strcmp(opstr, ps->ops[i].string)) {
- infix_advance(ps);
- return ps->ops[i].id;
- }
+ /* Make sure this op is supported */
+ for (op = 0; ops[op]; op++) {
+ /* This is why '<=' must come before '<' in ops[] */
+ if (strncmp(str + i, ops[op], strlen(ops[op])) == 0)
+ break;
}
- opstr[1] = '\0';
-
- for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
- if (!strcmp(opstr, ps->ops[i].string))
- return ps->ops[i].id;
+ if (!ops[op]) {
+ parse_error(pe, FILT_ERR_INVALID_OP, pos + i);
+ goto err_free;
}
- return OP_NONE;
-}
-
-static inline void clear_operand_string(struct filter_parse_state *ps)
-{
- memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
- ps->operand.tail = 0;
-}
-
-static inline int append_operand_char(struct filter_parse_state *ps, char c)
-{
- if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
- return -EINVAL;
+ i += strlen(ops[op]);
- ps->operand.string[ps->operand.tail++] = c;
+ while (isspace(str[i]))
+ i++;
- return 0;
-}
+ s = i;
-static int filter_opstack_push(struct filter_parse_state *ps,
- enum filter_op_ids op)
-{
- struct opstack_op *opstack_op;
-
- opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
- if (!opstack_op)
+ pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+ if (!pred)
return -ENOMEM;
- opstack_op->op = op;
- list_add(&opstack_op->list, &ps->opstack);
-
- return 0;
-}
+ pred->field = field;
+ pred->offset = field->offset;
+ pred->op = op;
-static int filter_opstack_empty(struct filter_parse_state *ps)
-{
- return list_empty(&ps->opstack);
-}
+ if (ftrace_event_is_function(call)) {
+ /*
+ * Perf does things different with function events.
+ * It only allows an "ip" field, and expects a string.
+ * But the string does not need to be surrounded by quotes.
+ * If it is a string, the assigned function as a nop,
+ * (perf doesn't use it) and grab everything.
+ */
+ if (strcmp(field->name, "ip") != 0) {
+ parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
+ goto err_free;
+ }
+ pred->fn = filter_pred_none;
+
+ /*
+ * Quotes are not required, but if they exist then we need
+ * to read them till we hit a matching one.
+ */
+ if (str[i] == '\'' || str[i] == '"')
+ q = str[i];
+ else
+ q = 0;
+
+ for (i++; str[i]; i++) {
+ if (q && str[i] == q)
+ break;
+ if (!q && (str[i] == ')' || str[i] == '&' ||
+ str[i] == '|'))
+ break;
+ }
+ /* Skip quotes */
+ if (q)
+ s++;
+ len = i - s;
+ if (len >= MAX_FILTER_STR_VAL) {
+ parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
+ goto err_free;
+ }
-static int filter_opstack_top(struct filter_parse_state *ps)
-{
- struct opstack_op *opstack_op;
+ pred->regex.len = len;
+ strncpy(pred->regex.pattern, str + s, len);
+ pred->regex.pattern[len] = 0;
+
+ /* This is either a string, or an integer */
+ } else if (str[i] == '\'' || str[i] == '"') {
+ char q = str[i];
+
+ /* Make sure the op is OK for strings */
+ switch (op) {
+ case OP_NE:
+ pred->not = 1;
+ /* Fall through */
+ case OP_GLOB:
+ case OP_EQ:
+ break;
+ default:
+ parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+ goto err_free;
+ }
- if (filter_opstack_empty(ps))
- return OP_NONE;
+ /* Make sure the field is OK for strings */
+ if (!is_string_field(field)) {
+ parse_error(pe, FILT_ERR_EXPECT_DIGIT, pos + i);
+ goto err_free;
+ }
- opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+ for (i++; str[i]; i++) {
+ if (str[i] == q)
+ break;
+ }
+ if (!str[i]) {
+ parse_error(pe, FILT_ERR_MISSING_QUOTE, pos + i);
+ goto err_free;
+ }
- return opstack_op->op;
-}
+ /* Skip quotes */
+ s++;
+ len = i - s;
+ if (len >= MAX_FILTER_STR_VAL) {
+ parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
+ goto err_free;
+ }
-static int filter_opstack_pop(struct filter_parse_state *ps)
-{
- struct opstack_op *opstack_op;
- enum filter_op_ids op;
+ pred->regex.len = len;
+ strncpy(pred->regex.pattern, str + s, len);
+ pred->regex.pattern[len] = 0;
- if (filter_opstack_empty(ps))
- return OP_NONE;
+ filter_build_regex(pred);
- opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
- op = opstack_op->op;
- list_del(&opstack_op->list);
+ if (field->filter_type == FILTER_COMM) {
+ pred->fn = filter_pred_comm;
- kfree(opstack_op);
+ } else if (field->filter_type == FILTER_STATIC_STRING) {
+ pred->fn = filter_pred_string;
+ pred->regex.field_len = field->size;
- return op;
-}
+ } else if (field->filter_type == FILTER_DYN_STRING)
+ pred->fn = filter_pred_strloc;
+ else
+ pred->fn = filter_pred_pchar;
+ /* go past the last quote */
+ i++;
-static void filter_opstack_clear(struct filter_parse_state *ps)
-{
- while (!filter_opstack_empty(ps))
- filter_opstack_pop(ps);
-}
+ } else if (isdigit(str[i])) {
-static char *curr_operand(struct filter_parse_state *ps)
-{
- return ps->operand.string;
-}
+ /* Make sure the field is not a string */
+ if (is_string_field(field)) {
+ parse_error(pe, FILT_ERR_EXPECT_STRING, pos + i);
+ goto err_free;
+ }
-static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
-{
- struct postfix_elt *elt;
+ if (op == OP_GLOB) {
+ parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+ goto err_free;
+ }
- elt = kmalloc(sizeof(*elt), GFP_KERNEL);
- if (!elt)
- return -ENOMEM;
+ /* We allow 0xDEADBEEF */
+ while (isalnum(str[i]))
+ i++;
- elt->op = OP_NONE;
- elt->operand = kstrdup(operand, GFP_KERNEL);
- if (!elt->operand) {
- kfree(elt);
- return -ENOMEM;
- }
+ len = i - s;
+ /* 0xfeedfacedeadbeef is 18 chars max */
+ if (len >= sizeof(num_buf)) {
+ parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
+ goto err_free;
+ }
- list_add_tail(&elt->list, &ps->postfix);
+ strncpy(num_buf, str + s, len);
+ num_buf[len] = 0;
- return 0;
-}
+ /* Make sure it is a value */
+ if (field->is_signed)
+ ret = kstrtoll(num_buf, 0, &val);
+ else
+ ret = kstrtoull(num_buf, 0, &val);
+ if (ret) {
+ parse_error(pe, FILT_ERR_ILLEGAL_INTVAL, pos + s);
+ goto err_free;
+ }
-static int postfix_append_op(struct filter_parse_state *ps, enum filter_op_ids op)
-{
- struct postfix_elt *elt;
+ pred->val = val;
- elt = kmalloc(sizeof(*elt), GFP_KERNEL);
- if (!elt)
- return -ENOMEM;
+ if (field->filter_type == FILTER_CPU)
+ pred->fn = filter_pred_cpu;
+ else {
+ pred->fn = select_comparison_fn(pred->op, field->size,
+ field->is_signed);
+ if (pred->op == OP_NE)
+ pred->not = 1;
+ }
- elt->op = op;
- elt->operand = NULL;
+ } else {
+ parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i);
+ goto err_free;
+ }
- list_add_tail(&elt->list, &ps->postfix);
+ *pred_ptr = pred;
+ return i;
- return 0;
+err_free:
+ kfree(pred);
+ return -EINVAL;
}
-static void postfix_clear(struct filter_parse_state *ps)
-{
- struct postfix_elt *elt;
+enum {
+ TOO_MANY_CLOSE = -1,
+ TOO_MANY_OPEN = -2,
+ MISSING_QUOTE = -3,
+};
- while (!list_empty(&ps->postfix)) {
- elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
- list_del(&elt->list);
- kfree(elt->operand);
- kfree(elt);
- }
-}
+/*
+ * Read the filter string once to calculate the number of predicates
+ * as well as how deep the parentheses go.
+ *
+ * Returns:
+ * 0 - everything is fine (err is undefined)
+ * -1 - too many ')'
+ * -2 - too many '('
+ * -3 - No matching quote
+ */
+static int calc_stack(const char *str, int *parens, int *preds, int *err)
+{
+ bool is_pred = false;
+ int nr_preds = 0;
+ int open = 1; /* Count the expression as "(E)" */
+ int last_quote = 0;
+ int max_open = 1;
+ int quote = 0;
+ int i;
-static int filter_parse(struct filter_parse_state *ps)
-{
- enum filter_op_ids op, top_op;
- int in_string = 0;
- char ch;
+ *err = 0;
- while ((ch = infix_next(ps))) {
- if (ch == '"') {
- in_string ^= 1;
+ for (i = 0; str[i]; i++) {
+ if (isspace(str[i]))
continue;
- }
-
- if (in_string)
- goto parse_operand;
-
- if (isspace(ch))
+ if (quote) {
+ if (str[i] == quote)
+ quote = 0;
continue;
+ }
- if (is_op_char(ps, ch)) {
- op = infix_get_op(ps, ch);
- if (op == OP_NONE) {
- parse_error(ps, FILT_ERR_INVALID_OP, 0);
- return -EINVAL;
- }
-
- if (strlen(curr_operand(ps))) {
- postfix_append_operand(ps, curr_operand(ps));
- clear_operand_string(ps);
- }
-
- while (!filter_opstack_empty(ps)) {
- top_op = filter_opstack_top(ps);
- if (!is_precedence_lower(ps, top_op, op)) {
- top_op = filter_opstack_pop(ps);
- postfix_append_op(ps, top_op);
- continue;
- }
+ switch (str[i]) {
+ case '\'':
+ case '"':
+ quote = str[i];
+ last_quote = i;
+ break;
+ case '|':
+ case '&':
+ if (str[i+1] != str[i])
break;
- }
-
- filter_opstack_push(ps, op);
+ is_pred = false;
continue;
- }
-
- if (ch == '(') {
- filter_opstack_push(ps, OP_OPEN_PAREN);
+ case '(':
+ is_pred = false;
+ open++;
+ if (open > max_open)
+ max_open = open;
continue;
- }
-
- if (ch == ')') {
- if (strlen(curr_operand(ps))) {
- postfix_append_operand(ps, curr_operand(ps));
- clear_operand_string(ps);
- }
-
- top_op = filter_opstack_pop(ps);
- while (top_op != OP_NONE) {
- if (top_op == OP_OPEN_PAREN)
- break;
- postfix_append_op(ps, top_op);
- top_op = filter_opstack_pop(ps);
- }
- if (top_op == OP_NONE) {
- parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
- return -EINVAL;
+ case ')':
+ is_pred = false;
+ if (open == 1) {
+ *err = i;
+ return TOO_MANY_CLOSE;
}
+ open--;
continue;
}
-parse_operand:
- if (append_operand_char(ps, ch)) {
- parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
- return -EINVAL;
- }
- }
-
- if (strlen(curr_operand(ps)))
- postfix_append_operand(ps, curr_operand(ps));
-
- while (!filter_opstack_empty(ps)) {
- top_op = filter_opstack_pop(ps);
- if (top_op == OP_NONE)
- break;
- if (top_op == OP_OPEN_PAREN) {
- parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
- return -EINVAL;
+ if (!is_pred) {
+ nr_preds++;
+ is_pred = true;
}
- postfix_append_op(ps, top_op);
}
- return 0;
-}
-
-static struct filter_pred *create_pred(struct filter_parse_state *ps,
- struct trace_event_call *call,
- enum filter_op_ids op,
- char *operand1, char *operand2)
-{
- struct ftrace_event_field *field;
- static struct filter_pred pred;
-
- memset(&pred, 0, sizeof(pred));
- pred.op = op;
-
- if (op == OP_AND || op == OP_OR)
- return &pred;
-
- if (!operand1 || !operand2) {
- parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
- return NULL;
- }
-
- field = trace_find_event_field(call, operand1);
- if (!field) {
- parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
- return NULL;
+ if (quote) {
+ *err = last_quote;
+ return MISSING_QUOTE;
}
- strcpy(pred.regex.pattern, operand2);
- pred.regex.len = strlen(pred.regex.pattern);
- pred.field = field;
- return init_pred(ps, field, &pred) ? NULL : &pred;
-}
-
-static int check_preds(struct filter_parse_state *ps)
-{
- int n_normal_preds = 0, n_logical_preds = 0;
- struct postfix_elt *elt;
- int cnt = 0;
+ if (open != 1) {
+ int level = open;
- list_for_each_entry(elt, &ps->postfix, list) {
- if (elt->op == OP_NONE) {
- cnt++;
- continue;
- }
-
- if (elt->op == OP_AND || elt->op == OP_OR) {
- n_logical_preds++;
- cnt--;
- continue;
+ /* find the bad open */
+ for (i--; i; i--) {
+ if (quote) {
+ if (str[i] == quote)
+ quote = 0;
+ continue;
+ }
+ switch (str[i]) {
+ case '(':
+ if (level == open) {
+ *err = i;
+ return TOO_MANY_OPEN;
+ }
+ level--;
+ break;
+ case ')':
+ level++;
+ break;
+ case '\'':
+ case '"':
+ quote = str[i];
+ break;
+ }
}
- if (elt->op != OP_NOT)
- cnt--;
- n_normal_preds++;
- /* all ops should have operands */
- if (cnt < 0)
- break;
- }
-
- if (cnt != 1 || !n_normal_preds || n_logical_preds >= n_normal_preds) {
- parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
- return -EINVAL;
+ /* First character is the '(' with missing ')' */
+ *err = 0;
+ return TOO_MANY_OPEN;
}
+ /* Set the size of the required stacks */
+ *parens = max_open;
+ *preds = nr_preds;
return 0;
}
-static int count_preds(struct filter_parse_state *ps)
-{
- struct postfix_elt *elt;
- int n_preds = 0;
-
- list_for_each_entry(elt, &ps->postfix, list) {
- if (elt->op == OP_NONE)
- continue;
- n_preds++;
- }
-
- return n_preds;
-}
-
-struct check_pred_data {
- int count;
- int max;
-};
-
-static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
- int *err, void *data)
-{
- struct check_pred_data *d = data;
-
- if (WARN_ON(d->count++ > d->max)) {
- *err = -EINVAL;
- return WALK_PRED_ABORT;
- }
- return WALK_PRED_DEFAULT;
-}
-
-/*
- * The tree is walked at filtering of an event. If the tree is not correctly
- * built, it may cause an infinite loop. Check here that the tree does
- * indeed terminate.
- */
-static int check_pred_tree(struct event_filter *filter,
- struct filter_pred *root)
-{
- struct check_pred_data data = {
- /*
- * The max that we can hit a node is three times.
- * Once going down, once coming up from left, and
- * once coming up from right. This is more than enough
- * since leafs are only hit a single time.
- */
- .max = 3 * filter->n_preds,
- .count = 0,
- };
-
- return walk_pred_tree(filter->preds, root,
- check_pred_tree_cb, &data);
-}
-
-static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
- int *err, void *data)
-{
- int *count = data;
-
- if ((move == MOVE_DOWN) &&
- (pred->left == FILTER_PRED_INVALID))
- (*count)++;
-
- return WALK_PRED_DEFAULT;
-}
-
-static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
-{
- int count = 0, ret;
-
- ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
- WARN_ON(ret);
- return count;
-}
-
-struct fold_pred_data {
- struct filter_pred *root;
- int count;
- int children;
-};
-
-static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
- int *err, void *data)
-{
- struct fold_pred_data *d = data;
- struct filter_pred *root = d->root;
-
- if (move != MOVE_DOWN)
- return WALK_PRED_DEFAULT;
- if (pred->left != FILTER_PRED_INVALID)
- return WALK_PRED_DEFAULT;
-
- if (WARN_ON(d->count == d->children)) {
- *err = -EINVAL;
- return WALK_PRED_ABORT;
- }
-
- pred->index &= ~FILTER_PRED_FOLD;
- root->ops[d->count++] = pred->index;
- return WALK_PRED_DEFAULT;
-}
-
-static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
-{
- struct fold_pred_data data = {
- .root = root,
- .count = 0,
- };
- int children;
-
- /* No need to keep the fold flag */
- root->index &= ~FILTER_PRED_FOLD;
-
- /* If the root is a leaf then do nothing */
- if (root->left == FILTER_PRED_INVALID)
- return 0;
-
- /* count the children */
- children = count_leafs(preds, &preds[root->left]);
- children += count_leafs(preds, &preds[root->right]);
-
- root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL);
- if (!root->ops)
- return -ENOMEM;
-
- root->val = children;
- data.children = children;
- return walk_pred_tree(preds, root, fold_pred_cb, &data);
-}
-
-static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
- int *err, void *data)
-{
- struct filter_pred *preds = data;
-
- if (move != MOVE_DOWN)
- return WALK_PRED_DEFAULT;
- if (!(pred->index & FILTER_PRED_FOLD))
- return WALK_PRED_DEFAULT;
-
- *err = fold_pred(preds, pred);
- if (*err)
- return WALK_PRED_ABORT;
-
- /* eveyrhing below is folded, continue with parent */
- return WALK_PRED_PARENT;
-}
-
-/*
- * To optimize the processing of the ops, if we have several "ors" or
- * "ands" together, we can put them in an array and process them all
- * together speeding up the filter logic.
- */
-static int fold_pred_tree(struct event_filter *filter,
- struct filter_pred *root)
-{
- return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
- filter->preds);
-}
-
-static int replace_preds(struct trace_event_call *call,
+static int process_preds(struct trace_event_call *call,
+ const char *filter_string,
struct event_filter *filter,
- struct filter_parse_state *ps,
- bool dry_run)
+ struct filter_parse_error *pe)
{
- char *operand1 = NULL, *operand2 = NULL;
- struct filter_pred *pred;
- struct filter_pred *root;
- struct postfix_elt *elt;
- struct pred_stack stack = { }; /* init to NULL */
- int err;
- int n_preds = 0;
-
- n_preds = count_preds(ps);
- if (n_preds >= MAX_FILTER_PRED) {
- parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
- return -ENOSPC;
- }
-
- err = check_preds(ps);
- if (err)
- return err;
-
- if (!dry_run) {
- err = __alloc_pred_stack(&stack, n_preds);
- if (err)
- return err;
- err = __alloc_preds(filter, n_preds);
- if (err)
- goto fail;
- }
-
- n_preds = 0;
- list_for_each_entry(elt, &ps->postfix, list) {
- if (elt->op == OP_NONE) {
- if (!operand1)
- operand1 = elt->operand;
- else if (!operand2)
- operand2 = elt->operand;
- else {
- parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
- err = -EINVAL;
- goto fail;
- }
- continue;
- }
-
- if (elt->op == OP_NOT) {
- if (!n_preds || operand1 || operand2) {
- parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
- err = -EINVAL;
- goto fail;
- }
- if (!dry_run)
- filter->preds[n_preds - 1].not ^= 1;
- continue;
- }
-
- if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
- parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
- err = -ENOSPC;
- goto fail;
- }
-
- pred = create_pred(ps, call, elt->op, operand1, operand2);
- if (!pred) {
- err = -EINVAL;
- goto fail;
- }
+ struct prog_entry *prog;
+ int nr_parens;
+ int nr_preds;
+ int index;
+ int ret;
- if (!dry_run) {
- err = filter_add_pred(ps, filter, pred, &stack);
- if (err)
- goto fail;
+ ret = calc_stack(filter_string, &nr_parens, &nr_preds, &index);
+ if (ret < 0) {
+ switch (ret) {
+ case MISSING_QUOTE:
+ parse_error(pe, FILT_ERR_MISSING_QUOTE, index);
+ break;
+ case TOO_MANY_OPEN:
+ parse_error(pe, FILT_ERR_TOO_MANY_OPEN, index);
+ break;
+ default:
+ parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, index);
}
-
- operand1 = operand2 = NULL;
+ return ret;
}
- if (!dry_run) {
- /* We should have one item left on the stack */
- pred = __pop_pred_stack(&stack);
- if (!pred)
- return -EINVAL;
- /* This item is where we start from in matching */
- root = pred;
- /* Make sure the stack is empty */
- pred = __pop_pred_stack(&stack);
- if (WARN_ON(pred)) {
- err = -EINVAL;
- filter->root = NULL;
- goto fail;
- }
- err = check_pred_tree(filter, root);
- if (err)
- goto fail;
-
- /* Optimize the tree */
- err = fold_pred_tree(filter, root);
- if (err)
- goto fail;
-
- /* We don't set root until we know it works */
- barrier();
- filter->root = root;
+ if (!nr_preds) {
+ prog = NULL;
+ } else {
+ prog = predicate_parse(filter_string, nr_parens, nr_preds,
+ parse_pred, call, pe);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
}
-
- err = 0;
-fail:
- __free_pred_stack(&stack);
- return err;
+ rcu_assign_pointer(filter->prog, prog);
+ return 0;
}
static inline void event_set_filtered_flag(struct trace_event_file *file)
@@ -1780,72 +1558,53 @@ struct filter_list {
struct event_filter *filter;
};
-static int replace_system_preds(struct trace_subsystem_dir *dir,
+static int process_system_preds(struct trace_subsystem_dir *dir,
struct trace_array *tr,
- struct filter_parse_state *ps,
+ struct filter_parse_error *pe,
char *filter_string)
{
struct trace_event_file *file;
struct filter_list *filter_item;
+ struct event_filter *filter = NULL;
struct filter_list *tmp;
LIST_HEAD(filter_list);
bool fail = true;
int err;
list_for_each_entry(file, &tr->events, list) {
- if (file->system != dir)
- continue;
-
- /*
- * Try to see if the filter can be applied
- * (filter arg is ignored on dry_run)
- */
- err = replace_preds(file->event_call, NULL, ps, true);
- if (err)
- event_set_no_set_filter_flag(file);
- else
- event_clear_no_set_filter_flag(file);
- }
-
- list_for_each_entry(file, &tr->events, list) {
- struct event_filter *filter;
if (file->system != dir)
continue;
- if (event_no_set_filter_flag(file))
- continue;
-
- filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
- if (!filter_item)
- goto fail_mem;
-
- list_add_tail(&filter_item->list, &filter_list);
-
- filter_item->filter = __alloc_filter();
- if (!filter_item->filter)
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ if (!filter)
goto fail_mem;
- filter = filter_item->filter;
- /* Can only fail on no memory */
- err = replace_filter_string(filter, filter_string);
- if (err)
+ filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+ if (!filter->filter_string)
goto fail_mem;
- err = replace_preds(file->event_call, filter, ps, false);
+ err = process_preds(file->event_call, filter_string, filter, pe);
if (err) {
filter_disable(file);
- parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
- append_filter_err(ps, filter);
+ parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ append_filter_err(pe, filter);
} else
event_set_filtered_flag(file);
+
+
+ filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
+ if (!filter_item)
+ goto fail_mem;
+
+ list_add_tail(&filter_item->list, &filter_list);
/*
* Regardless of if this returned an error, we still
* replace the filter for the call.
*/
- filter = event_filter(file);
- event_set_filter(file, filter_item->filter);
- filter_item->filter = filter;
+ filter_item->filter = event_filter(file);
+ event_set_filter(file, filter);
+ filter = NULL;
fail = false;
}
@@ -1871,9 +1630,10 @@ static int replace_system_preds(struct trace_subsystem_dir *dir,
list_del(&filter_item->list);
kfree(filter_item);
}
- parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
+ kfree(filter);
/* If any call succeeded, we still need to sync */
if (!fail)
synchronize_sched();
@@ -1885,47 +1645,42 @@ static int replace_system_preds(struct trace_subsystem_dir *dir,
return -ENOMEM;
}
-static int create_filter_start(char *filter_str, bool set_str,
- struct filter_parse_state **psp,
+static int create_filter_start(char *filter_string, bool set_str,
+ struct filter_parse_error **pse,
struct event_filter **filterp)
{
struct event_filter *filter;
- struct filter_parse_state *ps = NULL;
+ struct filter_parse_error *pe = NULL;
int err = 0;
- WARN_ON_ONCE(*psp || *filterp);
+ if (WARN_ON_ONCE(*pse || *filterp))
+ return -EINVAL;
- /* allocate everything, and if any fails, free all and fail */
- filter = __alloc_filter();
- if (filter && set_str)
- err = replace_filter_string(filter, filter_str);
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ if (filter && set_str) {
+ filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+ if (!filter->filter_string)
+ err = -ENOMEM;
+ }
- ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ pe = kzalloc(sizeof(*pe), GFP_KERNEL);
- if (!filter || !ps || err) {
- kfree(ps);
+ if (!filter || !pe || err) {
+ kfree(pe);
__free_filter(filter);
return -ENOMEM;
}
/* we're committed to creating a new filter */
*filterp = filter;
- *psp = ps;
+ *pse = pe;
- parse_init(ps, filter_ops, filter_str);
- err = filter_parse(ps);
- if (err && set_str)
- append_filter_err(ps, filter);
- return err;
+ return 0;
}
-static void create_filter_finish(struct filter_parse_state *ps)
+static void create_filter_finish(struct filter_parse_error *pe)
{
- if (ps) {
- filter_opstack_clear(ps);
- postfix_clear(ps);
- kfree(ps);
- }
+ kfree(pe);
}
/**
@@ -1945,26 +1700,20 @@ static void create_filter_finish(struct filter_parse_state *ps)
* freeing it.
*/
static int create_filter(struct trace_event_call *call,
- char *filter_str, bool set_str,
+ char *filter_string, bool set_str,
struct event_filter **filterp)
{
- struct event_filter *filter = NULL;
- struct filter_parse_state *ps = NULL;
+ struct filter_parse_error *pe = NULL;
int err;
- err = create_filter_start(filter_str, set_str, &ps, &filter);
- if (!err) {
- err = replace_preds(call, filter, ps, false);
- if (err && set_str)
- append_filter_err(ps, filter);
- }
- if (err && !set_str) {
- free_event_filter(filter);
- filter = NULL;
- }
- create_filter_finish(ps);
+ err = create_filter_start(filter_string, set_str, &pe, filterp);
+ if (err)
+ return err;
+
+ err = process_preds(call, filter_string, *filterp, pe);
+ if (err && set_str)
+ append_filter_err(pe, *filterp);
- *filterp = filter;
return err;
}
@@ -1988,24 +1737,22 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
struct trace_array *tr,
char *filter_str, struct event_filter **filterp)
{
- struct event_filter *filter = NULL;
- struct filter_parse_state *ps = NULL;
+ struct filter_parse_error *pe = NULL;
int err;
- err = create_filter_start(filter_str, true, &ps, &filter);
+ err = create_filter_start(filter_str, true, &pe, filterp);
if (!err) {
- err = replace_system_preds(dir, tr, ps, filter_str);
+ err = process_system_preds(dir, tr, pe, filter_str);
if (!err) {
/* System filters just show a default message */
- kfree(filter->filter_string);
- filter->filter_string = NULL;
+ kfree((*filterp)->filter_string);
+ (*filterp)->filter_string = NULL;
} else {
- append_filter_err(ps, filter);
+ append_filter_err(pe, *filterp);
}
}
- create_filter_finish(ps);
+ create_filter_finish(pe);
- *filterp = filter;
return err;
}
@@ -2013,7 +1760,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
int apply_event_filter(struct trace_event_file *file, char *filter_string)
{
struct trace_event_call *call = file->event_call;
- struct event_filter *filter;
+ struct event_filter *filter = NULL;
int err;
if (!strcmp(strstrip(filter_string), "0")) {
@@ -2066,7 +1813,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
{
struct event_subsystem *system = dir->subsystem;
struct trace_array *tr = dir->tr;
- struct event_filter *filter;
+ struct event_filter *filter = NULL;
int err = 0;
mutex_lock(&event_mutex);
@@ -2186,66 +1933,80 @@ static int __ftrace_function_set_filter(int filter, char *buf, int len,
return ret;
}
-static int ftrace_function_check_pred(struct filter_pred *pred, int leaf)
+static int ftrace_function_check_pred(struct filter_pred *pred)
{
struct ftrace_event_field *field = pred->field;
- if (leaf) {
- /*
- * Check the leaf predicate for function trace, verify:
- * - only '==' and '!=' is used
- * - the 'ip' field is used
- */
- if ((pred->op != OP_EQ) && (pred->op != OP_NE))
- return -EINVAL;
+ /*
+ * Check the predicate for function trace, verify:
+ * - only '==' and '!=' is used
+ * - the 'ip' field is used
+ */
+ if ((pred->op != OP_EQ) && (pred->op != OP_NE))
+ return -EINVAL;
- if (strcmp(field->name, "ip"))
- return -EINVAL;
- } else {
- /*
- * Check the non leaf predicate for function trace, verify:
- * - only '||' is used
- */
- if (pred->op != OP_OR)
- return -EINVAL;
- }
+ if (strcmp(field->name, "ip"))
+ return -EINVAL;
return 0;
}
-static int ftrace_function_set_filter_cb(enum move_type move,
- struct filter_pred *pred,
- int *err, void *data)
+static int ftrace_function_set_filter_pred(struct filter_pred *pred,
+ struct function_filter_data *data)
{
+ int ret;
+
/* Checking the node is valid for function trace. */
- if ((move != MOVE_DOWN) ||
- (pred->left != FILTER_PRED_INVALID)) {
- *err = ftrace_function_check_pred(pred, 0);
- } else {
- *err = ftrace_function_check_pred(pred, 1);
- if (*err)
- return WALK_PRED_ABORT;
-
- *err = __ftrace_function_set_filter(pred->op == OP_EQ,
- pred->regex.pattern,
- pred->regex.len,
- data);
- }
+ ret = ftrace_function_check_pred(pred);
+ if (ret)
+ return ret;
- return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT;
+ return __ftrace_function_set_filter(pred->op == OP_EQ,
+ pred->regex.pattern,
+ pred->regex.len,
+ data);
+}
+
+static bool is_or(struct prog_entry *prog, int i)
+{
+ int target;
+
+ /*
+ * Only "||" is allowed for function events, thus,
+ * all true branches should jump to true, and any
+ * false branch should jump to false.
+ */
+ target = prog[i].target + 1;
+ /* True and false have NULL preds (all prog entries should jump to one */
+ if (prog[target].pred)
+ return false;
+
+ /* prog[target].target is 1 for TRUE, 0 for FALSE */
+ return prog[i].when_to_branch == prog[target].target;
}
static int ftrace_function_set_filter(struct perf_event *event,
struct event_filter *filter)
{
+ struct prog_entry *prog = rcu_dereference_protected(filter->prog,
+ lockdep_is_held(&event_mutex));
struct function_filter_data data = {
.first_filter = 1,
.first_notrace = 1,
.ops = &event->ftrace_ops,
};
+ int i;
+
+ for (i = 0; prog[i].pred; i++) {
+ struct filter_pred *pred = prog[i].pred;
- return walk_pred_tree(filter->preds, filter->root,
- ftrace_function_set_filter_cb, &data);
+ if (!is_or(prog, i))
+ return -EINVAL;
+
+ if (ftrace_function_set_filter_pred(pred, &data) < 0)
+ return -EINVAL;
+ }
+ return 0;
}
#else
static int ftrace_function_set_filter(struct perf_event *event,
@@ -2259,7 +2020,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
char *filter_str)
{
int err;
- struct event_filter *filter;
+ struct event_filter *filter = NULL;
struct trace_event_call *call;
mutex_lock(&event_mutex);
@@ -2375,7 +2136,7 @@ static struct test_filter_data_t {
#undef YES
#undef NO
-#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
+#define DATA_CNT ARRAY_SIZE(test_filter_data)
static int test_pred_visited;
@@ -2388,26 +2149,28 @@ static int test_pred_visited_fn(struct filter_pred *pred, void *event)
return 1;
}
-static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
- int *err, void *data)
+static void update_pred_fn(struct event_filter *filter, char *fields)
{
- char *fields = data;
+ struct prog_entry *prog = rcu_dereference_protected(filter->prog,
+ lockdep_is_held(&event_mutex));
+ int i;
- if ((move == MOVE_DOWN) &&
- (pred->left == FILTER_PRED_INVALID)) {
+ for (i = 0; prog[i].pred; i++) {
+ struct filter_pred *pred = prog[i].pred;
struct ftrace_event_field *field = pred->field;
+ WARN_ON_ONCE(!pred->fn);
+
if (!field) {
- WARN(1, "all leafs should have field defined");
- return WALK_PRED_DEFAULT;
+ WARN_ONCE(1, "all leafs should have field defined %d", i);
+ continue;
}
+
if (!strchr(fields, *field->name))
- return WALK_PRED_DEFAULT;
+ continue;
- WARN_ON(!pred->fn);
pred->fn = test_pred_visited_fn;
}
- return WALK_PRED_DEFAULT;
}
static __init int ftrace_test_event_filter(void)
@@ -2431,20 +2194,22 @@ static __init int ftrace_test_event_filter(void)
break;
}
+ /* Needed to dereference filter->prog */
+ mutex_lock(&event_mutex);
/*
* The preemption disabling is not really needed for self
* tests, but the rcu dereference will complain without it.
*/
preempt_disable();
if (*d->not_visited)
- walk_pred_tree(filter->preds, filter->root,
- test_walk_pred_cb,
- d->not_visited);
+ update_pred_fn(filter, d->not_visited);
test_pred_visited = 0;
err = filter_match_preds(filter, &d->rec);
preempt_enable();
+ mutex_unlock(&event_mutex);
+
__free_filter(filter);
if (test_pred_visited) {
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1e1558c99d56..0d7b3ffbecc2 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -20,15 +20,39 @@
#include <linux/slab.h>
#include <linux/stacktrace.h>
#include <linux/rculist.h>
+#include <linux/tracefs.h>
#include "tracing_map.h"
#include "trace.h"
+#define SYNTH_SYSTEM "synthetic"
+#define SYNTH_FIELDS_MAX 16
+
+#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
+
struct hist_field;
-typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
+typedef u64 (*hist_field_fn_t) (struct hist_field *field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event);
#define HIST_FIELD_OPERANDS_MAX 2
+#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
+#define HIST_ACTIONS_MAX 8
+
+enum field_op_id {
+ FIELD_OP_NONE,
+ FIELD_OP_PLUS,
+ FIELD_OP_MINUS,
+ FIELD_OP_UNARY_MINUS,
+};
+
+struct hist_var {
+ char *name;
+ struct hist_trigger_data *hist_data;
+ unsigned int idx;
+};
struct hist_field {
struct ftrace_event_field *field;
@@ -37,27 +61,49 @@ struct hist_field {
unsigned int size;
unsigned int offset;
unsigned int is_signed;
+ const char *type;
struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
+ struct hist_trigger_data *hist_data;
+ struct hist_var var;
+ enum field_op_id operator;
+ char *system;
+ char *event_name;
+ char *name;
+ unsigned int var_idx;
+ unsigned int var_ref_idx;
+ bool read_once;
};
-static u64 hist_field_none(struct hist_field *field, void *event)
+static u64 hist_field_none(struct hist_field *field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
{
return 0;
}
-static u64 hist_field_counter(struct hist_field *field, void *event)
+static u64 hist_field_counter(struct hist_field *field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
{
return 1;
}
-static u64 hist_field_string(struct hist_field *hist_field, void *event)
+static u64 hist_field_string(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
{
char *addr = (char *)(event + hist_field->field->offset);
return (u64)(unsigned long)addr;
}
-static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
+static u64 hist_field_dynstring(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
{
u32 str_item = *(u32 *)(event + hist_field->field->offset);
int str_loc = str_item & 0xffff;
@@ -66,24 +112,74 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
return (u64)(unsigned long)addr;
}
-static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
+static u64 hist_field_pstring(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
{
char **addr = (char **)(event + hist_field->field->offset);
return (u64)(unsigned long)*addr;
}
-static u64 hist_field_log2(struct hist_field *hist_field, void *event)
+static u64 hist_field_log2(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
{
struct hist_field *operand = hist_field->operands[0];
- u64 val = operand->fn(operand, event);
+ u64 val = operand->fn(operand, elt, rbe, event);
return (u64) ilog2(roundup_pow_of_two(val));
}
+static u64 hist_field_plus(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, rbe, event);
+ u64 val2 = operand2->fn(operand2, elt, rbe, event);
+
+ return val1 + val2;
+}
+
+static u64 hist_field_minus(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, rbe, event);
+ u64 val2 = operand2->fn(operand2, elt, rbe, event);
+
+ return val1 - val2;
+}
+
+static u64 hist_field_unary_minus(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand = hist_field->operands[0];
+
+ s64 sval = (s64)operand->fn(operand, elt, rbe, event);
+ u64 val = (u64)-sval;
+
+ return val;
+}
+
#define DEFINE_HIST_FIELD_FN(type) \
-static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
+ static u64 hist_field_##type(struct hist_field *hist_field, \
+ struct tracing_map_elt *elt, \
+ struct ring_buffer_event *rbe, \
+ void *event) \
{ \
type *addr = (type *)(event + hist_field->field->offset); \
\
@@ -126,6 +222,19 @@ enum hist_field_flags {
HIST_FIELD_FL_SYSCALL = 1 << 7,
HIST_FIELD_FL_STACKTRACE = 1 << 8,
HIST_FIELD_FL_LOG2 = 1 << 9,
+ HIST_FIELD_FL_TIMESTAMP = 1 << 10,
+ HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11,
+ HIST_FIELD_FL_VAR = 1 << 12,
+ HIST_FIELD_FL_EXPR = 1 << 13,
+ HIST_FIELD_FL_VAR_REF = 1 << 14,
+ HIST_FIELD_FL_CPU = 1 << 15,
+ HIST_FIELD_FL_ALIAS = 1 << 16,
+};
+
+struct var_defs {
+ unsigned int n_vars;
+ char *name[TRACING_MAP_VARS_MAX];
+ char *expr[TRACING_MAP_VARS_MAX];
};
struct hist_trigger_attrs {
@@ -133,25 +242,1437 @@ struct hist_trigger_attrs {
char *vals_str;
char *sort_key_str;
char *name;
+ char *clock;
bool pause;
bool cont;
bool clear;
+ bool ts_in_usecs;
unsigned int map_bits;
+
+ char *assignment_str[TRACING_MAP_VARS_MAX];
+ unsigned int n_assignments;
+
+ char *action_str[HIST_ACTIONS_MAX];
+ unsigned int n_actions;
+
+ struct var_defs var_defs;
+};
+
+struct field_var {
+ struct hist_field *var;
+ struct hist_field *val;
+};
+
+struct field_var_hist {
+ struct hist_trigger_data *hist_data;
+ char *cmd;
};
struct hist_trigger_data {
- struct hist_field *fields[TRACING_MAP_FIELDS_MAX];
+ struct hist_field *fields[HIST_FIELDS_MAX];
unsigned int n_vals;
unsigned int n_keys;
unsigned int n_fields;
+ unsigned int n_vars;
unsigned int key_size;
struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX];
unsigned int n_sort_keys;
struct trace_event_file *event_file;
struct hist_trigger_attrs *attrs;
struct tracing_map *map;
+ bool enable_timestamps;
+ bool remove;
+ struct hist_field *var_refs[TRACING_MAP_VARS_MAX];
+ unsigned int n_var_refs;
+
+ struct action_data *actions[HIST_ACTIONS_MAX];
+ unsigned int n_actions;
+
+ struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX];
+ unsigned int n_synth_var_refs;
+ struct field_var *field_vars[SYNTH_FIELDS_MAX];
+ unsigned int n_field_vars;
+ unsigned int n_field_var_str;
+ struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX];
+ unsigned int n_field_var_hists;
+
+ struct field_var *max_vars[SYNTH_FIELDS_MAX];
+ unsigned int n_max_vars;
+ unsigned int n_max_var_str;
+};
+
+struct synth_field {
+ char *type;
+ char *name;
+ size_t size;
+ bool is_signed;
+ bool is_string;
+};
+
+struct synth_event {
+ struct list_head list;
+ int ref;
+ char *name;
+ struct synth_field **fields;
+ unsigned int n_fields;
+ unsigned int n_u64;
+ struct trace_event_class class;
+ struct trace_event_call call;
+ struct tracepoint *tp;
+};
+
+struct action_data;
+
+typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe,
+ struct action_data *data, u64 *var_ref_vals);
+
+struct action_data {
+ action_fn_t fn;
+ unsigned int n_params;
+ char *params[SYNTH_FIELDS_MAX];
+
+ union {
+ struct {
+ unsigned int var_ref_idx;
+ char *match_event;
+ char *match_event_system;
+ char *synth_event_name;
+ struct synth_event *synth_event;
+ } onmatch;
+
+ struct {
+ char *var_str;
+ char *fn_name;
+ unsigned int max_var_ref_idx;
+ struct hist_field *max_var;
+ struct hist_field *var;
+ } onmax;
+ };
+};
+
+
+static char last_hist_cmd[MAX_FILTER_STR_VAL];
+static char hist_err_str[MAX_FILTER_STR_VAL];
+
+static void last_cmd_set(char *str)
+{
+ if (!str)
+ return;
+
+ strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
+}
+
+static void hist_err(char *str, char *var)
+{
+ int maxlen = MAX_FILTER_STR_VAL - 1;
+
+ if (!str)
+ return;
+
+ if (strlen(hist_err_str))
+ return;
+
+ if (!var)
+ var = "";
+
+ if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen)
+ return;
+
+ strcat(hist_err_str, str);
+ strcat(hist_err_str, var);
+}
+
+static void hist_err_event(char *str, char *system, char *event, char *var)
+{
+ char err[MAX_FILTER_STR_VAL];
+
+ if (system && var)
+ snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
+ else if (system)
+ snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
+ else
+ strncpy(err, var, MAX_FILTER_STR_VAL);
+
+ hist_err(str, err);
+}
+
+static void hist_err_clear(void)
+{
+ hist_err_str[0] = '\0';
+}
+
+static bool have_hist_err(void)
+{
+ if (strlen(hist_err_str))
+ return true;
+
+ return false;
+}
+
+static LIST_HEAD(synth_event_list);
+static DEFINE_MUTEX(synth_event_mutex);
+
+struct synth_trace_event {
+ struct trace_entry ent;
+ u64 fields[];
+};
+
+static int synth_event_define_fields(struct trace_event_call *call)
+{
+ struct synth_trace_event trace;
+ int offset = offsetof(typeof(trace), fields);
+ struct synth_event *event = call->data;
+ unsigned int i, size, n_u64;
+ char *name, *type;
+ bool is_signed;
+ int ret = 0;
+
+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+ size = event->fields[i]->size;
+ is_signed = event->fields[i]->is_signed;
+ type = event->fields[i]->type;
+ name = event->fields[i]->name;
+ ret = trace_define_field(call, type, name, offset, size,
+ is_signed, FILTER_OTHER);
+ if (ret)
+ break;
+
+ if (event->fields[i]->is_string) {
+ offset += STR_VAR_LEN_MAX;
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ offset += sizeof(u64);
+ n_u64++;
+ }
+ }
+
+ event->n_u64 = n_u64;
+
+ return ret;
+}
+
+static bool synth_field_signed(char *type)
+{
+ if (strncmp(type, "u", 1) == 0)
+ return false;
+
+ return true;
+}
+
+static int synth_field_is_string(char *type)
+{
+ if (strstr(type, "char[") != NULL)
+ return true;
+
+ return false;
+}
+
+static int synth_field_string_size(char *type)
+{
+ char buf[4], *end, *start;
+ unsigned int len;
+ int size, err;
+
+ start = strstr(type, "char[");
+ if (start == NULL)
+ return -EINVAL;
+ start += strlen("char[");
+
+ end = strchr(type, ']');
+ if (!end || end < start)
+ return -EINVAL;
+
+ len = end - start;
+ if (len > 3)
+ return -EINVAL;
+
+ strncpy(buf, start, len);
+ buf[len] = '\0';
+
+ err = kstrtouint(buf, 0, &size);
+ if (err)
+ return err;
+
+ if (size > STR_VAR_LEN_MAX)
+ return -EINVAL;
+
+ return size;
+}
+
+static int synth_field_size(char *type)
+{
+ int size = 0;
+
+ if (strcmp(type, "s64") == 0)
+ size = sizeof(s64);
+ else if (strcmp(type, "u64") == 0)
+ size = sizeof(u64);
+ else if (strcmp(type, "s32") == 0)
+ size = sizeof(s32);
+ else if (strcmp(type, "u32") == 0)
+ size = sizeof(u32);
+ else if (strcmp(type, "s16") == 0)
+ size = sizeof(s16);
+ else if (strcmp(type, "u16") == 0)
+ size = sizeof(u16);
+ else if (strcmp(type, "s8") == 0)
+ size = sizeof(s8);
+ else if (strcmp(type, "u8") == 0)
+ size = sizeof(u8);
+ else if (strcmp(type, "char") == 0)
+ size = sizeof(char);
+ else if (strcmp(type, "unsigned char") == 0)
+ size = sizeof(unsigned char);
+ else if (strcmp(type, "int") == 0)
+ size = sizeof(int);
+ else if (strcmp(type, "unsigned int") == 0)
+ size = sizeof(unsigned int);
+ else if (strcmp(type, "long") == 0)
+ size = sizeof(long);
+ else if (strcmp(type, "unsigned long") == 0)
+ size = sizeof(unsigned long);
+ else if (strcmp(type, "pid_t") == 0)
+ size = sizeof(pid_t);
+ else if (synth_field_is_string(type))
+ size = synth_field_string_size(type);
+
+ return size;
+}
+
+static const char *synth_field_fmt(char *type)
+{
+ const char *fmt = "%llu";
+
+ if (strcmp(type, "s64") == 0)
+ fmt = "%lld";
+ else if (strcmp(type, "u64") == 0)
+ fmt = "%llu";
+ else if (strcmp(type, "s32") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "u32") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "s16") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "u16") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "s8") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "u8") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "char") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "unsigned char") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "int") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "unsigned int") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "long") == 0)
+ fmt = "%ld";
+ else if (strcmp(type, "unsigned long") == 0)
+ fmt = "%lu";
+ else if (strcmp(type, "pid_t") == 0)
+ fmt = "%d";
+ else if (synth_field_is_string(type))
+ fmt = "%s";
+
+ return fmt;
+}
+
+static enum print_line_t print_synth_event(struct trace_iterator *iter,
+ int flags,
+ struct trace_event *event)
+{
+ struct trace_array *tr = iter->tr;
+ struct trace_seq *s = &iter->seq;
+ struct synth_trace_event *entry;
+ struct synth_event *se;
+ unsigned int i, n_u64;
+ char print_fmt[32];
+ const char *fmt;
+
+ entry = (struct synth_trace_event *)iter->ent;
+ se = container_of(event, struct synth_event, call.event);
+
+ trace_seq_printf(s, "%s: ", se->name);
+
+ for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
+ if (trace_seq_has_overflowed(s))
+ goto end;
+
+ fmt = synth_field_fmt(se->fields[i]->type);
+
+ /* parameter types */
+ if (tr->trace_flags & TRACE_ITER_VERBOSE)
+ trace_seq_printf(s, "%s ", fmt);
+
+ snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
+
+ /* parameter values */
+ if (se->fields[i]->is_string) {
+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
+ (char *)&entry->fields[n_u64],
+ i == se->n_fields - 1 ? "" : " ");
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
+ entry->fields[n_u64],
+ i == se->n_fields - 1 ? "" : " ");
+ n_u64++;
+ }
+ }
+end:
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions synth_event_funcs = {
+ .trace = print_synth_event
+};
+
+static notrace void trace_event_raw_event_synth(void *__data,
+ u64 *var_ref_vals,
+ unsigned int var_ref_idx)
+{
+ struct trace_event_file *trace_file = __data;
+ struct synth_trace_event *entry;
+ struct trace_event_buffer fbuffer;
+ struct ring_buffer *buffer;
+ struct synth_event *event;
+ unsigned int i, n_u64;
+ int fields_size = 0;
+
+ event = trace_file->event_call->data;
+
+ if (trace_trigger_soft_disabled(trace_file))
+ return;
+
+ fields_size = event->n_u64 * sizeof(u64);
+
+ /*
+ * Avoid ring buffer recursion detection, as this event
+ * is being performed within another event.
+ */
+ buffer = trace_file->tr->trace_buffer.buffer;
+ ring_buffer_nest_start(buffer);
+
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry) + fields_size);
+ if (!entry)
+ goto out;
+
+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+ if (event->fields[i]->is_string) {
+ char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
+ char *str_field = (char *)&entry->fields[n_u64];
+
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
+ n_u64++;
+ }
+ }
+
+ trace_event_buffer_commit(&fbuffer);
+out:
+ ring_buffer_nest_end(buffer);
+}
+
+static void free_synth_event_print_fmt(struct trace_event_call *call)
+{
+ if (call) {
+ kfree(call->print_fmt);
+ call->print_fmt = NULL;
+ }
+}
+
+static int __set_synth_event_print_fmt(struct synth_event *event,
+ char *buf, int len)
+{
+ const char *fmt;
+ int pos = 0;
+ int i;
+
+ /* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+ for (i = 0; i < event->n_fields; i++) {
+ fmt = synth_field_fmt(event->fields[i]->type);
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
+ event->fields[i]->name, fmt,
+ i == event->n_fields - 1 ? "" : ", ");
+ }
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ for (i = 0; i < event->n_fields; i++) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", REC->%s", event->fields[i]->name);
+ }
+
+#undef LEN_OR_ZERO
+
+ /* return the length of print_fmt */
+ return pos;
+}
+
+static int set_synth_event_print_fmt(struct trace_event_call *call)
+{
+ struct synth_event *event = call->data;
+ char *print_fmt;
+ int len;
+
+ /* First: called with 0 length to calculate the needed length */
+ len = __set_synth_event_print_fmt(event, NULL, 0);
+
+ print_fmt = kmalloc(len + 1, GFP_KERNEL);
+ if (!print_fmt)
+ return -ENOMEM;
+
+ /* Second: actually write the @print_fmt */
+ __set_synth_event_print_fmt(event, print_fmt, len + 1);
+ call->print_fmt = print_fmt;
+
+ return 0;
+}
+
+static void free_synth_field(struct synth_field *field)
+{
+ kfree(field->type);
+ kfree(field->name);
+ kfree(field);
+}
+
+static struct synth_field *parse_synth_field(char *field_type,
+ char *field_name)
+{
+ struct synth_field *field;
+ int len, ret = 0;
+ char *array;
+
+ if (field_type[0] == ';')
+ field_type++;
+
+ len = strlen(field_name);
+ if (field_name[len - 1] == ';')
+ field_name[len - 1] = '\0';
+
+ field = kzalloc(sizeof(*field), GFP_KERNEL);
+ if (!field)
+ return ERR_PTR(-ENOMEM);
+
+ len = strlen(field_type) + 1;
+ array = strchr(field_name, '[');
+ if (array)
+ len += strlen(array);
+ field->type = kzalloc(len, GFP_KERNEL);
+ if (!field->type) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ strcat(field->type, field_type);
+ if (array) {
+ strcat(field->type, array);
+ *array = '\0';
+ }
+
+ field->size = synth_field_size(field->type);
+ if (!field->size) {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ if (synth_field_is_string(field->type))
+ field->is_string = true;
+
+ field->is_signed = synth_field_signed(field->type);
+
+ field->name = kstrdup(field_name, GFP_KERNEL);
+ if (!field->name) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ out:
+ return field;
+ free:
+ free_synth_field(field);
+ field = ERR_PTR(ret);
+ goto out;
+}
+
+static void free_synth_tracepoint(struct tracepoint *tp)
+{
+ if (!tp)
+ return;
+
+ kfree(tp->name);
+ kfree(tp);
+}
+
+static struct tracepoint *alloc_synth_tracepoint(char *name)
+{
+ struct tracepoint *tp;
+
+ tp = kzalloc(sizeof(*tp), GFP_KERNEL);
+ if (!tp)
+ return ERR_PTR(-ENOMEM);
+
+ tp->name = kstrdup(name, GFP_KERNEL);
+ if (!tp->name) {
+ kfree(tp);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return tp;
+}
+
+typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
+ unsigned int var_ref_idx);
+
+static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
+ unsigned int var_ref_idx)
+{
+ struct tracepoint *tp = event->tp;
+
+ if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
+ struct tracepoint_func *probe_func_ptr;
+ synth_probe_func_t probe_func;
+ void *__data;
+
+ if (!(cpu_online(raw_smp_processor_id())))
+ return;
+
+ probe_func_ptr = rcu_dereference_sched((tp)->funcs);
+ if (probe_func_ptr) {
+ do {
+ probe_func = probe_func_ptr->func;
+ __data = probe_func_ptr->data;
+ probe_func(__data, var_ref_vals, var_ref_idx);
+ } while ((++probe_func_ptr)->func);
+ }
+ }
+}
+
+static struct synth_event *find_synth_event(const char *name)
+{
+ struct synth_event *event;
+
+ list_for_each_entry(event, &synth_event_list, list) {
+ if (strcmp(event->name, name) == 0)
+ return event;
+ }
+
+ return NULL;
+}
+
+static int register_synth_event(struct synth_event *event)
+{
+ struct trace_event_call *call = &event->call;
+ int ret = 0;
+
+ event->call.class = &event->class;
+ event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
+ if (!event->class.system) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ event->tp = alloc_synth_tracepoint(event->name);
+ if (IS_ERR(event->tp)) {
+ ret = PTR_ERR(event->tp);
+ event->tp = NULL;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&call->class->fields);
+ call->event.funcs = &synth_event_funcs;
+ call->class->define_fields = synth_event_define_fields;
+
+ ret = register_trace_event(&call->event);
+ if (!ret) {
+ ret = -ENODEV;
+ goto out;
+ }
+ call->flags = TRACE_EVENT_FL_TRACEPOINT;
+ call->class->reg = trace_event_reg;
+ call->class->probe = trace_event_raw_event_synth;
+ call->data = event;
+ call->tp = event->tp;
+
+ ret = trace_add_event_call(call);
+ if (ret) {
+ pr_warn("Failed to register synthetic event: %s\n",
+ trace_event_name(call));
+ goto err;
+ }
+
+ ret = set_synth_event_print_fmt(call);
+ if (ret < 0) {
+ trace_remove_event_call(call);
+ goto err;
+ }
+ out:
+ return ret;
+ err:
+ unregister_trace_event(&call->event);
+ goto out;
+}
+
+static int unregister_synth_event(struct synth_event *event)
+{
+ struct trace_event_call *call = &event->call;
+ int ret;
+
+ ret = trace_remove_event_call(call);
+
+ return ret;
+}
+
+static void free_synth_event(struct synth_event *event)
+{
+ unsigned int i;
+
+ if (!event)
+ return;
+
+ for (i = 0; i < event->n_fields; i++)
+ free_synth_field(event->fields[i]);
+
+ kfree(event->fields);
+ kfree(event->name);
+ kfree(event->class.system);
+ free_synth_tracepoint(event->tp);
+ free_synth_event_print_fmt(&event->call);
+ kfree(event);
+}
+
+static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
+ struct synth_field **fields)
+{
+ struct synth_event *event;
+ unsigned int i;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event) {
+ event = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ event->name = kstrdup(event_name, GFP_KERNEL);
+ if (!event->name) {
+ kfree(event);
+ event = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
+ if (!event->fields) {
+ free_synth_event(event);
+ event = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ for (i = 0; i < n_fields; i++)
+ event->fields[i] = fields[i];
+
+ event->n_fields = n_fields;
+ out:
+ return event;
+}
+
+static void action_trace(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe,
+ struct action_data *data, u64 *var_ref_vals)
+{
+ struct synth_event *event = data->onmatch.synth_event;
+
+ trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx);
+}
+
+struct hist_var_data {
+ struct list_head list;
+ struct hist_trigger_data *hist_data;
+};
+
+static void add_or_delete_synth_event(struct synth_event *event, int delete)
+{
+ if (delete)
+ free_synth_event(event);
+ else {
+ mutex_lock(&synth_event_mutex);
+ if (!find_synth_event(event->name))
+ list_add(&event->list, &synth_event_list);
+ else
+ free_synth_event(event);
+ mutex_unlock(&synth_event_mutex);
+ }
+}
+
+static int create_synth_event(int argc, char **argv)
+{
+ struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
+ struct synth_event *event = NULL;
+ bool delete_event = false;
+ int i, n_fields = 0, ret = 0;
+ char *name;
+
+ mutex_lock(&synth_event_mutex);
+
+ /*
+ * Argument syntax:
+ * - Add synthetic event: <event_name> field[;field] ...
+ * - Remove synthetic event: !<event_name> field[;field] ...
+ * where 'field' = type field_name
+ */
+ if (argc < 1) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ name = argv[0];
+ if (name[0] == '!') {
+ delete_event = true;
+ name++;
+ }
+
+ event = find_synth_event(name);
+ if (event) {
+ if (delete_event) {
+ if (event->ref) {
+ event = NULL;
+ ret = -EBUSY;
+ goto out;
+ }
+ list_del(&event->list);
+ goto out;
+ }
+ event = NULL;
+ ret = -EEXIST;
+ goto out;
+ } else if (delete_event)
+ goto out;
+
+ if (argc < 2) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (i = 1; i < argc - 1; i++) {
+ if (strcmp(argv[i], ";") == 0)
+ continue;
+ if (n_fields == SYNTH_FIELDS_MAX) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ field = parse_synth_field(argv[i], argv[i + 1]);
+ if (IS_ERR(field)) {
+ ret = PTR_ERR(field);
+ goto err;
+ }
+ fields[n_fields] = field;
+ i++; n_fields++;
+ }
+
+ if (i < argc) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ event = alloc_synth_event(name, n_fields, fields);
+ if (IS_ERR(event)) {
+ ret = PTR_ERR(event);
+ event = NULL;
+ goto err;
+ }
+ out:
+ mutex_unlock(&synth_event_mutex);
+
+ if (event) {
+ if (delete_event) {
+ ret = unregister_synth_event(event);
+ add_or_delete_synth_event(event, !ret);
+ } else {
+ ret = register_synth_event(event);
+ add_or_delete_synth_event(event, ret);
+ }
+ }
+
+ return ret;
+ err:
+ mutex_unlock(&synth_event_mutex);
+
+ for (i = 0; i < n_fields; i++)
+ free_synth_field(fields[i]);
+ free_synth_event(event);
+
+ return ret;
+}
+
+static int release_all_synth_events(void)
+{
+ struct list_head release_events;
+ struct synth_event *event, *e;
+ int ret = 0;
+
+ INIT_LIST_HEAD(&release_events);
+
+ mutex_lock(&synth_event_mutex);
+
+ list_for_each_entry(event, &synth_event_list, list) {
+ if (event->ref) {
+ mutex_unlock(&synth_event_mutex);
+ return -EBUSY;
+ }
+ }
+
+ list_splice_init(&event->list, &release_events);
+
+ mutex_unlock(&synth_event_mutex);
+
+ list_for_each_entry_safe(event, e, &release_events, list) {
+ list_del(&event->list);
+
+ ret = unregister_synth_event(event);
+ add_or_delete_synth_event(event, !ret);
+ }
+
+ return ret;
+}
+
+
+static void *synth_events_seq_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&synth_event_mutex);
+
+ return seq_list_start(&synth_event_list, *pos);
+}
+
+static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &synth_event_list, pos);
+}
+
+static void synth_events_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&synth_event_mutex);
+}
+
+static int synth_events_seq_show(struct seq_file *m, void *v)
+{
+ struct synth_field *field;
+ struct synth_event *event = v;
+ unsigned int i;
+
+ seq_printf(m, "%s\t", event->name);
+
+ for (i = 0; i < event->n_fields; i++) {
+ field = event->fields[i];
+
+ /* parameter values */
+ seq_printf(m, "%s %s%s", field->type, field->name,
+ i == event->n_fields - 1 ? "" : "; ");
+ }
+
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static const struct seq_operations synth_events_seq_op = {
+ .start = synth_events_seq_start,
+ .next = synth_events_seq_next,
+ .stop = synth_events_seq_stop,
+ .show = synth_events_seq_show
+};
+
+static int synth_events_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ ret = release_all_synth_events();
+ if (ret < 0)
+ return ret;
+ }
+
+ return seq_open(file, &synth_events_seq_op);
+}
+
+static ssize_t synth_events_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return trace_parse_run_command(file, buffer, count, ppos,
+ create_synth_event);
+}
+
+static const struct file_operations synth_events_fops = {
+ .open = synth_events_open,
+ .write = synth_events_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static u64 hist_field_timestamp(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_trigger_data *hist_data = hist_field->hist_data;
+ struct trace_array *tr = hist_data->event_file->tr;
+
+ u64 ts = ring_buffer_event_time_stamp(rbe);
+
+ if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
+ ts = ns2usecs(ts);
+
+ return ts;
+}
+
+static u64 hist_field_cpu(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ int cpu = smp_processor_id();
+
+ return cpu;
+}
+
+static struct hist_field *
+check_field_for_var_ref(struct hist_field *hist_field,
+ struct hist_trigger_data *var_data,
+ unsigned int var_idx)
+{
+ struct hist_field *found = NULL;
+
+ if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) {
+ if (hist_field->var.idx == var_idx &&
+ hist_field->var.hist_data == var_data) {
+ found = hist_field;
+ }
+ }
+
+ return found;
+}
+
+static struct hist_field *
+check_field_for_var_refs(struct hist_trigger_data *hist_data,
+ struct hist_field *hist_field,
+ struct hist_trigger_data *var_data,
+ unsigned int var_idx,
+ unsigned int level)
+{
+ struct hist_field *found = NULL;
+ unsigned int i;
+
+ if (level > 3)
+ return found;
+
+ if (!hist_field)
+ return found;
+
+ found = check_field_for_var_ref(hist_field, var_data, var_idx);
+ if (found)
+ return found;
+
+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
+ struct hist_field *operand;
+
+ operand = hist_field->operands[i];
+ found = check_field_for_var_refs(hist_data, operand, var_data,
+ var_idx, level + 1);
+ if (found)
+ return found;
+ }
+
+ return found;
+}
+
+static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
+ struct hist_trigger_data *var_data,
+ unsigned int var_idx)
+{
+ struct hist_field *hist_field, *found = NULL;
+ unsigned int i;
+
+ for_each_hist_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
+ found = check_field_for_var_refs(hist_data, hist_field,
+ var_data, var_idx, 0);
+ if (found)
+ return found;
+ }
+
+ for (i = 0; i < hist_data->n_synth_var_refs; i++) {
+ hist_field = hist_data->synth_var_refs[i];
+ found = check_field_for_var_refs(hist_data, hist_field,
+ var_data, var_idx, 0);
+ if (found)
+ return found;
+ }
+
+ return found;
+}
+
+static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
+ unsigned int var_idx)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_field *found = NULL;
+ struct hist_var_data *var_data;
+
+ list_for_each_entry(var_data, &tr->hist_vars, list) {
+ if (var_data->hist_data == hist_data)
+ continue;
+ found = find_var_ref(var_data->hist_data, hist_data, var_idx);
+ if (found)
+ break;
+ }
+
+ return found;
+}
+
+static bool check_var_refs(struct hist_trigger_data *hist_data)
+{
+ struct hist_field *field;
+ bool found = false;
+ int i;
+
+ for_each_hist_field(i, hist_data) {
+ field = hist_data->fields[i];
+ if (field && field->flags & HIST_FIELD_FL_VAR) {
+ if (find_any_var_ref(hist_data, field->var.idx)) {
+ found = true;
+ break;
+ }
+ }
+ }
+
+ return found;
+}
+
+static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_var_data *var_data, *found = NULL;
+
+ list_for_each_entry(var_data, &tr->hist_vars, list) {
+ if (var_data->hist_data == hist_data) {
+ found = var_data;
+ break;
+ }
+ }
+
+ return found;
+}
+
+static bool field_has_hist_vars(struct hist_field *hist_field,
+ unsigned int level)
+{
+ int i;
+
+ if (level > 3)
+ return false;
+
+ if (!hist_field)
+ return false;
+
+ if (hist_field->flags & HIST_FIELD_FL_VAR ||
+ hist_field->flags & HIST_FIELD_FL_VAR_REF)
+ return true;
+
+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
+ struct hist_field *operand;
+
+ operand = hist_field->operands[i];
+ if (field_has_hist_vars(operand, level + 1))
+ return true;
+ }
+
+ return false;
+}
+
+static bool has_hist_vars(struct hist_trigger_data *hist_data)
+{
+ struct hist_field *hist_field;
+ int i;
+
+ for_each_hist_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
+ if (field_has_hist_vars(hist_field, 0))
+ return true;
+ }
+
+ return false;
+}
+
+static int save_hist_vars(struct hist_trigger_data *hist_data)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_var_data *var_data;
+
+ var_data = find_hist_vars(hist_data);
+ if (var_data)
+ return 0;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ var_data = kzalloc(sizeof(*var_data), GFP_KERNEL);
+ if (!var_data) {
+ trace_array_put(tr);
+ return -ENOMEM;
+ }
+
+ var_data->hist_data = hist_data;
+ list_add(&var_data->list, &tr->hist_vars);
+
+ return 0;
+}
+
+static void remove_hist_vars(struct hist_trigger_data *hist_data)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_var_data *var_data;
+
+ var_data = find_hist_vars(hist_data);
+ if (!var_data)
+ return;
+
+ if (WARN_ON(check_var_refs(hist_data)))
+ return;
+
+ list_del(&var_data->list);
+
+ kfree(var_data);
+
+ trace_array_put(tr);
+}
+
+static struct hist_field *find_var_field(struct hist_trigger_data *hist_data,
+ const char *var_name)
+{
+ struct hist_field *hist_field, *found = NULL;
+ int i;
+
+ for_each_hist_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
+ if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR &&
+ strcmp(hist_field->var.name, var_name) == 0) {
+ found = hist_field;
+ break;
+ }
+ }
+
+ return found;
+}
+
+static struct hist_field *find_var(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ const char *var_name)
+{
+ struct hist_trigger_data *test_data;
+ struct event_trigger_data *test;
+ struct hist_field *hist_field;
+
+ hist_field = find_var_field(hist_data, var_name);
+ if (hist_field)
+ return hist_field;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ test_data = test->private_data;
+ hist_field = find_var_field(test_data, var_name);
+ if (hist_field)
+ return hist_field;
+ }
+ }
+
+ return NULL;
+}
+
+static struct trace_event_file *find_var_file(struct trace_array *tr,
+ char *system,
+ char *event_name,
+ char *var_name)
+{
+ struct hist_trigger_data *var_hist_data;
+ struct hist_var_data *var_data;
+ struct trace_event_file *file, *found = NULL;
+
+ if (system)
+ return find_event_file(tr, system, event_name);
+
+ list_for_each_entry(var_data, &tr->hist_vars, list) {
+ var_hist_data = var_data->hist_data;
+ file = var_hist_data->event_file;
+ if (file == found)
+ continue;
+
+ if (find_var_field(var_hist_data, var_name)) {
+ if (found) {
+ hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
+ return NULL;
+ }
+
+ found = file;
+ }
+ }
+
+ return found;
+}
+
+static struct hist_field *find_file_var(struct trace_event_file *file,
+ const char *var_name)
+{
+ struct hist_trigger_data *test_data;
+ struct event_trigger_data *test;
+ struct hist_field *hist_field;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ test_data = test->private_data;
+ hist_field = find_var_field(test_data, var_name);
+ if (hist_field)
+ return hist_field;
+ }
+ }
+
+ return NULL;
+}
+
+static struct hist_field *
+find_match_var(struct hist_trigger_data *hist_data, char *var_name)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_field *hist_field, *found = NULL;
+ struct trace_event_file *file;
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *data = hist_data->actions[i];
+
+ if (data->fn == action_trace) {
+ char *system = data->onmatch.match_event_system;
+ char *event_name = data->onmatch.match_event;
+
+ file = find_var_file(tr, system, event_name, var_name);
+ if (!file)
+ continue;
+ hist_field = find_file_var(file, var_name);
+ if (hist_field) {
+ if (found) {
+ hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
+ return ERR_PTR(-EINVAL);
+ }
+
+ found = hist_field;
+ }
+ }
+ }
+ return found;
+}
+
+static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
+ char *system,
+ char *event_name,
+ char *var_name)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_field *hist_field = NULL;
+ struct trace_event_file *file;
+
+ if (!system || !event_name) {
+ hist_field = find_match_var(hist_data, var_name);
+ if (IS_ERR(hist_field))
+ return NULL;
+ if (hist_field)
+ return hist_field;
+ }
+
+ file = find_var_file(tr, system, event_name, var_name);
+ if (!file)
+ return NULL;
+
+ hist_field = find_file_var(file, var_name);
+
+ return hist_field;
+}
+
+struct hist_elt_data {
+ char *comm;
+ u64 *var_ref_vals;
+ char *field_var_str[SYNTH_FIELDS_MAX];
};
+static u64 hist_field_var_ref(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_elt_data *elt_data;
+ u64 var_val = 0;
+
+ elt_data = elt->private_data;
+ var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
+
+ return var_val;
+}
+
+static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key,
+ u64 *var_ref_vals, bool self)
+{
+ struct hist_trigger_data *var_data;
+ struct tracing_map_elt *var_elt;
+ struct hist_field *hist_field;
+ unsigned int i, var_idx;
+ bool resolved = true;
+ u64 var_val = 0;
+
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ hist_field = hist_data->var_refs[i];
+ var_idx = hist_field->var.idx;
+ var_data = hist_field->var.hist_data;
+
+ if (var_data == NULL) {
+ resolved = false;
+ break;
+ }
+
+ if ((self && var_data != hist_data) ||
+ (!self && var_data == hist_data))
+ continue;
+
+ var_elt = tracing_map_lookup(var_data->map, key);
+ if (!var_elt) {
+ resolved = false;
+ break;
+ }
+
+ if (!tracing_map_var_set(var_elt, var_idx)) {
+ resolved = false;
+ break;
+ }
+
+ if (self || !hist_field->read_once)
+ var_val = tracing_map_read_var(var_elt, var_idx);
+ else
+ var_val = tracing_map_read_var_once(var_elt, var_idx);
+
+ var_ref_vals[i] = var_val;
+ }
+
+ return resolved;
+}
+
static const char *hist_field_name(struct hist_field *field,
unsigned int level)
{
@@ -162,8 +1683,26 @@ static const char *hist_field_name(struct hist_field *field,
if (field->field)
field_name = field->field->name;
- else if (field->flags & HIST_FIELD_FL_LOG2)
+ else if (field->flags & HIST_FIELD_FL_LOG2 ||
+ field->flags & HIST_FIELD_FL_ALIAS)
field_name = hist_field_name(field->operands[0], ++level);
+ else if (field->flags & HIST_FIELD_FL_CPU)
+ field_name = "cpu";
+ else if (field->flags & HIST_FIELD_FL_EXPR ||
+ field->flags & HIST_FIELD_FL_VAR_REF) {
+ if (field->system) {
+ static char full_name[MAX_FILTER_STR_VAL];
+
+ strcat(full_name, field->system);
+ strcat(full_name, ".");
+ strcat(full_name, field->event_name);
+ strcat(full_name, ".");
+ strcat(full_name, field->name);
+ field_name = full_name;
+ } else
+ field_name = field->name;
+ } else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
+ field_name = "common_timestamp";
if (field_name == NULL)
field_name = "";
@@ -232,16 +1771,119 @@ static int parse_map_size(char *str)
static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
{
+ unsigned int i;
+
if (!attrs)
return;
+ for (i = 0; i < attrs->n_assignments; i++)
+ kfree(attrs->assignment_str[i]);
+
+ for (i = 0; i < attrs->n_actions; i++)
+ kfree(attrs->action_str[i]);
+
kfree(attrs->name);
kfree(attrs->sort_key_str);
kfree(attrs->keys_str);
kfree(attrs->vals_str);
+ kfree(attrs->clock);
kfree(attrs);
}
+static int parse_action(char *str, struct hist_trigger_attrs *attrs)
+{
+ int ret = -EINVAL;
+
+ if (attrs->n_actions >= HIST_ACTIONS_MAX)
+ return ret;
+
+ if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) ||
+ (strncmp(str, "onmax(", strlen("onmax(")) == 0)) {
+ attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
+ if (!attrs->action_str[attrs->n_actions]) {
+ ret = -ENOMEM;
+ return ret;
+ }
+ attrs->n_actions++;
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
+{
+ int ret = 0;
+
+ if ((strncmp(str, "key=", strlen("key=")) == 0) ||
+ (strncmp(str, "keys=", strlen("keys=")) == 0)) {
+ attrs->keys_str = kstrdup(str, GFP_KERNEL);
+ if (!attrs->keys_str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
+ (strncmp(str, "vals=", strlen("vals=")) == 0) ||
+ (strncmp(str, "values=", strlen("values=")) == 0)) {
+ attrs->vals_str = kstrdup(str, GFP_KERNEL);
+ if (!attrs->vals_str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if (strncmp(str, "sort=", strlen("sort=")) == 0) {
+ attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
+ if (!attrs->sort_key_str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if (strncmp(str, "name=", strlen("name=")) == 0) {
+ attrs->name = kstrdup(str, GFP_KERNEL);
+ if (!attrs->name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if (strncmp(str, "clock=", strlen("clock=")) == 0) {
+ strsep(&str, "=");
+ if (!str) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ str = strstrip(str);
+ attrs->clock = kstrdup(str, GFP_KERNEL);
+ if (!attrs->clock) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if (strncmp(str, "size=", strlen("size=")) == 0) {
+ int map_bits = parse_map_size(str);
+
+ if (map_bits < 0) {
+ ret = map_bits;
+ goto out;
+ }
+ attrs->map_bits = map_bits;
+ } else {
+ char *assignment;
+
+ if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
+ hist_err("Too many variables defined: ", str);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ assignment = kstrdup(str, GFP_KERNEL);
+ if (!assignment) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ attrs->assignment_str[attrs->n_assignments++] = assignment;
+ }
+ out:
+ return ret;
+}
+
static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
{
struct hist_trigger_attrs *attrs;
@@ -254,35 +1896,21 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
while (trigger_str) {
char *str = strsep(&trigger_str, ":");
- if ((strncmp(str, "key=", strlen("key=")) == 0) ||
- (strncmp(str, "keys=", strlen("keys=")) == 0))
- attrs->keys_str = kstrdup(str, GFP_KERNEL);
- else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
- (strncmp(str, "vals=", strlen("vals=")) == 0) ||
- (strncmp(str, "values=", strlen("values=")) == 0))
- attrs->vals_str = kstrdup(str, GFP_KERNEL);
- else if (strncmp(str, "sort=", strlen("sort=")) == 0)
- attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
- else if (strncmp(str, "name=", strlen("name=")) == 0)
- attrs->name = kstrdup(str, GFP_KERNEL);
- else if (strcmp(str, "pause") == 0)
+ if (strchr(str, '=')) {
+ ret = parse_assignment(str, attrs);
+ if (ret)
+ goto free;
+ } else if (strcmp(str, "pause") == 0)
attrs->pause = true;
else if ((strcmp(str, "cont") == 0) ||
(strcmp(str, "continue") == 0))
attrs->cont = true;
else if (strcmp(str, "clear") == 0)
attrs->clear = true;
- else if (strncmp(str, "size=", strlen("size=")) == 0) {
- int map_bits = parse_map_size(str);
-
- if (map_bits < 0) {
- ret = map_bits;
+ else {
+ ret = parse_action(str, attrs);
+ if (ret)
goto free;
- }
- attrs->map_bits = map_bits;
- } else {
- ret = -EINVAL;
- goto free;
}
}
@@ -291,6 +1919,14 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
goto free;
}
+ if (!attrs->clock) {
+ attrs->clock = kstrdup("global", GFP_KERNEL);
+ if (!attrs->clock) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ }
+
return attrs;
free:
destroy_hist_trigger_attrs(attrs);
@@ -313,64 +1949,203 @@ static inline void save_comm(char *comm, struct task_struct *task)
memcpy(comm, task->comm, TASK_COMM_LEN);
}
-static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt)
+static void hist_elt_data_free(struct hist_elt_data *elt_data)
{
- kfree((char *)elt->private_data);
+ unsigned int i;
+
+ for (i = 0; i < SYNTH_FIELDS_MAX; i++)
+ kfree(elt_data->field_var_str[i]);
+
+ kfree(elt_data->comm);
+ kfree(elt_data);
}
-static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt)
+static void hist_trigger_elt_data_free(struct tracing_map_elt *elt)
+{
+ struct hist_elt_data *elt_data = elt->private_data;
+
+ hist_elt_data_free(elt_data);
+}
+
+static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
{
struct hist_trigger_data *hist_data = elt->map->private_data;
+ unsigned int size = TASK_COMM_LEN;
+ struct hist_elt_data *elt_data;
struct hist_field *key_field;
- unsigned int i;
+ unsigned int i, n_str;
+
+ elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
+ if (!elt_data)
+ return -ENOMEM;
for_each_hist_key_field(i, hist_data) {
key_field = hist_data->fields[i];
if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
- unsigned int size = TASK_COMM_LEN + 1;
-
- elt->private_data = kzalloc(size, GFP_KERNEL);
- if (!elt->private_data)
+ elt_data->comm = kzalloc(size, GFP_KERNEL);
+ if (!elt_data->comm) {
+ kfree(elt_data);
return -ENOMEM;
+ }
break;
}
}
+ n_str = hist_data->n_field_var_str + hist_data->n_max_var_str;
+
+ size = STR_VAR_LEN_MAX;
+
+ for (i = 0; i < n_str; i++) {
+ elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
+ if (!elt_data->field_var_str[i]) {
+ hist_elt_data_free(elt_data);
+ return -ENOMEM;
+ }
+ }
+
+ elt->private_data = elt_data;
+
return 0;
}
-static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to,
- struct tracing_map_elt *from)
+static void hist_trigger_elt_data_init(struct tracing_map_elt *elt)
+{
+ struct hist_elt_data *elt_data = elt->private_data;
+
+ if (elt_data->comm)
+ save_comm(elt_data->comm, current);
+}
+
+static const struct tracing_map_ops hist_trigger_elt_data_ops = {
+ .elt_alloc = hist_trigger_elt_data_alloc,
+ .elt_free = hist_trigger_elt_data_free,
+ .elt_init = hist_trigger_elt_data_init,
+};
+
+static const char *get_hist_field_flags(struct hist_field *hist_field)
+{
+ const char *flags_str = NULL;
+
+ if (hist_field->flags & HIST_FIELD_FL_HEX)
+ flags_str = "hex";
+ else if (hist_field->flags & HIST_FIELD_FL_SYM)
+ flags_str = "sym";
+ else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
+ flags_str = "sym-offset";
+ else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
+ flags_str = "execname";
+ else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
+ flags_str = "syscall";
+ else if (hist_field->flags & HIST_FIELD_FL_LOG2)
+ flags_str = "log2";
+ else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
+ flags_str = "usecs";
+
+ return flags_str;
+}
+
+static void expr_field_str(struct hist_field *field, char *expr)
{
- char *comm_from = from->private_data;
- char *comm_to = to->private_data;
+ if (field->flags & HIST_FIELD_FL_VAR_REF)
+ strcat(expr, "$");
+
+ strcat(expr, hist_field_name(field, 0));
- if (comm_from)
- memcpy(comm_to, comm_from, TASK_COMM_LEN + 1);
+ if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) {
+ const char *flags_str = get_hist_field_flags(field);
+
+ if (flags_str) {
+ strcat(expr, ".");
+ strcat(expr, flags_str);
+ }
+ }
}
-static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt)
+static char *expr_str(struct hist_field *field, unsigned int level)
{
- char *comm = elt->private_data;
+ char *expr;
+
+ if (level > 1)
+ return NULL;
+
+ expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
+ if (!expr)
+ return NULL;
+
+ if (!field->operands[0]) {
+ expr_field_str(field, expr);
+ return expr;
+ }
+
+ if (field->operator == FIELD_OP_UNARY_MINUS) {
+ char *subexpr;
- if (comm)
- save_comm(comm, current);
+ strcat(expr, "-(");
+ subexpr = expr_str(field->operands[0], ++level);
+ if (!subexpr) {
+ kfree(expr);
+ return NULL;
+ }
+ strcat(expr, subexpr);
+ strcat(expr, ")");
+
+ kfree(subexpr);
+
+ return expr;
+ }
+
+ expr_field_str(field->operands[0], expr);
+
+ switch (field->operator) {
+ case FIELD_OP_MINUS:
+ strcat(expr, "-");
+ break;
+ case FIELD_OP_PLUS:
+ strcat(expr, "+");
+ break;
+ default:
+ kfree(expr);
+ return NULL;
+ }
+
+ expr_field_str(field->operands[1], expr);
+
+ return expr;
}
-static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
- .elt_alloc = hist_trigger_elt_comm_alloc,
- .elt_copy = hist_trigger_elt_comm_copy,
- .elt_free = hist_trigger_elt_comm_free,
- .elt_init = hist_trigger_elt_comm_init,
-};
+static int contains_operator(char *str)
+{
+ enum field_op_id field_op = FIELD_OP_NONE;
+ char *op;
+
+ op = strpbrk(str, "+-");
+ if (!op)
+ return FIELD_OP_NONE;
+
+ switch (*op) {
+ case '-':
+ if (*str == '-')
+ field_op = FIELD_OP_UNARY_MINUS;
+ else
+ field_op = FIELD_OP_MINUS;
+ break;
+ case '+':
+ field_op = FIELD_OP_PLUS;
+ break;
+ default:
+ break;
+ }
+
+ return field_op;
+}
static void destroy_hist_field(struct hist_field *hist_field,
unsigned int level)
{
unsigned int i;
- if (level > 2)
+ if (level > 3)
return;
if (!hist_field)
@@ -379,11 +2154,17 @@ static void destroy_hist_field(struct hist_field *hist_field,
for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
destroy_hist_field(hist_field->operands[i], level + 1);
+ kfree(hist_field->var.name);
+ kfree(hist_field->name);
+ kfree(hist_field->type);
+
kfree(hist_field);
}
-static struct hist_field *create_hist_field(struct ftrace_event_field *field,
- unsigned long flags)
+static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
+ struct ftrace_event_field *field,
+ unsigned long flags,
+ char *var_name)
{
struct hist_field *hist_field;
@@ -394,8 +2175,22 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
if (!hist_field)
return NULL;
+ hist_field->hist_data = hist_data;
+
+ if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
+ goto out; /* caller will populate */
+
+ if (flags & HIST_FIELD_FL_VAR_REF) {
+ hist_field->fn = hist_field_var_ref;
+ goto out;
+ }
+
if (flags & HIST_FIELD_FL_HITCOUNT) {
hist_field->fn = hist_field_counter;
+ hist_field->size = sizeof(u64);
+ hist_field->type = kstrdup("u64", GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
goto out;
}
@@ -407,8 +2202,29 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
if (flags & HIST_FIELD_FL_LOG2) {
unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
hist_field->fn = hist_field_log2;
- hist_field->operands[0] = create_hist_field(field, fl);
+ hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
hist_field->size = hist_field->operands[0]->size;
+ hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
+ goto out;
+ }
+
+ if (flags & HIST_FIELD_FL_TIMESTAMP) {
+ hist_field->fn = hist_field_timestamp;
+ hist_field->size = sizeof(u64);
+ hist_field->type = kstrdup("u64", GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
+ goto out;
+ }
+
+ if (flags & HIST_FIELD_FL_CPU) {
+ hist_field->fn = hist_field_cpu;
+ hist_field->size = sizeof(int);
+ hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
goto out;
}
@@ -418,6 +2234,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
if (is_string_field(field)) {
flags |= HIST_FIELD_FL_STRING;
+ hist_field->size = MAX_FILTER_STR_VAL;
+ hist_field->type = kstrdup(field->type, GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
+
if (field->filter_type == FILTER_STATIC_STRING)
hist_field->fn = hist_field_string;
else if (field->filter_type == FILTER_DYN_STRING)
@@ -425,6 +2246,12 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
else
hist_field->fn = hist_field_pstring;
} else {
+ hist_field->size = field->size;
+ hist_field->is_signed = field->is_signed;
+ hist_field->type = kstrdup(field->type, GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
+
hist_field->fn = select_value_fn(field->size,
field->is_signed);
if (!hist_field->fn) {
@@ -436,14 +2263,23 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
hist_field->field = field;
hist_field->flags = flags;
+ if (var_name) {
+ hist_field->var.name = kstrdup(var_name, GFP_KERNEL);
+ if (!hist_field->var.name)
+ goto free;
+ }
+
return hist_field;
+ free:
+ destroy_hist_field(hist_field, 0);
+ return NULL;
}
static void destroy_hist_fields(struct hist_trigger_data *hist_data)
{
unsigned int i;
- for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
+ for (i = 0; i < HIST_FIELDS_MAX; i++) {
if (hist_data->fields[i]) {
destroy_hist_field(hist_data->fields[i], 0);
hist_data->fields[i] = NULL;
@@ -451,69 +2287,1610 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data)
}
}
-static int create_hitcount_val(struct hist_trigger_data *hist_data)
+static int init_var_ref(struct hist_field *ref_field,
+ struct hist_field *var_field,
+ char *system, char *event_name)
{
- hist_data->fields[HITCOUNT_IDX] =
- create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
- if (!hist_data->fields[HITCOUNT_IDX])
- return -ENOMEM;
+ int err = 0;
+
+ ref_field->var.idx = var_field->var.idx;
+ ref_field->var.hist_data = var_field->hist_data;
+ ref_field->size = var_field->size;
+ ref_field->is_signed = var_field->is_signed;
+ ref_field->flags |= var_field->flags &
+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
+
+ if (system) {
+ ref_field->system = kstrdup(system, GFP_KERNEL);
+ if (!ref_field->system)
+ return -ENOMEM;
+ }
- hist_data->n_vals++;
+ if (event_name) {
+ ref_field->event_name = kstrdup(event_name, GFP_KERNEL);
+ if (!ref_field->event_name) {
+ err = -ENOMEM;
+ goto free;
+ }
+ }
- if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+ if (var_field->var.name) {
+ ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL);
+ if (!ref_field->name) {
+ err = -ENOMEM;
+ goto free;
+ }
+ } else if (var_field->name) {
+ ref_field->name = kstrdup(var_field->name, GFP_KERNEL);
+ if (!ref_field->name) {
+ err = -ENOMEM;
+ goto free;
+ }
+ }
+
+ ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
+ if (!ref_field->type) {
+ err = -ENOMEM;
+ goto free;
+ }
+ out:
+ return err;
+ free:
+ kfree(ref_field->system);
+ kfree(ref_field->event_name);
+ kfree(ref_field->name);
+
+ goto out;
+}
+
+static struct hist_field *create_var_ref(struct hist_field *var_field,
+ char *system, char *event_name)
+{
+ unsigned long flags = HIST_FIELD_FL_VAR_REF;
+ struct hist_field *ref_field;
+
+ ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
+ if (ref_field) {
+ if (init_var_ref(ref_field, var_field, system, event_name)) {
+ destroy_hist_field(ref_field, 0);
+ return NULL;
+ }
+ }
+
+ return ref_field;
+}
+
+static bool is_var_ref(char *var_name)
+{
+ if (!var_name || strlen(var_name) < 2 || var_name[0] != '$')
+ return false;
+
+ return true;
+}
+
+static char *field_name_from_var(struct hist_trigger_data *hist_data,
+ char *var_name)
+{
+ char *name, *field;
+ unsigned int i;
+
+ for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
+ name = hist_data->attrs->var_defs.name[i];
+
+ if (strcmp(var_name, name) == 0) {
+ field = hist_data->attrs->var_defs.expr[i];
+ if (contains_operator(field) || is_var_ref(field))
+ continue;
+ return field;
+ }
+ }
+
+ return NULL;
+}
+
+static char *local_field_var_ref(struct hist_trigger_data *hist_data,
+ char *system, char *event_name,
+ char *var_name)
+{
+ struct trace_event_call *call;
+
+ if (system && event_name) {
+ call = hist_data->event_file->event_call;
+
+ if (strcmp(system, call->class->system) != 0)
+ return NULL;
+
+ if (strcmp(event_name, trace_event_name(call)) != 0)
+ return NULL;
+ }
+
+ if (!!system != !!event_name)
+ return NULL;
+
+ if (!is_var_ref(var_name))
+ return NULL;
+
+ var_name++;
+
+ return field_name_from_var(hist_data, var_name);
+}
+
+static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
+ char *system, char *event_name,
+ char *var_name)
+{
+ struct hist_field *var_field = NULL, *ref_field = NULL;
+
+ if (!is_var_ref(var_name))
+ return NULL;
+
+ var_name++;
+
+ var_field = find_event_var(hist_data, system, event_name, var_name);
+ if (var_field)
+ ref_field = create_var_ref(var_field, system, event_name);
+
+ if (!ref_field)
+ hist_err_event("Couldn't find variable: $",
+ system, event_name, var_name);
+
+ return ref_field;
+}
+
+static struct ftrace_event_field *
+parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
+ char *field_str, unsigned long *flags)
+{
+ struct ftrace_event_field *field = NULL;
+ char *field_name, *modifier, *str;
+
+ modifier = str = kstrdup(field_str, GFP_KERNEL);
+ if (!modifier)
+ return ERR_PTR(-ENOMEM);
+
+ field_name = strsep(&modifier, ".");
+ if (modifier) {
+ if (strcmp(modifier, "hex") == 0)
+ *flags |= HIST_FIELD_FL_HEX;
+ else if (strcmp(modifier, "sym") == 0)
+ *flags |= HIST_FIELD_FL_SYM;
+ else if (strcmp(modifier, "sym-offset") == 0)
+ *flags |= HIST_FIELD_FL_SYM_OFFSET;
+ else if ((strcmp(modifier, "execname") == 0) &&
+ (strcmp(field_name, "common_pid") == 0))
+ *flags |= HIST_FIELD_FL_EXECNAME;
+ else if (strcmp(modifier, "syscall") == 0)
+ *flags |= HIST_FIELD_FL_SYSCALL;
+ else if (strcmp(modifier, "log2") == 0)
+ *flags |= HIST_FIELD_FL_LOG2;
+ else if (strcmp(modifier, "usecs") == 0)
+ *flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
+ else {
+ field = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ }
+
+ if (strcmp(field_name, "common_timestamp") == 0) {
+ *flags |= HIST_FIELD_FL_TIMESTAMP;
+ hist_data->enable_timestamps = true;
+ if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
+ hist_data->attrs->ts_in_usecs = true;
+ } else if (strcmp(field_name, "cpu") == 0)
+ *flags |= HIST_FIELD_FL_CPU;
+ else {
+ field = trace_find_event_field(file->event_call, field_name);
+ if (!field || !field->size) {
+ field = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ }
+ out:
+ kfree(str);
+
+ return field;
+}
+
+static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
+ struct hist_field *var_ref,
+ char *var_name)
+{
+ struct hist_field *alias = NULL;
+ unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR;
+
+ alias = create_hist_field(hist_data, NULL, flags, var_name);
+ if (!alias)
+ return NULL;
+
+ alias->fn = var_ref->fn;
+ alias->operands[0] = var_ref;
+
+ if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
+ destroy_hist_field(alias, 0);
+ return NULL;
+ }
+
+ return alias;
+}
+
+static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file, char *str,
+ unsigned long *flags, char *var_name)
+{
+ char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
+ struct ftrace_event_field *field = NULL;
+ struct hist_field *hist_field = NULL;
+ int ret = 0;
+
+ s = strchr(str, '.');
+ if (s) {
+ s = strchr(++s, '.');
+ if (s) {
+ ref_system = strsep(&str, ".");
+ if (!str) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ref_event = strsep(&str, ".");
+ if (!str) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ref_var = str;
+ }
+ }
+
+ s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
+ if (!s) {
+ hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
+ if (hist_field) {
+ hist_data->var_refs[hist_data->n_var_refs] = hist_field;
+ hist_field->var_ref_idx = hist_data->n_var_refs++;
+ if (var_name) {
+ hist_field = create_alias(hist_data, hist_field, var_name);
+ if (!hist_field) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ return hist_field;
+ }
+ } else
+ str = s;
+
+ field = parse_field(hist_data, file, str, flags);
+ if (IS_ERR(field)) {
+ ret = PTR_ERR(field);
+ goto out;
+ }
+
+ hist_field = create_hist_field(hist_data, field, *flags, var_name);
+ if (!hist_field) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ return hist_field;
+ out:
+ return ERR_PTR(ret);
+}
+
+static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ char *str, unsigned long flags,
+ char *var_name, unsigned int level);
+
+static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ char *str, unsigned long flags,
+ char *var_name, unsigned int level)
+{
+ struct hist_field *operand1, *expr = NULL;
+ unsigned long operand_flags;
+ int ret = 0;
+ char *s;
+
+ /* we support only -(xxx) i.e. explicit parens required */
+
+ if (level > 3) {
+ hist_err("Too many subexpressions (3 max): ", str);
+ ret = -EINVAL;
+ goto free;
+ }
+
+ str++; /* skip leading '-' */
+
+ s = strchr(str, '(');
+ if (s)
+ str++;
+ else {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ s = strrchr(str, ')');
+ if (s)
+ *s = '\0';
+ else {
+ ret = -EINVAL; /* no closing ')' */
+ goto free;
+ }
+
+ flags |= HIST_FIELD_FL_EXPR;
+ expr = create_hist_field(hist_data, NULL, flags, var_name);
+ if (!expr) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ operand_flags = 0;
+ operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
+ if (IS_ERR(operand1)) {
+ ret = PTR_ERR(operand1);
+ goto free;
+ }
+
+ expr->flags |= operand1->flags &
+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
+ expr->fn = hist_field_unary_minus;
+ expr->operands[0] = operand1;
+ expr->operator = FIELD_OP_UNARY_MINUS;
+ expr->name = expr_str(expr, 0);
+ expr->type = kstrdup(operand1->type, GFP_KERNEL);
+ if (!expr->type) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ return expr;
+ free:
+ destroy_hist_field(expr, 0);
+ return ERR_PTR(ret);
+}
+
+static int check_expr_operands(struct hist_field *operand1,
+ struct hist_field *operand2)
+{
+ unsigned long operand1_flags = operand1->flags;
+ unsigned long operand2_flags = operand2->flags;
+
+ if ((operand1_flags & HIST_FIELD_FL_VAR_REF) ||
+ (operand1_flags & HIST_FIELD_FL_ALIAS)) {
+ struct hist_field *var;
+
+ var = find_var_field(operand1->var.hist_data, operand1->name);
+ if (!var)
+ return -EINVAL;
+ operand1_flags = var->flags;
+ }
+
+ if ((operand2_flags & HIST_FIELD_FL_VAR_REF) ||
+ (operand2_flags & HIST_FIELD_FL_ALIAS)) {
+ struct hist_field *var;
+
+ var = find_var_field(operand2->var.hist_data, operand2->name);
+ if (!var)
+ return -EINVAL;
+ operand2_flags = var->flags;
+ }
+
+ if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
+ (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
+ hist_err("Timestamp units in expression don't match", NULL);
return -EINVAL;
+ }
return 0;
}
-static int create_val_field(struct hist_trigger_data *hist_data,
- unsigned int val_idx,
- struct trace_event_file *file,
- char *field_str)
+static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ char *str, unsigned long flags,
+ char *var_name, unsigned int level)
{
- struct ftrace_event_field *field = NULL;
- unsigned long flags = 0;
- char *field_name;
+ struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL;
+ unsigned long operand_flags;
+ int field_op, ret = -EINVAL;
+ char *sep, *operand1_str;
+
+ if (level > 3) {
+ hist_err("Too many subexpressions (3 max): ", str);
+ return ERR_PTR(-EINVAL);
+ }
+
+ field_op = contains_operator(str);
+
+ if (field_op == FIELD_OP_NONE)
+ return parse_atom(hist_data, file, str, &flags, var_name);
+
+ if (field_op == FIELD_OP_UNARY_MINUS)
+ return parse_unary(hist_data, file, str, flags, var_name, ++level);
+
+ switch (field_op) {
+ case FIELD_OP_MINUS:
+ sep = "-";
+ break;
+ case FIELD_OP_PLUS:
+ sep = "+";
+ break;
+ default:
+ goto free;
+ }
+
+ operand1_str = strsep(&str, sep);
+ if (!operand1_str || !str)
+ goto free;
+
+ operand_flags = 0;
+ operand1 = parse_atom(hist_data, file, operand1_str,
+ &operand_flags, NULL);
+ if (IS_ERR(operand1)) {
+ ret = PTR_ERR(operand1);
+ operand1 = NULL;
+ goto free;
+ }
+
+ /* rest of string could be another expression e.g. b+c in a+b+c */
+ operand_flags = 0;
+ operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
+ if (IS_ERR(operand2)) {
+ ret = PTR_ERR(operand2);
+ operand2 = NULL;
+ goto free;
+ }
+
+ ret = check_expr_operands(operand1, operand2);
+ if (ret)
+ goto free;
+
+ flags |= HIST_FIELD_FL_EXPR;
+
+ flags |= operand1->flags &
+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
+
+ expr = create_hist_field(hist_data, NULL, flags, var_name);
+ if (!expr) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ operand1->read_once = true;
+ operand2->read_once = true;
+
+ expr->operands[0] = operand1;
+ expr->operands[1] = operand2;
+ expr->operator = field_op;
+ expr->name = expr_str(expr, 0);
+ expr->type = kstrdup(operand1->type, GFP_KERNEL);
+ if (!expr->type) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ switch (field_op) {
+ case FIELD_OP_MINUS:
+ expr->fn = hist_field_minus;
+ break;
+ case FIELD_OP_PLUS:
+ expr->fn = hist_field_plus;
+ break;
+ default:
+ ret = -EINVAL;
+ goto free;
+ }
+
+ return expr;
+ free:
+ destroy_hist_field(operand1, 0);
+ destroy_hist_field(operand2, 0);
+ destroy_hist_field(expr, 0);
+
+ return ERR_PTR(ret);
+}
+
+static char *find_trigger_filter(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file)
+{
+ struct event_trigger_data *test;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (test->private_data == hist_data)
+ return test->filter_str;
+ }
+ }
+
+ return NULL;
+}
+
+static struct event_command trigger_hist_cmd;
+static int event_hist_trigger_func(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param);
+
+static bool compatible_keys(struct hist_trigger_data *target_hist_data,
+ struct hist_trigger_data *hist_data,
+ unsigned int n_keys)
+{
+ struct hist_field *target_hist_field, *hist_field;
+ unsigned int n, i, j;
+
+ if (hist_data->n_fields - hist_data->n_vals != n_keys)
+ return false;
+
+ i = hist_data->n_vals;
+ j = target_hist_data->n_vals;
+
+ for (n = 0; n < n_keys; n++) {
+ hist_field = hist_data->fields[i + n];
+ target_hist_field = target_hist_data->fields[j + n];
+
+ if (strcmp(hist_field->type, target_hist_field->type) != 0)
+ return false;
+ if (hist_field->size != target_hist_field->size)
+ return false;
+ if (hist_field->is_signed != target_hist_field->is_signed)
+ return false;
+ }
+
+ return true;
+}
+
+static struct hist_trigger_data *
+find_compatible_hist(struct hist_trigger_data *target_hist_data,
+ struct trace_event_file *file)
+{
+ struct hist_trigger_data *hist_data;
+ struct event_trigger_data *test;
+ unsigned int n_keys;
+
+ n_keys = target_hist_data->n_fields - target_hist_data->n_vals;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ hist_data = test->private_data;
+
+ if (compatible_keys(target_hist_data, hist_data, n_keys))
+ return hist_data;
+ }
+ }
+
+ return NULL;
+}
+
+static struct trace_event_file *event_file(struct trace_array *tr,
+ char *system, char *event_name)
+{
+ struct trace_event_file *file;
+
+ file = find_event_file(tr, system, event_name);
+ if (!file)
+ return ERR_PTR(-EINVAL);
+
+ return file;
+}
+
+static struct hist_field *
+find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
+ char *system, char *event_name, char *field_name)
+{
+ struct hist_field *event_var;
+ char *synthetic_name;
+
+ synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
+ if (!synthetic_name)
+ return ERR_PTR(-ENOMEM);
+
+ strcpy(synthetic_name, "synthetic_");
+ strcat(synthetic_name, field_name);
+
+ event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
+
+ kfree(synthetic_name);
+
+ return event_var;
+}
+
+/**
+ * create_field_var_hist - Automatically create a histogram and var for a field
+ * @target_hist_data: The target hist trigger
+ * @subsys_name: Optional subsystem name
+ * @event_name: Optional event name
+ * @field_name: The name of the field (and the resulting variable)
+ *
+ * Hist trigger actions fetch data from variables, not directly from
+ * events. However, for convenience, users are allowed to directly
+ * specify an event field in an action, which will be automatically
+ * converted into a variable on their behalf.
+
+ * If a user specifies a field on an event that isn't the event the
+ * histogram currently being defined (the target event histogram), the
+ * only way that can be accomplished is if a new hist trigger is
+ * created and the field variable defined on that.
+ *
+ * This function creates a new histogram compatible with the target
+ * event (meaning a histogram with the same key as the target
+ * histogram), and creates a variable for the specified field, but
+ * with 'synthetic_' prepended to the variable name in order to avoid
+ * collision with normal field variables.
+ *
+ * Return: The variable created for the field.
+ */
+static struct hist_field *
+create_field_var_hist(struct hist_trigger_data *target_hist_data,
+ char *subsys_name, char *event_name, char *field_name)
+{
+ struct trace_array *tr = target_hist_data->event_file->tr;
+ struct hist_field *event_var = ERR_PTR(-EINVAL);
+ struct hist_trigger_data *hist_data;
+ unsigned int i, n, first = true;
+ struct field_var_hist *var_hist;
+ struct trace_event_file *file;
+ struct hist_field *key_field;
+ char *saved_filter;
+ char *cmd;
+ int ret;
+
+ if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
+ hist_err_event("onmatch: Too many field variables defined: ",
+ subsys_name, event_name, field_name);
+ return ERR_PTR(-EINVAL);
+ }
+
+ file = event_file(tr, subsys_name, event_name);
+
+ if (IS_ERR(file)) {
+ hist_err_event("onmatch: Event file not found: ",
+ subsys_name, event_name, field_name);
+ ret = PTR_ERR(file);
+ return ERR_PTR(ret);
+ }
+
+ /*
+ * Look for a histogram compatible with target. We'll use the
+ * found histogram specification to create a new matching
+ * histogram with our variable on it. target_hist_data is not
+ * yet a registered histogram so we can't use that.
+ */
+ hist_data = find_compatible_hist(target_hist_data, file);
+ if (!hist_data) {
+ hist_err_event("onmatch: Matching event histogram not found: ",
+ subsys_name, event_name, field_name);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* See if a synthetic field variable has already been created */
+ event_var = find_synthetic_field_var(target_hist_data, subsys_name,
+ event_name, field_name);
+ if (!IS_ERR_OR_NULL(event_var))
+ return event_var;
+
+ var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL);
+ if (!var_hist)
+ return ERR_PTR(-ENOMEM);
+
+ cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
+ if (!cmd) {
+ kfree(var_hist);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Use the same keys as the compatible histogram */
+ strcat(cmd, "keys=");
+
+ for_each_hist_key_field(i, hist_data) {
+ key_field = hist_data->fields[i];
+ if (!first)
+ strcat(cmd, ",");
+ strcat(cmd, key_field->field->name);
+ first = false;
+ }
+
+ /* Create the synthetic field variable specification */
+ strcat(cmd, ":synthetic_");
+ strcat(cmd, field_name);
+ strcat(cmd, "=");
+ strcat(cmd, field_name);
+
+ /* Use the same filter as the compatible histogram */
+ saved_filter = find_trigger_filter(hist_data, file);
+ if (saved_filter) {
+ strcat(cmd, " if ");
+ strcat(cmd, saved_filter);
+ }
+
+ var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
+ if (!var_hist->cmd) {
+ kfree(cmd);
+ kfree(var_hist);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Save the compatible histogram information */
+ var_hist->hist_data = hist_data;
+
+ /* Create the new histogram with our variable */
+ ret = event_hist_trigger_func(&trigger_hist_cmd, file,
+ "", "hist", cmd);
+ if (ret) {
+ kfree(cmd);
+ kfree(var_hist->cmd);
+ kfree(var_hist);
+ hist_err_event("onmatch: Couldn't create histogram for field: ",
+ subsys_name, event_name, field_name);
+ return ERR_PTR(ret);
+ }
+
+ kfree(cmd);
+
+ /* If we can't find the variable, something went wrong */
+ event_var = find_synthetic_field_var(target_hist_data, subsys_name,
+ event_name, field_name);
+ if (IS_ERR_OR_NULL(event_var)) {
+ kfree(var_hist->cmd);
+ kfree(var_hist);
+ hist_err_event("onmatch: Couldn't find synthetic variable: ",
+ subsys_name, event_name, field_name);
+ return ERR_PTR(-EINVAL);
+ }
+
+ n = target_hist_data->n_field_var_hists;
+ target_hist_data->field_var_hists[n] = var_hist;
+ target_hist_data->n_field_var_hists++;
+
+ return event_var;
+}
+
+static struct hist_field *
+find_target_event_var(struct hist_trigger_data *hist_data,
+ char *subsys_name, char *event_name, char *var_name)
+{
+ struct trace_event_file *file = hist_data->event_file;
+ struct hist_field *hist_field = NULL;
+
+ if (subsys_name) {
+ struct trace_event_call *call;
+
+ if (!event_name)
+ return NULL;
+
+ call = file->event_call;
+
+ if (strcmp(subsys_name, call->class->system) != 0)
+ return NULL;
+
+ if (strcmp(event_name, trace_event_name(call)) != 0)
+ return NULL;
+ }
+
+ hist_field = find_var_field(hist_data, var_name);
+
+ return hist_field;
+}
+
+static inline void __update_field_vars(struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *rec,
+ struct field_var **field_vars,
+ unsigned int n_field_vars,
+ unsigned int field_var_str_start)
+{
+ struct hist_elt_data *elt_data = elt->private_data;
+ unsigned int i, j, var_idx;
+ u64 var_val;
+
+ for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
+ struct field_var *field_var = field_vars[i];
+ struct hist_field *var = field_var->var;
+ struct hist_field *val = field_var->val;
+
+ var_val = val->fn(val, elt, rbe, rec);
+ var_idx = var->var.idx;
+
+ if (val->flags & HIST_FIELD_FL_STRING) {
+ char *str = elt_data->field_var_str[j++];
+ char *val_str = (char *)(uintptr_t)var_val;
+
+ strscpy(str, val_str, STR_VAR_LEN_MAX);
+ var_val = (u64)(uintptr_t)str;
+ }
+ tracing_map_set_var(elt, var_idx, var_val);
+ }
+}
+
+static void update_field_vars(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *rec)
+{
+ __update_field_vars(elt, rbe, rec, hist_data->field_vars,
+ hist_data->n_field_vars, 0);
+}
+
+static void update_max_vars(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *rec)
+{
+ __update_field_vars(elt, rbe, rec, hist_data->max_vars,
+ hist_data->n_max_vars, hist_data->n_field_var_str);
+}
+
+static struct hist_field *create_var(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ char *name, int size, const char *type)
+{
+ struct hist_field *var;
+ int idx;
+
+ if (find_var(hist_data, file, name) && !hist_data->remove) {
+ var = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ var = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
+ if (!var) {
+ var = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ idx = tracing_map_add_var(hist_data->map);
+ if (idx < 0) {
+ kfree(var);
+ var = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ var->flags = HIST_FIELD_FL_VAR;
+ var->var.idx = idx;
+ var->var.hist_data = var->hist_data = hist_data;
+ var->size = size;
+ var->var.name = kstrdup(name, GFP_KERNEL);
+ var->type = kstrdup(type, GFP_KERNEL);
+ if (!var->var.name || !var->type) {
+ kfree(var->var.name);
+ kfree(var->type);
+ kfree(var);
+ var = ERR_PTR(-ENOMEM);
+ }
+ out:
+ return var;
+}
+
+static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ char *field_name)
+{
+ struct hist_field *val = NULL, *var = NULL;
+ unsigned long flags = HIST_FIELD_FL_VAR;
+ struct field_var *field_var;
int ret = 0;
- if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
+ if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
+ hist_err("Too many field variables defined: ", field_name);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ val = parse_atom(hist_data, file, field_name, &flags, NULL);
+ if (IS_ERR(val)) {
+ hist_err("Couldn't parse field variable: ", field_name);
+ ret = PTR_ERR(val);
+ goto err;
+ }
+
+ var = create_var(hist_data, file, field_name, val->size, val->type);
+ if (IS_ERR(var)) {
+ hist_err("Couldn't create or find variable: ", field_name);
+ kfree(val);
+ ret = PTR_ERR(var);
+ goto err;
+ }
+
+ field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL);
+ if (!field_var) {
+ kfree(val);
+ kfree(var);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ field_var->var = var;
+ field_var->val = val;
+ out:
+ return field_var;
+ err:
+ field_var = ERR_PTR(ret);
+ goto out;
+}
+
+/**
+ * create_target_field_var - Automatically create a variable for a field
+ * @target_hist_data: The target hist trigger
+ * @subsys_name: Optional subsystem name
+ * @event_name: Optional event name
+ * @var_name: The name of the field (and the resulting variable)
+ *
+ * Hist trigger actions fetch data from variables, not directly from
+ * events. However, for convenience, users are allowed to directly
+ * specify an event field in an action, which will be automatically
+ * converted into a variable on their behalf.
+
+ * This function creates a field variable with the name var_name on
+ * the hist trigger currently being defined on the target event. If
+ * subsys_name and event_name are specified, this function simply
+ * verifies that they do in fact match the target event subsystem and
+ * event name.
+ *
+ * Return: The variable created for the field.
+ */
+static struct field_var *
+create_target_field_var(struct hist_trigger_data *target_hist_data,
+ char *subsys_name, char *event_name, char *var_name)
+{
+ struct trace_event_file *file = target_hist_data->event_file;
+
+ if (subsys_name) {
+ struct trace_event_call *call;
+
+ if (!event_name)
+ return NULL;
+
+ call = file->event_call;
+
+ if (strcmp(subsys_name, call->class->system) != 0)
+ return NULL;
+
+ if (strcmp(event_name, trace_event_name(call)) != 0)
+ return NULL;
+ }
+
+ return create_field_var(target_hist_data, file, var_name);
+}
+
+static void onmax_print(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt,
+ struct action_data *data)
+{
+ unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx;
+
+ seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx));
+
+ for (i = 0; i < hist_data->n_max_vars; i++) {
+ struct hist_field *save_val = hist_data->max_vars[i]->val;
+ struct hist_field *save_var = hist_data->max_vars[i]->var;
+ u64 val;
+
+ save_var_idx = save_var->var.idx;
+
+ val = tracing_map_read_var(elt, save_var_idx);
+
+ if (save_val->flags & HIST_FIELD_FL_STRING) {
+ seq_printf(m, " %s: %-32s", save_var->var.name,
+ (char *)(uintptr_t)(val));
+ } else
+ seq_printf(m, " %s: %10llu", save_var->var.name, val);
+ }
+}
+
+static void onmax_save(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe,
+ struct action_data *data, u64 *var_ref_vals)
+{
+ unsigned int max_idx = data->onmax.max_var->var.idx;
+ unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx;
+
+ u64 var_val, max_val;
+
+ var_val = var_ref_vals[max_var_ref_idx];
+ max_val = tracing_map_read_var(elt, max_idx);
+
+ if (var_val <= max_val)
+ return;
+
+ tracing_map_set_var(elt, max_idx, var_val);
+
+ update_max_vars(hist_data, elt, rbe, rec);
+}
+
+static void onmax_destroy(struct action_data *data)
+{
+ unsigned int i;
+
+ destroy_hist_field(data->onmax.max_var, 0);
+ destroy_hist_field(data->onmax.var, 0);
+
+ kfree(data->onmax.var_str);
+ kfree(data->onmax.fn_name);
+
+ for (i = 0; i < data->n_params; i++)
+ kfree(data->params[i]);
+
+ kfree(data);
+}
+
+static int onmax_create(struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ struct trace_event_file *file = hist_data->event_file;
+ struct hist_field *var_field, *ref_field, *max_var;
+ unsigned int var_ref_idx = hist_data->n_var_refs;
+ struct field_var *field_var;
+ char *onmax_var_str, *param;
+ unsigned long flags;
+ unsigned int i;
+ int ret = 0;
+
+ onmax_var_str = data->onmax.var_str;
+ if (onmax_var_str[0] != '$') {
+ hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str);
return -EINVAL;
+ }
+ onmax_var_str++;
- field_name = strsep(&field_str, ".");
- if (field_str) {
- if (strcmp(field_str, "hex") == 0)
- flags |= HIST_FIELD_FL_HEX;
- else {
+ var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str);
+ if (!var_field) {
+ hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str);
+ return -EINVAL;
+ }
+
+ flags = HIST_FIELD_FL_VAR_REF;
+ ref_field = create_hist_field(hist_data, NULL, flags, NULL);
+ if (!ref_field)
+ return -ENOMEM;
+
+ if (init_var_ref(ref_field, var_field, NULL, NULL)) {
+ destroy_hist_field(ref_field, 0);
+ ret = -ENOMEM;
+ goto out;
+ }
+ hist_data->var_refs[hist_data->n_var_refs] = ref_field;
+ ref_field->var_ref_idx = hist_data->n_var_refs++;
+ data->onmax.var = ref_field;
+
+ data->fn = onmax_save;
+ data->onmax.max_var_ref_idx = var_ref_idx;
+ max_var = create_var(hist_data, file, "max", sizeof(u64), "u64");
+ if (IS_ERR(max_var)) {
+ hist_err("onmax: Couldn't create onmax variable: ", "max");
+ ret = PTR_ERR(max_var);
+ goto out;
+ }
+ data->onmax.max_var = max_var;
+
+ for (i = 0; i < data->n_params; i++) {
+ param = kstrdup(data->params[i], GFP_KERNEL);
+ if (!param) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ field_var = create_target_field_var(hist_data, NULL, NULL, param);
+ if (IS_ERR(field_var)) {
+ hist_err("onmax: Couldn't create field variable: ", param);
+ ret = PTR_ERR(field_var);
+ kfree(param);
+ goto out;
+ }
+
+ hist_data->max_vars[hist_data->n_max_vars++] = field_var;
+ if (field_var->val->flags & HIST_FIELD_FL_STRING)
+ hist_data->n_max_var_str++;
+
+ kfree(param);
+ }
+ out:
+ return ret;
+}
+
+static int parse_action_params(char *params, struct action_data *data)
+{
+ char *param, *saved_param;
+ int ret = 0;
+
+ while (params) {
+ if (data->n_params >= SYNTH_FIELDS_MAX)
+ goto out;
+
+ param = strsep(&params, ",");
+ if (!param) {
ret = -EINVAL;
goto out;
}
+
+ param = strstrip(param);
+ if (strlen(param) < 2) {
+ hist_err("Invalid action param: ", param);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ saved_param = kstrdup(param, GFP_KERNEL);
+ if (!saved_param) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ data->params[data->n_params++] = saved_param;
}
+ out:
+ return ret;
+}
- field = trace_find_event_field(file->event_call, field_name);
- if (!field || !field->size) {
+static struct action_data *onmax_parse(char *str)
+{
+ char *onmax_fn_name, *onmax_var_str;
+ struct action_data *data;
+ int ret = -EINVAL;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ onmax_var_str = strsep(&str, ")");
+ if (!onmax_var_str || !str) {
ret = -EINVAL;
- goto out;
+ goto free;
+ }
+
+ data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL);
+ if (!data->onmax.var_str) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ strsep(&str, ".");
+ if (!str)
+ goto free;
+
+ onmax_fn_name = strsep(&str, "(");
+ if (!onmax_fn_name || !str)
+ goto free;
+
+ if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) {
+ char *params = strsep(&str, ")");
+
+ if (!params) {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ ret = parse_action_params(params, data);
+ if (ret)
+ goto free;
+ } else
+ goto free;
+
+ data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL);
+ if (!data->onmax.fn_name) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ out:
+ return data;
+ free:
+ onmax_destroy(data);
+ data = ERR_PTR(ret);
+ goto out;
+}
+
+static void onmatch_destroy(struct action_data *data)
+{
+ unsigned int i;
+
+ mutex_lock(&synth_event_mutex);
+
+ kfree(data->onmatch.match_event);
+ kfree(data->onmatch.match_event_system);
+ kfree(data->onmatch.synth_event_name);
+
+ for (i = 0; i < data->n_params; i++)
+ kfree(data->params[i]);
+
+ if (data->onmatch.synth_event)
+ data->onmatch.synth_event->ref--;
+
+ kfree(data);
+
+ mutex_unlock(&synth_event_mutex);
+}
+
+static void destroy_field_var(struct field_var *field_var)
+{
+ if (!field_var)
+ return;
+
+ destroy_hist_field(field_var->var, 0);
+ destroy_hist_field(field_var->val, 0);
+
+ kfree(field_var);
+}
+
+static void destroy_field_vars(struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_field_vars; i++)
+ destroy_field_var(hist_data->field_vars[i]);
+}
+
+static void save_field_var(struct hist_trigger_data *hist_data,
+ struct field_var *field_var)
+{
+ hist_data->field_vars[hist_data->n_field_vars++] = field_var;
+
+ if (field_var->val->flags & HIST_FIELD_FL_STRING)
+ hist_data->n_field_var_str++;
+}
+
+
+static void destroy_synth_var_refs(struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_synth_var_refs; i++)
+ destroy_hist_field(hist_data->synth_var_refs[i], 0);
+}
+
+static void save_synth_var_ref(struct hist_trigger_data *hist_data,
+ struct hist_field *var_ref)
+{
+ hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref;
+
+ hist_data->var_refs[hist_data->n_var_refs] = var_ref;
+ var_ref->var_ref_idx = hist_data->n_var_refs++;
+}
+
+static int check_synth_field(struct synth_event *event,
+ struct hist_field *hist_field,
+ unsigned int field_pos)
+{
+ struct synth_field *field;
+
+ if (field_pos >= event->n_fields)
+ return -EINVAL;
+
+ field = event->fields[field_pos];
+
+ if (strcmp(field->type, hist_field->type) != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct hist_field *
+onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data,
+ char *system, char *event, char *var)
+{
+ struct hist_field *hist_field;
+
+ var++; /* skip '$' */
+
+ hist_field = find_target_event_var(hist_data, system, event, var);
+ if (!hist_field) {
+ if (!system) {
+ system = data->onmatch.match_event_system;
+ event = data->onmatch.match_event;
+ }
+
+ hist_field = find_event_var(hist_data, system, event, var);
+ }
+
+ if (!hist_field)
+ hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var);
+
+ return hist_field;
+}
+
+static struct hist_field *
+onmatch_create_field_var(struct hist_trigger_data *hist_data,
+ struct action_data *data, char *system,
+ char *event, char *var)
+{
+ struct hist_field *hist_field = NULL;
+ struct field_var *field_var;
+
+ /*
+ * First try to create a field var on the target event (the
+ * currently being defined). This will create a variable for
+ * unqualified fields on the target event, or if qualified,
+ * target fields that have qualified names matching the target.
+ */
+ field_var = create_target_field_var(hist_data, system, event, var);
+
+ if (field_var && !IS_ERR(field_var)) {
+ save_field_var(hist_data, field_var);
+ hist_field = field_var->var;
+ } else {
+ field_var = NULL;
+ /*
+ * If no explicit system.event is specfied, default to
+ * looking for fields on the onmatch(system.event.xxx)
+ * event.
+ */
+ if (!system) {
+ system = data->onmatch.match_event_system;
+ event = data->onmatch.match_event;
+ }
+
+ /*
+ * At this point, we're looking at a field on another
+ * event. Because we can't modify a hist trigger on
+ * another event to add a variable for a field, we need
+ * to create a new trigger on that event and create the
+ * variable at the same time.
+ */
+ hist_field = create_field_var_hist(hist_data, system, event, var);
+ if (IS_ERR(hist_field))
+ goto free;
+ }
+ out:
+ return hist_field;
+ free:
+ destroy_field_var(field_var);
+ hist_field = NULL;
+ goto out;
+}
+
+static int onmatch_create(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ struct action_data *data)
+{
+ char *event_name, *param, *system = NULL;
+ struct hist_field *hist_field, *var_ref;
+ unsigned int i, var_ref_idx;
+ unsigned int field_pos = 0;
+ struct synth_event *event;
+ int ret = 0;
+
+ mutex_lock(&synth_event_mutex);
+ event = find_synth_event(data->onmatch.synth_event_name);
+ if (!event) {
+ hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
+ mutex_unlock(&synth_event_mutex);
+ return -EINVAL;
+ }
+ event->ref++;
+ mutex_unlock(&synth_event_mutex);
+
+ var_ref_idx = hist_data->n_var_refs;
+
+ for (i = 0; i < data->n_params; i++) {
+ char *p;
+
+ p = param = kstrdup(data->params[i], GFP_KERNEL);
+ if (!param) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ system = strsep(&param, ".");
+ if (!param) {
+ param = (char *)system;
+ system = event_name = NULL;
+ } else {
+ event_name = strsep(&param, ".");
+ if (!param) {
+ kfree(p);
+ ret = -EINVAL;
+ goto err;
+ }
+ }
+
+ if (param[0] == '$')
+ hist_field = onmatch_find_var(hist_data, data, system,
+ event_name, param);
+ else
+ hist_field = onmatch_create_field_var(hist_data, data,
+ system,
+ event_name,
+ param);
+
+ if (!hist_field) {
+ kfree(p);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (check_synth_field(event, hist_field, field_pos) == 0) {
+ var_ref = create_var_ref(hist_field, system, event_name);
+ if (!var_ref) {
+ kfree(p);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ save_synth_var_ref(hist_data, var_ref);
+ field_pos++;
+ kfree(p);
+ continue;
+ }
+
+ hist_err_event("onmatch: Param type doesn't match synthetic event field type: ",
+ system, event_name, param);
+ kfree(p);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (field_pos != event->n_fields) {
+ hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ data->fn = action_trace;
+ data->onmatch.synth_event = event;
+ data->onmatch.var_ref_idx = var_ref_idx;
+ out:
+ return ret;
+ err:
+ mutex_lock(&synth_event_mutex);
+ event->ref--;
+ mutex_unlock(&synth_event_mutex);
+
+ goto out;
+}
+
+static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
+{
+ char *match_event, *match_event_system;
+ char *synth_event_name, *params;
+ struct action_data *data;
+ int ret = -EINVAL;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ match_event = strsep(&str, ")");
+ if (!match_event || !str) {
+ hist_err("onmatch: Missing closing paren: ", match_event);
+ goto free;
+ }
+
+ match_event_system = strsep(&match_event, ".");
+ if (!match_event) {
+ hist_err("onmatch: Missing subsystem for match event: ", match_event_system);
+ goto free;
+ }
+
+ if (IS_ERR(event_file(tr, match_event_system, match_event))) {
+ hist_err_event("onmatch: Invalid subsystem or event name: ",
+ match_event_system, match_event, NULL);
+ goto free;
+ }
+
+ data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL);
+ if (!data->onmatch.match_event) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL);
+ if (!data->onmatch.match_event_system) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ strsep(&str, ".");
+ if (!str) {
+ hist_err("onmatch: Missing . after onmatch(): ", str);
+ goto free;
+ }
+
+ synth_event_name = strsep(&str, "(");
+ if (!synth_event_name || !str) {
+ hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name);
+ goto free;
}
- hist_data->fields[val_idx] = create_hist_field(field, flags);
- if (!hist_data->fields[val_idx]) {
+ data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL);
+ if (!data->onmatch.synth_event_name) {
ret = -ENOMEM;
+ goto free;
+ }
+
+ params = strsep(&str, ")");
+ if (!params || !str || (str && strlen(str))) {
+ hist_err("onmatch: Missing closing paramlist paren: ", params);
+ goto free;
+ }
+
+ ret = parse_action_params(params, data);
+ if (ret)
+ goto free;
+ out:
+ return data;
+ free:
+ onmatch_destroy(data);
+ data = ERR_PTR(ret);
+ goto out;
+}
+
+static int create_hitcount_val(struct hist_trigger_data *hist_data)
+{
+ hist_data->fields[HITCOUNT_IDX] =
+ create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL);
+ if (!hist_data->fields[HITCOUNT_IDX])
+ return -ENOMEM;
+
+ hist_data->n_vals++;
+ hist_data->n_fields++;
+
+ if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __create_val_field(struct hist_trigger_data *hist_data,
+ unsigned int val_idx,
+ struct trace_event_file *file,
+ char *var_name, char *field_str,
+ unsigned long flags)
+{
+ struct hist_field *hist_field;
+ int ret = 0;
+
+ hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0);
+ if (IS_ERR(hist_field)) {
+ ret = PTR_ERR(hist_field);
goto out;
}
+ hist_data->fields[val_idx] = hist_field;
+
++hist_data->n_vals;
+ ++hist_data->n_fields;
- if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+ if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
ret = -EINVAL;
out:
return ret;
}
+static int create_val_field(struct hist_trigger_data *hist_data,
+ unsigned int val_idx,
+ struct trace_event_file *file,
+ char *field_str)
+{
+ if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
+ return -EINVAL;
+
+ return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
+}
+
+static int create_var_field(struct hist_trigger_data *hist_data,
+ unsigned int val_idx,
+ struct trace_event_file *file,
+ char *var_name, char *expr_str)
+{
+ unsigned long flags = 0;
+
+ if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
+ return -EINVAL;
+
+ if (find_var(hist_data, file, var_name) && !hist_data->remove) {
+ hist_err("Variable already defined: ", var_name);
+ return -EINVAL;
+ }
+
+ flags |= HIST_FIELD_FL_VAR;
+ hist_data->n_vars++;
+ if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX))
+ return -EINVAL;
+
+ return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
+}
+
static int create_val_fields(struct hist_trigger_data *hist_data,
struct trace_event_file *file)
{
char *fields_str, *field_str;
- unsigned int i, j;
+ unsigned int i, j = 1;
int ret;
ret = create_hitcount_val(hist_data);
@@ -533,12 +3910,15 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
field_str = strsep(&fields_str, ",");
if (!field_str)
break;
+
if (strcmp(field_str, "hitcount") == 0)
continue;
+
ret = create_val_field(hist_data, j++, file, field_str);
if (ret)
goto out;
}
+
if (fields_str && (strcmp(fields_str, "hitcount") != 0))
ret = -EINVAL;
out:
@@ -551,12 +3931,13 @@ static int create_key_field(struct hist_trigger_data *hist_data,
struct trace_event_file *file,
char *field_str)
{
- struct ftrace_event_field *field = NULL;
+ struct hist_field *hist_field = NULL;
+
unsigned long flags = 0;
unsigned int key_size;
int ret = 0;
- if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
+ if (WARN_ON(key_idx >= HIST_FIELDS_MAX))
return -EINVAL;
flags |= HIST_FIELD_FL_KEY;
@@ -564,57 +3945,40 @@ static int create_key_field(struct hist_trigger_data *hist_data,
if (strcmp(field_str, "stacktrace") == 0) {
flags |= HIST_FIELD_FL_STACKTRACE;
key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH;
+ hist_field = create_hist_field(hist_data, NULL, flags, NULL);
} else {
- char *field_name = strsep(&field_str, ".");
-
- if (field_str) {
- if (strcmp(field_str, "hex") == 0)
- flags |= HIST_FIELD_FL_HEX;
- else if (strcmp(field_str, "sym") == 0)
- flags |= HIST_FIELD_FL_SYM;
- else if (strcmp(field_str, "sym-offset") == 0)
- flags |= HIST_FIELD_FL_SYM_OFFSET;
- else if ((strcmp(field_str, "execname") == 0) &&
- (strcmp(field_name, "common_pid") == 0))
- flags |= HIST_FIELD_FL_EXECNAME;
- else if (strcmp(field_str, "syscall") == 0)
- flags |= HIST_FIELD_FL_SYSCALL;
- else if (strcmp(field_str, "log2") == 0)
- flags |= HIST_FIELD_FL_LOG2;
- else {
- ret = -EINVAL;
- goto out;
- }
+ hist_field = parse_expr(hist_data, file, field_str, flags,
+ NULL, 0);
+ if (IS_ERR(hist_field)) {
+ ret = PTR_ERR(hist_field);
+ goto out;
}
- field = trace_find_event_field(file->event_call, field_name);
- if (!field || !field->size) {
+ if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
+ hist_err("Using variable references as keys not supported: ", field_str);
+ destroy_hist_field(hist_field, 0);
ret = -EINVAL;
goto out;
}
- if (is_string_field(field))
- key_size = MAX_FILTER_STR_VAL;
- else
- key_size = field->size;
+ key_size = hist_field->size;
}
- hist_data->fields[key_idx] = create_hist_field(field, flags);
- if (!hist_data->fields[key_idx]) {
- ret = -ENOMEM;
- goto out;
- }
+ hist_data->fields[key_idx] = hist_field;
key_size = ALIGN(key_size, sizeof(u64));
hist_data->fields[key_idx]->size = key_size;
hist_data->fields[key_idx]->offset = key_offset;
+
hist_data->key_size += key_size;
+
if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
ret = -EINVAL;
goto out;
}
hist_data->n_keys++;
+ hist_data->n_fields++;
if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
return -EINVAL;
@@ -658,21 +4022,113 @@ static int create_key_fields(struct hist_trigger_data *hist_data,
return ret;
}
+static int create_var_fields(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file)
+{
+ unsigned int i, j = hist_data->n_vals;
+ int ret = 0;
+
+ unsigned int n_vars = hist_data->attrs->var_defs.n_vars;
+
+ for (i = 0; i < n_vars; i++) {
+ char *var_name = hist_data->attrs->var_defs.name[i];
+ char *expr = hist_data->attrs->var_defs.expr[i];
+
+ ret = create_var_field(hist_data, j++, file, var_name, expr);
+ if (ret)
+ goto out;
+ }
+ out:
+ return ret;
+}
+
+static void free_var_defs(struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
+ kfree(hist_data->attrs->var_defs.name[i]);
+ kfree(hist_data->attrs->var_defs.expr[i]);
+ }
+
+ hist_data->attrs->var_defs.n_vars = 0;
+}
+
+static int parse_var_defs(struct hist_trigger_data *hist_data)
+{
+ char *s, *str, *var_name, *field_str;
+ unsigned int i, j, n_vars = 0;
+ int ret = 0;
+
+ for (i = 0; i < hist_data->attrs->n_assignments; i++) {
+ str = hist_data->attrs->assignment_str[i];
+ for (j = 0; j < TRACING_MAP_VARS_MAX; j++) {
+ field_str = strsep(&str, ",");
+ if (!field_str)
+ break;
+
+ var_name = strsep(&field_str, "=");
+ if (!var_name || !field_str) {
+ hist_err("Malformed assignment: ", var_name);
+ ret = -EINVAL;
+ goto free;
+ }
+
+ if (n_vars == TRACING_MAP_VARS_MAX) {
+ hist_err("Too many variables defined: ", var_name);
+ ret = -EINVAL;
+ goto free;
+ }
+
+ s = kstrdup(var_name, GFP_KERNEL);
+ if (!s) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ hist_data->attrs->var_defs.name[n_vars] = s;
+
+ s = kstrdup(field_str, GFP_KERNEL);
+ if (!s) {
+ kfree(hist_data->attrs->var_defs.name[n_vars]);
+ ret = -ENOMEM;
+ goto free;
+ }
+ hist_data->attrs->var_defs.expr[n_vars++] = s;
+
+ hist_data->attrs->var_defs.n_vars = n_vars;
+ }
+ }
+
+ return ret;
+ free:
+ free_var_defs(hist_data);
+
+ return ret;
+}
+
static int create_hist_fields(struct hist_trigger_data *hist_data,
struct trace_event_file *file)
{
int ret;
+ ret = parse_var_defs(hist_data);
+ if (ret)
+ goto out;
+
ret = create_val_fields(hist_data, file);
if (ret)
goto out;
- ret = create_key_fields(hist_data, file);
+ ret = create_var_fields(hist_data, file);
if (ret)
goto out;
- hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
+ ret = create_key_fields(hist_data, file);
+ if (ret)
+ goto out;
out:
+ free_var_defs(hist_data);
+
return ret;
}
@@ -695,7 +4151,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
char *fields_str = hist_data->attrs->sort_key_str;
struct tracing_map_sort_key *sort_key;
int descending, ret = 0;
- unsigned int i, j;
+ unsigned int i, j, k;
hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */
@@ -743,12 +4199,19 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
continue;
}
- for (j = 1; j < hist_data->n_fields; j++) {
+ for (j = 1, k = 1; j < hist_data->n_fields; j++) {
+ unsigned int idx;
+
hist_field = hist_data->fields[j];
+ if (hist_field->flags & HIST_FIELD_FL_VAR)
+ continue;
+
+ idx = k++;
+
test_name = hist_field_name(hist_field, 0);
if (strcmp(field_name, test_name) == 0) {
- sort_key->field_idx = j;
+ sort_key->field_idx = idx;
descending = is_descending(field_str);
if (descending < 0) {
ret = descending;
@@ -763,16 +4226,230 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
break;
}
}
+
hist_data->n_sort_keys = i;
out:
return ret;
}
+static void destroy_actions(struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *data = hist_data->actions[i];
+
+ if (data->fn == action_trace)
+ onmatch_destroy(data);
+ else if (data->fn == onmax_save)
+ onmax_destroy(data);
+ else
+ kfree(data);
+ }
+}
+
+static int parse_actions(struct hist_trigger_data *hist_data)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct action_data *data;
+ unsigned int i;
+ int ret = 0;
+ char *str;
+
+ for (i = 0; i < hist_data->attrs->n_actions; i++) {
+ str = hist_data->attrs->action_str[i];
+
+ if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) {
+ char *action_str = str + strlen("onmatch(");
+
+ data = onmatch_parse(tr, action_str);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ break;
+ }
+ data->fn = action_trace;
+ } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) {
+ char *action_str = str + strlen("onmax(");
+
+ data = onmax_parse(action_str);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ break;
+ }
+ data->fn = onmax_save;
+ } else {
+ ret = -EINVAL;
+ break;
+ }
+
+ hist_data->actions[hist_data->n_actions++] = data;
+ }
+
+ return ret;
+}
+
+static int create_actions(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file)
+{
+ struct action_data *data;
+ unsigned int i;
+ int ret = 0;
+
+ for (i = 0; i < hist_data->attrs->n_actions; i++) {
+ data = hist_data->actions[i];
+
+ if (data->fn == action_trace) {
+ ret = onmatch_create(hist_data, file, data);
+ if (ret)
+ return ret;
+ } else if (data->fn == onmax_save) {
+ ret = onmax_create(hist_data, data);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+static void print_actions(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *data = hist_data->actions[i];
+
+ if (data->fn == onmax_save)
+ onmax_print(m, hist_data, elt, data);
+ }
+}
+
+static void print_onmax_spec(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ unsigned int i;
+
+ seq_puts(m, ":onmax(");
+ seq_printf(m, "%s", data->onmax.var_str);
+ seq_printf(m, ").%s(", data->onmax.fn_name);
+
+ for (i = 0; i < hist_data->n_max_vars; i++) {
+ seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name);
+ if (i < hist_data->n_max_vars - 1)
+ seq_puts(m, ",");
+ }
+ seq_puts(m, ")");
+}
+
+static void print_onmatch_spec(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ unsigned int i;
+
+ seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system,
+ data->onmatch.match_event);
+
+ seq_printf(m, "%s(", data->onmatch.synth_event->name);
+
+ for (i = 0; i < data->n_params; i++) {
+ if (i)
+ seq_puts(m, ",");
+ seq_printf(m, "%s", data->params[i]);
+ }
+
+ seq_puts(m, ")");
+}
+
+static bool actions_match(struct hist_trigger_data *hist_data,
+ struct hist_trigger_data *hist_data_test)
+{
+ unsigned int i, j;
+
+ if (hist_data->n_actions != hist_data_test->n_actions)
+ return false;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *data = hist_data->actions[i];
+ struct action_data *data_test = hist_data_test->actions[i];
+
+ if (data->fn != data_test->fn)
+ return false;
+
+ if (data->n_params != data_test->n_params)
+ return false;
+
+ for (j = 0; j < data->n_params; j++) {
+ if (strcmp(data->params[j], data_test->params[j]) != 0)
+ return false;
+ }
+
+ if (data->fn == action_trace) {
+ if (strcmp(data->onmatch.synth_event_name,
+ data_test->onmatch.synth_event_name) != 0)
+ return false;
+ if (strcmp(data->onmatch.match_event_system,
+ data_test->onmatch.match_event_system) != 0)
+ return false;
+ if (strcmp(data->onmatch.match_event,
+ data_test->onmatch.match_event) != 0)
+ return false;
+ } else if (data->fn == onmax_save) {
+ if (strcmp(data->onmax.var_str,
+ data_test->onmax.var_str) != 0)
+ return false;
+ if (strcmp(data->onmax.fn_name,
+ data_test->onmax.fn_name) != 0)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+
+static void print_actions_spec(struct seq_file *m,
+ struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *data = hist_data->actions[i];
+
+ if (data->fn == action_trace)
+ print_onmatch_spec(m, hist_data, data);
+ else if (data->fn == onmax_save)
+ print_onmax_spec(m, hist_data, data);
+ }
+}
+
+static void destroy_field_var_hists(struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_field_var_hists; i++) {
+ kfree(hist_data->field_var_hists[i]->cmd);
+ kfree(hist_data->field_var_hists[i]);
+ }
+}
+
static void destroy_hist_data(struct hist_trigger_data *hist_data)
{
+ if (!hist_data)
+ return;
+
destroy_hist_trigger_attrs(hist_data->attrs);
destroy_hist_fields(hist_data);
tracing_map_destroy(hist_data->map);
+
+ destroy_actions(hist_data);
+ destroy_field_vars(hist_data);
+ destroy_field_var_hists(hist_data);
+ destroy_synth_var_refs(hist_data);
+
kfree(hist_data);
}
@@ -781,7 +4458,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
struct tracing_map *map = hist_data->map;
struct ftrace_event_field *field;
struct hist_field *hist_field;
- int i, idx;
+ int i, idx = 0;
for_each_hist_field(i, hist_data) {
hist_field = hist_data->fields[i];
@@ -792,6 +4469,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
cmp_fn = tracing_map_cmp_none;
+ else if (!field)
+ cmp_fn = tracing_map_cmp_num(hist_field->size,
+ hist_field->is_signed);
else if (is_string_field(field))
cmp_fn = tracing_map_cmp_string;
else
@@ -800,36 +4480,29 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
idx = tracing_map_add_key_field(map,
hist_field->offset,
cmp_fn);
-
- } else
+ } else if (!(hist_field->flags & HIST_FIELD_FL_VAR))
idx = tracing_map_add_sum_field(map);
if (idx < 0)
return idx;
- }
-
- return 0;
-}
-
-static bool need_tracing_map_ops(struct hist_trigger_data *hist_data)
-{
- struct hist_field *key_field;
- unsigned int i;
-
- for_each_hist_key_field(i, hist_data) {
- key_field = hist_data->fields[i];
- if (key_field->flags & HIST_FIELD_FL_EXECNAME)
- return true;
+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
+ idx = tracing_map_add_var(map);
+ if (idx < 0)
+ return idx;
+ hist_field->var.idx = idx;
+ hist_field->var.hist_data = hist_data;
+ }
}
- return false;
+ return 0;
}
static struct hist_trigger_data *
create_hist_data(unsigned int map_bits,
struct hist_trigger_attrs *attrs,
- struct trace_event_file *file)
+ struct trace_event_file *file,
+ bool remove)
{
const struct tracing_map_ops *map_ops = NULL;
struct hist_trigger_data *hist_data;
@@ -840,6 +4513,12 @@ create_hist_data(unsigned int map_bits,
return ERR_PTR(-ENOMEM);
hist_data->attrs = attrs;
+ hist_data->remove = remove;
+ hist_data->event_file = file;
+
+ ret = parse_actions(hist_data);
+ if (ret)
+ goto free;
ret = create_hist_fields(hist_data, file);
if (ret)
@@ -849,8 +4528,7 @@ create_hist_data(unsigned int map_bits,
if (ret)
goto free;
- if (need_tracing_map_ops(hist_data))
- map_ops = &hist_trigger_elt_comm_ops;
+ map_ops = &hist_trigger_elt_data_ops;
hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
map_ops, hist_data);
@@ -863,12 +4541,6 @@ create_hist_data(unsigned int map_bits,
ret = create_tracing_map_fields(hist_data);
if (ret)
goto free;
-
- ret = tracing_map_init(hist_data->map);
- if (ret)
- goto free;
-
- hist_data->event_file = file;
out:
return hist_data;
free:
@@ -882,18 +4554,39 @@ create_hist_data(unsigned int map_bits,
}
static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt,
- void *rec)
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe,
+ u64 *var_ref_vals)
{
+ struct hist_elt_data *elt_data;
struct hist_field *hist_field;
- unsigned int i;
+ unsigned int i, var_idx;
u64 hist_val;
+ elt_data = elt->private_data;
+ elt_data->var_ref_vals = var_ref_vals;
+
for_each_hist_val_field(i, hist_data) {
hist_field = hist_data->fields[i];
- hist_val = hist_field->fn(hist_field, rec);
+ hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
+ var_idx = hist_field->var.idx;
+ tracing_map_set_var(elt, var_idx, hist_val);
+ continue;
+ }
tracing_map_update_sum(elt, i, hist_val);
}
+
+ for_each_hist_key_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
+ hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+ var_idx = hist_field->var.idx;
+ tracing_map_set_var(elt, var_idx, hist_val);
+ }
+ }
+
+ update_field_vars(hist_data, elt, rbe, rec);
}
static inline void add_to_key(char *compound_key, void *key,
@@ -920,15 +4613,31 @@ static inline void add_to_key(char *compound_key, void *key,
memcpy(compound_key + key_field->offset, key, size);
}
-static void event_hist_trigger(struct event_trigger_data *data, void *rec)
+static void
+hist_trigger_actions(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe, u64 *var_ref_vals)
+{
+ struct action_data *data;
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ data = hist_data->actions[i];
+ data->fn(hist_data, elt, rec, rbe, data, var_ref_vals);
+ }
+}
+
+static void event_hist_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *rbe)
{
struct hist_trigger_data *hist_data = data->private_data;
bool use_compound_key = (hist_data->n_keys > 1);
unsigned long entries[HIST_STACKTRACE_DEPTH];
+ u64 var_ref_vals[TRACING_MAP_VARS_MAX];
char compound_key[HIST_KEY_SIZE_MAX];
+ struct tracing_map_elt *elt = NULL;
struct stack_trace stacktrace;
struct hist_field *key_field;
- struct tracing_map_elt *elt;
u64 field_contents;
void *key = NULL;
unsigned int i;
@@ -949,7 +4658,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
key = entries;
} else {
- field_contents = key_field->fn(key_field, rec);
+ field_contents = key_field->fn(key_field, elt, rbe, rec);
if (key_field->flags & HIST_FIELD_FL_STRING) {
key = (void *)(unsigned long)field_contents;
use_compound_key = true;
@@ -964,9 +4673,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
if (use_compound_key)
key = compound_key;
+ if (hist_data->n_var_refs &&
+ !resolve_var_refs(hist_data, key, var_ref_vals, false))
+ return;
+
elt = tracing_map_insert(hist_data->map, key);
- if (elt)
- hist_trigger_elt_update(hist_data, elt, rec);
+ if (!elt)
+ return;
+
+ hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
+
+ if (resolve_var_refs(hist_data, key, var_ref_vals, true))
+ hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals);
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -1023,7 +4741,13 @@ hist_trigger_entry_print(struct seq_file *m,
seq_printf(m, "%s: [%llx] %-55s", field_name,
uval, str);
} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
- char *comm = elt->private_data;
+ struct hist_elt_data *elt_data = elt->private_data;
+ char *comm;
+
+ if (WARN_ON_ONCE(!elt_data))
+ return;
+
+ comm = elt_data->comm;
uval = *(u64 *)(key + key_field->offset);
seq_printf(m, "%s: %-16s[%10llu]", field_name,
@@ -1067,6 +4791,10 @@ hist_trigger_entry_print(struct seq_file *m,
for (i = 1; i < hist_data->n_vals; i++) {
field_name = hist_field_name(hist_data->fields[i], 0);
+ if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR ||
+ hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR)
+ continue;
+
if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
seq_printf(m, " %s: %10llx", field_name,
tracing_map_read_sum(elt, i));
@@ -1076,6 +4804,8 @@ hist_trigger_entry_print(struct seq_file *m,
}
}
+ print_actions(m, hist_data, elt);
+
seq_puts(m, "\n");
}
@@ -1144,6 +4874,11 @@ static int hist_show(struct seq_file *m, void *v)
hist_trigger_show(m, data, n++);
}
+ if (have_hist_err()) {
+ seq_printf(m, "\nERROR: %s\n", hist_err_str);
+ seq_printf(m, " Last command: %s\n", last_hist_cmd);
+ }
+
out_unlock:
mutex_unlock(&event_mutex);
@@ -1162,37 +4897,22 @@ const struct file_operations event_hist_fops = {
.release = single_release,
};
-static const char *get_hist_field_flags(struct hist_field *hist_field)
-{
- const char *flags_str = NULL;
-
- if (hist_field->flags & HIST_FIELD_FL_HEX)
- flags_str = "hex";
- else if (hist_field->flags & HIST_FIELD_FL_SYM)
- flags_str = "sym";
- else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
- flags_str = "sym-offset";
- else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
- flags_str = "execname";
- else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
- flags_str = "syscall";
- else if (hist_field->flags & HIST_FIELD_FL_LOG2)
- flags_str = "log2";
-
- return flags_str;
-}
-
static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
{
const char *field_name = hist_field_name(hist_field, 0);
- seq_printf(m, "%s", field_name);
- if (hist_field->flags) {
- const char *flags_str = get_hist_field_flags(hist_field);
-
- if (flags_str)
- seq_printf(m, ".%s", flags_str);
- }
+ if (hist_field->var.name)
+ seq_printf(m, "%s=", hist_field->var.name);
+
+ if (hist_field->flags & HIST_FIELD_FL_CPU)
+ seq_puts(m, "cpu");
+ else if (field_name) {
+ if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
+ hist_field->flags & HIST_FIELD_FL_ALIAS)
+ seq_putc(m, '$');
+ seq_printf(m, "%s", field_name);
+ } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
+ seq_puts(m, "common_timestamp");
}
static int event_hist_trigger_print(struct seq_file *m,
@@ -1200,7 +4920,8 @@ static int event_hist_trigger_print(struct seq_file *m,
struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
- struct hist_field *key_field;
+ struct hist_field *field;
+ bool have_var = false;
unsigned int i;
seq_puts(m, "hist:");
@@ -1211,25 +4932,47 @@ static int event_hist_trigger_print(struct seq_file *m,
seq_puts(m, "keys=");
for_each_hist_key_field(i, hist_data) {
- key_field = hist_data->fields[i];
+ field = hist_data->fields[i];
if (i > hist_data->n_vals)
seq_puts(m, ",");
- if (key_field->flags & HIST_FIELD_FL_STACKTRACE)
+ if (field->flags & HIST_FIELD_FL_STACKTRACE)
seq_puts(m, "stacktrace");
else
- hist_field_print(m, key_field);
+ hist_field_print(m, field);
}
seq_puts(m, ":vals=");
for_each_hist_val_field(i, hist_data) {
+ field = hist_data->fields[i];
+ if (field->flags & HIST_FIELD_FL_VAR) {
+ have_var = true;
+ continue;
+ }
+
if (i == HITCOUNT_IDX)
seq_puts(m, "hitcount");
else {
seq_puts(m, ",");
- hist_field_print(m, hist_data->fields[i]);
+ hist_field_print(m, field);
+ }
+ }
+
+ if (have_var) {
+ unsigned int n = 0;
+
+ seq_puts(m, ":");
+
+ for_each_hist_val_field(i, hist_data) {
+ field = hist_data->fields[i];
+
+ if (field->flags & HIST_FIELD_FL_VAR) {
+ if (n++)
+ seq_puts(m, ",");
+ hist_field_print(m, field);
+ }
}
}
@@ -1237,28 +4980,36 @@ static int event_hist_trigger_print(struct seq_file *m,
for (i = 0; i < hist_data->n_sort_keys; i++) {
struct tracing_map_sort_key *sort_key;
+ unsigned int idx, first_key_idx;
+
+ /* skip VAR vals */
+ first_key_idx = hist_data->n_vals - hist_data->n_vars;
sort_key = &hist_data->sort_keys[i];
+ idx = sort_key->field_idx;
+
+ if (WARN_ON(idx >= HIST_FIELDS_MAX))
+ return -EINVAL;
if (i > 0)
seq_puts(m, ",");
- if (sort_key->field_idx == HITCOUNT_IDX)
+ if (idx == HITCOUNT_IDX)
seq_puts(m, "hitcount");
else {
- unsigned int idx = sort_key->field_idx;
-
- if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX))
- return -EINVAL;
-
+ if (idx >= first_key_idx)
+ idx += hist_data->n_vars;
hist_field_print(m, hist_data->fields[idx]);
}
if (sort_key->descending)
seq_puts(m, ".descending");
}
-
seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
+ if (hist_data->enable_timestamps)
+ seq_printf(m, ":clock=%s", hist_data->attrs->clock);
+
+ print_actions_spec(m, hist_data);
if (data->filter_str)
seq_printf(m, " if %s", data->filter_str);
@@ -1286,6 +5037,21 @@ static int event_hist_trigger_init(struct event_trigger_ops *ops,
return 0;
}
+static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
+{
+ struct trace_event_file *file;
+ unsigned int i;
+ char *cmd;
+ int ret;
+
+ for (i = 0; i < hist_data->n_field_var_hists; i++) {
+ file = hist_data->field_var_hists[i]->hist_data->event_file;
+ cmd = hist_data->field_var_hists[i]->cmd;
+ ret = event_hist_trigger_func(&trigger_hist_cmd, file,
+ "!hist", "hist", cmd);
+ }
+}
+
static void event_hist_trigger_free(struct event_trigger_ops *ops,
struct event_trigger_data *data)
{
@@ -1298,7 +5064,13 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops,
if (!data->ref) {
if (data->name)
del_named_trigger(data);
+
trigger_data_free(data);
+
+ remove_hist_vars(hist_data);
+
+ unregister_field_var_hists(hist_data);
+
destroy_hist_data(hist_data);
}
}
@@ -1425,6 +5197,15 @@ static bool hist_trigger_match(struct event_trigger_data *data,
return false;
if (key_field->offset != key_field_test->offset)
return false;
+ if (key_field->size != key_field_test->size)
+ return false;
+ if (key_field->is_signed != key_field_test->is_signed)
+ return false;
+ if (!!key_field->var.name != !!key_field_test->var.name)
+ return false;
+ if (key_field->var.name &&
+ strcmp(key_field->var.name, key_field_test->var.name) != 0)
+ return false;
}
for (i = 0; i < hist_data->n_sort_keys; i++) {
@@ -1440,6 +5221,9 @@ static bool hist_trigger_match(struct event_trigger_data *data,
(strcmp(data->filter_str, data_test->filter_str) != 0))
return false;
+ if (!actions_match(hist_data, hist_data_test))
+ return false;
+
return true;
}
@@ -1456,6 +5240,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
if (named_data) {
if (!hist_trigger_match(data, named_data, named_data,
true)) {
+ hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name);
ret = -EINVAL;
goto out;
}
@@ -1475,13 +5260,16 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
test->paused = false;
else if (hist_data->attrs->clear)
hist_clear(test);
- else
+ else {
+ hist_err("Hist trigger already exists", NULL);
ret = -EEXIST;
+ }
goto out;
}
}
new:
if (hist_data->attrs->cont || hist_data->attrs->clear) {
+ hist_err("Can't clear or continue a nonexistent hist trigger", NULL);
ret = -ENOENT;
goto out;
}
@@ -1490,7 +5278,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
data->paused = true;
if (named_data) {
- destroy_hist_data(data->private_data);
data->private_data = named_data->private_data;
set_named_trigger_data(data, named_data);
data->ops = &event_hist_trigger_named_ops;
@@ -1502,8 +5289,32 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
goto out;
}
- list_add_rcu(&data->list, &file->triggers);
+ if (hist_data->enable_timestamps) {
+ char *clock = hist_data->attrs->clock;
+
+ ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
+ if (ret) {
+ hist_err("Couldn't set trace_clock: ", clock);
+ goto out;
+ }
+
+ tracing_set_time_stamp_abs(file->tr, true);
+ }
+
+ if (named_data)
+ destroy_hist_data(hist_data);
+
ret++;
+ out:
+ return ret;
+}
+
+static int hist_trigger_enable(struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ int ret = 0;
+
+ list_add_tail_rcu(&data->list, &file->triggers);
update_cond_flag(file);
@@ -1512,10 +5323,55 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
update_cond_flag(file);
ret--;
}
- out:
+
return ret;
}
+static bool have_hist_trigger_match(struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+ struct event_trigger_data *test, *named_data = NULL;
+ bool match = false;
+
+ if (hist_data->attrs->name)
+ named_data = find_named_trigger(hist_data->attrs->name);
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (hist_trigger_match(data, test, named_data, false)) {
+ match = true;
+ break;
+ }
+ }
+ }
+
+ return match;
+}
+
+static bool hist_trigger_check_refs(struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+ struct event_trigger_data *test, *named_data = NULL;
+
+ if (hist_data->attrs->name)
+ named_data = find_named_trigger(hist_data->attrs->name);
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (!hist_trigger_match(data, test, named_data, false))
+ continue;
+ hist_data = test->private_data;
+ if (check_var_refs(hist_data))
+ return true;
+ break;
+ }
+ }
+
+ return false;
+}
+
static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file)
@@ -1541,17 +5397,55 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
if (unregistered && test->ops->free)
test->ops->free(test->ops, test);
+
+ if (hist_data->enable_timestamps) {
+ if (!hist_data->remove || unregistered)
+ tracing_set_time_stamp_abs(file->tr, false);
+ }
+}
+
+static bool hist_file_check_refs(struct trace_event_file *file)
+{
+ struct hist_trigger_data *hist_data;
+ struct event_trigger_data *test;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ hist_data = test->private_data;
+ if (check_var_refs(hist_data))
+ return true;
+ }
+ }
+
+ return false;
}
static void hist_unreg_all(struct trace_event_file *file)
{
struct event_trigger_data *test, *n;
+ struct hist_trigger_data *hist_data;
+ struct synth_event *se;
+ const char *se_name;
+
+ if (hist_file_check_refs(file))
+ return;
list_for_each_entry_safe(test, n, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ hist_data = test->private_data;
list_del_rcu(&test->list);
trace_event_trigger_enable_disable(file, 0);
+
+ mutex_lock(&synth_event_mutex);
+ se_name = trace_event_name(file->event_call);
+ se = find_synth_event(se_name);
+ if (se)
+ se->ref--;
+ mutex_unlock(&synth_event_mutex);
+
update_cond_flag(file);
+ if (hist_data->enable_timestamps)
+ tracing_set_time_stamp_abs(file->tr, false);
if (test->ops->free)
test->ops->free(test->ops, test);
}
@@ -1567,16 +5461,54 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
struct hist_trigger_attrs *attrs;
struct event_trigger_ops *trigger_ops;
struct hist_trigger_data *hist_data;
- char *trigger;
+ struct synth_event *se;
+ const char *se_name;
+ bool remove = false;
+ char *trigger, *p;
int ret = 0;
+ if (glob && strlen(glob)) {
+ last_cmd_set(param);
+ hist_err_clear();
+ }
+
if (!param)
return -EINVAL;
- /* separate the trigger from the filter (k:v [if filter]) */
- trigger = strsep(&param, " \t");
- if (!trigger)
- return -EINVAL;
+ if (glob[0] == '!')
+ remove = true;
+
+ /*
+ * separate the trigger from the filter (k:v [if filter])
+ * allowing for whitespace in the trigger
+ */
+ p = trigger = param;
+ do {
+ p = strstr(p, "if");
+ if (!p)
+ break;
+ if (p == param)
+ return -EINVAL;
+ if (*(p - 1) != ' ' && *(p - 1) != '\t') {
+ p++;
+ continue;
+ }
+ if (p >= param + strlen(param) - strlen("if") - 1)
+ return -EINVAL;
+ if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') {
+ p++;
+ continue;
+ }
+ break;
+ } while (p);
+
+ if (!p)
+ param = NULL;
+ else {
+ *(p - 1) = '\0';
+ param = strstrip(p);
+ trigger = strstrip(trigger);
+ }
attrs = parse_hist_trigger_attrs(trigger);
if (IS_ERR(attrs))
@@ -1585,7 +5517,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
if (attrs->map_bits)
hist_trigger_bits = attrs->map_bits;
- hist_data = create_hist_data(hist_trigger_bits, attrs, file);
+ hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove);
if (IS_ERR(hist_data)) {
destroy_hist_trigger_attrs(attrs);
return PTR_ERR(hist_data);
@@ -1593,10 +5525,11 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
- ret = -ENOMEM;
trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
- if (!trigger_data)
+ if (!trigger_data) {
+ ret = -ENOMEM;
goto out_free;
+ }
trigger_data->count = -1;
trigger_data->ops = trigger_ops;
@@ -1614,8 +5547,24 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
goto out_free;
}
- if (glob[0] == '!') {
+ if (remove) {
+ if (!have_hist_trigger_match(trigger_data, file))
+ goto out_free;
+
+ if (hist_trigger_check_refs(trigger_data, file)) {
+ ret = -EBUSY;
+ goto out_free;
+ }
+
cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+
+ mutex_lock(&synth_event_mutex);
+ se_name = trace_event_name(file->event_call);
+ se = find_synth_event(se_name);
+ if (se)
+ se->ref--;
+ mutex_unlock(&synth_event_mutex);
+
ret = 0;
goto out_free;
}
@@ -1632,14 +5581,47 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
goto out_free;
} else if (ret < 0)
goto out_free;
+
+ if (get_named_trigger_data(trigger_data))
+ goto enable;
+
+ if (has_hist_vars(hist_data))
+ save_hist_vars(hist_data);
+
+ ret = create_actions(hist_data, file);
+ if (ret)
+ goto out_unreg;
+
+ ret = tracing_map_init(hist_data->map);
+ if (ret)
+ goto out_unreg;
+enable:
+ ret = hist_trigger_enable(trigger_data, file);
+ if (ret)
+ goto out_unreg;
+
+ mutex_lock(&synth_event_mutex);
+ se_name = trace_event_name(file->event_call);
+ se = find_synth_event(se_name);
+ if (se)
+ se->ref++;
+ mutex_unlock(&synth_event_mutex);
+
/* Just return zero, not the number of registered triggers */
ret = 0;
out:
+ if (ret == 0)
+ hist_err_clear();
+
return ret;
+ out_unreg:
+ cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
out_free:
if (cmd_ops->set_filter)
cmd_ops->set_filter(NULL, trigger_data, NULL);
+ remove_hist_vars(hist_data);
+
kfree(trigger_data);
destroy_hist_data(hist_data);
@@ -1669,7 +5651,8 @@ __init int register_trigger_hist_cmd(void)
}
static void
-hist_enable_trigger(struct event_trigger_data *data, void *rec)
+hist_enable_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
struct event_trigger_data *test;
@@ -1685,7 +5668,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec)
}
static void
-hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
+hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (!data->count)
return;
@@ -1693,7 +5677,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
if (data->count != -1)
(data->count)--;
- hist_enable_trigger(data, rec);
+ hist_enable_trigger(data, rec, event);
}
static struct event_trigger_ops hist_enable_trigger_ops = {
@@ -1798,3 +5782,31 @@ __init int register_trigger_hist_enable_disable_cmds(void)
return ret;
}
+
+static __init int trace_events_hist_init(void)
+{
+ struct dentry *entry = NULL;
+ struct dentry *d_tracer;
+ int err = 0;
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer)) {
+ err = PTR_ERR(d_tracer);
+ goto err;
+ }
+
+ entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
+ NULL, &synth_events_fops);
+ if (!entry) {
+ err = -ENODEV;
+ goto err;
+ }
+
+ return err;
+ err:
+ pr_warn("Could not create tracefs 'synthetic_events' entry\n");
+
+ return err;
+}
+
+fs_initcall(trace_events_hist_init);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 87411482a46f..d251cabcf69a 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -63,7 +63,8 @@ void trigger_data_free(struct event_trigger_data *data)
* any trigger that should be deferred, ETT_NONE if nothing to defer.
*/
enum event_trigger_type
-event_triggers_call(struct trace_event_file *file, void *rec)
+event_triggers_call(struct trace_event_file *file, void *rec,
+ struct ring_buffer_event *event)
{
struct event_trigger_data *data;
enum event_trigger_type tt = ETT_NONE;
@@ -76,7 +77,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
if (data->paused)
continue;
if (!rec) {
- data->ops->func(data, rec);
+ data->ops->func(data, rec, event);
continue;
}
filter = rcu_dereference_sched(data->filter);
@@ -86,7 +87,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
tt |= data->cmd_ops->trigger_type;
continue;
}
- data->ops->func(data, rec);
+ data->ops->func(data, rec, event);
}
return tt;
}
@@ -108,7 +109,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
void
event_triggers_post_call(struct trace_event_file *file,
enum event_trigger_type tt,
- void *rec)
+ void *rec, struct ring_buffer_event *event)
{
struct event_trigger_data *data;
@@ -116,7 +117,7 @@ event_triggers_post_call(struct trace_event_file *file,
if (data->paused)
continue;
if (data->cmd_ops->trigger_type & tt)
- data->ops->func(data, rec);
+ data->ops->func(data, rec, event);
}
}
EXPORT_SYMBOL_GPL(event_triggers_post_call);
@@ -908,8 +909,15 @@ void set_named_trigger_data(struct event_trigger_data *data,
data->named_data = named_data;
}
+struct event_trigger_data *
+get_named_trigger_data(struct event_trigger_data *data)
+{
+ return data->named_data;
+}
+
static void
-traceon_trigger(struct event_trigger_data *data, void *rec)
+traceon_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (tracing_is_on())
return;
@@ -918,7 +926,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec)
}
static void
-traceon_count_trigger(struct event_trigger_data *data, void *rec)
+traceon_count_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (tracing_is_on())
return;
@@ -933,7 +942,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec)
}
static void
-traceoff_trigger(struct event_trigger_data *data, void *rec)
+traceoff_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (!tracing_is_on())
return;
@@ -942,7 +952,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec)
}
static void
-traceoff_count_trigger(struct event_trigger_data *data, void *rec)
+traceoff_count_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (!tracing_is_on())
return;
@@ -1039,13 +1050,15 @@ static struct event_command trigger_traceoff_cmd = {
#ifdef CONFIG_TRACER_SNAPSHOT
static void
-snapshot_trigger(struct event_trigger_data *data, void *rec)
+snapshot_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
tracing_snapshot();
}
static void
-snapshot_count_trigger(struct event_trigger_data *data, void *rec)
+snapshot_count_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (!data->count)
return;
@@ -1053,7 +1066,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec)
if (data->count != -1)
(data->count)--;
- snapshot_trigger(data, rec);
+ snapshot_trigger(data, rec, event);
}
static int
@@ -1141,13 +1154,15 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
#endif
static void
-stacktrace_trigger(struct event_trigger_data *data, void *rec)
+stacktrace_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
trace_dump_stack(STACK_SKIP);
}
static void
-stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
+stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (!data->count)
return;
@@ -1155,7 +1170,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
if (data->count != -1)
(data->count)--;
- stacktrace_trigger(data, rec);
+ stacktrace_trigger(data, rec, event);
}
static int
@@ -1217,7 +1232,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
}
static void
-event_enable_trigger(struct event_trigger_data *data, void *rec)
+event_enable_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1228,7 +1244,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec)
}
static void
-event_enable_count_trigger(struct event_trigger_data *data, void *rec)
+event_enable_count_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1242,7 +1259,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec)
if (data->count != -1)
(data->count)--;
- event_enable_trigger(data, rec);
+ event_enable_trigger(data, rec, event);
}
int event_enable_trigger_print(struct seq_file *m,
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index ad1d6164e946..50f44b7b2b32 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -196,7 +196,7 @@ struct notifier_block module_trace_bprintk_format_nb = {
};
int __trace_bprintk(unsigned long ip, const char *fmt, ...)
- {
+{
int ret;
va_list ap;
@@ -214,7 +214,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...)
EXPORT_SYMBOL_GPL(__trace_bprintk);
int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
- {
+{
if (unlikely(!fmt))
return 0;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2014f4351ae0..34fd0e0ec51d 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -151,6 +151,8 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
return;
ret = strncpy_from_user(dst, src, maxlen);
+ if (ret == maxlen)
+ dst[--ret] = '\0';
if (ret < 0) { /* Failed to fetch string */
((u8 *)get_rloc_data(dest))[0] = '\0';
@@ -446,7 +448,7 @@ static int create_trace_uprobe(int argc, char **argv)
if (ret)
goto fail_address_parse;
- inode = igrab(d_inode(path.dentry));
+ inode = igrab(d_real_inode(path.dentry));
path_put(&path);
if (!inode || !S_ISREG(inode->i_mode)) {
@@ -602,24 +604,9 @@ static int probes_seq_show(struct seq_file *m, void *v)
char c = is_ret_probe(tu) ? 'r' : 'p';
int i;
- seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
- trace_event_name(&tu->tp.call));
- seq_printf(m, " %s:", tu->filename);
-
- /* Don't print "0x (null)" when offset is 0 */
- if (tu->offset) {
- seq_printf(m, "0x%px", (void *)tu->offset);
- } else {
- switch (sizeof(void *)) {
- case 4:
- seq_printf(m, "0x00000000");
- break;
- case 8:
- default:
- seq_printf(m, "0x0000000000000000");
- break;
- }
- }
+ seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, tu->tp.call.class->system,
+ trace_event_name(&tu->tp.call), tu->filename,
+ (int)(sizeof(void *) * 2), tu->offset);
for (i = 0; i < tu->tp.nr_args; i++)
seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 07e75344725b..5cadb1b8b5fe 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -66,6 +66,73 @@ u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i)
return (u64)atomic64_read(&elt->fields[i].sum);
}
+/**
+ * tracing_map_set_var - Assign a tracing_map_elt's variable field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given variable associated with the tracing_map_elt
+ * @n: The value to assign
+ *
+ * Assign n to variable i associated with the specified tracing_map_elt
+ * instance. The index i is the index returned by the call to
+ * tracing_map_add_var() when the tracing map was set up.
+ */
+void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n)
+{
+ atomic64_set(&elt->vars[i], n);
+ elt->var_set[i] = true;
+}
+
+/**
+ * tracing_map_var_set - Return whether or not a variable has been set
+ * @elt: The tracing_map_elt
+ * @i: The index of the given variable associated with the tracing_map_elt
+ *
+ * Return true if the variable has been set, false otherwise. The
+ * index i is the index returned by the call to tracing_map_add_var()
+ * when the tracing map was set up.
+ */
+bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i)
+{
+ return elt->var_set[i];
+}
+
+/**
+ * tracing_map_read_var - Return the value of a tracing_map_elt's variable field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given variable associated with the tracing_map_elt
+ *
+ * Retrieve the value of the variable i associated with the specified
+ * tracing_map_elt instance. The index i is the index returned by the
+ * call to tracing_map_add_var() when the tracing map was set
+ * up.
+ *
+ * Return: The variable value associated with field i for elt.
+ */
+u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i)
+{
+ return (u64)atomic64_read(&elt->vars[i]);
+}
+
+/**
+ * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given variable associated with the tracing_map_elt
+ *
+ * Retrieve the value of the variable i associated with the specified
+ * tracing_map_elt instance, and reset the variable to the 'not set'
+ * state. The index i is the index returned by the call to
+ * tracing_map_add_var() when the tracing map was set up. The reset
+ * essentially makes the variable a read-once variable if it's only
+ * accessed using this function.
+ *
+ * Return: The variable value associated with field i for elt.
+ */
+u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i)
+{
+ elt->var_set[i] = false;
+ return (u64)atomic64_read(&elt->vars[i]);
+}
+
int tracing_map_cmp_string(void *val_a, void *val_b)
{
char *a = val_a;
@@ -171,6 +238,28 @@ int tracing_map_add_sum_field(struct tracing_map *map)
}
/**
+ * tracing_map_add_var - Add a field describing a tracing_map var
+ * @map: The tracing_map
+ *
+ * Add a var to the map and return the index identifying it in the map
+ * and associated tracing_map_elts. This is the index used for
+ * instance to update a var for a particular tracing_map_elt using
+ * tracing_map_update_var() or reading it via tracing_map_read_var().
+ *
+ * Return: The index identifying the var in the map and associated
+ * tracing_map_elts, or -EINVAL on error.
+ */
+int tracing_map_add_var(struct tracing_map *map)
+{
+ int ret = -EINVAL;
+
+ if (map->n_vars < TRACING_MAP_VARS_MAX)
+ ret = map->n_vars++;
+
+ return ret;
+}
+
+/**
* tracing_map_add_key_field - Add a field describing a tracing_map key
* @map: The tracing_map
* @offset: The offset within the key
@@ -280,6 +369,11 @@ static void tracing_map_elt_clear(struct tracing_map_elt *elt)
if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
atomic64_set(&elt->fields[i].sum, 0);
+ for (i = 0; i < elt->map->n_vars; i++) {
+ atomic64_set(&elt->vars[i], 0);
+ elt->var_set[i] = false;
+ }
+
if (elt->map->ops && elt->map->ops->elt_clear)
elt->map->ops->elt_clear(elt);
}
@@ -306,6 +400,8 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt)
if (elt->map->ops && elt->map->ops->elt_free)
elt->map->ops->elt_free(elt);
kfree(elt->fields);
+ kfree(elt->vars);
+ kfree(elt->var_set);
kfree(elt->key);
kfree(elt);
}
@@ -333,6 +429,18 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
goto free;
}
+ elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL);
+ if (!elt->vars) {
+ err = -ENOMEM;
+ goto free;
+ }
+
+ elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL);
+ if (!elt->var_set) {
+ err = -ENOMEM;
+ goto free;
+ }
+
tracing_map_elt_init_fields(elt);
if (map->ops && map->ops->elt_alloc) {
@@ -414,7 +522,9 @@ static inline struct tracing_map_elt *
__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
{
u32 idx, key_hash, test_key;
+ int dup_try = 0;
struct tracing_map_entry *entry;
+ struct tracing_map_elt *val;
key_hash = jhash(key, map->key_size, 0);
if (key_hash == 0)
@@ -426,11 +536,33 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
entry = TRACING_MAP_ENTRY(map->map, idx);
test_key = entry->key;
- if (test_key && test_key == key_hash && entry->val &&
- keys_match(key, entry->val->key, map->key_size)) {
- if (!lookup_only)
- atomic64_inc(&map->hits);
- return entry->val;
+ if (test_key && test_key == key_hash) {
+ val = READ_ONCE(entry->val);
+ if (val &&
+ keys_match(key, val->key, map->key_size)) {
+ if (!lookup_only)
+ atomic64_inc(&map->hits);
+ return val;
+ } else if (unlikely(!val)) {
+ /*
+ * The key is present. But, val (pointer to elt
+ * struct) is still NULL. which means some other
+ * thread is in the process of inserting an
+ * element.
+ *
+ * On top of that, it's key_hash is same as the
+ * one being inserted right now. So, it's
+ * possible that the element has the same
+ * key as well.
+ */
+
+ dup_try++;
+ if (dup_try > map->map_size) {
+ atomic64_inc(&map->drops);
+ break;
+ }
+ continue;
+ }
}
if (!test_key) {
@@ -452,6 +584,13 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
atomic64_inc(&map->hits);
return entry->val;
+ } else {
+ /*
+ * cmpxchg() failed. Loop around once
+ * more to check what key was inserted.
+ */
+ dup_try++;
+ continue;
}
}
@@ -816,67 +955,15 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
return sort_entry;
}
-static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
-{
- struct tracing_map_elt *dup_elt;
- unsigned int i;
-
- dup_elt = tracing_map_elt_alloc(elt->map);
- if (IS_ERR(dup_elt))
- return NULL;
-
- if (elt->map->ops && elt->map->ops->elt_copy)
- elt->map->ops->elt_copy(dup_elt, elt);
-
- dup_elt->private_data = elt->private_data;
- memcpy(dup_elt->key, elt->key, elt->map->key_size);
-
- for (i = 0; i < elt->map->n_fields; i++) {
- atomic64_set(&dup_elt->fields[i].sum,
- atomic64_read(&elt->fields[i].sum));
- dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
- }
-
- return dup_elt;
-}
-
-static int merge_dup(struct tracing_map_sort_entry **sort_entries,
- unsigned int target, unsigned int dup)
-{
- struct tracing_map_elt *target_elt, *elt;
- bool first_dup = (target - dup) == 1;
- int i;
-
- if (first_dup) {
- elt = sort_entries[target]->elt;
- target_elt = copy_elt(elt);
- if (!target_elt)
- return -ENOMEM;
- sort_entries[target]->elt = target_elt;
- sort_entries[target]->elt_copied = true;
- } else
- target_elt = sort_entries[target]->elt;
-
- elt = sort_entries[dup]->elt;
-
- for (i = 0; i < elt->map->n_fields; i++)
- atomic64_add(atomic64_read(&elt->fields[i].sum),
- &target_elt->fields[i].sum);
-
- sort_entries[dup]->dup = true;
-
- return 0;
-}
-
-static int merge_dups(struct tracing_map_sort_entry **sort_entries,
+static void detect_dups(struct tracing_map_sort_entry **sort_entries,
int n_entries, unsigned int key_size)
{
unsigned int dups = 0, total_dups = 0;
- int err, i, j;
+ int i;
void *key;
if (n_entries < 2)
- return total_dups;
+ return;
sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
(int (*)(const void *, const void *))cmp_entries_dup, NULL);
@@ -885,30 +972,14 @@ static int merge_dups(struct tracing_map_sort_entry **sort_entries,
for (i = 1; i < n_entries; i++) {
if (!memcmp(sort_entries[i]->key, key, key_size)) {
dups++; total_dups++;
- err = merge_dup(sort_entries, i - dups, i);
- if (err)
- return err;
continue;
}
key = sort_entries[i]->key;
dups = 0;
}
- if (!total_dups)
- return total_dups;
-
- for (i = 0, j = 0; i < n_entries; i++) {
- if (!sort_entries[i]->dup) {
- sort_entries[j] = sort_entries[i];
- if (j++ != i)
- sort_entries[i] = NULL;
- } else {
- destroy_sort_entry(sort_entries[i]);
- sort_entries[i] = NULL;
- }
- }
-
- return total_dups;
+ WARN_ONCE(total_dups > 0,
+ "Duplicates detected: %d\n", total_dups);
}
static bool is_key(struct tracing_map *map, unsigned int field_idx)
@@ -1034,10 +1105,7 @@ int tracing_map_sort_entries(struct tracing_map *map,
return 1;
}
- ret = merge_dups(entries, n_entries, map->key_size);
- if (ret < 0)
- goto free;
- n_entries -= ret;
+ detect_dups(entries, n_entries, map->key_size);
if (is_key(map, sort_keys[0].field_idx))
cmp_entries_fn = cmp_entries_key;
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 5b5bbf8ae550..053eb92b2d31 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -10,6 +10,7 @@
#define TRACING_MAP_VALS_MAX 3
#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \
TRACING_MAP_VALS_MAX)
+#define TRACING_MAP_VARS_MAX 16
#define TRACING_MAP_SORT_KEYS_MAX 2
typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
@@ -137,6 +138,8 @@ struct tracing_map_field {
struct tracing_map_elt {
struct tracing_map *map;
struct tracing_map_field *fields;
+ atomic64_t *vars;
+ bool *var_set;
void *key;
void *private_data;
};
@@ -192,6 +195,7 @@ struct tracing_map {
int key_idx[TRACING_MAP_KEYS_MAX];
unsigned int n_keys;
struct tracing_map_sort_key sort_key;
+ unsigned int n_vars;
atomic64_t hits;
atomic64_t drops;
};
@@ -215,11 +219,6 @@ struct tracing_map {
* Element allocation occurs before tracing begins, when the
* tracing_map_init() call is made by client code.
*
- * @elt_copy: At certain points in the lifetime of an element, it may
- * need to be copied. The copy should include a copy of the
- * client-allocated data, which can be copied into the 'to'
- * element from the 'from' element.
- *
* @elt_free: When a tracing_map_elt is freed, this function is called
* and allows client-allocated per-element data to be freed.
*
@@ -233,8 +232,6 @@ struct tracing_map {
*/
struct tracing_map_ops {
int (*elt_alloc)(struct tracing_map_elt *elt);
- void (*elt_copy)(struct tracing_map_elt *to,
- struct tracing_map_elt *from);
void (*elt_free)(struct tracing_map_elt *elt);
void (*elt_clear)(struct tracing_map_elt *elt);
void (*elt_init)(struct tracing_map_elt *elt);
@@ -248,6 +245,7 @@ tracing_map_create(unsigned int map_bits,
extern int tracing_map_init(struct tracing_map *map);
extern int tracing_map_add_sum_field(struct tracing_map *map);
+extern int tracing_map_add_var(struct tracing_map *map);
extern int tracing_map_add_key_field(struct tracing_map *map,
unsigned int offset,
tracing_map_cmp_fn_t cmp_fn);
@@ -267,7 +265,13 @@ extern int tracing_map_cmp_none(void *val_a, void *val_b);
extern void tracing_map_update_sum(struct tracing_map_elt *elt,
unsigned int i, u64 n);
+extern void tracing_map_set_var(struct tracing_map_elt *elt,
+ unsigned int i, u64 n);
+extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i);
extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
+extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
+extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
+
extern void tracing_map_set_field_descr(struct tracing_map *map,
unsigned int i,
unsigned int key_offset,
diff --git a/kernel/ucount.c b/kernel/ucount.c
index b4eeee03934f..f48d1b6376a4 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -10,6 +10,7 @@
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/hash.h>
+#include <linux/kmemleak.h>
#include <linux/user_namespace.h>
#define UCOUNTS_HASHTABLE_BITS 10
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 913fe4336d2b..dcd6be1996fe 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -19,6 +19,8 @@
#include <linux/proc_ns.h>
#include <linux/sched/task.h>
+static struct kmem_cache *uts_ns_cache __ro_after_init;
+
static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
{
return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES);
@@ -33,7 +35,7 @@ static struct uts_namespace *create_uts_ns(void)
{
struct uts_namespace *uts_ns;
- uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+ uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL);
if (uts_ns)
kref_init(&uts_ns->kref);
return uts_ns;
@@ -42,7 +44,7 @@ static struct uts_namespace *create_uts_ns(void)
/*
* Clone a new ns copying an original utsname, setting refcount to 1
* @old_ns: namespace to clone
- * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
+ * Return ERR_PTR(-ENOMEM) on error (failure to allocate), new ns otherwise
*/
static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
struct uts_namespace *old_ns)
@@ -75,7 +77,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
return ns;
fail_free:
- kfree(ns);
+ kmem_cache_free(uts_ns_cache, ns);
fail_dec:
dec_uts_namespaces(ucounts);
fail:
@@ -113,7 +115,7 @@ void free_uts_ns(struct kref *kref)
dec_uts_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
- kfree(ns);
+ kmem_cache_free(uts_ns_cache, ns);
}
static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
@@ -169,3 +171,13 @@ const struct proc_ns_operations utsns_operations = {
.install = utsns_install,
.owner = utsns_owner,
};
+
+void __init uts_ns_init(void)
+{
+ uts_ns_cache = kmem_cache_create_usercopy(
+ "uts_namespace", sizeof(struct uts_namespace), 0,
+ SLAB_PANIC|SLAB_ACCOUNT,
+ offsetof(struct uts_namespace, name),
+ sizeof_field(struct uts_namespace, name),
+ NULL);
+}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 254e636a3d6b..ca7959be8aaa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -153,10 +153,9 @@ struct worker_pool {
unsigned long watchdog_ts; /* L: watchdog timestamp */
struct list_head worklist; /* L: list of pending works */
- int nr_workers; /* L: total number of workers */
- /* nr_idle includes the ones off idle_list for rebinding */
- int nr_idle; /* L: currently idle ones */
+ int nr_workers; /* L: total number of workers */
+ int nr_idle; /* L: currently idle workers */
struct list_head idle_list; /* X: list of idle workers */
struct timer_list idle_timer; /* L: worker idle timeout */
@@ -166,7 +165,6 @@ struct worker_pool {
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
/* L: hash of busy workers */
- /* see manage_workers() for details on the two manager mutexes */
struct worker *manager; /* L: purely informational */
struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
@@ -1604,6 +1602,40 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
}
EXPORT_SYMBOL_GPL(mod_delayed_work_on);
+static void rcu_work_rcufn(struct rcu_head *rcu)
+{
+ struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);
+
+ /* read the comment in __queue_work() */
+ local_irq_disable();
+ __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
+ local_irq_enable();
+}
+
+/**
+ * queue_rcu_work - queue work after a RCU grace period
+ * @wq: workqueue to use
+ * @rwork: work to queue
+ *
+ * Return: %false if @rwork was already pending, %true otherwise. Note
+ * that a full RCU grace period is guaranteed only after a %true return.
+ * While @rwork is guarnateed to be executed after a %false return, the
+ * execution may happen before a full RCU grace period has passed.
+ */
+bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
+{
+ struct work_struct *work = &rwork->work;
+
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ rwork->wq = wq;
+ call_rcu(&rwork->rcu, rcu_work_rcufn);
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(queue_rcu_work);
+
/**
* worker_enter_idle - enter idle state
* @worker: worker which is entering idle state
@@ -3001,6 +3033,26 @@ bool flush_delayed_work(struct delayed_work *dwork)
}
EXPORT_SYMBOL(flush_delayed_work);
+/**
+ * flush_rcu_work - wait for a rwork to finish executing the last queueing
+ * @rwork: the rcu work to flush
+ *
+ * Return:
+ * %true if flush_rcu_work() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_rcu_work(struct rcu_work *rwork)
+{
+ if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {
+ rcu_barrier();
+ flush_work(&rwork->work);
+ return true;
+ } else {
+ return flush_work(&rwork->work);
+ }
+}
+EXPORT_SYMBOL(flush_rcu_work);
+
static bool __cancel_work(struct work_struct *work, bool is_dwork)
{
unsigned long flags;