aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig246
-rw-r--r--kernel/trace/Makefile15
-rw-r--r--kernel/trace/blktrace.c394
-rw-r--r--kernel/trace/bpf_trace.c958
-rw-r--r--kernel/trace/bpf_trace.h34
-rw-r--r--kernel/trace/error_report-traces.c11
-rw-r--r--kernel/trace/fgraph.c23
-rw-r--r--kernel/trace/ftrace.c965
-rw-r--r--kernel/trace/ftrace_internal.h22
-rw-r--r--kernel/trace/pid_list.c495
-rw-r--r--kernel/trace/pid_list.h88
-rw-r--r--kernel/trace/preemptirq_delay_test.c52
-rw-r--r--kernel/trace/ring_buffer.c1436
-rw-r--r--kernel/trace/ring_buffer_benchmark.c48
-rw-r--r--kernel/trace/synth_event_gen_test.c22
-rw-r--r--kernel/trace/trace.c1866
-rw-r--r--kernel/trace/trace.h471
-rw-r--r--kernel/trace/trace_benchmark.c6
-rw-r--r--kernel/trace/trace_boot.c391
-rw-r--r--kernel/trace/trace_branch.c6
-rw-r--r--kernel/trace/trace_clock.c44
-rw-r--r--kernel/trace/trace_dynevent.c91
-rw-r--r--kernel/trace/trace_dynevent.h14
-rw-r--r--kernel/trace/trace_entries.h83
-rw-r--r--kernel/trace/trace_eprobe.c959
-rw-r--r--kernel/trace/trace_event_perf.c33
-rw-r--r--kernel/trace/trace_events.c773
-rw-r--r--kernel/trace/trace_events_filter.c45
-rw-r--r--kernel/trace/trace_events_hist.c2987
-rw-r--r--kernel/trace/trace_events_inject.c6
-rw-r--r--kernel/trace/trace_events_synth.c2245
-rw-r--r--kernel/trace/trace_events_trigger.c99
-rw-r--r--kernel/trace/trace_export.c20
-rw-r--r--kernel/trace/trace_functions.c296
-rw-r--r--kernel/trace/trace_functions_graph.c48
-rw-r--r--kernel/trace/trace_hwlat.c589
-rw-r--r--kernel/trace/trace_irqsoff.c92
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_kprobe.c254
-rw-r--r--kernel/trace/trace_mmiotrace.c16
-rw-r--r--kernel/trace/trace_osnoise.c2368
-rw-r--r--kernel/trace/trace_output.c305
-rw-r--r--kernel/trace/trace_output.h1
-rw-r--r--kernel/trace/trace_preemptirq.c47
-rw-r--r--kernel/trace/trace_printk.c23
-rw-r--r--kernel/trace/trace_probe.c134
-rw-r--r--kernel/trace/trace_probe.h35
-rw-r--r--kernel/trace/trace_probe_tmpl.h8
-rw-r--r--kernel/trace/trace_recursion_record.c236
-rw-r--r--kernel/trace/trace_sched_wakeup.c97
-rw-r--r--kernel/trace/trace_selftest.c134
-rw-r--r--kernel/trace/trace_seq.c12
-rw-r--r--kernel/trace/trace_stack.c20
-rw-r--r--kernel/trace/trace_stat.c14
-rw-r--r--kernel/trace/trace_synth.h40
-rw-r--r--kernel/trace/trace_syscalls.c20
-rw-r--r--kernel/trace/trace_uprobe.c103
-rw-r--r--kernel/trace/tracing_map.c57
-rw-r--r--kernel/trace/tracing_map.h2
59 files changed, 14923 insertions, 4988 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 402eef84c859..420ff4bc67fd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -10,11 +10,6 @@ config USER_STACKTRACE_SUPPORT
config NOP_TRACER
bool
-config HAVE_FTRACE_NMI_ENTER
- bool
- help
- See Documentation/trace/ftrace-design.rst
-
config HAVE_FUNCTION_TRACER
bool
help
@@ -36,6 +31,15 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
bool
+config HAVE_DYNAMIC_FTRACE_WITH_ARGS
+ bool
+ help
+ If this is set, then arguments and stack can be found from
+ the pt_regs passed into the function callback regs parameter
+ by default, even without setting the REGS flag in the ftrace_ops.
+ This allows for use of regs_get_kernel_argument() and
+ kernel_stack_pointer().
+
config HAVE_FTRACE_MCOUNT_RECORD
bool
help
@@ -56,6 +60,11 @@ config HAVE_NOP_MCOUNT
help
Arch supports the gcc options -pg with -mrecord-mcount and -nop-mcount
+config HAVE_OBJTOOL_MCOUNT
+ bool
+ help
+ Arch supports objtool --mcount
+
config HAVE_C_RECORDMCOUNT
bool
help
@@ -72,11 +81,6 @@ config RING_BUFFER
select TRACE_CLOCK
select IRQ_WORK
-config FTRACE_NMI_ENTER
- bool
- depends on HAVE_FTRACE_NMI_ENTER
- default y
-
config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
select GLOB
@@ -131,10 +135,9 @@ config TRACING_SUPPORT
depends on STACKTRACE_SUPPORT
default y
-if TRACING_SUPPORT
-
menuconfig FTRACE
bool "Tracers"
+ depends on TRACING_SUPPORT
default y if DEBUG_KERNEL
help
Enable the kernel tracing infrastructure.
@@ -158,6 +161,7 @@ config FUNCTION_TRACER
select CONTEXT_SWITCH_TRACER
select GLOB
select TASKS_RCU if PREEMPTION
+ select TASKS_RUDE_RCU
help
Enable the kernel to trace every kernel function. This is done
by using a compiler feature to insert a small, 5-byte No-Operation
@@ -211,9 +215,14 @@ config DYNAMIC_FTRACE_WITH_REGS
config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
def_bool y
- depends on DYNAMIC_FTRACE
+ depends on DYNAMIC_FTRACE_WITH_REGS
depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+config DYNAMIC_FTRACE_WITH_ARGS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
@@ -258,20 +267,10 @@ config TRACE_PREEMPT_TOGGLE
Enables hooks which will be called when preemption is first disabled,
and last enabled.
-config PREEMPTIRQ_EVENTS
- bool "Enable trace events for preempt and irq disable/enable"
- select TRACE_IRQFLAGS
- select TRACE_PREEMPT_TOGGLE if PREEMPTION
- select GENERIC_TRACER
- default n
- help
- Enable tracing of disable and enable events for preemption and irqs.
-
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
depends on TRACE_IRQFLAGS_SUPPORT
- depends on !ARCH_USES_GETTIMEOFFSET
select TRACE_IRQFLAGS
select GENERIC_TRACER
select TRACER_MAX_TRACE
@@ -295,7 +294,6 @@ config IRQSOFF_TRACER
config PREEMPT_TRACER
bool "Preemption-off Latency Tracer"
default n
- depends on !ARCH_USES_GETTIMEOFFSET
depends on PREEMPTION
select GENERIC_TRACER
select TRACER_MAX_TRACE
@@ -362,6 +360,68 @@ config HWLAT_TRACER
file. Every time a latency is greater than tracing_thresh, it will
be recorded into the ring buffer.
+config OSNOISE_TRACER
+ bool "OS Noise tracer"
+ select GENERIC_TRACER
+ help
+ In the context of high-performance computing (HPC), the Operating
+ System Noise (osnoise) refers to the interference experienced by an
+ application due to activities inside the operating system. In the
+ context of Linux, NMIs, IRQs, SoftIRQs, and any other system thread
+ can cause noise to the system. Moreover, hardware-related jobs can
+ also cause noise, for example, via SMIs.
+
+ The osnoise tracer leverages the hwlat_detector by running a similar
+ loop with preemption, SoftIRQs and IRQs enabled, thus allowing all
+ the sources of osnoise during its execution. The osnoise tracer takes
+ note of the entry and exit point of any source of interferences,
+ increasing a per-cpu interference counter. It saves an interference
+ counter for each source of interference. The interference counter for
+ NMI, IRQs, SoftIRQs, and threads is increased anytime the tool
+ observes these interferences' entry events. When a noise happens
+ without any interference from the operating system level, the
+ hardware noise counter increases, pointing to a hardware-related
+ noise. In this way, osnoise can account for any source of
+ interference. At the end of the period, the osnoise tracer prints
+ the sum of all noise, the max single noise, the percentage of CPU
+ available for the thread, and the counters for the noise sources.
+
+ In addition to the tracer, a set of tracepoints were added to
+ facilitate the identification of the osnoise source.
+
+ The output will appear in the trace and trace_pipe files.
+
+ To enable this tracer, echo in "osnoise" into the current_tracer
+ file.
+
+config TIMERLAT_TRACER
+ bool "Timerlat tracer"
+ select OSNOISE_TRACER
+ select GENERIC_TRACER
+ help
+ The timerlat tracer aims to help the preemptive kernel developers
+ to find sources of wakeup latencies of real-time threads.
+
+ The tracer creates a per-cpu kernel thread with real-time priority.
+ The tracer thread sets a periodic timer to wakeup itself, and goes
+ to sleep waiting for the timer to fire. At the wakeup, the thread
+ then computes a wakeup latency value as the difference between
+ the current time and the absolute time that the timer was set
+ to expire.
+
+ The tracer prints two lines at every activation. The first is the
+ timer latency observed at the hardirq context before the
+ activation of the thread. The second is the timer latency observed
+ by the thread, which is the same level that cyclictest reports. The
+ ACTIVATION ID field serves to relate the irq execution to its
+ respective thread execution.
+
+ The tracer is build on top of osnoise tracer, and the osnoise:
+ events can be used to trace the source of interference from NMI,
+ IRQs and other threads. It also enables the capture of the
+ stacktrace at the IRQ context, which helps to identify the code
+ path that can cause thread delay.
+
config MMIOTRACE
bool "Memory mapped IO tracing"
depends on HAVE_MMIOTRACE_SUPPORT && PCI
@@ -466,7 +526,6 @@ config PROFILE_ANNOTATED_BRANCHES
config PROFILE_ALL_BRANCHES
bool "Profile all if conditionals" if !FORTIFY_SOURCE
select TRACE_BRANCH_PROFILING
- imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives
help
This tracer profiles all branch conditions. Every if ()
taken in the kernel is recorded whether it hit or miss.
@@ -550,14 +609,14 @@ config KPROBE_EVENTS
config KPROBE_EVENTS_ON_NOTRACE
bool "Do NOT protect notrace function from kprobe events"
depends on KPROBE_EVENTS
- depends on KPROBES_ON_FTRACE
+ depends on DYNAMIC_FTRACE
default n
help
This is only for the developers who want to debug ftrace itself
using kprobe events.
If kprobes can use ftrace instead of breakpoint, ftrace related
- functions are protected from kprobe-events to prevent an infinit
+ functions are protected from kprobe-events to prevent an infinite
recursion or any unexpected execution path which leads to a kernel
crash.
@@ -614,6 +673,30 @@ config FTRACE_MCOUNT_RECORD
depends on DYNAMIC_FTRACE
depends on HAVE_FTRACE_MCOUNT_RECORD
+config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ bool
+ depends on FTRACE_MCOUNT_RECORD
+
+config FTRACE_MCOUNT_USE_CC
+ def_bool y
+ depends on $(cc-option,-mrecord-mcount)
+ depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ depends on FTRACE_MCOUNT_RECORD
+
+config FTRACE_MCOUNT_USE_OBJTOOL
+ def_bool y
+ depends on HAVE_OBJTOOL_MCOUNT
+ depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ depends on !FTRACE_MCOUNT_USE_CC
+ depends on FTRACE_MCOUNT_RECORD
+
+config FTRACE_MCOUNT_USE_RECORDMCOUNT
+ def_bool y
+ depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
+ depends on !FTRACE_MCOUNT_USE_CC
+ depends on !FTRACE_MCOUNT_USE_OBJTOOL
+ depends on FTRACE_MCOUNT_RECORD
+
config TRACING_MAP
bool
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -624,12 +707,30 @@ config TRACING_MAP
generally used outside of that context, and is normally
selected by tracers that use it.
+config SYNTH_EVENTS
+ bool "Synthetic trace events"
+ select TRACING
+ select DYNAMIC_EVENTS
+ default n
+ help
+ Synthetic events are user-defined trace events that can be
+ used to combine data from other trace events or in fact any
+ data source. Synthetic events can be generated indirectly
+ via the trace() action of histogram triggers or directly
+ by way of an in-kernel API.
+
+ See Documentation/trace/events.rst or
+ Documentation/trace/histogram.rst for details and examples.
+
+ If in doubt, say N.
+
config HIST_TRIGGERS
bool "Histogram triggers"
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
select TRACING_MAP
select TRACING
select DYNAMIC_EVENTS
+ select SYNTH_EVENTS
default n
help
Hist triggers allow one or more arbitrary trace event fields
@@ -659,7 +760,7 @@ config TRACEPOINT_BENCHMARK
help
This option creates the tracepoint "benchmark:benchmark_event".
When the tracepoint is enabled, it kicks off a kernel thread that
- goes into an infinite loop (calling cond_sched() to let other tasks
+ goes into an infinite loop (calling cond_resched() to let other tasks
run), and calls the tracepoint. Each iteration will record the time
it took to write to the tracepoint and the next iteration that
data will be passed to the tracepoint itself. That is, the tracepoint
@@ -728,6 +829,45 @@ config TRACE_EVAL_MAP_FILE
If unsure, say N.
+config FTRACE_RECORD_RECURSION
+ bool "Record functions that recurse in function tracing"
+ depends on FUNCTION_TRACER
+ help
+ All callbacks that attach to the function tracing have some sort
+ of protection against recursion. Even though the protection exists,
+ it adds overhead. This option will create a file in the tracefs
+ file system called "recursed_functions" that will list the functions
+ that triggered a recursion.
+
+ This will add more overhead to cases that have recursion.
+
+ If unsure, say N
+
+config FTRACE_RECORD_RECURSION_SIZE
+ int "Max number of recursed functions to record"
+ default 128
+ depends on FTRACE_RECORD_RECURSION
+ help
+ This defines the limit of number of functions that can be
+ listed in the "recursed_functions" file, that lists all
+ the functions that caused a recursion to happen.
+ This file can be reset, but the limit can not change in
+ size at runtime.
+
+config RING_BUFFER_RECORD_RECURSION
+ bool "Record functions that recurse in the ring buffer"
+ depends on FTRACE_RECORD_RECURSION
+ # default y, because it is coupled with FTRACE_RECORD_RECURSION
+ default y
+ help
+ The ring buffer has its own internal recursion. Although when
+ recursion happens it wont cause harm because of the protection,
+ but it does cause an unwanted overhead. Enabling this option will
+ place where recursion was detected into the ftrace "recursed_functions"
+ file.
+
+ This will add more overhead to cases that have recursion.
+
config GCOV_PROFILE_FTRACE
bool "Enable GCOV profiling on ftrace subsystem"
depends on GCOV_KERNEL
@@ -798,6 +938,26 @@ config RING_BUFFER_STARTUP_TEST
If unsure, say N
+config RING_BUFFER_VALIDATE_TIME_DELTAS
+ bool "Verify ring buffer time stamp deltas"
+ depends on RING_BUFFER
+ help
+ This will audit the time stamps on the ring buffer sub
+ buffer to make sure that all the time deltas for the
+ events on a sub buffer matches the current time stamp.
+ This audit is performed for every event that is not
+ interrupted, or interrupting another event. A check
+ is also made when traversing sub buffers to make sure
+ that all the deltas on the previous sub buffer do not
+ add up to be greater than the current time stamp.
+
+ NOTE: This adds significant overhead to recording of events,
+ and should only be used to test the logic of the ring buffer.
+ Do not use it on production systems.
+
+ Only say Y if you understand what this does, and you
+ still want it enabled. Otherwise say N
+
config MMIOTRACE_TEST
tristate "Test module for mmiotrace"
depends on MMIOTRACE && m
@@ -821,11 +981,15 @@ config PREEMPTIRQ_DELAY_TEST
irq-disabled critical sections for 500us:
modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3
+ What's more, if you want to attach the test on the cpu which the latency
+ tracer is running on, specify cpu_affinity=cpu_num at the end of the
+ command.
+
If unsure, say N
config SYNTH_EVENT_GEN_TEST
tristate "Test module for in-kernel synthetic event generation"
- depends on HIST_TRIGGERS
+ depends on SYNTH_EVENTS
help
This option creates a test module to check the base
functionality of in-kernel synthetic event definition and
@@ -848,7 +1012,27 @@ config KPROBE_EVENT_GEN_TEST
If unsure, say N.
-endif # FTRACE
+config HIST_TRIGGERS_DEBUG
+ bool "Hist trigger debug support"
+ depends on HIST_TRIGGERS
+ help
+ Add "hist_debug" file for each event, which when read will
+ dump out a bunch of internal details about the hist triggers
+ defined on that event.
+
+ The hist_debug file serves a couple of purposes:
-endif # TRACING_SUPPORT
+ - Helps developers verify that nothing is broken.
+ - Provides educational information to support the details
+ of the hist trigger internals as described by
+ Documentation/trace/histogram-design.rst.
+
+ The hist_debug output only covers the data structures
+ related to the histogram definitions themselves and doesn't
+ display the internals of map buckets or variable values of
+ running histograms.
+
+ If unsure, say N.
+
+endif # FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f9dcd19165fa..bedc5caceec7 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -2,9 +2,12 @@
# Do not instrument the tracer itself:
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
ifdef CONFIG_FUNCTION_TRACER
-ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
+
+# Avoid recursion due to instrumentation.
+KCSAN_SANITIZE := n
ifdef CONFIG_FTRACE_SELFTEST
# selftest needs instrumentation
@@ -28,6 +31,8 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE
GCOV_PROFILE := y
endif
+CFLAGS_bpf_trace.o := -I$(src)
+
CFLAGS_trace_benchmark.o := -I$(src)
CFLAGS_trace_events_filter.o := -I$(src)
@@ -42,6 +47,7 @@ obj-$(CONFIG_TRACING) += trace_output.o
obj-$(CONFIG_TRACING) += trace_seq.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
+obj-$(CONFIG_TRACING) += pid_list.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o
obj-$(CONFIG_SYNTH_EVENT_GEN_TEST) += synth_event_gen_test.o
@@ -53,6 +59,7 @@ obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
+obj-$(CONFIG_OSNOISE_TRACER) += trace_osnoise.o
obj-$(CONFIG_NOP_TRACER) += trace_nop.o
obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
@@ -71,10 +78,13 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
+obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
+obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
ifeq ($(CONFIG_PM),y)
obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
@@ -86,6 +96,7 @@ obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
+obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ca39dc3230cb..1183c88634aa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -3,6 +3,9 @@
* Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
*
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
@@ -69,17 +72,17 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
struct trace_buffer *buffer = NULL;
- int pc = 0;
+ unsigned int trace_ctx = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
if (blk_tracer) {
buffer = blk_tr->array_buffer.buffer;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + len + cgid_len,
- 0, pc);
+ trace_ctx);
if (!event)
return;
t = ring_buffer_event_data(event);
@@ -104,7 +107,7 @@ record_it:
memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer)
- trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
}
}
@@ -170,10 +173,10 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
blkcg = NULL;
#ifdef CONFIG_BLK_CGROUP
- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
+ trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n,
blkcg ? cgroup_id(blkcg->css.cgroup) : 1);
#else
- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0);
+ trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0);
#endif
local_irq_restore(flags);
}
@@ -219,8 +222,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
struct blk_io_trace *t;
unsigned long flags = 0;
unsigned long *sequence;
+ unsigned int trace_ctx = 0;
pid_t pid;
- int cpu, pc = 0;
+ int cpu;
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
@@ -249,10 +253,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
tracing_record_cmdline(current);
buffer = blk_tr->array_buffer.buffer;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + pdu_len + cgid_len,
- 0, pc);
+ trace_ctx);
if (!event)
return;
t = ring_buffer_event_data(event);
@@ -298,7 +302,7 @@ record_it:
memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
if (blk_tracer) {
- trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
return;
}
}
@@ -308,8 +312,6 @@ record_it:
static void blk_trace_free(struct blk_trace *bt)
{
- debugfs_remove(bt->msg_file);
- debugfs_remove(bt->dropped_file);
relay_close(bt->rchan);
debugfs_remove(bt->dir);
free_percpu(bt->sequence);
@@ -344,7 +346,8 @@ static int __blk_trace_remove(struct request_queue *q)
{
struct blk_trace *bt;
- bt = xchg(&q->blk_trace, NULL);
+ bt = rcu_replace_pointer(q->blk_trace, NULL,
+ lockdep_is_held(&q->debugfs_mutex));
if (!bt)
return -EINVAL;
@@ -358,9 +361,9 @@ int blk_trace_remove(struct request_queue *q)
{
int ret;
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
ret = __blk_trace_remove(q);
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -445,7 +448,7 @@ static struct dentry *blk_create_buf_file_callback(const char *filename,
&relay_file_operations);
}
-static struct rchan_callbacks blk_relay_callbacks = {
+static const struct rchan_callbacks blk_relay_callbacks = {
.subbuf_start = blk_subbuf_start_callback,
.create_buf_file = blk_create_buf_file_callback,
.remove_buf_file = blk_remove_buf_file_callback,
@@ -454,14 +457,9 @@ static struct rchan_callbacks blk_relay_callbacks = {
static void blk_trace_setup_lba(struct blk_trace *bt,
struct block_device *bdev)
{
- struct hd_struct *part = NULL;
-
- if (bdev)
- part = bdev->bd_part;
-
- if (part) {
- bt->start_lba = part->start_sect;
- bt->end_lba = part->start_sect + part->nr_sects;
+ if (bdev) {
+ bt->start_lba = bdev->bd_start_sect;
+ bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev);
} else {
bt->start_lba = 0;
bt->end_lba = -1ULL;
@@ -479,12 +477,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
struct dentry *dir = NULL;
int ret;
+ lockdep_assert_held(&q->debugfs_mutex);
+
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
- if (!blk_debugfs_root)
- return -ENOENT;
-
strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
@@ -494,6 +491,17 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
*/
strreplace(buts->name, '/', '_');
+ /*
+ * bdev can be NULL, as with scsi-generic, this is a helpful as
+ * we can be.
+ */
+ if (rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->debugfs_mutex))) {
+ pr_warn("Concurrent blktraces are not allowed on %s\n",
+ buts->name);
+ return -EBUSY;
+ }
+
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
return -ENOMEM;
@@ -507,21 +515,36 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!bt->msg_data)
goto err;
- ret = -ENOENT;
-
- dir = debugfs_lookup(buts->name, blk_debugfs_root);
- if (!dir)
+ /*
+ * When tracing the whole disk reuse the existing debugfs directory
+ * created by the block layer on init. For partitions block devices,
+ * and scsi-generic block devices we create a temporary new debugfs
+ * directory that will be removed once the trace ends.
+ */
+ if (bdev && !bdev_is_partition(bdev))
+ dir = q->debugfs_dir;
+ else
bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
+ /*
+ * As blktrace relies on debugfs for its interface the debugfs directory
+ * is required, contrary to the usual mantra of not checking for debugfs
+ * files or directories.
+ */
+ if (IS_ERR_OR_NULL(dir)) {
+ pr_warn("debugfs_dir not present for %s so skipping\n",
+ buts->name);
+ ret = -ENOENT;
+ goto err;
+ }
+
bt->dev = dev;
atomic_set(&bt->dropped, 0);
INIT_LIST_HEAD(&bt->running_list);
ret = -EIO;
- bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
- &blk_dropped_fops);
-
- bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+ debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
+ debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
bt->rchan = relay_open("trace", dir, buts->buf_size,
buts->buf_nr, &blk_relay_callbacks, bt);
@@ -543,16 +566,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
bt->pid = buts->pid;
bt->trace_state = Blktrace_setup;
- ret = -EBUSY;
- if (cmpxchg(&q->blk_trace, NULL, bt))
- goto err;
-
+ rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
ret = 0;
err:
- if (dir && !bt->dir)
- dput(dir);
if (ret)
blk_trace_free(bt);
return ret;
@@ -585,9 +603,9 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
{
int ret;
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
ret = __blk_trace_setup(q, name, dev, bdev, arg);
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -633,7 +651,7 @@ static int __blk_trace_startstop(struct request_queue *q, int start)
struct blk_trace *bt;
bt = rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
if (bt == NULL)
return -EINVAL;
@@ -673,9 +691,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
ret = __blk_trace_startstop(q, start);
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -704,7 +722,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
if (!q)
return -ENXIO;
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
switch (cmd) {
case BLKTRACESETUP:
@@ -719,7 +737,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
#endif
case BLKTRACESTART:
start = 1;
- /* fall through */
+ fallthrough;
case BLKTRACESTOP:
ret = __blk_trace_startstop(q, start);
break;
@@ -731,7 +749,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
break;
}
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -742,14 +760,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
**/
void blk_trace_shutdown(struct request_queue *q)
{
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
if (rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex))) {
+ lockdep_is_held(&q->debugfs_mutex))) {
__blk_trace_startstop(q, 0);
__blk_trace_remove(q);
}
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
}
#ifdef CONFIG_BLK_CGROUP
@@ -767,19 +785,19 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
return cgroup_id(bio_blkcg(bio)->css.cgroup);
}
#else
-u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
return 0;
}
#endif
static u64
-blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+blk_trace_request_get_cgid(struct request *rq)
{
if (!rq->bio)
return 0;
/* Use the first bio */
- return blk_trace_bio_get_cgid(q, rq->bio);
+ return blk_trace_bio_get_cgid(rq->q, rq->bio);
}
/*
@@ -798,7 +816,7 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
* Records an action against a request. Will log the bio offset + size.
*
**/
-static void blk_add_trace_rq(struct request *rq, int error,
+static void blk_add_trace_rq(struct request *rq, blk_status_t error,
unsigned int nr_bytes, u32 what, u64 cgid)
{
struct blk_trace *bt;
@@ -816,37 +834,40 @@ static void blk_add_trace_rq(struct request *rq, int error,
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, error, 0, NULL, cgid);
+ rq->cmd_flags, what, blk_status_to_errno(error), 0,
+ NULL, cgid);
rcu_read_unlock();
}
-static void blk_add_trace_rq_insert(void *ignore,
- struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_insert(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
}
-static void blk_add_trace_rq_issue(void *ignore,
- struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_issue(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
+}
+
+static void blk_add_trace_rq_merge(void *ignore, struct request *rq)
+{
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE,
+ blk_trace_request_get_cgid(rq));
}
-static void blk_add_trace_rq_requeue(void *ignore,
- struct request_queue *q,
- struct request *rq)
+static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
}
static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
- int error, unsigned int nr_bytes)
+ blk_status_t error, unsigned int nr_bytes)
{
blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
- blk_trace_request_get_cgid(rq->q, rq));
+ blk_trace_request_get_cgid(rq));
}
/**
@@ -878,76 +899,38 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
rcu_read_unlock();
}
-static void blk_add_trace_bio_bounce(void *ignore,
- struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0);
}
static void blk_add_trace_bio_complete(void *ignore,
- struct request_queue *q, struct bio *bio,
- int error)
+ struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE,
+ blk_status_to_errno(bio->bi_status));
}
-static void blk_add_trace_bio_backmerge(void *ignore,
- struct request_queue *q,
- struct request *rq,
- struct bio *bio)
+static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE,
+ 0);
}
-static void blk_add_trace_bio_frontmerge(void *ignore,
- struct request_queue *q,
- struct request *rq,
- struct bio *bio)
+static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE,
+ 0);
}
-static void blk_add_trace_bio_queue(void *ignore,
- struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_queue(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0);
}
-static void blk_add_trace_getrq(void *ignore,
- struct request_queue *q,
- struct bio *bio, int rw)
+static void blk_add_trace_getrq(void *ignore, struct bio *bio)
{
- if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
- else {
- struct blk_trace *bt;
-
- rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
- if (bt)
- __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
- NULL, 0);
- rcu_read_unlock();
- }
-}
-
-
-static void blk_add_trace_sleeprq(void *ignore,
- struct request_queue *q,
- struct bio *bio, int rw)
-{
- if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
- else {
- struct blk_trace *bt;
-
- rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
- if (bt)
- __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
- 0, 0, NULL, 0);
- rcu_read_unlock();
- }
+ blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0);
}
static void blk_add_trace_plug(void *ignore, struct request_queue *q)
@@ -982,10 +965,9 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
rcu_read_unlock();
}
-static void blk_add_trace_split(void *ignore,
- struct request_queue *q, struct bio *bio,
- unsigned int pdu)
+static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
{
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct blk_trace *bt;
rcu_read_lock();
@@ -995,8 +977,10 @@ static void blk_add_trace_split(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
- BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
- &rpdu, blk_trace_bio_get_cgid(q, bio));
+ BLK_TA_SPLIT,
+ blk_status_to_errno(bio->bi_status),
+ sizeof(rpdu), &rpdu,
+ blk_trace_bio_get_cgid(q, bio));
}
rcu_read_unlock();
}
@@ -1004,20 +988,16 @@ static void blk_add_trace_split(void *ignore,
/**
* blk_add_trace_bio_remap - Add a trace for a bio-remap operation
* @ignore: trace callback data parameter (not used)
- * @q: queue the io is for
* @bio: the source bio
- * @dev: target device
+ * @dev: source device
* @from: source sector
*
- * Description:
- * Device mapper or raid target sometimes need to split a bio because
- * it spans a stripe (or similar). Add a trace for that action.
- *
+ * Called after a bio is remapped to a different device and/or sector.
**/
-static void blk_add_trace_bio_remap(void *ignore,
- struct request_queue *q, struct bio *bio,
- dev_t dev, sector_t from)
+static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
+ sector_t from)
{
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct blk_trace *bt;
struct blk_io_trace_remap r;
@@ -1033,7 +1013,8 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
+ bio_op(bio), bio->bi_opf, BLK_TA_REMAP,
+ blk_status_to_errno(bio->bi_status),
sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
}
@@ -1041,7 +1022,6 @@ static void blk_add_trace_bio_remap(void *ignore,
/**
* blk_add_trace_rq_remap - Add a trace for a request-remap operation
* @ignore: trace callback data parameter (not used)
- * @q: queue the io is for
* @rq: the source request
* @dev: target device
* @from: source sector
@@ -1051,16 +1031,14 @@ static void blk_add_trace_bio_remap(void *ignore,
* Add a trace for that action.
*
**/
-static void blk_add_trace_rq_remap(void *ignore,
- struct request_queue *q,
- struct request *rq, dev_t dev,
+static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
sector_t from)
{
struct blk_trace *bt;
struct blk_io_trace_remap r;
rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
+ bt = rcu_dereference(rq->q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
@@ -1072,13 +1050,12 @@ static void blk_add_trace_rq_remap(void *ignore,
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
- sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
+ sizeof(r), &r, blk_trace_request_get_cgid(rq));
rcu_read_unlock();
}
/**
* blk_add_driver_data - Add binary message with driver-specific data
- * @q: queue the io is for
* @rq: io request
* @data: driver-specific data
* @len: length of driver-specific data
@@ -1087,14 +1064,12 @@ static void blk_add_trace_rq_remap(void *ignore,
* Some drivers might want to write driver-specific data per request.
*
**/
-void blk_add_driver_data(struct request_queue *q,
- struct request *rq,
- void *data, size_t len)
+void blk_add_driver_data(struct request *rq, void *data, size_t len)
{
struct blk_trace *bt;
rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
+ bt = rcu_dereference(rq->q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
@@ -1102,7 +1077,7 @@ void blk_add_driver_data(struct request_queue *q,
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, 0, len, data,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1115,6 +1090,8 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
WARN_ON(ret);
+ ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
+ WARN_ON(ret);
ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
@@ -1131,8 +1108,6 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
WARN_ON(ret);
- ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
- WARN_ON(ret);
ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
@@ -1152,7 +1127,6 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_split(blk_add_trace_split, NULL);
unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
unregister_trace_block_plug(blk_add_trace_plug, NULL);
- unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
@@ -1161,6 +1135,7 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
+ unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
@@ -1253,21 +1228,10 @@ static inline __u16 t_error(const struct trace_entry *ent)
static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
{
- const __u64 *val = pdu_start(ent, has_cg);
+ const __be64 *val = pdu_start(ent, has_cg);
return be64_to_cpu(*val);
}
-static void get_pdu_remap(const struct trace_entry *ent,
- struct blk_io_trace_remap *r, bool has_cg)
-{
- const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
- __u64 sector_from = __r->sector_from;
-
- r->device_from = be32_to_cpu(__r->device_from);
- r->device_to = be32_to_cpu(__r->device_to);
- r->sector_from = be64_to_cpu(sector_from);
-}
-
typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
bool has_cg);
@@ -1315,7 +1279,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act,
* ones now use the 64bit ino as the whole ID and
* no longer use generation.
*
- * Regarldess of the content, always output
+ * Regardless of the content, always output
* "LOW32,HIGH32" so that FILEID_INO32_GEN fid can
* be mapped back to @id on both 64 and 32bit ino
* setups. See __kernfs_fh_to_dentry().
@@ -1357,7 +1321,7 @@ static void blk_log_dump_pdu(struct trace_seq *s,
i == 0 ? "" : " ", pdu_buf[i]);
/*
- * stop when the rest is just zeroes and indicate so
+ * stop when the rest is just zeros and indicate so
* with a ".." appended
*/
if (i == end && end != pdu_len - 1) {
@@ -1407,13 +1371,13 @@ static void blk_log_with_error(struct trace_seq *s,
static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
- struct blk_io_trace_remap r = { .device_from = 0, };
+ const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
- get_pdu_remap(ent, &r, has_cg);
trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
t_sector(ent), t_sec(ent),
- MAJOR(r.device_from), MINOR(r.device_from),
- (unsigned long long)r.sector_from);
+ MAJOR(be32_to_cpu(__r->device_from)),
+ MINOR(be32_to_cpu(__r->device_from)),
+ be64_to_cpu(__r->sector_from));
}
static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
@@ -1637,10 +1601,19 @@ static int blk_trace_remove_queue(struct request_queue *q)
{
struct blk_trace *bt;
- bt = xchg(&q->blk_trace, NULL);
+ bt = rcu_replace_pointer(q->blk_trace, NULL,
+ lockdep_is_held(&q->debugfs_mutex));
if (bt == NULL)
return -EINVAL;
+ if (bt->trace_state == Blktrace_running) {
+ bt->trace_state = Blktrace_stopped;
+ spin_lock_irq(&running_trace_lock);
+ list_del_init(&bt->running_list);
+ spin_unlock_irq(&running_trace_lock);
+ relay_flush(bt->rchan);
+ }
+
put_probe_ref();
synchronize_rcu();
blk_trace_free(bt);
@@ -1669,10 +1642,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
blk_trace_setup_lba(bt, bdev);
- ret = -EBUSY;
- if (cmpxchg(&q->blk_trace, NULL, bt))
- goto free_bt;
-
+ rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
return 0;
@@ -1789,36 +1759,19 @@ static ssize_t blk_trace_mask2str(char *buf, int mask)
return p - buf;
}
-static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
-{
- if (bdev->bd_disk == NULL)
- return NULL;
-
- return bdev_get_queue(bdev);
-}
-
static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- struct hd_struct *p = dev_to_part(dev);
- struct request_queue *q;
- struct block_device *bdev;
+ struct block_device *bdev = dev_to_bdev(dev);
+ struct request_queue *q = bdev_get_queue(bdev);
struct blk_trace *bt;
ssize_t ret = -ENXIO;
- bdev = bdget(part_devt(p));
- if (bdev == NULL)
- goto out;
-
- q = blk_trace_get_queue(bdev);
- if (q == NULL)
- goto out_bdput;
-
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
bt = rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
if (attr == &dev_attr_enable) {
ret = sprintf(buf, "%u\n", !!bt);
goto out_unlock_bdev;
@@ -1836,10 +1789,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
ret = sprintf(buf, "%llu\n", bt->end_lba);
out_unlock_bdev:
- mutex_unlock(&q->blk_trace_mutex);
-out_bdput:
- bdput(bdev);
-out:
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -1847,9 +1797,8 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
- struct block_device *bdev;
- struct request_queue *q;
- struct hd_struct *p;
+ struct block_device *bdev = dev_to_bdev(dev);
+ struct request_queue *q = bdev_get_queue(bdev);
struct blk_trace *bt;
u64 value;
ssize_t ret = -EINVAL;
@@ -1865,24 +1814,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
goto out;
value = ret;
}
- } else if (kstrtoull(buf, 0, &value))
- goto out;
-
- ret = -ENXIO;
-
- p = dev_to_part(dev);
- bdev = bdget(part_devt(p));
- if (bdev == NULL)
- goto out;
-
- q = blk_trace_get_queue(bdev);
- if (q == NULL)
- goto out_bdput;
+ } else {
+ if (kstrtoull(buf, 0, &value))
+ goto out;
+ }
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
bt = rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
if (attr == &dev_attr_enable) {
if (!!value == !!bt) {
ret = 0;
@@ -1899,7 +1839,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
if (bt == NULL) {
ret = blk_trace_setup_queue(q, bdev);
bt = rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
}
if (ret == 0) {
@@ -1914,9 +1854,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
}
out_unlock_bdev:
- mutex_unlock(&q->blk_trace_mutex);
-out_bdput:
- bdput(bdev);
+ mutex_unlock(&q->debugfs_mutex);
out:
return ret ? ret : count;
}
@@ -1935,7 +1873,17 @@ void blk_trace_remove_sysfs(struct device *dev)
#ifdef CONFIG_EVENT_TRACING
-void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
+/**
+ * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string.
+ * @rwbs: buffer to be filled
+ * @op: REQ_OP_XXX for the tracepoint
+ *
+ * Description:
+ * Maps the REQ_OP_XXX to character and fills the buffer provided by the
+ * caller with resulting string.
+ *
+ **/
+void blk_fill_rwbs(char *rwbs, unsigned int op)
{
int i = 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 19e793aa441a..21aa30644219 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -7,18 +7,30 @@
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/bpf_perf_event.h>
+#include <linux/btf.h>
#include <linux/filter.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/kprobes.h>
+#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/error-injection.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_lsm.h>
+
+#include <net/bpf_sk_storage.h>
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/btf.h>
#include <asm/tlb.h>
#include "trace_probe.h"
#include "trace.h"
+#define CREATE_TRACE_POINTS
+#include "bpf_trace.h"
+
#define bpf_event_rcu_dereference(p) \
rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
@@ -62,6 +74,10 @@ static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
+ u64 flags, const struct btf **btf,
+ s32 *btf_id);
+
/**
* trace_call_bpf - invoke BPF program
* @call: tracepoint event
@@ -80,10 +96,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
unsigned int ret;
- if (in_nmi()) /* not supported yet */
- return 1;
-
- preempt_disable();
+ cant_sleep();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
/*
@@ -100,7 +113,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
* Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
* to all call sites, we did a bpf_prog_array_valid() there to check
* whether call->prog_array is empty or not, which is
- * a heurisitc to speed up execution.
+ * a heuristic to speed up execution.
*
* If bpf_prog_array_valid() fetched prog_array was
* non-NULL, we go into trace_call_bpf() and do the actual
@@ -111,15 +124,13 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
* out of events when it was updated in between this and the
* rcu_dereference() which is accepted risk.
*/
- ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run);
out:
__this_cpu_dec(bpf_prog_active);
- preempt_enable();
return ret;
}
-EXPORT_SYMBOL_GPL(trace_call_bpf);
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
@@ -138,18 +149,24 @@ static const struct bpf_func_proto bpf_override_return_proto = {
};
#endif
-BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
- const void __user *, unsafe_ptr)
+static __always_inline int
+bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr)
{
- int ret = probe_user_read(dst, unsafe_ptr, size);
+ int ret;
+ ret = copy_from_user_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
-
return ret;
}
-static const struct bpf_func_proto bpf_probe_read_user_proto = {
+BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
+ const void __user *, unsafe_ptr)
+{
+ return bpf_probe_read_user_common(dst, size, unsafe_ptr);
+}
+
+const struct bpf_func_proto bpf_probe_read_user_proto = {
.func = bpf_probe_read_user,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -158,18 +175,35 @@ static const struct bpf_func_proto bpf_probe_read_user_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
- const void __user *, unsafe_ptr)
+static __always_inline int
+bpf_probe_read_user_str_common(void *dst, u32 size,
+ const void __user *unsafe_ptr)
{
- int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size);
+ int ret;
+ /*
+ * NB: We rely on strncpy_from_user() not copying junk past the NUL
+ * terminator into `dst`.
+ *
+ * strncpy_from_user() does long-sized strides in the fast path. If the
+ * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`,
+ * then there could be junk after the NUL in `dst`. If user takes `dst`
+ * and keys a hash map with it, then semantically identical strings can
+ * occupy multiple entries in the map.
+ */
+ ret = strncpy_from_user_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
-
return ret;
}
-static const struct bpf_func_proto bpf_probe_read_user_str_proto = {
+BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
+ const void __user *, unsafe_ptr)
+{
+ return bpf_probe_read_user_str_common(dst, size, unsafe_ptr);
+}
+
+const struct bpf_func_proto bpf_probe_read_user_str_proto = {
.func = bpf_probe_read_user_str,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -179,17 +213,12 @@ static const struct bpf_func_proto bpf_probe_read_user_str_proto = {
};
static __always_inline int
-bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr,
- const bool compat)
+bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr)
{
- int ret = security_locked_down(LOCKDOWN_BPF_READ);
+ int ret;
+ ret = copy_from_kernel_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
- goto out;
- ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) :
- probe_kernel_read_strict(dst, unsafe_ptr, size);
- if (unlikely(ret < 0))
-out:
memset(dst, 0, size);
return ret;
}
@@ -197,10 +226,10 @@ out:
BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
const void *, unsafe_ptr)
{
- return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false);
+ return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
}
-static const struct bpf_func_proto bpf_probe_read_kernel_proto = {
+const struct bpf_func_proto bpf_probe_read_kernel_proto = {
.func = bpf_probe_read_kernel,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -209,42 +238,22 @@ static const struct bpf_func_proto bpf_probe_read_kernel_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
- const void *, unsafe_ptr)
-{
- return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true);
-}
-
-static const struct bpf_func_proto bpf_probe_read_compat_proto = {
- .func = bpf_probe_read_compat,
- .gpl_only = true,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_UNINIT_MEM,
- .arg2_type = ARG_CONST_SIZE_OR_ZERO,
- .arg3_type = ARG_ANYTHING,
-};
-
static __always_inline int
-bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr,
- const bool compat)
+bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
{
- int ret = security_locked_down(LOCKDOWN_BPF_READ);
+ int ret;
- if (unlikely(ret < 0))
- goto out;
/*
- * The strncpy_from_unsafe_*() call will likely not fill the entire
- * buffer, but that's okay in this circumstance as we're probing
+ * The strncpy_from_kernel_nofault() call will likely not fill the
+ * entire buffer, but that's okay in this circumstance as we're probing
* arbitrary memory anyway similar to bpf_probe_read_*() and might
* as well probe the stack. Thus, memory is explicitly cleared
* only in error case, so that improper users ignoring return
* code altogether don't copy garbage; otherwise length of string
* is returned that can be used for bpf_perf_event_output() et al.
*/
- ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) :
- strncpy_from_unsafe_strict(dst, unsafe_ptr, size);
+ ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
-out:
memset(dst, 0, size);
return ret;
}
@@ -252,10 +261,10 @@ out:
BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size,
const void *, unsafe_ptr)
{
- return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false);
+ return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}
-static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
+const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
.func = bpf_probe_read_kernel_str,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -264,10 +273,34 @@ static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
.arg3_type = ARG_ANYTHING,
};
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
+ const void *, unsafe_ptr)
+{
+ if ((unsigned long)unsafe_ptr < TASK_SIZE) {
+ return bpf_probe_read_user_common(dst, size,
+ (__force void __user *)unsafe_ptr);
+ }
+ return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
+}
+
+static const struct bpf_func_proto bpf_probe_read_compat_proto = {
+ .func = bpf_probe_read_compat,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size,
const void *, unsafe_ptr)
{
- return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true);
+ if ((unsigned long)unsafe_ptr < TASK_SIZE) {
+ return bpf_probe_read_user_str_common(dst, size,
+ (__force void __user *)unsafe_ptr);
+ }
+ return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}
static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
@@ -278,6 +311,7 @@ static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
+#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */
BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
u32, size)
@@ -303,7 +337,7 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
- return probe_user_write(unsafe_ptr, src, size);
+ return copy_to_user_nofault(unsafe_ptr, src, size);
}
static const struct bpf_func_proto bpf_probe_write_user_proto = {
@@ -311,154 +345,199 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
{
+ if (!capable(CAP_SYS_ADMIN))
+ return NULL;
+
pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
current->comm, task_pid_nr(current));
return &bpf_probe_write_user_proto;
}
-/*
- * Only limited trace_printk() conversion specifiers allowed:
- * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s
- */
+static DEFINE_RAW_SPINLOCK(trace_printk_lock);
+
+#define MAX_TRACE_PRINTK_VARARGS 3
+#define BPF_TRACE_PRINTK_SIZE 1024
+
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
u64, arg2, u64, arg3)
{
- bool str_seen = false;
- int mod[3] = {};
- int fmt_cnt = 0;
- u64 unsafe_addr;
- char buf[64];
- int i;
-
- /*
- * bpf_check()->check_func_arg()->check_stack_boundary()
- * guarantees that fmt points to bpf program stack,
- * fmt_size bytes of it were initialized and fmt_size > 0
- */
- if (fmt[--fmt_size] != 0)
- return -EINVAL;
+ u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 };
+ u32 *bin_args;
+ static char buf[BPF_TRACE_PRINTK_SIZE];
+ unsigned long flags;
+ int ret;
- /* check format string for allowed specifiers */
- for (i = 0; i < fmt_size; i++) {
- if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
- return -EINVAL;
+ ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args,
+ MAX_TRACE_PRINTK_VARARGS);
+ if (ret < 0)
+ return ret;
- if (fmt[i] != '%')
- continue;
+ raw_spin_lock_irqsave(&trace_printk_lock, flags);
+ ret = bstr_printf(buf, sizeof(buf), fmt, bin_args);
- if (fmt_cnt >= 3)
- return -EINVAL;
+ trace_bpf_trace_printk(buf);
+ raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
- /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
- i++;
- if (fmt[i] == 'l') {
- mod[fmt_cnt]++;
- i++;
- } else if (fmt[i] == 'p' || fmt[i] == 's') {
- mod[fmt_cnt]++;
- /* disallow any further format extensions */
- if (fmt[i + 1] != 0 &&
- !isspace(fmt[i + 1]) &&
- !ispunct(fmt[i + 1]))
- return -EINVAL;
- fmt_cnt++;
- if (fmt[i] == 's') {
- if (str_seen)
- /* allow only one '%s' per fmt string */
- return -EINVAL;
- str_seen = true;
-
- switch (fmt_cnt) {
- case 1:
- unsafe_addr = arg1;
- arg1 = (long) buf;
- break;
- case 2:
- unsafe_addr = arg2;
- arg2 = (long) buf;
- break;
- case 3:
- unsafe_addr = arg3;
- arg3 = (long) buf;
- break;
- }
- buf[0] = 0;
- strncpy_from_unsafe(buf,
- (void *) (long) unsafe_addr,
- sizeof(buf));
- }
- continue;
- }
+ bpf_bprintf_cleanup();
- if (fmt[i] == 'l') {
- mod[fmt_cnt]++;
- i++;
- }
-
- if (fmt[i] != 'i' && fmt[i] != 'd' &&
- fmt[i] != 'u' && fmt[i] != 'x')
- return -EINVAL;
- fmt_cnt++;
- }
-
-/* Horrid workaround for getting va_list handling working with different
- * argument type combinations generically for 32 and 64 bit archs.
- */
-#define __BPF_TP_EMIT() __BPF_ARG3_TP()
-#define __BPF_TP(...) \
- __trace_printk(0 /* Fake ip */, \
- fmt, ##__VA_ARGS__)
-
-#define __BPF_ARG1_TP(...) \
- ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
- ? __BPF_TP(arg1, ##__VA_ARGS__) \
- : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
- ? __BPF_TP((long)arg1, ##__VA_ARGS__) \
- : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
-
-#define __BPF_ARG2_TP(...) \
- ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \
- ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \
- : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \
- ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \
- : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
-
-#define __BPF_ARG3_TP(...) \
- ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \
- ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \
- : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \
- ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \
- : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
-
- return __BPF_TP_EMIT();
+ return ret;
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
.func = bpf_trace_printk,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
};
-const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
+static void __set_printk_clr_event(void)
{
/*
- * this program might be calling bpf_trace_printk,
- * so allocate per-cpu printk buffers
+ * This program might be calling bpf_trace_printk,
+ * so enable the associated bpf_trace/bpf_trace_printk event.
+ * Repeat this each time as it is possible a user has
+ * disabled bpf_trace_printk events. By loading a program
+ * calling bpf_trace_printk() however the user has expressed
+ * the intent to see such events.
*/
- trace_printk_init_buffers();
+ if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
+ pr_warn_ratelimited("could not enable bpf_trace_printk events");
+}
+const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
+{
+ __set_printk_clr_event();
return &bpf_trace_printk_proto;
}
+BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, data,
+ u32, data_len)
+{
+ static char buf[BPF_TRACE_PRINTK_SIZE];
+ unsigned long flags;
+ int ret, num_args;
+ u32 *bin_args;
+
+ if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
+ (data_len && !data))
+ return -EINVAL;
+ num_args = data_len / 8;
+
+ ret = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args);
+ if (ret < 0)
+ return ret;
+
+ raw_spin_lock_irqsave(&trace_printk_lock, flags);
+ ret = bstr_printf(buf, sizeof(buf), fmt, bin_args);
+
+ trace_bpf_trace_printk(buf);
+ raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
+
+ bpf_bprintf_cleanup();
+
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_trace_vprintk_proto = {
+ .func = bpf_trace_vprintk,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
+{
+ __set_printk_clr_event();
+ return &bpf_trace_vprintk_proto;
+}
+
+BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
+ const void *, data, u32, data_len)
+{
+ int err, num_args;
+ u32 *bin_args;
+
+ if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
+ (data_len && !data))
+ return -EINVAL;
+ num_args = data_len / 8;
+
+ err = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args);
+ if (err < 0)
+ return err;
+
+ seq_bprintf(m, fmt, bin_args);
+
+ bpf_bprintf_cleanup();
+
+ return seq_has_overflowed(m) ? -EOVERFLOW : 0;
+}
+
+BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file)
+
+static const struct bpf_func_proto bpf_seq_printf_proto = {
+ .func = bpf_seq_printf,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_seq_file_ids[0],
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
+{
+ return seq_write(m, data, len) ? -EOVERFLOW : 0;
+}
+
+static const struct bpf_func_proto bpf_seq_write_proto = {
+ .func = bpf_seq_write,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_seq_file_ids[0],
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr,
+ u32, btf_ptr_size, u64, flags)
+{
+ const struct btf *btf;
+ s32 btf_id;
+ int ret;
+
+ ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id);
+ if (ret)
+ return ret;
+
+ return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags);
+}
+
+static const struct bpf_func_proto bpf_seq_printf_btf_proto = {
+ .func = bpf_seq_printf_btf,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_seq_file_ids[0],
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
static __always_inline int
get_map_perf_counter(struct bpf_map *map, u64 flags,
u64 *value, u64 *enabled, u64 *running)
@@ -615,7 +694,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -670,12 +749,41 @@ BPF_CALL_0(bpf_get_current_task)
return (long) current;
}
-static const struct bpf_func_proto bpf_get_current_task_proto = {
+const struct bpf_func_proto bpf_get_current_task_proto = {
.func = bpf_get_current_task,
.gpl_only = true,
.ret_type = RET_INTEGER,
};
+BPF_CALL_0(bpf_get_current_task_btf)
+{
+ return (unsigned long) current;
+}
+
+const struct bpf_func_proto bpf_get_current_task_btf_proto = {
+ .func = bpf_get_current_task_btf,
+ .gpl_only = true,
+ .ret_type = RET_PTR_TO_BTF_ID,
+ .ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+};
+
+BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
+{
+ return (unsigned long) task_pt_regs(task);
+}
+
+BTF_ID_LIST(bpf_task_pt_regs_ids)
+BTF_ID(struct, pt_regs)
+
+const struct bpf_func_proto bpf_task_pt_regs_proto = {
+ .func = bpf_task_pt_regs,
+ .gpl_only = true,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .ret_type = RET_PTR_TO_BTF_ID,
+ .ret_btf_id = &bpf_task_pt_regs_ids[0],
+};
+
BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -732,7 +840,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
- if (in_nmi()) {
+ if (irqs_disabled()) {
/* Do an early check on signal validity. Otherwise,
* the error is lost in deferred irq_work.
*/
@@ -740,7 +848,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
return -EINVAL;
work = this_cpu_ptr(&send_signal_work);
- if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
+ if (irq_work_is_busy(&work->irq_work))
return -EBUSY;
/* Add the current task, which is the target of sending signal,
@@ -781,8 +889,257 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = {
.arg1_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz)
+{
+ long len;
+ char *p;
+
+ if (!sz)
+ return 0;
+
+ p = d_path(path, buf, sz);
+ if (IS_ERR(p)) {
+ len = PTR_ERR(p);
+ } else {
+ len = buf + sz - p;
+ memmove(buf, p, len);
+ }
+
+ return len;
+}
+
+BTF_SET_START(btf_allowlist_d_path)
+#ifdef CONFIG_SECURITY
+BTF_ID(func, security_file_permission)
+BTF_ID(func, security_inode_getattr)
+BTF_ID(func, security_file_open)
+#endif
+#ifdef CONFIG_SECURITY_PATH
+BTF_ID(func, security_path_truncate)
+#endif
+BTF_ID(func, vfs_truncate)
+BTF_ID(func, vfs_fallocate)
+BTF_ID(func, dentry_open)
+BTF_ID(func, vfs_getattr)
+BTF_ID(func, filp_close)
+BTF_SET_END(btf_allowlist_d_path)
+
+static bool bpf_d_path_allowed(const struct bpf_prog *prog)
+{
+ if (prog->type == BPF_PROG_TYPE_TRACING &&
+ prog->expected_attach_type == BPF_TRACE_ITER)
+ return true;
+
+ if (prog->type == BPF_PROG_TYPE_LSM)
+ return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
+
+ return btf_id_set_contains(&btf_allowlist_d_path,
+ prog->aux->attach_btf_id);
+}
+
+BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path)
+
+static const struct bpf_func_proto bpf_d_path_proto = {
+ .func = bpf_d_path,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_d_path_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .allowed = bpf_d_path_allowed,
+};
+
+#define BTF_F_ALL (BTF_F_COMPACT | BTF_F_NONAME | \
+ BTF_F_PTR_RAW | BTF_F_ZERO)
+
+static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
+ u64 flags, const struct btf **btf,
+ s32 *btf_id)
+{
+ const struct btf_type *t;
+
+ if (unlikely(flags & ~(BTF_F_ALL)))
+ return -EINVAL;
+
+ if (btf_ptr_size != sizeof(struct btf_ptr))
+ return -EINVAL;
+
+ *btf = bpf_get_btf_vmlinux();
+
+ if (IS_ERR_OR_NULL(*btf))
+ return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL;
+
+ if (ptr->type_id > 0)
+ *btf_id = ptr->type_id;
+ else
+ return -EINVAL;
+
+ if (*btf_id > 0)
+ t = btf_type_by_id(*btf, *btf_id);
+ if (*btf_id <= 0 || !t)
+ return -ENOENT;
+
+ return 0;
+}
+
+BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr,
+ u32, btf_ptr_size, u64, flags)
+{
+ const struct btf *btf;
+ s32 btf_id;
+ int ret;
+
+ ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id);
+ if (ret)
+ return ret;
+
+ return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size,
+ flags);
+}
+
+const struct bpf_func_proto bpf_snprintf_btf_proto = {
+ .func = bpf_snprintf_btf,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx)
+{
+ /* This helper call is inlined by verifier. */
+ return ((u64 *)ctx)[-2];
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
+ .func = bpf_get_func_ip_tracing,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
+{
+ struct kprobe *kp = kprobe_running();
+
+ return kp ? (uintptr_t)kp->addr : 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
+ .func = bpf_get_func_ip_kprobe,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
+{
+ struct bpf_trace_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+ return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = {
+ .func = bpf_get_attach_cookie_trace,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx)
+{
+ return ctx->event->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
+ .func = bpf_get_attach_cookie_pe,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
+{
+#ifndef CONFIG_X86
+ return -ENOENT;
+#else
+ static const u32 br_entry_size = sizeof(struct perf_branch_entry);
+ u32 entry_cnt = size / br_entry_size;
+
+ entry_cnt = static_call(perf_snapshot_branch_stack)(buf, entry_cnt);
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (!entry_cnt)
+ return -ENOENT;
+
+ return entry_cnt * br_entry_size;
+#endif
+}
+
+static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
+ .func = bpf_get_branch_snapshot,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value)
+{
+ /* This helper call is inlined by verifier. */
+ u64 nr_args = ((u64 *)ctx)[-1];
+
+ if ((u64) n >= nr_args)
+ return -EINVAL;
+ *value = ((u64 *)ctx)[n];
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_arg_proto = {
+ .func = get_func_arg,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
+{
+ /* This helper call is inlined by verifier. */
+ u64 nr_args = ((u64 *)ctx)[-1];
+
+ *value = ((u64 *)ctx)[nr_args];
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_ret_proto = {
+ .func = get_func_ret,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_1(get_func_arg_cnt, void *, ctx)
+{
+ /* This helper call is inlined by verifier. */
+ return ((u64 *)ctx)[-1];
+}
+
+static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
+ .func = get_func_arg_cnt,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
static const struct bpf_func_proto *
-tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -799,12 +1156,18 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_peek_elem_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_ktime_get_boot_ns:
+ return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_get_current_task:
return &bpf_get_current_task_proto;
+ case BPF_FUNC_get_current_task_btf:
+ return &bpf_get_current_task_btf_proto;
+ case BPF_FUNC_task_pt_regs:
+ return &bpf_task_pt_regs_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_current_comm:
@@ -817,34 +1180,85 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_numa_node_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
- case BPF_FUNC_probe_write_user:
- return bpf_get_probe_write_proto();
case BPF_FUNC_current_task_under_cgroup:
return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_probe_write_user:
+ return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
+ NULL : bpf_get_probe_write_proto();
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return &bpf_probe_read_kernel_proto;
- case BPF_FUNC_probe_read:
- return &bpf_probe_read_compat_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
+ NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return &bpf_probe_read_kernel_str_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
+ NULL : &bpf_probe_read_kernel_str_proto;
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ case BPF_FUNC_probe_read:
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
+ NULL : &bpf_probe_read_compat_proto;
case BPF_FUNC_probe_read_str:
- return &bpf_probe_read_compat_str_proto;
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
+ NULL : &bpf_probe_read_compat_str_proto;
+#endif
#ifdef CONFIG_CGROUPS
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
case BPF_FUNC_send_signal_thread:
return &bpf_send_signal_thread_proto;
+ case BPF_FUNC_perf_event_read_value:
+ return &bpf_perf_event_read_value_proto;
+ case BPF_FUNC_get_ns_current_pid_tgid:
+ return &bpf_get_ns_current_pid_tgid_proto;
+ case BPF_FUNC_ringbuf_output:
+ return &bpf_ringbuf_output_proto;
+ case BPF_FUNC_ringbuf_reserve:
+ return &bpf_ringbuf_reserve_proto;
+ case BPF_FUNC_ringbuf_submit:
+ return &bpf_ringbuf_submit_proto;
+ case BPF_FUNC_ringbuf_discard:
+ return &bpf_ringbuf_discard_proto;
+ case BPF_FUNC_ringbuf_query:
+ return &bpf_ringbuf_query_proto;
+ case BPF_FUNC_jiffies64:
+ return &bpf_jiffies64_proto;
+ case BPF_FUNC_get_task_stack:
+ return &bpf_get_task_stack_proto;
+ case BPF_FUNC_copy_from_user:
+ return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
+ case BPF_FUNC_snprintf_btf:
+ return &bpf_snprintf_btf_proto;
+ case BPF_FUNC_per_cpu_ptr:
+ return &bpf_per_cpu_ptr_proto;
+ case BPF_FUNC_this_cpu_ptr:
+ return &bpf_this_cpu_ptr_proto;
+ case BPF_FUNC_task_storage_get:
+ return &bpf_task_storage_get_proto;
+ case BPF_FUNC_task_storage_delete:
+ return &bpf_task_storage_delete_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_snprintf:
+ return &bpf_snprintf_proto;
+ case BPF_FUNC_get_func_ip:
+ return &bpf_get_func_ip_proto_tracing;
+ case BPF_FUNC_get_branch_snapshot:
+ return &bpf_get_branch_snapshot_proto;
+ case BPF_FUNC_find_vma:
+ return &bpf_find_vma_proto;
+ case BPF_FUNC_trace_vprintk:
+ return bpf_get_trace_vprintk_proto();
default:
- return NULL;
+ return bpf_base_func_proto(func_id);
}
}
@@ -858,14 +1272,16 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_stackid_proto;
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto;
- case BPF_FUNC_perf_event_read_value:
- return &bpf_perf_event_read_value_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
case BPF_FUNC_override_return:
return &bpf_override_return_proto;
#endif
+ case BPF_FUNC_get_func_ip:
+ return &bpf_get_func_ip_proto_kprobe;
+ case BPF_FUNC_get_attach_cookie:
+ return &bpf_get_attach_cookie_proto_trace;
default:
- return tracing_func_proto(func_id, prog);
+ return bpf_tracing_func_proto(func_id, prog);
}
}
@@ -918,7 +1334,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -974,8 +1390,10 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_stackid_proto_tp;
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto_tp;
+ case BPF_FUNC_get_attach_cookie:
+ return &bpf_get_attach_cookie_proto_trace;
default:
- return tracing_func_proto(func_id, prog);
+ return bpf_tracing_func_proto(func_id, prog);
}
}
@@ -1028,6 +1446,41 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
.arg3_type = ARG_CONST_SIZE,
};
+BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
+ void *, buf, u32, size, u64, flags)
+{
+ static const u32 br_entry_size = sizeof(struct perf_branch_entry);
+ struct perf_branch_stack *br_stack = ctx->data->br_stack;
+ u32 to_copy;
+
+ if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE))
+ return -EINVAL;
+
+ if (unlikely(!br_stack))
+ return -ENOENT;
+
+ if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE)
+ return br_stack->nr * br_entry_size;
+
+ if (!buf || (size % br_entry_size != 0))
+ return -EINVAL;
+
+ to_copy = min_t(u32, br_stack->nr * br_entry_size, size);
+ memcpy(buf, br_stack->entries, to_copy);
+
+ return to_copy;
+}
+
+static const struct bpf_func_proto bpf_read_branch_records_proto = {
+ .func = bpf_read_branch_records,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1035,13 +1488,17 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
- return &bpf_get_stackid_proto_tp;
+ return &bpf_get_stackid_proto_pe;
case BPF_FUNC_get_stack:
- return &bpf_get_stack_proto_tp;
+ return &bpf_get_stack_proto_pe;
case BPF_FUNC_perf_prog_read_value:
return &bpf_perf_prog_read_value_proto;
+ case BPF_FUNC_read_branch_records:
+ return &bpf_read_branch_records_proto;
+ case BPF_FUNC_get_attach_cookie:
+ return &bpf_get_attach_cookie_proto_pe;
default:
- return tracing_func_proto(func_id, prog);
+ return bpf_tracing_func_proto(func_id, prog);
}
}
@@ -1099,11 +1556,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
extern const struct bpf_func_proto bpf_skb_output_proto;
+extern const struct bpf_func_proto bpf_xdp_output_proto;
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags)
@@ -1152,7 +1610,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -1168,20 +1626,67 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto_raw_tp;
default:
- return tracing_func_proto(func_id, prog);
+ return bpf_tracing_func_proto(func_id, prog);
}
}
-static const struct bpf_func_proto *
+const struct bpf_func_proto *
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *fn;
+
switch (func_id) {
#ifdef CONFIG_NET
case BPF_FUNC_skb_output:
return &bpf_skb_output_proto;
+ case BPF_FUNC_xdp_output:
+ return &bpf_xdp_output_proto;
+ case BPF_FUNC_skc_to_tcp6_sock:
+ return &bpf_skc_to_tcp6_sock_proto;
+ case BPF_FUNC_skc_to_tcp_sock:
+ return &bpf_skc_to_tcp_sock_proto;
+ case BPF_FUNC_skc_to_tcp_timewait_sock:
+ return &bpf_skc_to_tcp_timewait_sock_proto;
+ case BPF_FUNC_skc_to_tcp_request_sock:
+ return &bpf_skc_to_tcp_request_sock_proto;
+ case BPF_FUNC_skc_to_udp6_sock:
+ return &bpf_skc_to_udp6_sock_proto;
+ case BPF_FUNC_skc_to_unix_sock:
+ return &bpf_skc_to_unix_sock_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_tracing_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_tracing_proto;
+ case BPF_FUNC_sock_from_file:
+ return &bpf_sock_from_file_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_ptr_cookie_proto;
#endif
+ case BPF_FUNC_seq_printf:
+ return prog->expected_attach_type == BPF_TRACE_ITER ?
+ &bpf_seq_printf_proto :
+ NULL;
+ case BPF_FUNC_seq_write:
+ return prog->expected_attach_type == BPF_TRACE_ITER ?
+ &bpf_seq_write_proto :
+ NULL;
+ case BPF_FUNC_seq_printf_btf:
+ return prog->expected_attach_type == BPF_TRACE_ITER ?
+ &bpf_seq_printf_btf_proto :
+ NULL;
+ case BPF_FUNC_d_path:
+ return &bpf_d_path_proto;
+ case BPF_FUNC_get_func_arg:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL;
+ case BPF_FUNC_get_func_ret:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
+ case BPF_FUNC_get_func_arg_cnt:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
default:
- return raw_tp_prog_func_proto(func_id, prog);
+ fn = raw_tp_prog_func_proto(func_id, prog);
+ if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
+ fn = bpf_iter_get_func_proto(func_id, prog);
+ return fn;
}
}
@@ -1190,13 +1695,7 @@ static bool raw_tp_prog_is_valid_access(int off, int size,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
- if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
- return false;
- if (type != BPF_READ)
- return false;
- if (off % size != 0)
- return false;
- return true;
+ return bpf_tracing_ctx_access(off, size, type);
}
static bool tracing_prog_is_valid_access(int off, int size,
@@ -1204,13 +1703,14 @@ static bool tracing_prog_is_valid_access(int off, int size,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
- if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
- return false;
- if (type != BPF_READ)
- return false;
- if (off % size != 0)
- return false;
- return btf_ctx_access(off, size, type, prog, info);
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+int __weak bpf_prog_test_run_tracing(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ return -ENOTSUPP;
}
const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
@@ -1219,6 +1719,9 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
};
const struct bpf_prog_ops raw_tracepoint_prog_ops = {
+#ifdef CONFIG_NET
+ .test_run = bpf_prog_test_run_raw_tp,
+#endif
};
const struct bpf_verifier_ops tracing_verifier_ops = {
@@ -1227,6 +1730,7 @@ const struct bpf_verifier_ops tracing_verifier_ops = {
};
const struct bpf_prog_ops tracing_prog_ops = {
+ .test_run = bpf_prog_test_run_tracing,
};
static bool raw_tp_writable_prog_is_valid_access(int off, int size,
@@ -1338,7 +1842,8 @@ static DEFINE_MUTEX(bpf_event_mutex);
#define BPF_TRACE_MAX_PROGS 64
int perf_event_attach_bpf_prog(struct perf_event *event,
- struct bpf_prog *prog)
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
struct bpf_prog_array *old_array;
struct bpf_prog_array *new_array;
@@ -1365,12 +1870,13 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
goto unlock;
}
- ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+ ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array);
if (ret < 0)
goto unlock;
/* set the new array to event->tp_event and set event->prog */
event->prog = prog;
+ event->bpf_cookie = bpf_cookie;
rcu_assign_pointer(event->tp_event->prog_array, new_array);
bpf_prog_array_free(old_array);
@@ -1391,7 +1897,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
goto unlock;
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
- ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+ ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
if (ret == -ENOENT)
goto unlock;
if (ret < 0) {
@@ -1416,7 +1922,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
u32 *ids, prog_cnt, ids_len;
int ret;
- if (!capable(CAP_SYS_ADMIN))
+ if (!perfmon_capable())
return -EPERM;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL;
@@ -1466,19 +1972,20 @@ struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
{
- struct module *mod = __module_address((unsigned long)btp);
+ struct module *mod;
- if (mod)
- module_put(mod);
+ preempt_disable();
+ mod = __module_address((unsigned long)btp);
+ module_put(mod);
+ preempt_enable();
}
static __always_inline
void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
{
+ cant_sleep();
rcu_read_lock();
- preempt_disable();
- (void) BPF_PROG_RUN(prog, args);
- preempt_enable();
+ (void) bpf_prog_run(prog, args);
rcu_read_unlock();
}
@@ -1541,7 +2048,8 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
if (prog->aux->max_tp_access > btp->writable_size)
return -EINVAL;
- return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog);
+ return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
+ prog);
}
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
@@ -1621,10 +2129,11 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
{
struct bpf_trace_module *btm, *tmp;
struct module *mod = module;
+ int ret = 0;
if (mod->num_bpf_raw_events == 0 ||
(op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
- return 0;
+ goto out;
mutex_lock(&bpf_module_mutex);
@@ -1634,6 +2143,8 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
if (btm) {
btm->module = module;
list_add(&btm->list, &bpf_trace_modules);
+ } else {
+ ret = -ENOMEM;
}
break;
case MODULE_STATE_GOING:
@@ -1649,7 +2160,8 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
mutex_unlock(&bpf_module_mutex);
- return 0;
+out:
+ return notifier_from_errno(ret);
}
static struct notifier_block bpf_module_nb = {
diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h
new file mode 100644
index 000000000000..9acbc11ac7bb
--- /dev/null
+++ b/kernel/trace/bpf_trace.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bpf_trace
+
+#if !defined(_TRACE_BPF_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+
+#define _TRACE_BPF_TRACE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(bpf_trace_printk,
+
+ TP_PROTO(const char *bpf_string),
+
+ TP_ARGS(bpf_string),
+
+ TP_STRUCT__entry(
+ __string(bpf_string, bpf_string)
+ ),
+
+ TP_fast_assign(
+ __assign_str(bpf_string, bpf_string);
+ ),
+
+ TP_printk("%s", __get_str(bpf_string))
+);
+
+#endif /* _TRACE_BPF_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE bpf_trace
+
+#include <trace/define_trace.h>
diff --git a/kernel/trace/error_report-traces.c b/kernel/trace/error_report-traces.c
new file mode 100644
index 000000000000..f89792c25b11
--- /dev/null
+++ b/kernel/trace/error_report-traces.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Error reporting trace points.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/error_report.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(error_report_end);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 1af321dec0f1..22061d38fc00 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -42,7 +42,7 @@ bool ftrace_graph_is_dead(void)
}
/**
- * ftrace_graph_stop - set to permanently disable function graph tracincg
+ * ftrace_graph_stop - set to permanently disable function graph tracing
*
* In case of an error int function graph tracing, this is called
* to try to keep function graph tracing from causing any more harm.
@@ -115,15 +115,17 @@ int function_graph_enter(unsigned long ret, unsigned long func,
{
struct ftrace_graph_ent trace;
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
/*
* Skip graph tracing if the return location is served by direct trampoline,
- * since call sequence and return addresses is unpredicatable anymore.
+ * since call sequence and return addresses are unpredictable anyway.
* Ex: BPF trampoline may call original function and may skip frame
* depending on type of BPF programs attached.
*/
if (ftrace_direct_func_count &&
ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE))
return -EBUSY;
+#endif
trace.func = func;
trace.depth = ++current->curr_ret_depth;
@@ -333,11 +335,10 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
static struct ftrace_ops graph_ops = {
- .func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE |
- FTRACE_OPS_FL_INITIALIZED |
+ .func = ftrace_graph_func,
+ .flags = FTRACE_OPS_FL_INITIALIZED |
FTRACE_OPS_FL_PID |
- FTRACE_OPS_FL_STUB,
+ FTRACE_OPS_GRAPH_STUB,
#ifdef FTRACE_GRAPH_TRAMP_ADDR
.trampoline = FTRACE_GRAPH_TRAMP_ADDR,
/* trampoline_size is only needed for dynamically allocated tramps */
@@ -387,15 +388,14 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
}
}
- read_lock(&tasklist_lock);
- do_each_thread(g, t) {
+ rcu_read_lock();
+ for_each_process_thread(g, t) {
if (start == end) {
ret = -EAGAIN;
goto unlock;
}
if (t->ret_stack == NULL) {
- atomic_set(&t->tracing_graph_pause, 0);
atomic_set(&t->trace_overrun, 0);
t->curr_ret_stack = -1;
t->curr_ret_depth = -1;
@@ -403,10 +403,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
smp_wmb();
t->ret_stack = ret_stack_list[start++];
}
- } while_each_thread(g, t);
+ }
unlock:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
free:
for (i = start; i < end; i++)
kfree(ret_stack_list[i]);
@@ -490,7 +490,6 @@ static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
static void
graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
{
- atomic_set(&t->tracing_graph_pause, 0);
atomic_set(&t->trace_overrun, 0);
t->ftrace_timestamp = 0;
/* make curr_ret_stack visible before we add the ret_stack */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fd81c7de77a7..be5f6b32a012 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -80,7 +80,7 @@ enum {
struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
+ .flags = FTRACE_OPS_FL_STUB,
INIT_OPS_HASH(ftrace_list_end)
};
@@ -102,7 +102,7 @@ static bool ftrace_pids_enabled(struct ftrace_ops *ops)
tr = ops->private;
- return tr->function_pids != NULL;
+ return tr->function_pids != NULL || tr->function_no_pids != NULL;
}
static void ftrace_update_trampoline(struct ftrace_ops *ops);
@@ -119,14 +119,9 @@ struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
struct ftrace_ops global_ops;
-#if ARCH_SUPPORTS_FTRACE_OPS
-static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs);
-#else
-/* See comment below, where ftrace_ops_list_func is defined */
-static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
-#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
-#endif
+/* Defined by vmlinux.lds.h see the commment above arch_ftrace_ops_list_func for details */
+void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
static inline void ftrace_ops_init(struct ftrace_ops *ops)
{
@@ -140,25 +135,21 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
}
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = op->private;
+ int pid;
- if (tr && this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid))
- return;
-
- op->saved_func(ip, parent_ip, op, regs);
-}
+ if (tr) {
+ pid = this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid);
+ if (pid == FTRACE_PID_IGNORE)
+ return;
+ if (pid != FTRACE_PID_TRACE &&
+ pid != current->pid)
+ return;
+ }
-static void ftrace_sync(struct work_struct *work)
-{
- /*
- * This function is just a stub to implement a hard force
- * of synchronize_rcu(). This requires synchronizing
- * tasks even in userspace and idle.
- *
- * Yes, function tracing is rude.
- */
+ op->saved_func(ip, parent_ip, op, fregs);
}
static void ftrace_sync_ipi(void *data)
@@ -234,7 +225,7 @@ static void update_ftrace_function(void)
/*
* For static tracing, we need to be a bit more careful.
* The function change takes affect immediately. Thus,
- * we need to coorditate the setting of the function_trace_ops
+ * we need to coordinate the setting of the function_trace_ops
* with the setting of the ftrace_trace_function.
*
* Set the function to the list ops, which will call the
@@ -246,7 +237,7 @@ static void update_ftrace_function(void)
* Make sure all CPUs see this. Yes this is slow, but static
* tracing is slow and nasty to have enabled.
*/
- schedule_on_each_cpu(ftrace_sync);
+ synchronize_rcu_tasks_rude();
/* Now all cpus are using the list ops. */
function_trace_op = set_function_trace_op;
/* Make sure the function_trace_op is visible on all CPUs */
@@ -327,7 +318,7 @@ int __register_ftrace_function(struct ftrace_ops *ops)
if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT))
return -EBUSY;
- if (!core_kernel_data((unsigned long)ops))
+ if (!is_kernel_core_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
add_ftrace_ops(&ftrace_ops_list, ops);
@@ -585,7 +576,7 @@ static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
}
-int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
+static int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
{
struct ftrace_profile_page *pg;
int functions;
@@ -758,7 +749,7 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
static void
function_profile_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct pt_regs *regs)
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
struct ftrace_profile_stat *stat;
struct ftrace_profile *rec;
@@ -870,7 +861,7 @@ static void unregister_ftrace_profiler(void)
#else
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ .flags = FTRACE_OPS_FL_INITIALIZED,
INIT_OPS_HASH(ftrace_profile_ops)
};
@@ -992,8 +983,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
}
}
- entry = tracefs_create_file("function_profile_enabled", 0644,
- d_tracer, NULL, &ftrace_profile_fops);
+ entry = tracefs_create_file("function_profile_enabled",
+ TRACE_MODE_WRITE, d_tracer, NULL,
+ &ftrace_profile_fops);
if (!entry)
pr_warn("Could not create tracefs 'function_profile_enabled' entry\n");
}
@@ -1044,13 +1036,12 @@ struct ftrace_ops global_ops = {
.local_hash.notrace_hash = EMPTY_HASH,
.local_hash.filter_hash = EMPTY_HASH,
INIT_OPS_HASH(global_ops)
- .flags = FTRACE_OPS_FL_RECURSION_SAFE |
- FTRACE_OPS_FL_INITIALIZED |
+ .flags = FTRACE_OPS_FL_INITIALIZED |
FTRACE_OPS_FL_PID,
};
/*
- * Used by the stack undwinder to know about dynamic ftrace trampolines.
+ * Used by the stack unwinder to know about dynamic ftrace trampolines.
*/
struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
{
@@ -1095,7 +1086,7 @@ struct ftrace_page {
struct ftrace_page *next;
struct dyn_ftrace *records;
int index;
- int size;
+ int order;
};
#define ENTRY_SIZE sizeof(struct dyn_ftrace)
@@ -1372,10 +1363,10 @@ static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size)
int i;
/*
- * Make the hash size about 1/2 the # found
+ * Use around half the size (max bit of it), but
+ * a minimum of 2 is fine (as size of 0 or 1 both give 1 for bits).
*/
- for (size /= 2; size; size >>= 1)
- bits++;
+ bits = fls(size / 2);
/* Don't allocate too much */
if (bits > FTRACE_HASH_MAX_BITS)
@@ -1455,7 +1446,7 @@ static bool hash_contains_ip(unsigned long ip,
{
/*
* The function record is a match if it exists in the filter
- * hash and not in the notrace hash. Note, an emty hash is
+ * hash and not in the notrace hash. Note, an empty hash is
* considered a match for the filter hash, but an empty
* notrace hash is considered not in the notrace hash.
*/
@@ -1633,6 +1624,8 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
static struct ftrace_ops *
ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
static struct ftrace_ops *
+ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude);
+static struct ftrace_ops *
ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
@@ -1782,7 +1775,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
* to it.
*/
if (ftrace_rec_count(rec) == 1 &&
- ftrace_find_tramp_ops_any(rec))
+ ftrace_find_tramp_ops_any_other(rec, ops))
rec->flags |= FTRACE_FL_TRAMP;
else
rec->flags &= ~FTRACE_FL_TRAMP;
@@ -1970,12 +1963,18 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
static void print_ip_ins(const char *fmt, const unsigned char *p)
{
+ char ins[MCOUNT_INSN_SIZE];
int i;
+ if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) {
+ printk(KERN_CONT "%s[FAULT] %px\n", fmt, p);
+ return;
+ }
+
printk(KERN_CONT "%s", fmt);
for (i = 0; i < MCOUNT_INSN_SIZE; i++)
- printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
+ printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]);
}
enum ftrace_bug_type ftrace_bug_type;
@@ -2017,16 +2016,16 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
{
unsigned long ip = rec ? rec->ip : 0;
+ pr_info("------------[ ftrace bug ]------------\n");
+
switch (failed) {
case -EFAULT:
- FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace faulted on modifying ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
break;
case -EINVAL:
- FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace failed to modify ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
print_ip_ins(" actual: ", (unsigned char *)ip);
pr_cont("\n");
if (ftrace_expected) {
@@ -2035,14 +2034,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
}
break;
case -EPERM:
- FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace faulted on writing ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
break;
default:
- FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace faulted on unknown error ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
}
print_bug_type();
if (rec) {
@@ -2067,6 +2064,8 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
ip = ftrace_get_addr_curr(rec);
pr_cont("\n expected tramp: %lx\n", ip);
}
+
+ FTRACE_WARN_ON_ONCE(1);
}
static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
@@ -2148,6 +2147,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
else
rec->flags &= ~FTRACE_FL_TRAMP_EN;
}
+
if (flag & FTRACE_FL_DIRECT) {
/*
* If there's only one user (direct_ops helper)
@@ -2204,7 +2204,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
}
/**
- * ftrace_update_record, set a record that now is tracing or not
+ * ftrace_update_record - set a record that now is tracing or not
* @rec: the record to update
* @enable: set to true if the record is tracing, false to force disable
*
@@ -2217,7 +2217,7 @@ int ftrace_update_record(struct dyn_ftrace *rec, bool enable)
}
/**
- * ftrace_test_record, check if the record has been enabled or not
+ * ftrace_test_record - check if the record has been enabled or not
* @rec: the record to test
* @enable: set to true to check if enabled, false if it is disabled
*
@@ -2249,6 +2249,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
}
static struct ftrace_ops *
+ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude)
+{
+ struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ if (op == op_exclude || !op->trampoline)
+ continue;
+
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+static struct ftrace_ops *
ftrace_find_tramp_ops_next(struct dyn_ftrace *rec,
struct ftrace_ops *op)
{
@@ -2261,7 +2279,7 @@ ftrace_find_tramp_ops_next(struct dyn_ftrace *rec,
if (hash_contains_ip(ip, op->func_hash))
return op;
- }
+ }
return NULL;
}
@@ -2372,9 +2390,43 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
return entry->direct;
}
+static struct ftrace_func_entry*
+ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
+ struct ftrace_hash **free_hash)
+{
+ struct ftrace_func_entry *entry;
+
+ if (ftrace_hash_empty(direct_functions) ||
+ direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
+ struct ftrace_hash *new_hash;
+ int size = ftrace_hash_empty(direct_functions) ? 0 :
+ direct_functions->count + 1;
+
+ if (size < 32)
+ size = 32;
+
+ new_hash = dup_hash(direct_functions, size);
+ if (!new_hash)
+ return NULL;
+
+ *free_hash = direct_functions;
+ direct_functions = new_hash;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return NULL;
+
+ entry->ip = ip;
+ entry->direct = addr;
+ __add_hash_entry(direct_functions, entry);
+ return entry;
+}
+
static void call_direct_funcs(unsigned long ip, unsigned long pip,
- struct ftrace_ops *ops, struct pt_regs *regs)
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
+ struct pt_regs *regs = ftrace_get_regs(fregs);
unsigned long addr;
addr = ftrace_find_rec_direct(ip);
@@ -2386,9 +2438,17 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops direct_ops = {
.func = call_direct_funcs,
- .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
+ .flags = FTRACE_OPS_FL_IPMODIFY
| FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
| FTRACE_OPS_FL_PERMANENT,
+ /*
+ * By declaring the main trampoline as this trampoline
+ * it will never have one allocated for it. Allocated
+ * trampolines should not call direct functions.
+ * The direct_ops should only be called by the builtin
+ * ftrace_regs_caller trampoline.
+ */
+ .trampoline = FTRACE_REGS_ADDR,
};
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
@@ -2398,7 +2458,7 @@ struct ftrace_ops direct_ops = {
*
* If the record has the FTRACE_FL_REGS set, that means that it
* wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
- * is not not set, then it wants to convert to the normal callback.
+ * is not set, then it wants to convert to the normal callback.
*
* Returns the address of the trampoline to set to
*/
@@ -2543,7 +2603,7 @@ struct ftrace_rec_iter {
};
/**
- * ftrace_rec_iter_start, start up iterating over traced functions
+ * ftrace_rec_iter_start - start up iterating over traced functions
*
* Returns an iterator handle that is used to iterate over all
* the records that represent address locations where functions
@@ -2574,7 +2634,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void)
}
/**
- * ftrace_rec_iter_next, get the next record to process.
+ * ftrace_rec_iter_next - get the next record to process.
* @iter: The handle to the iterator.
*
* Returns the next iterator after the given iterator @iter.
@@ -2599,7 +2659,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
}
/**
- * ftrace_rec_iter_record, get the record at the iterator location
+ * ftrace_rec_iter_record - get the record at the iterator location
* @iter: The current iterator location
*
* Returns the record that the current @iter is at.
@@ -2702,7 +2762,7 @@ static int __ftrace_modify_code(void *data)
}
/**
- * ftrace_run_stop_machine, go back to the stop machine method
+ * ftrace_run_stop_machine - go back to the stop machine method
* @command: The command to tell ftrace what to do
*
* If an arch needs to fall back to the stop machine method, the
@@ -2714,7 +2774,7 @@ void ftrace_run_stop_machine(int command)
}
/**
- * arch_ftrace_update_code, modify the code to trace or not trace
+ * arch_ftrace_update_code - modify the code to trace or not trace
* @command: The command that needs to be done
*
* Archs can override this function if it does not need to
@@ -2765,6 +2825,51 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
}
+/* List of trace_ops that have allocated trampolines */
+static LIST_HEAD(ftrace_ops_trampoline_list);
+
+static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops)
+{
+ lockdep_assert_held(&ftrace_lock);
+ list_add_rcu(&ops->list, &ftrace_ops_trampoline_list);
+}
+
+static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops)
+{
+ lockdep_assert_held(&ftrace_lock);
+ list_del_rcu(&ops->list);
+ synchronize_rcu();
+}
+
+/*
+ * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols
+ * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is
+ * not a module.
+ */
+#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace"
+#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline"
+
+static void ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+ if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) &&
+ ops->trampoline) {
+ /*
+ * Record the text poke event before the ksymbol unregister
+ * event.
+ */
+ perf_event_text_poke((void *)ops->trampoline,
+ (void *)ops->trampoline,
+ ops->trampoline_size, NULL, 0);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ ops->trampoline, ops->trampoline_size,
+ true, FTRACE_TRAMPOLINE_SYM);
+ /* Remove from kallsyms after the perf events */
+ ftrace_remove_trampoline_from_kallsyms(ops);
+ }
+
+ arch_ftrace_trampoline_free(ops);
+}
+
static void ftrace_startup_enable(int command)
{
if (saved_ftrace_func != ftrace_trace_function) {
@@ -2814,6 +2919,8 @@ int ftrace_startup(struct ftrace_ops *ops, int command)
__unregister_ftrace_function(ops);
ftrace_start_up--;
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+ ftrace_trampoline_free(ops);
return ret;
}
@@ -2922,20 +3029,20 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
* infrastructure to do the synchronization, thus we must do it
* ourselves.
*/
- schedule_on_each_cpu(ftrace_sync);
+ synchronize_rcu_tasks_rude();
/*
- * When the kernel is preeptive, tasks can be preempted
+ * When the kernel is preemptive, tasks can be preempted
* while on a ftrace trampoline. Just scheduling a task on
* a CPU is not good enough to flush them. Calling
- * synchornize_rcu_tasks() will wait for those tasks to
+ * synchronize_rcu_tasks() will wait for those tasks to
* execute and either schedule voluntarily or enter user space.
*/
if (IS_ENABLED(CONFIG_PREEMPTION))
synchronize_rcu_tasks();
free_ops:
- arch_ftrace_trampoline_free(ops);
+ ftrace_trampoline_free(ops);
}
return 0;
@@ -3022,6 +3129,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
+ bool init_nop = ftrace_need_init_nop();
struct ftrace_page *pg;
struct dyn_ftrace *p;
u64 start, stop;
@@ -3060,8 +3168,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
* Do the initial record conversion from mcount jump
* to the NOP instructions.
*/
- if (!__is_defined(CC_USING_NOP_MCOUNT) &&
- !ftrace_nop_initialize(mod, p))
+ if (init_nop && !ftrace_nop_initialize(mod, p))
break;
update_cnt++;
@@ -3078,19 +3185,15 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
static int ftrace_allocate_records(struct ftrace_page *pg, int count)
{
int order;
+ int pages;
int cnt;
if (WARN_ON(!count))
return -EINVAL;
- order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
-
- /*
- * We want to fill as much as possible. No more than a page
- * may be empty.
- */
- while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
- order--;
+ /* We want to fill as much as possible, with no empty pages */
+ pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE);
+ order = fls(pages) - 1;
again:
pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
@@ -3107,7 +3210,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
ftrace_number_of_groups++;
cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
- pg->size = cnt;
+ pg->order = order;
if (cnt > count)
cnt = count;
@@ -3120,7 +3223,6 @@ ftrace_allocate_pages(unsigned long num_to_init)
{
struct ftrace_page *start_pg;
struct ftrace_page *pg;
- int order;
int cnt;
if (!num_to_init)
@@ -3156,12 +3258,13 @@ ftrace_allocate_pages(unsigned long num_to_init)
free_pages:
pg = start_pg;
while (pg) {
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
start_pg = pg->next;
kfree(pg);
pg = start_pg;
- ftrace_number_of_pages -= 1 << order;
ftrace_number_of_groups--;
}
pr_info("ftrace: FAILED to allocate memory for functions\n");
@@ -3600,7 +3703,7 @@ static int t_show(struct seq_file *m, void *v)
if (direct)
seq_printf(m, "\n\tdirect-->%pS", (void *)direct);
}
- }
+ }
seq_putc(m, '\n');
@@ -4110,7 +4213,6 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
struct ftrace_hash **orig_hash, *new_hash;
LIST_HEAD(process_mods);
char *func;
- int ret;
mutex_lock(&ops->func_hash->regex_lock);
@@ -4139,8 +4241,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
if (!func) /* warn? */
continue;
- list_del(&ftrace_mod->list);
- list_add(&ftrace_mod->list, &process_mods);
+ list_move(&ftrace_mod->list, &process_mods);
/* Use the newly allocated func, as it may be "*" */
kfree(ftrace_mod->func);
@@ -4163,7 +4264,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
mutex_lock(&ftrace_lock);
- ret = ftrace_hash_move_and_update_ops(ops, orig_hash,
+ ftrace_hash_move_and_update_ops(ops, orig_hash,
new_hash, enable);
mutex_unlock(&ftrace_lock);
@@ -4241,7 +4342,7 @@ static int __init ftrace_mod_cmd_init(void)
core_initcall(ftrace_mod_cmd_init);
static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct ftrace_probe_ops *probe_ops;
struct ftrace_func_probe *probe;
@@ -4317,7 +4418,7 @@ void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper,
* @ip: The instruction pointer address to map @data to
* @data: The data to map to @ip
*
- * Returns 0 on succes otherwise an error.
+ * Returns 0 on success otherwise an error.
*/
int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
unsigned long ip, void *data)
@@ -4485,7 +4586,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
/*
* Note, there's a small window here that the func_hash->filter_hash
- * may be NULL or empty. Need to be carefule when reading the loop.
+ * may be NULL or empty. Need to be careful when reading the loop.
*/
mutex_lock(&probe->ops.func_hash->regex_lock);
@@ -4972,6 +5073,20 @@ struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
return NULL;
}
+static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr)
+{
+ struct ftrace_direct_func *direct;
+
+ direct = kmalloc(sizeof(*direct), GFP_KERNEL);
+ if (!direct)
+ return NULL;
+ direct->addr = addr;
+ direct->count = 0;
+ list_add_rcu(&direct->next, &ftrace_direct_funcs);
+ ftrace_direct_func_count++;
+ return direct;
+}
+
/**
* register_ftrace_direct - Call a custom trampoline directly
* @ip: The address of the nop at the beginning of a function
@@ -5024,43 +5139,16 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
}
ret = -ENOMEM;
- if (ftrace_hash_empty(direct_functions) ||
- direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
- struct ftrace_hash *new_hash;
- int size = ftrace_hash_empty(direct_functions) ? 0 :
- direct_functions->count + 1;
-
- if (size < 32)
- size = 32;
-
- new_hash = dup_hash(direct_functions, size);
- if (!new_hash)
- goto out_unlock;
-
- free_hash = direct_functions;
- direct_functions = new_hash;
- }
-
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- goto out_unlock;
-
direct = ftrace_find_direct_func(addr);
if (!direct) {
- direct = kmalloc(sizeof(*direct), GFP_KERNEL);
- if (!direct) {
- kfree(entry);
+ direct = ftrace_alloc_direct_func(addr);
+ if (!direct)
goto out_unlock;
- }
- direct->addr = addr;
- direct->count = 0;
- list_add_rcu(&direct->next, &ftrace_direct_funcs);
- ftrace_direct_func_count++;
}
- entry->ip = ip;
- entry->direct = addr;
- __add_hash_entry(direct_functions, entry);
+ entry = ftrace_add_rec_direct(ip, addr, &free_hash);
+ if (!entry)
+ goto out_unlock;
ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
if (ret)
@@ -5129,6 +5217,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
{
struct ftrace_direct_func *direct;
struct ftrace_func_entry *entry;
+ struct ftrace_hash *hash;
int ret = -ENODEV;
mutex_lock(&direct_mutex);
@@ -5137,7 +5226,8 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
if (!entry)
goto out_unlock;
- if (direct_functions->count == 1)
+ hash = direct_ops.func_hash->filter_hash;
+ if (hash->count == 1)
unregister_ftrace_function(&direct_ops);
ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
@@ -5155,6 +5245,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
list_del_rcu(&direct->next);
synchronize_rcu_tasks();
kfree(direct);
+ kfree(entry);
ftrace_direct_func_count--;
}
}
@@ -5255,6 +5346,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
int modify_ftrace_direct(unsigned long ip,
unsigned long old_addr, unsigned long new_addr)
{
+ struct ftrace_direct_func *direct, *new_direct = NULL;
struct ftrace_func_entry *entry;
struct dyn_ftrace *rec;
int ret = -ENODEV;
@@ -5270,6 +5362,20 @@ int modify_ftrace_direct(unsigned long ip,
if (entry->direct != old_addr)
goto out_unlock;
+ direct = ftrace_find_direct_func(old_addr);
+ if (WARN_ON(!direct))
+ goto out_unlock;
+ if (direct->count > 1) {
+ ret = -ENOMEM;
+ new_direct = ftrace_alloc_direct_func(new_addr);
+ if (!new_direct)
+ goto out_unlock;
+ direct->count--;
+ new_direct->count++;
+ } else {
+ direct->addr = new_addr;
+ }
+
/*
* If there's no other ftrace callback on the rec->ip location,
* then it can be changed directly by the architecture.
@@ -5283,12 +5389,235 @@ int modify_ftrace_direct(unsigned long ip,
ret = 0;
}
+ if (unlikely(ret && new_direct)) {
+ direct->count++;
+ list_del_rcu(&new_direct->next);
+ synchronize_rcu_tasks();
+ kfree(new_direct);
+ ftrace_direct_func_count--;
+ }
+
out_unlock:
mutex_unlock(&ftrace_lock);
mutex_unlock(&direct_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
+
+#define MULTI_FLAGS (FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | \
+ FTRACE_OPS_FL_SAVE_REGS)
+
+static int check_direct_multi(struct ftrace_ops *ops)
+{
+ if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED))
+ return -EINVAL;
+ if ((ops->flags & MULTI_FLAGS) != MULTI_FLAGS)
+ return -EINVAL;
+ return 0;
+}
+
+static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long addr)
+{
+ struct ftrace_func_entry *entry, *del;
+ int size, i;
+
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ del = __ftrace_lookup_ip(direct_functions, entry->ip);
+ if (del && del->direct == addr) {
+ remove_hash_entry(direct_functions, del);
+ kfree(del);
+ }
+ }
+ }
+}
+
+/**
+ * register_ftrace_direct_multi - Call a custom trampoline directly
+ * for multiple functions registered in @ops
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the trampoline to call at @ops functions
+ *
+ * This is used to connect a direct calls to @addr from the nop locations
+ * of the functions registered in @ops (with by ftrace_set_filter_ip
+ * function).
+ *
+ * The location that it calls (@addr) must be able to handle a direct call,
+ * and save the parameters of the function being traced, and restore them
+ * (or inject new ones if needed), before returning.
+ *
+ * Returns:
+ * 0 on success
+ * -EINVAL - The @ops object was already registered with this call or
+ * when there are no functions in @ops object.
+ * -EBUSY - Another direct function is already attached (there can be only one)
+ * -ENODEV - @ip does not point to a ftrace nop location (or not supported)
+ * -ENOMEM - There was an allocation failure.
+ */
+int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+{
+ struct ftrace_hash *hash, *free_hash = NULL;
+ struct ftrace_func_entry *entry, *new;
+ int err = -EBUSY, size, i;
+
+ if (ops->func || ops->trampoline)
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED))
+ return -EINVAL;
+ if (ops->flags & FTRACE_OPS_FL_ENABLED)
+ return -EINVAL;
+
+ hash = ops->func_hash->filter_hash;
+ if (ftrace_hash_empty(hash))
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+
+ /* Make sure requested entries are not already registered.. */
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ if (ftrace_find_rec_direct(entry->ip))
+ goto out_unlock;
+ }
+ }
+
+ /* ... and insert them to direct_functions hash. */
+ err = -ENOMEM;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ new = ftrace_add_rec_direct(entry->ip, addr, &free_hash);
+ if (!new)
+ goto out_remove;
+ entry->direct = addr;
+ }
+ }
+
+ ops->func = call_direct_funcs;
+ ops->flags = MULTI_FLAGS;
+ ops->trampoline = FTRACE_REGS_ADDR;
+
+ err = register_ftrace_function(ops);
+
+ out_remove:
+ if (err)
+ remove_direct_functions_hash(hash, addr);
+
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ if (free_hash) {
+ synchronize_rcu_tasks();
+ free_ftrace_hash(free_hash);
+ }
+ return err;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_direct_multi);
+
+/**
+ * unregister_ftrace_direct_multi - Remove calls to custom trampoline
+ * previously registered by register_ftrace_direct_multi for @ops object.
+ * @ops: The address of the struct ftrace_ops object
+ *
+ * This is used to remove a direct calls to @addr from the nop locations
+ * of the functions registered in @ops (with by ftrace_set_filter_ip
+ * function).
+ *
+ * Returns:
+ * 0 on success
+ * -EINVAL - The @ops object was not properly registered.
+ */
+int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+ int err;
+
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+ err = unregister_ftrace_function(ops);
+ remove_direct_functions_hash(hash, addr);
+ mutex_unlock(&direct_mutex);
+
+ /* cleanup for possible another register call */
+ ops->func = NULL;
+ ops->trampoline = 0;
+ return err;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi);
+
+/**
+ * modify_ftrace_direct_multi - Modify an existing direct 'multi' call
+ * to call something else
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the new trampoline to call at @ops functions
+ *
+ * This is used to unregister currently registered direct caller and
+ * register new one @addr on functions registered in @ops object.
+ *
+ * Note there's window between ftrace_shutdown and ftrace_startup calls
+ * where there will be no callbacks called.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @ops object was not properly registered.
+ */
+int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+{
+ struct ftrace_hash *hash;
+ struct ftrace_func_entry *entry, *iter;
+ static struct ftrace_ops tmp_ops = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_STUB,
+ };
+ int i, size;
+ int err;
+
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+
+ /* Enable the tmp_ops to have the same functions as the direct ops */
+ ftrace_ops_init(&tmp_ops);
+ tmp_ops.func_hash = ops->func_hash;
+
+ err = register_ftrace_function(&tmp_ops);
+ if (err)
+ goto out_direct;
+
+ /*
+ * Now the ftrace_ops_list_func() is called to do the direct callers.
+ * We can safely change the direct functions attached to each entry.
+ */
+ mutex_lock(&ftrace_lock);
+
+ hash = ops->func_hash->filter_hash;
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(iter, &hash->buckets[i], hlist) {
+ entry = __ftrace_lookup_ip(direct_functions, iter->ip);
+ if (!entry)
+ continue;
+ entry->direct = addr;
+ }
+ }
+
+ mutex_unlock(&ftrace_lock);
+
+ /* Removing the tmp_ops will add the updated direct callers to the functions */
+ unregister_ftrace_function(&tmp_ops);
+
+ out_direct:
+ mutex_unlock(&direct_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi);
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
@@ -5299,7 +5628,7 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct);
* @reset - non zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled
- * If @ip is NULL, it failes to update filter.
+ * If @ip is NULL, it fails to update filter.
*/
int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset)
@@ -5514,7 +5843,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
struct ftrace_hash **orig_hash;
struct trace_parser *parser;
int filter_hash;
- int ret;
if (file->f_mode & FMODE_READ) {
iter = m->private;
@@ -5524,7 +5852,10 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
parser = &iter->parser;
if (trace_parser_loaded(parser)) {
- ftrace_match_records(iter->hash, parser->buffer, parser->idx);
+ int enable = !(iter->flags & FTRACE_ITER_NOTRACE);
+
+ ftrace_process_regex(iter, parser->buffer,
+ parser->idx, enable);
}
trace_parser_put(parser);
@@ -5542,7 +5873,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
orig_hash = &iter->ops->func_hash->notrace_hash;
mutex_lock(&ftrace_lock);
- ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
+ ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
iter->hash, filter_hash);
mutex_unlock(&ftrace_lock);
} else {
@@ -5877,7 +6208,8 @@ ftrace_graph_release(struct inode *inode, struct file *file)
* infrastructure to do the synchronization, thus we must do it
* ourselves.
*/
- schedule_on_each_cpu(ftrace_sync);
+ if (old_hash != EMPTY_HASH)
+ synchronize_rcu_tasks_rude();
free_ftrace_hash(old_hash);
}
@@ -6000,10 +6332,10 @@ void ftrace_create_filter_files(struct ftrace_ops *ops,
struct dentry *parent)
{
- trace_create_file("set_ftrace_filter", 0644, parent,
+ trace_create_file("set_ftrace_filter", TRACE_MODE_WRITE, parent,
ops, &ftrace_filter_fops);
- trace_create_file("set_ftrace_notrace", 0644, parent,
+ trace_create_file("set_ftrace_notrace", TRACE_MODE_WRITE, parent,
ops, &ftrace_notrace_fops);
}
@@ -6030,19 +6362,19 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
{
- trace_create_file("available_filter_functions", 0444,
+ trace_create_file("available_filter_functions", TRACE_MODE_READ,
d_tracer, NULL, &ftrace_avail_fops);
- trace_create_file("enabled_functions", 0444,
+ trace_create_file("enabled_functions", TRACE_MODE_READ,
d_tracer, NULL, &ftrace_enabled_fops);
ftrace_create_filter_files(&global_ops, d_tracer);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- trace_create_file("set_graph_function", 0644, d_tracer,
+ trace_create_file("set_graph_function", TRACE_MODE_WRITE, d_tracer,
NULL,
&ftrace_graph_fops);
- trace_create_file("set_graph_notrace", 0644, d_tracer,
+ trace_create_file("set_graph_notrace", TRACE_MODE_WRITE, d_tracer,
NULL,
&ftrace_graph_notrace_fops);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -6114,6 +6446,7 @@ static int ftrace_process_locs(struct module *mod,
p = start;
pg = start_pg;
while (p < end) {
+ unsigned long end_offset;
addr = ftrace_call_adjust(*p++);
/*
* Some architecture linkers will pad between
@@ -6124,7 +6457,8 @@ static int ftrace_process_locs(struct module *mod,
if (!addr)
continue;
- if (pg->index == pg->size) {
+ end_offset = (pg->index+1) * sizeof(pg->records[0]);
+ if (end_offset > PAGE_SIZE << pg->order) {
/* We should have allocated enough */
if (WARN_ON(!pg->next))
break;
@@ -6178,6 +6512,27 @@ struct ftrace_mod_map {
unsigned int num_funcs;
};
+static int ftrace_get_trampoline_kallsym(unsigned int symnum,
+ unsigned long *value, char *type,
+ char *name, char *module_name,
+ int *exported)
+{
+ struct ftrace_ops *op;
+
+ list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) {
+ if (!op->trampoline || symnum--)
+ continue;
+ *value = op->trampoline;
+ *type = 't';
+ strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN);
+ strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN);
+ *exported = 0;
+ return 0;
+ }
+
+ return -ERANGE;
+}
+
#ifdef CONFIG_MODULES
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@ -6190,8 +6545,19 @@ static int referenced_filters(struct dyn_ftrace *rec)
int cnt = 0;
for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec))
- cnt++;
+ if (ops_references_rec(ops, rec)) {
+ if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT))
+ continue;
+ if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ continue;
+ cnt++;
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ rec->flags |= FTRACE_FL_REGS;
+ if (cnt == 1 && ops->trampoline)
+ rec->flags |= FTRACE_FL_TRAMP;
+ else
+ rec->flags &= ~FTRACE_FL_TRAMP;
+ }
}
return cnt;
@@ -6220,7 +6586,7 @@ clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash)
}
}
-/* Clear any records from hashs */
+/* Clear any records from hashes */
static void clear_mod_from_hashes(struct ftrace_page *pg)
{
struct trace_array *tr;
@@ -6261,7 +6627,6 @@ void ftrace_release_mod(struct module *mod)
struct ftrace_page **last_pg;
struct ftrace_page *tmp_page = NULL;
struct ftrace_page *pg;
- int order;
mutex_lock(&ftrace_lock);
@@ -6312,11 +6677,12 @@ void ftrace_release_mod(struct module *mod)
/* Needs to be called outside of ftrace_lock */
clear_mod_from_hashes(pg);
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
tmp_page = pg->next;
kfree(pg);
- ftrace_number_of_pages -= 1 << order;
ftrace_number_of_groups--;
}
}
@@ -6370,8 +6736,8 @@ void ftrace_module_enable(struct module *mod)
if (ftrace_start_up)
cnt += referenced_filters(rec);
- /* This clears FTRACE_FL_DISABLED */
- rec->flags = cnt;
+ rec->flags &= ~FTRACE_FL_DISABLED;
+ rec->flags += cnt;
if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(rec, 1);
@@ -6514,6 +6880,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
{
struct ftrace_mod_map *mod_map;
struct ftrace_mod_func *mod_func;
+ int ret;
preempt_disable();
list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
@@ -6540,8 +6907,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
WARN_ON(1);
break;
}
+ ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+ module_name, exported);
preempt_enable();
- return -ERANGE;
+ return ret;
}
#else
@@ -6553,6 +6922,18 @@ allocate_ftrace_mod_map(struct module *mod,
{
return NULL;
}
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *name, char *module_name,
+ int *exported)
+{
+ int ret;
+
+ preempt_disable();
+ ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+ module_name, exported);
+ preempt_enable();
+ return ret;
+}
#endif /* CONFIG_MODULES */
struct ftrace_init_func {
@@ -6619,7 +7000,6 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
struct ftrace_mod_map *mod_map = NULL;
struct ftrace_init_func *func, *func_next;
struct list_head clear_hash;
- int order;
INIT_LIST_HEAD(&clear_hash);
@@ -6657,9 +7037,10 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
ftrace_update_tot_cnt--;
if (!pg->index) {
*last_pg = pg->next;
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
- ftrace_number_of_pages -= 1 << order;
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
ftrace_number_of_groups--;
kfree(pg);
pg = container_of(last_pg, struct ftrace_page, next);
@@ -6688,6 +7069,11 @@ void __init ftrace_free_init_mem(void)
ftrace_free_mem(NULL, start, end);
}
+int __init __weak ftrace_dyn_arch_init(void)
+{
+ return 0;
+}
+
void __init ftrace_init(void)
{
extern unsigned long __start_mcount_loc[];
@@ -6733,7 +7119,24 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
static void ftrace_update_trampoline(struct ftrace_ops *ops)
{
+ unsigned long trampoline = ops->trampoline;
+
arch_ftrace_update_trampoline(ops);
+ if (ops->trampoline && ops->trampoline != trampoline &&
+ (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) {
+ /* Add to kallsyms before the perf events */
+ ftrace_add_trampoline_to_kallsyms(ops);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ ops->trampoline, ops->trampoline_size, false,
+ FTRACE_TRAMPOLINE_SYM);
+ /*
+ * Record the perf text poke event after the ksymbol register
+ * event.
+ */
+ perf_event_text_poke((void *)ops->trampoline, NULL, 0,
+ (void *)ops->trampoline,
+ ops->trampoline_size);
+ }
}
void ftrace_init_trace_array(struct trace_array *tr)
@@ -6746,8 +7149,7 @@ void ftrace_init_trace_array(struct trace_array *tr)
struct ftrace_ops global_ops = {
.func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE |
- FTRACE_OPS_FL_INITIALIZED |
+ .flags = FTRACE_OPS_FL_INITIALIZED |
FTRACE_OPS_FL_PID,
};
@@ -6797,20 +7199,20 @@ void ftrace_reset_array_ops(struct trace_array *tr)
static nokprobe_inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ignored, struct pt_regs *regs)
+ struct ftrace_ops *ignored, struct ftrace_regs *fregs)
{
+ struct pt_regs *regs = ftrace_get_regs(fregs);
struct ftrace_ops *op;
int bit;
- bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
- if (bit < 0)
- return;
-
/*
- * Some of the ops may be dynamically allocated,
- * they must be freed after a synchronize_rcu().
+ * The ftrace_test_and_set_recursion() will disable preemption,
+ * which is required since some of the ops may be dynamically
+ * allocated, they must be freed after a synchronize_rcu().
*/
- preempt_disable_notrace();
+ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START);
+ if (bit < 0)
+ return;
do_for_each_ftrace_op(op, ftrace_ops_list) {
/* Stub functions don't need to be called nor tested */
@@ -6831,11 +7233,10 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
pr_warn("op=%p %pS\n", op, op);
goto out;
}
- op->func(ip, parent_ip, op, regs);
+ op->func(ip, parent_ip, op, fregs);
}
} while_for_each_ftrace_op(op);
out:
- preempt_enable_notrace();
trace_clear_recursion(bit);
}
@@ -6851,21 +7252,23 @@ out:
* Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
* An architecture can pass partial regs with ftrace_ops and still
* set the ARCH_SUPPORTS_FTRACE_OPS.
+ *
+ * In vmlinux.lds.h, ftrace_ops_list_func() is defined to be
+ * arch_ftrace_ops_list_func.
*/
#if ARCH_SUPPORTS_FTRACE_OPS
-static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
+void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
- __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
+ __ftrace_ops_list_func(ip, parent_ip, NULL, fregs);
}
-NOKPROBE_SYMBOL(ftrace_ops_list_func);
#else
-static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
+void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
{
__ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
}
-NOKPROBE_SYMBOL(ftrace_ops_no_ops);
#endif
+NOKPROBE_SYMBOL(arch_ftrace_ops_list_func);
/*
* If there's only one function registered but it does not support
@@ -6873,22 +7276,17 @@ NOKPROBE_SYMBOL(ftrace_ops_no_ops);
* this function will be called by the mcount trampoline.
*/
static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
int bit;
- if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching())
- return;
-
- bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START);
if (bit < 0)
return;
- preempt_disable_notrace();
-
- op->func(ip, parent_ip, op, regs);
+ if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching())
+ op->func(ip, parent_ip, op, fregs);
- preempt_enable_notrace();
trace_clear_recursion(bit);
}
NOKPROBE_SYMBOL(ftrace_ops_assist_func);
@@ -6907,11 +7305,11 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func);
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
{
/*
- * If the function does not handle recursion, needs to be RCU safe,
- * or does per cpu logic, then we need to call the assist handler.
+ * If the function does not handle recursion or needs to be RCU safe,
+ * then we need to call the assist handler.
*/
- if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
- ops->flags & FTRACE_OPS_FL_RCU)
+ if (ops->flags & (FTRACE_OPS_FL_RECURSION |
+ FTRACE_OPS_FL_RCU))
return ftrace_ops_assist_func;
return ops->func;
@@ -6923,11 +7321,17 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
{
struct trace_array *tr = data;
struct trace_pid_list *pid_list;
+ struct trace_pid_list *no_pid_list;
pid_list = rcu_dereference_sched(tr->function_pids);
+ no_pid_list = rcu_dereference_sched(tr->function_no_pids);
- this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
- trace_ignore_this_task(pid_list, next));
+ if (trace_ignore_this_task(pid_list, no_pid_list, next))
+ this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
+ FTRACE_PID_IGNORE);
+ else
+ this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
+ next->pid);
}
static void
@@ -6940,6 +7344,9 @@ ftrace_pid_follow_sched_process_fork(void *data,
pid_list = rcu_dereference_sched(tr->function_pids);
trace_filter_add_remove_task(pid_list, self, task);
+
+ pid_list = rcu_dereference_sched(tr->function_no_pids);
+ trace_filter_add_remove_task(pid_list, self, task);
}
static void
@@ -6950,6 +7357,9 @@ ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task)
pid_list = rcu_dereference_sched(tr->function_pids);
trace_filter_add_remove_task(pid_list, NULL, task);
+
+ pid_list = rcu_dereference_sched(tr->function_no_pids);
+ trace_filter_add_remove_task(pid_list, NULL, task);
}
void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
@@ -6957,52 +7367,67 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
if (enable) {
register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
} else {
unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
}
}
-static void clear_ftrace_pids(struct trace_array *tr)
+static void clear_ftrace_pids(struct trace_array *tr, int type)
{
struct trace_pid_list *pid_list;
+ struct trace_pid_list *no_pid_list;
int cpu;
pid_list = rcu_dereference_protected(tr->function_pids,
lockdep_is_held(&ftrace_lock));
- if (!pid_list)
+ no_pid_list = rcu_dereference_protected(tr->function_no_pids,
+ lockdep_is_held(&ftrace_lock));
+
+ /* Make sure there's something to do */
+ if (!pid_type_enabled(type, pid_list, no_pid_list))
return;
- unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
+ /* See if the pids still need to be checked after this */
+ if (!still_need_pid_events(type, pid_list, no_pid_list)) {
+ unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = FTRACE_PID_TRACE;
+ }
- for_each_possible_cpu(cpu)
- per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = false;
+ if (type & TRACE_PIDS)
+ rcu_assign_pointer(tr->function_pids, NULL);
- rcu_assign_pointer(tr->function_pids, NULL);
+ if (type & TRACE_NO_PIDS)
+ rcu_assign_pointer(tr->function_no_pids, NULL);
/* Wait till all users are no longer using pid filtering */
synchronize_rcu();
- trace_free_pid_list(pid_list);
+ if ((type & TRACE_PIDS) && pid_list)
+ trace_pid_list_free(pid_list);
+
+ if ((type & TRACE_NO_PIDS) && no_pid_list)
+ trace_pid_list_free(no_pid_list);
}
void ftrace_clear_pids(struct trace_array *tr)
{
mutex_lock(&ftrace_lock);
- clear_ftrace_pids(tr);
+ clear_ftrace_pids(tr, TRACE_PIDS | TRACE_NO_PIDS);
mutex_unlock(&ftrace_lock);
}
-static void ftrace_pid_reset(struct trace_array *tr)
+static void ftrace_pid_reset(struct trace_array *tr, int type)
{
mutex_lock(&ftrace_lock);
- clear_ftrace_pids(tr);
+ clear_ftrace_pids(tr, type);
ftrace_update_pid_func();
ftrace_startup_all(0);
@@ -7066,9 +7491,45 @@ static const struct seq_operations ftrace_pid_sops = {
.show = fpid_show,
};
-static int
-ftrace_pid_open(struct inode *inode, struct file *file)
+static void *fnpid_start(struct seq_file *m, loff_t *pos)
+ __acquires(RCU)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_array *tr = m->private;
+
+ mutex_lock(&ftrace_lock);
+ rcu_read_lock_sched();
+
+ pid_list = rcu_dereference_sched(tr->function_no_pids);
+
+ if (!pid_list)
+ return !(*pos) ? FTRACE_NO_PIDS : NULL;
+
+ return trace_pid_start(pid_list, pos);
+}
+
+static void *fnpid_next(struct seq_file *m, void *v, loff_t *pos)
{
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_no_pids);
+
+ if (v == FTRACE_NO_PIDS) {
+ (*pos)++;
+ return NULL;
+ }
+ return trace_pid_next(pid_list, v, pos);
+}
+
+static const struct seq_operations ftrace_no_pid_sops = {
+ .start = fnpid_start,
+ .next = fnpid_next,
+ .stop = fpid_stop,
+ .show = fpid_show,
+};
+
+static int pid_open(struct inode *inode, struct file *file, int type)
+{
+ const struct seq_operations *seq_ops;
struct trace_array *tr = inode->i_private;
struct seq_file *m;
int ret = 0;
@@ -7079,9 +7540,22 @@ ftrace_pid_open(struct inode *inode, struct file *file)
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
- ftrace_pid_reset(tr);
+ ftrace_pid_reset(tr, type);
+
+ switch (type) {
+ case TRACE_PIDS:
+ seq_ops = &ftrace_pid_sops;
+ break;
+ case TRACE_NO_PIDS:
+ seq_ops = &ftrace_no_pid_sops;
+ break;
+ default:
+ trace_array_put(tr);
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
- ret = seq_open(file, &ftrace_pid_sops);
+ ret = seq_open(file, seq_ops);
if (ret < 0) {
trace_array_put(tr);
} else {
@@ -7093,10 +7567,23 @@ ftrace_pid_open(struct inode *inode, struct file *file)
return ret;
}
+static int
+ftrace_pid_open(struct inode *inode, struct file *file)
+{
+ return pid_open(inode, file, TRACE_PIDS);
+}
+
+static int
+ftrace_no_pid_open(struct inode *inode, struct file *file)
+{
+ return pid_open(inode, file, TRACE_NO_PIDS);
+}
+
static void ignore_task_cpu(void *data)
{
struct trace_array *tr = data;
struct trace_pid_list *pid_list;
+ struct trace_pid_list *no_pid_list;
/*
* This function is called by on_each_cpu() while the
@@ -7104,18 +7591,25 @@ static void ignore_task_cpu(void *data)
*/
pid_list = rcu_dereference_protected(tr->function_pids,
mutex_is_locked(&ftrace_lock));
+ no_pid_list = rcu_dereference_protected(tr->function_no_pids,
+ mutex_is_locked(&ftrace_lock));
- this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
- trace_ignore_this_task(pid_list, current));
+ if (trace_ignore_this_task(pid_list, no_pid_list, current))
+ this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
+ FTRACE_PID_IGNORE);
+ else
+ this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid,
+ current->pid);
}
static ssize_t
-ftrace_pid_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+pid_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos, int type)
{
struct seq_file *m = filp->private_data;
struct trace_array *tr = m->private;
- struct trace_pid_list *filtered_pids = NULL;
+ struct trace_pid_list *filtered_pids;
+ struct trace_pid_list *other_pids;
struct trace_pid_list *pid_list;
ssize_t ret;
@@ -7124,19 +7618,43 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
mutex_lock(&ftrace_lock);
- filtered_pids = rcu_dereference_protected(tr->function_pids,
+ switch (type) {
+ case TRACE_PIDS:
+ filtered_pids = rcu_dereference_protected(tr->function_pids,
lockdep_is_held(&ftrace_lock));
+ other_pids = rcu_dereference_protected(tr->function_no_pids,
+ lockdep_is_held(&ftrace_lock));
+ break;
+ case TRACE_NO_PIDS:
+ filtered_pids = rcu_dereference_protected(tr->function_no_pids,
+ lockdep_is_held(&ftrace_lock));
+ other_pids = rcu_dereference_protected(tr->function_pids,
+ lockdep_is_held(&ftrace_lock));
+ break;
+ default:
+ ret = -EINVAL;
+ WARN_ON_ONCE(1);
+ goto out;
+ }
ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
if (ret < 0)
goto out;
- rcu_assign_pointer(tr->function_pids, pid_list);
+ switch (type) {
+ case TRACE_PIDS:
+ rcu_assign_pointer(tr->function_pids, pid_list);
+ break;
+ case TRACE_NO_PIDS:
+ rcu_assign_pointer(tr->function_no_pids, pid_list);
+ break;
+ }
+
if (filtered_pids) {
synchronize_rcu();
- trace_free_pid_list(filtered_pids);
- } else if (pid_list) {
+ trace_pid_list_free(filtered_pids);
+ } else if (pid_list && !other_pids) {
/* Register a probe to set whether to ignore the tracing of a task */
register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
}
@@ -7159,6 +7677,20 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
return ret;
}
+static ssize_t
+ftrace_pid_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS);
+}
+
+static ssize_t
+ftrace_no_pid_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS);
+}
+
static int
ftrace_pid_release(struct inode *inode, struct file *file)
{
@@ -7177,10 +7709,20 @@ static const struct file_operations ftrace_pid_fops = {
.release = ftrace_pid_release,
};
+static const struct file_operations ftrace_no_pid_fops = {
+ .open = ftrace_no_pid_open,
+ .write = ftrace_no_pid_write,
+ .read = seq_read,
+ .llseek = tracing_lseek,
+ .release = ftrace_pid_release,
+};
+
void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
- trace_create_file("set_ftrace_pid", 0644, d_tracer,
+ trace_create_file("set_ftrace_pid", TRACE_MODE_WRITE, d_tracer,
tr, &ftrace_pid_fops);
+ trace_create_file("set_ftrace_notrace_pid", TRACE_MODE_WRITE,
+ d_tracer, tr, &ftrace_no_pid_fops);
}
void __init ftrace_init_tracefs_toplevel(struct trace_array *tr,
@@ -7208,7 +7750,9 @@ void ftrace_kill(void)
}
/**
- * Test if ftrace is dead or not.
+ * ftrace_is_dead - Test if ftrace is dead or not.
+ *
+ * Returns 1 if ftrace is "dead", zero otherwise.
*/
int ftrace_is_dead(void)
{
@@ -7228,7 +7772,7 @@ int ftrace_is_dead(void)
*/
int register_ftrace_function(struct ftrace_ops *ops)
{
- int ret = -1;
+ int ret;
ftrace_ops_init(ops);
@@ -7274,8 +7818,7 @@ static bool is_permanent_ops_registered(void)
int
ftrace_enable_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = -ENODEV;
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 0456e0a3dab1..382775edf690 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -4,28 +4,6 @@
#ifdef CONFIG_FUNCTION_TRACER
-/*
- * Traverse the ftrace_global_list, invoking all entries. The reason that we
- * can use rcu_dereference_raw_check() is that elements removed from this list
- * are simply leaked, so there is no need to interact with a grace-period
- * mechanism. The rcu_dereference_raw_check() calls are needed to handle
- * concurrent insertions into the ftrace_global_list.
- *
- * Silly Alpha and silly pointer-speculation compiler optimizations!
- */
-#define do_for_each_ftrace_op(op, list) \
- op = rcu_dereference_raw_check(list); \
- do
-
-/*
- * Optimized for just a single item in the list (as that is the normal case).
- */
-#define while_for_each_ftrace_op(op) \
- while (likely(op = rcu_dereference_raw_check((op)->next)) && \
- unlikely((op) != &ftrace_list_end))
-
-extern struct ftrace_ops __rcu *ftrace_ops_list;
-extern struct ftrace_ops ftrace_list_end;
extern struct mutex ftrace_lock;
extern struct ftrace_ops global_ops;
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
new file mode 100644
index 000000000000..a2ef1d18126a
--- /dev/null
+++ b/kernel/trace/pid_list.c
@@ -0,0 +1,495 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org>
+ */
+#include <linux/spinlock.h>
+#include <linux/irq_work.h>
+#include <linux/slab.h>
+#include "trace.h"
+
+/* See pid_list.h for details */
+
+static inline union lower_chunk *get_lower_chunk(struct trace_pid_list *pid_list)
+{
+ union lower_chunk *chunk;
+
+ lockdep_assert_held(&pid_list->lock);
+
+ if (!pid_list->lower_list)
+ return NULL;
+
+ chunk = pid_list->lower_list;
+ pid_list->lower_list = chunk->next;
+ pid_list->free_lower_chunks--;
+ WARN_ON_ONCE(pid_list->free_lower_chunks < 0);
+ chunk->next = NULL;
+ /*
+ * If a refill needs to happen, it can not happen here
+ * as the scheduler run queue locks are held.
+ */
+ if (pid_list->free_lower_chunks <= CHUNK_REALLOC)
+ irq_work_queue(&pid_list->refill_irqwork);
+
+ return chunk;
+}
+
+static inline union upper_chunk *get_upper_chunk(struct trace_pid_list *pid_list)
+{
+ union upper_chunk *chunk;
+
+ lockdep_assert_held(&pid_list->lock);
+
+ if (!pid_list->upper_list)
+ return NULL;
+
+ chunk = pid_list->upper_list;
+ pid_list->upper_list = chunk->next;
+ pid_list->free_upper_chunks--;
+ WARN_ON_ONCE(pid_list->free_upper_chunks < 0);
+ chunk->next = NULL;
+ /*
+ * If a refill needs to happen, it can not happen here
+ * as the scheduler run queue locks are held.
+ */
+ if (pid_list->free_upper_chunks <= CHUNK_REALLOC)
+ irq_work_queue(&pid_list->refill_irqwork);
+
+ return chunk;
+}
+
+static inline void put_lower_chunk(struct trace_pid_list *pid_list,
+ union lower_chunk *chunk)
+{
+ lockdep_assert_held(&pid_list->lock);
+
+ chunk->next = pid_list->lower_list;
+ pid_list->lower_list = chunk;
+ pid_list->free_lower_chunks++;
+}
+
+static inline void put_upper_chunk(struct trace_pid_list *pid_list,
+ union upper_chunk *chunk)
+{
+ lockdep_assert_held(&pid_list->lock);
+
+ chunk->next = pid_list->upper_list;
+ pid_list->upper_list = chunk;
+ pid_list->free_upper_chunks++;
+}
+
+static inline bool upper_empty(union upper_chunk *chunk)
+{
+ /*
+ * If chunk->data has no lower chunks, it will be the same
+ * as a zeroed bitmask. Use find_first_bit() to test it
+ * and if it doesn't find any bits set, then the array
+ * is empty.
+ */
+ int bit = find_first_bit((unsigned long *)chunk->data,
+ sizeof(chunk->data) * 8);
+ return bit >= sizeof(chunk->data) * 8;
+}
+
+static inline int pid_split(unsigned int pid, unsigned int *upper1,
+ unsigned int *upper2, unsigned int *lower)
+{
+ /* MAX_PID should cover all pids */
+ BUILD_BUG_ON(MAX_PID < PID_MAX_LIMIT);
+
+ /* In case a bad pid is passed in, then fail */
+ if (unlikely(pid >= MAX_PID))
+ return -1;
+
+ *upper1 = (pid >> UPPER1_SHIFT) & UPPER_MASK;
+ *upper2 = (pid >> UPPER2_SHIFT) & UPPER_MASK;
+ *lower = pid & LOWER_MASK;
+
+ return 0;
+}
+
+static inline unsigned int pid_join(unsigned int upper1,
+ unsigned int upper2, unsigned int lower)
+{
+ return ((upper1 & UPPER_MASK) << UPPER1_SHIFT) |
+ ((upper2 & UPPER_MASK) << UPPER2_SHIFT) |
+ (lower & LOWER_MASK);
+}
+
+/**
+ * trace_pid_list_is_set - test if the pid is set in the list
+ * @pid_list: The pid list to test
+ * @pid: The pid to to see if set in the list.
+ *
+ * Tests if @pid is is set in the @pid_list. This is usually called
+ * from the scheduler when a task is scheduled. Its pid is checked
+ * if it should be traced or not.
+ *
+ * Return true if the pid is in the list, false otherwise.
+ */
+bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
+{
+ union upper_chunk *upper_chunk;
+ union lower_chunk *lower_chunk;
+ unsigned long flags;
+ unsigned int upper1;
+ unsigned int upper2;
+ unsigned int lower;
+ bool ret = false;
+
+ if (!pid_list)
+ return false;
+
+ if (pid_split(pid, &upper1, &upper2, &lower) < 0)
+ return false;
+
+ raw_spin_lock_irqsave(&pid_list->lock, flags);
+ upper_chunk = pid_list->upper[upper1];
+ if (upper_chunk) {
+ lower_chunk = upper_chunk->data[upper2];
+ if (lower_chunk)
+ ret = test_bit(lower, lower_chunk->data);
+ }
+ raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+
+ return ret;
+}
+
+/**
+ * trace_pid_list_set - add a pid to the list
+ * @pid_list: The pid list to add the @pid to.
+ * @pid: The pid to add.
+ *
+ * Adds @pid to @pid_list. This is usually done explicitly by a user
+ * adding a task to be traced, or indirectly by the fork function
+ * when children should be traced and a task's pid is in the list.
+ *
+ * Return 0 on success, negative otherwise.
+ */
+int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
+{
+ union upper_chunk *upper_chunk;
+ union lower_chunk *lower_chunk;
+ unsigned long flags;
+ unsigned int upper1;
+ unsigned int upper2;
+ unsigned int lower;
+ int ret;
+
+ if (!pid_list)
+ return -ENODEV;
+
+ if (pid_split(pid, &upper1, &upper2, &lower) < 0)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&pid_list->lock, flags);
+ upper_chunk = pid_list->upper[upper1];
+ if (!upper_chunk) {
+ upper_chunk = get_upper_chunk(pid_list);
+ if (!upper_chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pid_list->upper[upper1] = upper_chunk;
+ }
+ lower_chunk = upper_chunk->data[upper2];
+ if (!lower_chunk) {
+ lower_chunk = get_lower_chunk(pid_list);
+ if (!lower_chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ upper_chunk->data[upper2] = lower_chunk;
+ }
+ set_bit(lower, lower_chunk->data);
+ ret = 0;
+ out:
+ raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ return ret;
+}
+
+/**
+ * trace_pid_list_clear - remove a pid from the list
+ * @pid_list: The pid list to remove the @pid from.
+ * @pid: The pid to remove.
+ *
+ * Removes @pid from @pid_list. This is usually done explicitly by a user
+ * removing tasks from tracing, or indirectly by the exit function
+ * when a task that is set to be traced exits.
+ *
+ * Return 0 on success, negative otherwise.
+ */
+int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
+{
+ union upper_chunk *upper_chunk;
+ union lower_chunk *lower_chunk;
+ unsigned long flags;
+ unsigned int upper1;
+ unsigned int upper2;
+ unsigned int lower;
+
+ if (!pid_list)
+ return -ENODEV;
+
+ if (pid_split(pid, &upper1, &upper2, &lower) < 0)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&pid_list->lock, flags);
+ upper_chunk = pid_list->upper[upper1];
+ if (!upper_chunk)
+ goto out;
+
+ lower_chunk = upper_chunk->data[upper2];
+ if (!lower_chunk)
+ goto out;
+
+ clear_bit(lower, lower_chunk->data);
+
+ /* if there's no more bits set, add it to the free list */
+ if (find_first_bit(lower_chunk->data, LOWER_MAX) >= LOWER_MAX) {
+ put_lower_chunk(pid_list, lower_chunk);
+ upper_chunk->data[upper2] = NULL;
+ if (upper_empty(upper_chunk)) {
+ put_upper_chunk(pid_list, upper_chunk);
+ pid_list->upper[upper1] = NULL;
+ }
+ }
+ out:
+ raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ return 0;
+}
+
+/**
+ * trace_pid_list_next - return the next pid in the list
+ * @pid_list: The pid list to examine.
+ * @pid: The pid to start from
+ * @next: The pointer to place the pid that is set starting from @pid.
+ *
+ * Looks for the next consecutive pid that is in @pid_list starting
+ * at the pid specified by @pid. If one is set (including @pid), then
+ * that pid is placed into @next.
+ *
+ * Return 0 when a pid is found, -1 if there are no more pids included.
+ */
+int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
+ unsigned int *next)
+{
+ union upper_chunk *upper_chunk;
+ union lower_chunk *lower_chunk;
+ unsigned long flags;
+ unsigned int upper1;
+ unsigned int upper2;
+ unsigned int lower;
+
+ if (!pid_list)
+ return -ENODEV;
+
+ if (pid_split(pid, &upper1, &upper2, &lower) < 0)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&pid_list->lock, flags);
+ for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) {
+ upper_chunk = pid_list->upper[upper1];
+
+ if (!upper_chunk)
+ continue;
+
+ for (; upper2 <= UPPER_MASK; upper2++, lower = 0) {
+ lower_chunk = upper_chunk->data[upper2];
+ if (!lower_chunk)
+ continue;
+
+ lower = find_next_bit(lower_chunk->data, LOWER_MAX,
+ lower);
+ if (lower < LOWER_MAX)
+ goto found;
+ }
+ }
+
+ found:
+ raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ if (upper1 > UPPER_MASK)
+ return -1;
+
+ *next = pid_join(upper1, upper2, lower);
+ return 0;
+}
+
+/**
+ * trace_pid_list_first - return the first pid in the list
+ * @pid_list: The pid list to examine.
+ * @pid: The pointer to place the pid first found pid that is set.
+ *
+ * Looks for the first pid that is set in @pid_list, and places it
+ * into @pid if found.
+ *
+ * Return 0 when a pid is found, -1 if there are no pids set.
+ */
+int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid)
+{
+ return trace_pid_list_next(pid_list, 0, pid);
+}
+
+static void pid_list_refill_irq(struct irq_work *iwork)
+{
+ struct trace_pid_list *pid_list = container_of(iwork, struct trace_pid_list,
+ refill_irqwork);
+ union upper_chunk *upper = NULL;
+ union lower_chunk *lower = NULL;
+ union upper_chunk **upper_next = &upper;
+ union lower_chunk **lower_next = &lower;
+ int upper_count;
+ int lower_count;
+ int ucnt = 0;
+ int lcnt = 0;
+
+ again:
+ raw_spin_lock(&pid_list->lock);
+ upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
+ lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
+ raw_spin_unlock(&pid_list->lock);
+
+ if (upper_count <= 0 && lower_count <= 0)
+ return;
+
+ while (upper_count-- > 0) {
+ union upper_chunk *chunk;
+
+ chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ if (!chunk)
+ break;
+ *upper_next = chunk;
+ upper_next = &chunk->next;
+ ucnt++;
+ }
+
+ while (lower_count-- > 0) {
+ union lower_chunk *chunk;
+
+ chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ if (!chunk)
+ break;
+ *lower_next = chunk;
+ lower_next = &chunk->next;
+ lcnt++;
+ }
+
+ raw_spin_lock(&pid_list->lock);
+ if (upper) {
+ *upper_next = pid_list->upper_list;
+ pid_list->upper_list = upper;
+ pid_list->free_upper_chunks += ucnt;
+ }
+ if (lower) {
+ *lower_next = pid_list->lower_list;
+ pid_list->lower_list = lower;
+ pid_list->free_lower_chunks += lcnt;
+ }
+ raw_spin_unlock(&pid_list->lock);
+
+ /*
+ * On success of allocating all the chunks, both counters
+ * will be less than zero. If they are not, then an allocation
+ * failed, and we should not try again.
+ */
+ if (upper_count >= 0 || lower_count >= 0)
+ return;
+ /*
+ * When the locks were released, free chunks could have
+ * been used and allocation needs to be done again. Might as
+ * well allocate it now.
+ */
+ goto again;
+}
+
+/**
+ * trace_pid_list_alloc - create a new pid_list
+ *
+ * Allocates a new pid_list to store pids into.
+ *
+ * Returns the pid_list on success, NULL otherwise.
+ */
+struct trace_pid_list *trace_pid_list_alloc(void)
+{
+ struct trace_pid_list *pid_list;
+ int i;
+
+ /* According to linux/thread.h, pids can be no bigger that 30 bits */
+ WARN_ON_ONCE(pid_max > (1 << 30));
+
+ pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
+ if (!pid_list)
+ return NULL;
+
+ init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
+
+ raw_spin_lock_init(&pid_list->lock);
+
+ for (i = 0; i < CHUNK_ALLOC; i++) {
+ union upper_chunk *chunk;
+
+ chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ if (!chunk)
+ break;
+ chunk->next = pid_list->upper_list;
+ pid_list->upper_list = chunk;
+ pid_list->free_upper_chunks++;
+ }
+
+ for (i = 0; i < CHUNK_ALLOC; i++) {
+ union lower_chunk *chunk;
+
+ chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ if (!chunk)
+ break;
+ chunk->next = pid_list->lower_list;
+ pid_list->lower_list = chunk;
+ pid_list->free_lower_chunks++;
+ }
+
+ return pid_list;
+}
+
+/**
+ * trace_pid_list_free - Frees an allocated pid_list.
+ *
+ * Frees the memory for a pid_list that was allocated.
+ */
+void trace_pid_list_free(struct trace_pid_list *pid_list)
+{
+ union upper_chunk *upper;
+ union lower_chunk *lower;
+ int i, j;
+
+ if (!pid_list)
+ return;
+
+ irq_work_sync(&pid_list->refill_irqwork);
+
+ while (pid_list->lower_list) {
+ union lower_chunk *chunk;
+
+ chunk = pid_list->lower_list;
+ pid_list->lower_list = pid_list->lower_list->next;
+ kfree(chunk);
+ }
+
+ while (pid_list->upper_list) {
+ union upper_chunk *chunk;
+
+ chunk = pid_list->upper_list;
+ pid_list->upper_list = pid_list->upper_list->next;
+ kfree(chunk);
+ }
+
+ for (i = 0; i < UPPER1_SIZE; i++) {
+ upper = pid_list->upper[i];
+ if (upper) {
+ for (j = 0; j < UPPER2_SIZE; j++) {
+ lower = upper->data[j];
+ kfree(lower);
+ }
+ kfree(upper);
+ }
+ }
+ kfree(pid_list);
+}
diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h
new file mode 100644
index 000000000000..62e73f1ac85f
--- /dev/null
+++ b/kernel/trace/pid_list.h
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Do not include this file directly. */
+
+#ifndef _TRACE_INTERNAL_PID_LIST_H
+#define _TRACE_INTERNAL_PID_LIST_H
+
+/*
+ * In order to keep track of what pids to trace, a tree is created much
+ * like page tables are used. This creates a sparse bit map, where
+ * the tree is filled in when needed. A PID is at most 30 bits (see
+ * linux/thread.h), and is broken up into 3 sections based on the bit map
+ * of the bits. The 8 MSB is the "upper1" section. The next 8 MSB is the
+ * "upper2" section and the 14 LSB is the "lower" section.
+ *
+ * A trace_pid_list structure holds the "upper1" section, in an
+ * array of 256 pointers (1 or 2K in size) to "upper_chunk" unions, where
+ * each has an array of 256 pointers (1 or 2K in size) to the "lower_chunk"
+ * structures, where each has an array of size 2K bytes representing a bitmask
+ * of the 14 LSB of the PID (256 * 8 = 2048)
+ *
+ * When a trace_pid_list is allocated, it includes the 256 pointer array
+ * of the upper1 unions. Then a "cache" of upper and lower is allocated
+ * where these will be assigned as needed.
+ *
+ * When a bit is set in the pid_list bitmask, the pid to use has
+ * the 8 MSB masked, and this is used to index the array in the
+ * pid_list to find the next upper union. If the element is NULL,
+ * then one is retrieved from the upper_list cache. If none is
+ * available, then -ENOMEM is returned.
+ *
+ * The next 8 MSB is used to index into the "upper2" section. If this
+ * element is NULL, then it is retrieved from the lower_list cache.
+ * Again, if one is not available -ENOMEM is returned.
+ *
+ * Finally the 14 LSB of the PID is used to set the bit in the 16384
+ * bitmask (made up of 2K bytes).
+ *
+ * When the second upper section or the lower section has their last
+ * bit cleared, they are added back to the free list to be reused
+ * when needed.
+ */
+
+#define UPPER_BITS 8
+#define UPPER_MAX (1 << UPPER_BITS)
+#define UPPER1_SIZE (1 << UPPER_BITS)
+#define UPPER2_SIZE (1 << UPPER_BITS)
+
+#define LOWER_BITS 14
+#define LOWER_MAX (1 << LOWER_BITS)
+#define LOWER_SIZE (LOWER_MAX / BITS_PER_LONG)
+
+#define UPPER1_SHIFT (LOWER_BITS + UPPER_BITS)
+#define UPPER2_SHIFT LOWER_BITS
+#define LOWER_MASK (LOWER_MAX - 1)
+
+#define UPPER_MASK (UPPER_MAX - 1)
+
+/* According to linux/thread.h pids can not be bigger than or equal to 1 << 30 */
+#define MAX_PID (1 << 30)
+
+/* Just keep 6 chunks of both upper and lower in the cache on alloc */
+#define CHUNK_ALLOC 6
+
+/* Have 2 chunks free, trigger a refill of the cache */
+#define CHUNK_REALLOC 2
+
+union lower_chunk {
+ union lower_chunk *next;
+ unsigned long data[LOWER_SIZE]; // 2K in size
+};
+
+union upper_chunk {
+ union upper_chunk *next;
+ union lower_chunk *data[UPPER2_SIZE]; // 1 or 2K in size
+};
+
+struct trace_pid_list {
+ raw_spinlock_t lock;
+ struct irq_work refill_irqwork;
+ union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size
+ union upper_chunk *upper_list;
+ union lower_chunk *lower_list;
+ int free_upper_chunks;
+ int free_lower_chunks;
+};
+
+#endif /* _TRACE_INTERNAL_PID_LIST_H */
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index 31c0fad4cb9e..8c4ffd076162 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -16,24 +16,32 @@
#include <linux/printk.h>
#include <linux/string.h>
#include <linux/sysfs.h>
+#include <linux/completion.h>
static ulong delay = 100;
static char test_mode[12] = "irq";
static uint burst_size = 1;
+static int cpu_affinity = -1;
module_param_named(delay, delay, ulong, 0444);
module_param_string(test_mode, test_mode, 12, 0444);
module_param_named(burst_size, burst_size, uint, 0444);
+module_param_named(cpu_affinity, cpu_affinity, int, 0444);
MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)");
MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)");
MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)");
+MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on");
+
+static struct completion done;
#define MIN(x, y) ((x) < (y) ? (x) : (y))
static void busy_wait(ulong time)
{
u64 start, end;
+
start = trace_clock_local();
+
do {
end = trace_clock_local();
if (kthread_should_stop())
@@ -44,6 +52,7 @@ static void busy_wait(ulong time)
static __always_inline void irqoff_test(void)
{
unsigned long flags;
+
local_irq_save(flags);
busy_wait(delay);
local_irq_restore(flags);
@@ -110,25 +119,58 @@ static int preemptirq_delay_run(void *data)
{
int i;
int s = MIN(burst_size, NR_TEST_FUNCS);
+ struct cpumask cpu_mask;
+
+ if (cpu_affinity > -1) {
+ cpumask_clear(&cpu_mask);
+ cpumask_set_cpu(cpu_affinity, &cpu_mask);
+ if (set_cpus_allowed_ptr(current, &cpu_mask))
+ pr_err("cpu_affinity:%d, failed\n", cpu_affinity);
+ }
for (i = 0; i < s; i++)
(testfuncs[i])(i);
+
+ complete(&done);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop()) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ __set_current_state(TASK_RUNNING);
+
return 0;
}
-static struct task_struct *preemptirq_start_test(void)
+static int preemptirq_run_test(void)
{
+ struct task_struct *task;
char task_name[50];
+ init_completion(&done);
+
snprintf(task_name, sizeof(task_name), "%s_test", test_mode);
- return kthread_run(preemptirq_delay_run, NULL, task_name);
+ task = kthread_run(preemptirq_delay_run, NULL, task_name);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ if (task) {
+ wait_for_completion(&done);
+ kthread_stop(task);
+ }
+ return 0;
}
static ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
- preemptirq_start_test();
+ ssize_t ret;
+
+ ret = preemptirq_run_test();
+ if (ret)
+ return ret;
return count;
}
@@ -148,11 +190,9 @@ static struct kobject *preemptirq_delay_kobj;
static int __init preemptirq_delay_init(void)
{
- struct task_struct *test_task;
int retval;
- test_task = preemptirq_start_test();
- retval = PTR_ERR_OR_ZERO(test_task);
+ retval = preemptirq_run_test();
if (retval != 0)
return retval;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 61f0e92ace99..2699e9e562b1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/trace_recursion.h>
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
@@ -129,7 +130,16 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-#define RB_ALIGN_DATA __aligned(RB_ALIGNMENT)
+
+#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
+# define RB_FORCE_8BYTE_ALIGNMENT 0
+# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT 1
+# define RB_ARCH_ALIGNMENT 8U
+#endif
+
+#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -193,7 +203,7 @@ rb_event_length(struct ring_buffer_event *event)
case RINGBUF_TYPE_DATA:
return rb_event_data_length(event);
default:
- BUG();
+ WARN_ON_ONCE(1);
}
/* not hit */
return 0;
@@ -249,7 +259,7 @@ rb_event_data(struct ring_buffer_event *event)
{
if (extended_time(event))
event = skip_time_extend(event);
- BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
+ WARN_ON_ONCE(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
/* If length is in len field, then array[0] has the data */
if (event->type_len)
return (void *)&event->array[0];
@@ -270,21 +280,14 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define for_each_buffer_cpu(buffer, cpu) \
for_each_cpu(cpu, buffer->cpumask)
+#define for_each_online_buffer_cpu(buffer, cpu) \
+ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
+
#define TS_SHIFT 27
#define TS_MASK ((1ULL << TS_SHIFT) - 1)
#define TS_DELTA_TEST (~TS_MASK)
-/**
- * ring_buffer_event_time_stamp - return the event's extended timestamp
- * @event: the event to get the timestamp of
- *
- * Returns the extended timestamp associated with a data event.
- * An extended time_stamp is a 64-bit timestamp represented
- * internally in a special way that makes the best use of space
- * contained within a ring buffer event. This function decodes
- * it and maps it to a straight u64 value.
- */
-u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
+static u64 rb_event_time_stamp(struct ring_buffer_event *event)
{
u64 ts;
@@ -413,21 +416,38 @@ struct rb_irq_work {
struct rb_event_info {
u64 ts;
u64 delta;
+ u64 before;
+ u64 after;
unsigned long length;
struct buffer_page *tail_page;
int add_timestamp;
};
/*
+ * Used for the add_timestamp
+ * NONE
+ * EXTEND - wants a time extend
+ * ABSOLUTE - the buffer requests all events to have absolute time stamps
+ * FORCE - force a full time stamp.
+ */
+enum {
+ RB_ADD_STAMP_NONE = 0,
+ RB_ADD_STAMP_EXTEND = BIT(1),
+ RB_ADD_STAMP_ABSOLUTE = BIT(2),
+ RB_ADD_STAMP_FORCE = BIT(3)
+};
+/*
* Used for which event context the event is in.
- * NMI = 0
- * IRQ = 1
- * SOFTIRQ = 2
- * NORMAL = 3
+ * TRANSITION = 0
+ * NMI = 1
+ * IRQ = 2
+ * SOFTIRQ = 3
+ * NORMAL = 4
*
* See trace_recursive_lock() comment below for more details.
*/
enum {
+ RB_CTX_TRANSITION,
RB_CTX_NMI,
RB_CTX_IRQ,
RB_CTX_SOFTIRQ,
@@ -435,12 +455,37 @@ enum {
RB_CTX_MAX
};
+#if BITS_PER_LONG == 32
+#define RB_TIME_32
+#endif
+
+/* To test on 64 bit machines */
+//#define RB_TIME_32
+
+#ifdef RB_TIME_32
+
+struct rb_time_struct {
+ local_t cnt;
+ local_t top;
+ local_t bottom;
+};
+#else
+#include <asm/local64.h>
+struct rb_time_struct {
+ local64_t time;
+};
+#endif
+typedef struct rb_time_struct rb_time_t;
+
+#define MAX_NEST 5
+
/*
* head_page == tail_page && head == tail then buffer is empty.
*/
struct ring_buffer_per_cpu {
int cpu;
atomic_t record_disabled;
+ atomic_t resize_disabled;
struct trace_buffer *buffer;
raw_spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
@@ -469,7 +514,9 @@ struct ring_buffer_per_cpu {
size_t shortest_full;
unsigned long read;
unsigned long read_bytes;
- u64 write_stamp;
+ rb_time_t write_stamp;
+ rb_time_t before_stamp;
+ u64 event_stamp[MAX_NEST];
u64 read_stamp;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
@@ -484,7 +531,6 @@ struct trace_buffer {
unsigned flags;
int cpus;
atomic_t record_disabled;
- atomic_t resize_disabled;
cpumask_var_t cpumask;
struct lock_class_key *reader_lock_key;
@@ -503,12 +549,292 @@ struct trace_buffer {
struct ring_buffer_iter {
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long head;
+ unsigned long next_event;
struct buffer_page *head_page;
struct buffer_page *cache_reader_page;
unsigned long cache_read;
u64 read_stamp;
+ u64 page_stamp;
+ struct ring_buffer_event *event;
+ int missed_events;
};
+#ifdef RB_TIME_32
+
+/*
+ * On 32 bit machines, local64_t is very expensive. As the ring
+ * buffer doesn't need all the features of a true 64 bit atomic,
+ * on 32 bit, it uses these functions (64 still uses local64_t).
+ *
+ * For the ring buffer, 64 bit required operations for the time is
+ * the following:
+ *
+ * - Only need 59 bits (uses 60 to make it even).
+ * - Reads may fail if it interrupted a modification of the time stamp.
+ * It will succeed if it did not interrupt another write even if
+ * the read itself is interrupted by a write.
+ * It returns whether it was successful or not.
+ *
+ * - Writes always succeed and will overwrite other writes and writes
+ * that were done by events interrupting the current write.
+ *
+ * - A write followed by a read of the same time stamp will always succeed,
+ * but may not contain the same value.
+ *
+ * - A cmpxchg will fail if it interrupted another write or cmpxchg.
+ * Other than that, it acts like a normal cmpxchg.
+ *
+ * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
+ * (bottom being the least significant 30 bits of the 60 bit time stamp).
+ *
+ * The two most significant bits of each half holds a 2 bit counter (0-3).
+ * Each update will increment this counter by one.
+ * When reading the top and bottom, if the two counter bits match then the
+ * top and bottom together make a valid 60 bit number.
+ */
+#define RB_TIME_SHIFT 30
+#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
+
+static inline int rb_time_cnt(unsigned long val)
+{
+ return (val >> RB_TIME_SHIFT) & 3;
+}
+
+static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
+{
+ u64 val;
+
+ val = top & RB_TIME_VAL_MASK;
+ val <<= RB_TIME_SHIFT;
+ val |= bottom & RB_TIME_VAL_MASK;
+
+ return val;
+}
+
+static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
+{
+ unsigned long top, bottom;
+ unsigned long c;
+
+ /*
+ * If the read is interrupted by a write, then the cnt will
+ * be different. Loop until both top and bottom have been read
+ * without interruption.
+ */
+ do {
+ c = local_read(&t->cnt);
+ top = local_read(&t->top);
+ bottom = local_read(&t->bottom);
+ } while (c != local_read(&t->cnt));
+
+ *cnt = rb_time_cnt(top);
+
+ /* If top and bottom counts don't match, this interrupted a write */
+ if (*cnt != rb_time_cnt(bottom))
+ return false;
+
+ *ret = rb_time_val(top, bottom);
+ return true;
+}
+
+static bool rb_time_read(rb_time_t *t, u64 *ret)
+{
+ unsigned long cnt;
+
+ return __rb_time_read(t, ret, &cnt);
+}
+
+static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
+{
+ return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
+}
+
+static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom)
+{
+ *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
+ *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
+}
+
+static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
+{
+ val = rb_time_val_cnt(val, cnt);
+ local_set(t, val);
+}
+
+static void rb_time_set(rb_time_t *t, u64 val)
+{
+ unsigned long cnt, top, bottom;
+
+ rb_time_split(val, &top, &bottom);
+
+ /* Writes always succeed with a valid number even if it gets interrupted. */
+ do {
+ cnt = local_inc_return(&t->cnt);
+ rb_time_val_set(&t->top, top, cnt);
+ rb_time_val_set(&t->bottom, bottom, cnt);
+ } while (cnt != local_read(&t->cnt));
+}
+
+static inline bool
+rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
+{
+ unsigned long ret;
+
+ ret = local_cmpxchg(l, expect, set);
+ return ret == expect;
+}
+
+static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+{
+ unsigned long cnt, top, bottom;
+ unsigned long cnt2, top2, bottom2;
+ u64 val;
+
+ /* The cmpxchg always fails if it interrupted an update */
+ if (!__rb_time_read(t, &val, &cnt2))
+ return false;
+
+ if (val != expect)
+ return false;
+
+ cnt = local_read(&t->cnt);
+ if ((cnt & 3) != cnt2)
+ return false;
+
+ cnt2 = cnt + 1;
+
+ rb_time_split(val, &top, &bottom);
+ top = rb_time_val_cnt(top, cnt);
+ bottom = rb_time_val_cnt(bottom, cnt);
+
+ rb_time_split(set, &top2, &bottom2);
+ top2 = rb_time_val_cnt(top2, cnt2);
+ bottom2 = rb_time_val_cnt(bottom2, cnt2);
+
+ if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
+ return false;
+ if (!rb_time_read_cmpxchg(&t->top, top, top2))
+ return false;
+ if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
+ return false;
+ return true;
+}
+
+#else /* 64 bits */
+
+/* local64_t always succeeds */
+
+static inline bool rb_time_read(rb_time_t *t, u64 *ret)
+{
+ *ret = local64_read(&t->time);
+ return true;
+}
+static void rb_time_set(rb_time_t *t, u64 val)
+{
+ local64_set(&t->time, val);
+}
+
+static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+{
+ u64 val;
+ val = local64_cmpxchg(&t->time, expect, set);
+ return val == expect;
+}
+#endif
+
+/*
+ * Enable this to make sure that the event passed to
+ * ring_buffer_event_time_stamp() is not committed and also
+ * is on the buffer that it passed in.
+ */
+//#define RB_VERIFY_EVENT
+#ifdef RB_VERIFY_EVENT
+static struct list_head *rb_list_head(struct list_head *list);
+static void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
+ void *event)
+{
+ struct buffer_page *page = cpu_buffer->commit_page;
+ struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page);
+ struct list_head *next;
+ long commit, write;
+ unsigned long addr = (unsigned long)event;
+ bool done = false;
+ int stop = 0;
+
+ /* Make sure the event exists and is not committed yet */
+ do {
+ if (page == tail_page || WARN_ON_ONCE(stop++ > 100))
+ done = true;
+ commit = local_read(&page->page->commit);
+ write = local_read(&page->write);
+ if (addr >= (unsigned long)&page->page->data[commit] &&
+ addr < (unsigned long)&page->page->data[write])
+ return;
+
+ next = rb_list_head(page->list.next);
+ page = list_entry(next, struct buffer_page, list);
+ } while (!done);
+ WARN_ON_ONCE(1);
+}
+#else
+static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
+ void *event)
+{
+}
+#endif
+
+
+static inline u64 rb_time_stamp(struct trace_buffer *buffer);
+
+/**
+ * ring_buffer_event_time_stamp - return the event's current time stamp
+ * @buffer: The buffer that the event is on
+ * @event: the event to get the time stamp of
+ *
+ * Note, this must be called after @event is reserved, and before it is
+ * committed to the ring buffer. And must be called from the same
+ * context where the event was reserved (normal, softirq, irq, etc).
+ *
+ * Returns the time stamp associated with the current event.
+ * If the event has an extended time stamp, then that is used as
+ * the time stamp to return.
+ * In the highly unlikely case that the event was nested more than
+ * the max nesting, then the write_stamp of the buffer is returned,
+ * otherwise current time is returned, but that really neither of
+ * the last two cases should ever happen.
+ */
+u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()];
+ unsigned int nest;
+ u64 ts;
+
+ /* If the event includes an absolute time, then just use that */
+ if (event->type_len == RINGBUF_TYPE_TIME_STAMP)
+ return rb_event_time_stamp(event);
+
+ nest = local_read(&cpu_buffer->committing);
+ verify_event(cpu_buffer, event);
+ if (WARN_ON_ONCE(!nest))
+ goto fail;
+
+ /* Read the current saved nesting level time stamp */
+ if (likely(--nest < MAX_NEST))
+ return cpu_buffer->event_stamp[nest];
+
+ /* Shouldn't happen, warn if it does */
+ WARN_ONCE(1, "nest (%d) greater than max", nest);
+
+ fail:
+ /* Can only fail on 32 bit */
+ if (!rb_time_read(&cpu_buffer->write_stamp, &ts))
+ /* Screw it, just read the current time */
+ ts = rb_time_stamp(cpu_buffer->buffer);
+
+ return ts;
+}
+
/**
* ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
@@ -565,7 +891,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
* @cpu: the cpu buffer to wait on
- * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS
+ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
@@ -573,7 +899,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
*/
int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
{
- struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
+ struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
int ret = 0;
@@ -742,11 +1068,19 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
static inline u64 rb_time_stamp(struct trace_buffer *buffer)
{
+ u64 ts;
+
+ /* Skip retpolines :-( */
+ if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
+ ts = trace_clock_local();
+ else
+ ts = buffer->clock();
+
/* shift to debug/test normalization and TIME_EXTENTS */
- return buffer->clock() << DEBUG_SHIFT;
+ return ts << DEBUG_SHIFT;
}
-u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
+u64 ring_buffer_time_stamp(struct trace_buffer *buffer)
{
u64 time;
@@ -864,8 +1198,7 @@ static struct list_head *rb_list_head(struct list_head *list)
* its flags will be non zero.
*/
static inline int
-rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page *page, struct list_head *list)
+rb_is_head_page(struct buffer_page *page, struct list_head *list)
{
unsigned long val;
@@ -894,8 +1227,7 @@ static bool rb_is_reader_page(struct buffer_page *page)
/*
* rb_set_list_to_head - set a list_head to be pointing to head.
*/
-static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
- struct list_head *list)
+static void rb_set_list_to_head(struct list_head *list)
{
unsigned long *ptr;
@@ -918,7 +1250,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
/*
* Set the previous list pointer to have the HEAD flag.
*/
- rb_set_list_to_head(cpu_buffer, head->list.prev);
+ rb_set_list_to_head(head->list.prev);
}
static void rb_list_head_clear(struct list_head *list)
@@ -993,8 +1325,7 @@ static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
old_flag, RB_PAGE_NORMAL);
}
-static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page **bpage)
+static inline void rb_inc_page(struct buffer_page **bpage)
{
struct list_head *p = rb_list_head((*bpage)->list.next);
@@ -1026,11 +1357,11 @@ rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
*/
for (i = 0; i < 3; i++) {
do {
- if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
+ if (rb_is_head_page(page, page->list.prev)) {
cpu_buffer->head_page = page;
return page;
}
- rb_inc_page(cpu_buffer, &page);
+ rb_inc_page(&page);
} while (page != head);
}
@@ -1184,7 +1515,8 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
return 0;
}
-static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
+static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ long nr_pages, struct list_head *pages)
{
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
@@ -1224,13 +1556,15 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
struct page *page;
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- mflags, cpu_to_node(cpu));
+ mflags, cpu_to_node(cpu_buffer->cpu));
if (!bpage)
goto free_pages;
+ rb_check_bpage(cpu_buffer, bpage);
+
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu), mflags, 0);
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
if (!page)
goto free_pages;
bpage->page = page_address(page);
@@ -1262,7 +1596,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
WARN_ON(!nr_pages);
- if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
+ if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages))
return -ENOMEM;
/*
@@ -1573,7 +1907,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
cond_resched();
to_remove_page = tmp_iter_page;
- rb_inc_page(cpu_buffer, &tmp_iter_page);
+ rb_inc_page(&tmp_iter_page);
/* update the counters */
page_entries = rb_page_entries(to_remove_page);
@@ -1716,18 +2050,18 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long nr_pages;
- int cpu, err = 0;
+ int cpu, err;
/*
* Always succeed at resizing a non-existent buffer:
*/
if (!buffer)
- return size;
+ return 0;
/* Make sure the requested buffer exists */
if (cpu_id != RING_BUFFER_ALL_CPUS &&
!cpumask_test_cpu(cpu_id, buffer->cpumask))
- return size;
+ return 0;
nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
@@ -1735,20 +2069,24 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
if (nr_pages < 2)
nr_pages = 2;
- size = nr_pages * BUF_PAGE_SIZE;
-
- /*
- * Don't succeed if resizing is disabled, as a reader might be
- * manipulating the ring buffer and is expecting a sane state while
- * this is true.
- */
- if (atomic_read(&buffer->resize_disabled))
- return -EBUSY;
-
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
+
if (cpu_id == RING_BUFFER_ALL_CPUS) {
+ /*
+ * Don't succeed if resizing is disabled, as a reader might be
+ * manipulating the ring buffer and is expecting a sane state while
+ * this is true.
+ */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (atomic_read(&cpu_buffer->resize_disabled)) {
+ err = -EBUSY;
+ goto out_err_unlock;
+ }
+ }
+
/* calculate the pages to update */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
@@ -1765,15 +2103,15 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
* allocated without receiving ENOMEM
*/
INIT_LIST_HEAD(&cpu_buffer->new_pages);
- if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
- &cpu_buffer->new_pages, cpu)) {
+ if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages)) {
/* not enough memory for new pages */
err = -ENOMEM;
goto out_err;
}
}
- get_online_cpus();
+ cpus_read_lock();
/*
* Fire off all the required work handlers
* We can't schedule on offline CPUs, but it's not necessary
@@ -1805,29 +2143,35 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cpu_buffer->nr_pages_to_update = 0;
}
- put_online_cpus();
+ cpus_read_unlock();
} else {
- /* Make sure this CPU has been initialized */
- if (!cpumask_test_cpu(cpu_id, buffer->cpumask))
- goto out;
-
cpu_buffer = buffer->buffers[cpu_id];
if (nr_pages == cpu_buffer->nr_pages)
goto out;
+ /*
+ * Don't succeed if resizing is disabled, as a reader might be
+ * manipulating the ring buffer and is expecting a sane state while
+ * this is true.
+ */
+ if (atomic_read(&cpu_buffer->resize_disabled)) {
+ err = -EBUSY;
+ goto out_err_unlock;
+ }
+
cpu_buffer->nr_pages_to_update = nr_pages -
cpu_buffer->nr_pages;
INIT_LIST_HEAD(&cpu_buffer->new_pages);
if (cpu_buffer->nr_pages_to_update > 0 &&
- __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
- &cpu_buffer->new_pages, cpu_id)) {
+ __rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages)) {
err = -ENOMEM;
goto out_err;
}
- get_online_cpus();
+ cpus_read_lock();
/* Can't run something on an offline CPU. */
if (!cpu_online(cpu_id))
@@ -1839,7 +2183,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
cpu_buffer->nr_pages_to_update = 0;
- put_online_cpus();
+ cpus_read_unlock();
}
out:
@@ -1867,7 +2211,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
mutex_unlock(&buffer->mutex);
- return size;
+ return 0;
out_err:
for_each_buffer_cpu(buffer, cpu) {
@@ -1885,6 +2229,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
free_buffer_page(bpage);
}
}
+ out_err_unlock:
mutex_unlock(&buffer->mutex);
return err;
}
@@ -1913,15 +2258,63 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->reader_page->read);
}
-static __always_inline struct ring_buffer_event *
-rb_iter_head_event(struct ring_buffer_iter *iter)
+static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
{
- return __rb_page_index(iter->head_page, iter->head);
+ return local_read(&bpage->page->commit);
}
-static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
+static struct ring_buffer_event *
+rb_iter_head_event(struct ring_buffer_iter *iter)
{
- return local_read(&bpage->page->commit);
+ struct ring_buffer_event *event;
+ struct buffer_page *iter_head_page = iter->head_page;
+ unsigned long commit;
+ unsigned length;
+
+ if (iter->head != iter->next_event)
+ return iter->event;
+
+ /*
+ * When the writer goes across pages, it issues a cmpxchg which
+ * is a mb(), which will synchronize with the rmb here.
+ * (see rb_tail_page_update() and __rb_reserve_next())
+ */
+ commit = rb_page_commit(iter_head_page);
+ smp_rmb();
+ event = __rb_page_index(iter_head_page, iter->head);
+ length = rb_event_length(event);
+
+ /*
+ * READ_ONCE() doesn't work on functions and we don't want the
+ * compiler doing any crazy optimizations with length.
+ */
+ barrier();
+
+ if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
+ /* Writer corrupted the read? */
+ goto reset;
+
+ memcpy(iter->event, event, length);
+ /*
+ * If the page stamp is still the same after this rmb() then the
+ * event was safely copied without the writer entering the page.
+ */
+ smp_rmb();
+
+ /* Make sure the page didn't change since we read this */
+ if (iter->page_stamp != iter_head_page->page->time_stamp ||
+ commit > rb_page_commit(iter_head_page))
+ goto reset;
+
+ iter->next_event = iter->head + length;
+ return iter->event;
+ reset:
+ /* Reset to the beginning */
+ iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
+ iter->head = 0;
+ iter->next_event = 0;
+ iter->missed_events = 1;
+ return NULL;
}
/* Size is determined by what has been committed */
@@ -1957,10 +2350,11 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
if (iter->head_page == cpu_buffer->reader_page)
iter->head_page = rb_set_head_page(cpu_buffer);
else
- rb_inc_page(cpu_buffer, &iter->head_page);
+ rb_inc_page(&iter->head_page);
- iter->read_stamp = iter->head_page->page->time_stamp;
+ iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
iter->head = 0;
+ iter->next_event = 0;
}
/*
@@ -2059,7 +2453,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* want the outer most commit to reset it.
*/
new_head = next_page;
- rb_inc_page(cpu_buffer, &new_head);
+ rb_inc_page(&new_head);
ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
RB_PAGE_NORMAL);
@@ -2211,7 +2605,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
next_page = tail_page;
- rb_inc_page(cpu_buffer, &next_page);
+ rb_inc_page(&next_page);
/*
* If for some reason, we had an interrupt storm that made
@@ -2237,7 +2631,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* the buffer, unless the commit page is still on the
* reader page.
*/
- if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
+ if (rb_is_head_page(next_page, &tail_page->list)) {
/*
* If the commit is not on the reader page, then
@@ -2268,7 +2662,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* have filled up the buffer with events
* from interrupts and such, and wrapped.
*
- * Note, if the tail page is also the on the
+ * Note, if the tail page is also on the
* reader_page, we let it move out.
*/
if (unlikely((cpu_buffer->commit_page !=
@@ -2302,8 +2696,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
return NULL;
}
-/* Slow path, do not inline */
-static noinline struct ring_buffer_event *
+/* Slow path */
+static struct ring_buffer_event *
rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
{
if (abs)
@@ -2324,8 +2718,65 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
return skip_time_extend(event);
}
-static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event);
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+ return true;
+}
+#endif
+
+static void
+rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info)
+{
+ u64 write_stamp;
+
+ WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s",
+ (unsigned long long)info->delta,
+ (unsigned long long)info->ts,
+ (unsigned long long)info->before,
+ (unsigned long long)info->after,
+ (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
+ sched_clock_stable() ? "" :
+ "If you just came from a suspend/resume,\n"
+ "please switch to the trace global clock:\n"
+ " echo global > /sys/kernel/debug/tracing/trace_clock\n"
+ "or add trace_clock=global to the kernel command line\n");
+}
+
+static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event **event,
+ struct rb_event_info *info,
+ u64 *delta,
+ unsigned int *length)
+{
+ bool abs = info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
+
+ if (unlikely(info->delta > (1ULL << 59))) {
+ /* did the clock go backwards */
+ if (info->before == info->after && info->before > info->ts) {
+ /* not interrupted */
+ static int once;
+
+ /*
+ * This is possible with a recalibrating of the TSC.
+ * Do not produce a call stack, but just report it.
+ */
+ if (!once) {
+ once++;
+ pr_warn("Ring buffer clock went backwards: %llu -> %llu\n",
+ info->before, info->ts);
+ }
+ } else
+ rb_check_timestamp(cpu_buffer, info);
+ if (!abs)
+ info->delta = 0;
+ }
+ *event = rb_add_time_stamp(*event, info->delta, abs);
+ *length -= RB_LEN_TIME_EXTEND;
+ *delta = 0;
+}
/**
* rb_update_event - update event type and data
@@ -2345,26 +2796,21 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
{
unsigned length = info->length;
u64 delta = info->delta;
+ unsigned int nest = local_read(&cpu_buffer->committing) - 1;
- /* Only a commit updates the timestamp */
- if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
- delta = 0;
+ if (!WARN_ON_ONCE(nest >= MAX_NEST))
+ cpu_buffer->event_stamp[nest] = info->ts;
/*
* If we need to add a timestamp, then we
* add it to the start of the reserved space.
*/
- if (unlikely(info->add_timestamp)) {
- bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
-
- event = rb_add_time_stamp(event, info->delta, abs);
- length -= RB_LEN_TIME_EXTEND;
- delta = 0;
- }
+ if (unlikely(info->add_timestamp))
+ rb_add_timestamp(cpu_buffer, &event, info, &delta, &length);
event->time_delta = delta;
length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA) {
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
event->type_len = 0;
event->array[0] = length;
} else
@@ -2379,11 +2825,11 @@ static unsigned rb_calculate_event_length(unsigned length)
if (!length)
length++;
- if (length > RB_MAX_SMALL_DATA)
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
length += sizeof(event.array[0]);
length += RB_EVNT_HDR_SIZE;
- length = ALIGN(length, RB_ALIGNMENT);
+ length = ALIGN(length, RB_ARCH_ALIGNMENT);
/*
* In case the time delta is larger than the 27 bits for it
@@ -2403,12 +2849,24 @@ static unsigned rb_calculate_event_length(unsigned length)
return length;
}
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline bool sched_clock_stable(void)
+static u64 rb_time_delta(struct ring_buffer_event *event)
{
- return true;
+ switch (event->type_len) {
+ case RINGBUF_TYPE_PADDING:
+ return 0;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ return rb_event_time_stamp(event);
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ return 0;
+
+ case RINGBUF_TYPE_DATA:
+ return event->time_delta;
+ default:
+ return 0;
+ }
}
-#endif
static inline int
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2418,6 +2876,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage;
unsigned long index;
unsigned long addr;
+ u64 write_stamp;
+ u64 delta;
new_index = rb_event_index(event);
old_index = new_index + rb_event_ts_length(event);
@@ -2426,10 +2886,43 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
bpage = READ_ONCE(cpu_buffer->tail_page);
+ delta = rb_time_delta(event);
+
+ if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
+ return 0;
+
+ /* Make sure the write stamp is read before testing the location */
+ barrier();
+
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
local_read(&bpage->write) & ~RB_WRITE_MASK;
unsigned long event_length = rb_event_length(event);
+
+ /* Something came in, can't discard */
+ if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
+ write_stamp, write_stamp - delta))
+ return 0;
+
+ /*
+ * It's possible that the event time delta is zero
+ * (has the same time stamp as the previous event)
+ * in which case write_stamp and before_stamp could
+ * be the same. In such a case, force before_stamp
+ * to be different than write_stamp. It doesn't
+ * matter what it is, as long as its different.
+ */
+ if (!delta)
+ rb_time_set(&cpu_buffer->before_stamp, 0);
+
+ /*
+ * If an event were to come in now, it would see that the
+ * write_stamp and the before_stamp are different, and assume
+ * that this event just added itself before updating
+ * the write stamp. The interrupting event will fix the
+ * write stamp for us, and use the before stamp as its delta.
+ */
+
/*
* This is on the tail page. It is possible that
* a write could come in and move the tail page
@@ -2480,11 +2973,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
return;
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
- rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- /* Only update the write stamp if the page has an event */
- if (rb_page_write(cpu_buffer->commit_page))
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
+ rb_inc_page(&cpu_buffer->commit_page);
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -2556,54 +3045,10 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
event->time_delta = 1;
}
-static __always_inline bool
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- unsigned long addr = (unsigned long)event;
- unsigned long index;
-
- index = rb_event_index(event);
- addr &= PAGE_MASK;
-
- return cpu_buffer->commit_page->page == (void *)addr &&
- rb_commit_index(cpu_buffer) == index;
-}
-
-static __always_inline void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- u64 delta;
-
- /*
- * The event first in the commit queue updates the
- * time stamp.
- */
- if (rb_event_is_commit(cpu_buffer, event)) {
- /*
- * A commit event that is first on a page
- * updates the write timestamp with the page stamp
- */
- if (!rb_event_index(event))
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
- else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
- delta = ring_buffer_event_time_stamp(event);
- cpu_buffer->write_stamp += delta;
- } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
- delta = ring_buffer_event_time_stamp(event);
- cpu_buffer->write_stamp = delta;
- } else
- cpu_buffer->write_stamp += event->time_delta;
- }
-}
-
static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
local_inc(&cpu_buffer->entries);
- rb_update_write_stamp(cpu_buffer, event);
rb_end_commit(cpu_buffer);
}
@@ -2649,6 +3094,13 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
irq_work_queue(&cpu_buffer->irq_work.work);
}
+#ifdef CONFIG_RING_BUFFER_RECORD_RECURSION
+# define do_ring_buffer_record_recursion() \
+ do_ftrace_record_recursion(_THIS_IP_, _RET_IP_)
+#else
+# define do_ring_buffer_record_recursion() do { } while (0)
+#endif
+
/*
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
@@ -2659,10 +3111,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* a bit of overhead in something as critical as function tracing,
* we use a bitmask trick.
*
- * bit 0 = NMI context
- * bit 1 = IRQ context
- * bit 2 = SoftIRQ context
- * bit 3 = normal context.
+ * bit 1 = NMI context
+ * bit 2 = IRQ context
+ * bit 3 = SoftIRQ context
+ * bit 4 = normal context.
*
* This works because this is the order of contexts that can
* preempt other contexts. A SoftIRQ never preempts an IRQ
@@ -2685,23 +3137,52 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* The least significant bit can be cleared this way, and it
* just so happens that it is the same bit corresponding to
* the current context.
+ *
+ * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit
+ * is set when a recursion is detected at the current context, and if
+ * the TRANSITION bit is already set, it will fail the recursion.
+ * This is needed because there's a lag between the changing of
+ * interrupt context and updating the preempt count. In this case,
+ * a false positive will be found. To handle this, one extra recursion
+ * is allowed, and this is done by the TRANSITION bit. If the TRANSITION
+ * bit is already set, then it is considered a recursion and the function
+ * ends. Otherwise, the TRANSITION bit is set, and that bit is returned.
+ *
+ * On the trace_recursive_unlock(), the TRANSITION bit will be the first
+ * to be cleared. Even if it wasn't the context that set it. That is,
+ * if an interrupt comes in while NORMAL bit is set and the ring buffer
+ * is called before preempt_count() is updated, since the check will
+ * be on the NORMAL bit, the TRANSITION bit will then be set. If an
+ * NMI then comes in, it will set the NMI bit, but when the NMI code
+ * does the trace_recursive_unlock() it will clear the TRANSITION bit
+ * and leave the NMI bit set. But this is fine, because the interrupt
+ * code that set the TRANSITION bit will then clear the NMI bit when it
+ * calls trace_recursive_unlock(). If another NMI comes in, it will
+ * set the TRANSITION bit and continue.
+ *
+ * Note: The TRANSITION bit only handles a single transition between context.
*/
static __always_inline int
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned int val = cpu_buffer->current_context;
- unsigned long pc = preempt_count();
- int bit;
+ int bit = interrupt_context_level();
- if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
- bit = RB_CTX_NORMAL;
- else
- bit = pc & NMI_MASK ? RB_CTX_NMI :
- pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
+ bit = RB_CTX_NORMAL - bit;
- if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
- return 1;
+ if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) {
+ /*
+ * It is possible that this was called by transitioning
+ * between interrupt context, and preempt_count() has not
+ * been updated yet. In this case, use the TRANSITION bit.
+ */
+ bit = RB_CTX_TRANSITION;
+ if (val & (1 << (bit + cpu_buffer->nest))) {
+ do_ring_buffer_record_recursion();
+ return 1;
+ }
+ }
val |= (1 << (bit + cpu_buffer->nest));
cpu_buffer->current_context = val;
@@ -2716,8 +3197,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->current_context - (1 << cpu_buffer->nest);
}
-/* The recursive locking above uses 4 bits */
-#define NESTED_BITS 4
+/* The recursive locking above uses 5 bits */
+#define NESTED_BITS 5
/**
* ring_buffer_nest_start - Allow to trace while nested
@@ -2794,22 +3275,156 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
-static noinline void
-rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
- struct rb_event_info *info)
+/* Special value to validate all deltas on a page. */
+#define CHECK_FULL_PAGE 1L
+
+#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
+static void dump_buffer_page(struct buffer_data_page *bpage,
+ struct rb_event_info *info,
+ unsigned long tail)
+{
+ struct ring_buffer_event *event;
+ u64 ts, delta;
+ int e;
+
+ ts = bpage->time_stamp;
+ pr_warn(" [%lld] PAGE TIME STAMP\n", ts);
+
+ for (e = 0; e < tail; e += rb_event_length(event)) {
+
+ event = (struct ring_buffer_event *)(bpage->data + e);
+
+ switch (event->type_len) {
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = rb_event_time_stamp(event);
+ ts += delta;
+ pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta);
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ delta = rb_event_time_stamp(event);
+ ts = delta;
+ pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
+ break;
+
+ case RINGBUF_TYPE_PADDING:
+ ts += event->time_delta;
+ pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta);
+ break;
+
+ case RINGBUF_TYPE_DATA:
+ ts += event->time_delta;
+ pr_warn(" [%lld] delta:%d\n", ts, event->time_delta);
+ break;
+
+ default:
+ break;
+ }
+ }
+}
+
+static DEFINE_PER_CPU(atomic_t, checking);
+static atomic_t ts_dump;
+
+/*
+ * Check if the current event time stamp matches the deltas on
+ * the buffer page.
+ */
+static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info,
+ unsigned long tail)
+{
+ struct ring_buffer_event *event;
+ struct buffer_data_page *bpage;
+ u64 ts, delta;
+ bool full = false;
+ int e;
+
+ bpage = info->tail_page->page;
+
+ if (tail == CHECK_FULL_PAGE) {
+ full = true;
+ tail = local_read(&bpage->commit);
+ } else if (info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) {
+ /* Ignore events with absolute time stamps */
+ return;
+ }
+
+ /*
+ * Do not check the first event (skip possible extends too).
+ * Also do not check if previous events have not been committed.
+ */
+ if (tail <= 8 || tail > local_read(&bpage->commit))
+ return;
+
+ /*
+ * If this interrupted another event,
+ */
+ if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
+ goto out;
+
+ ts = bpage->time_stamp;
+
+ for (e = 0; e < tail; e += rb_event_length(event)) {
+
+ event = (struct ring_buffer_event *)(bpage->data + e);
+
+ switch (event->type_len) {
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = rb_event_time_stamp(event);
+ ts += delta;
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ delta = rb_event_time_stamp(event);
+ ts = delta;
+ break;
+
+ case RINGBUF_TYPE_PADDING:
+ if (event->time_delta == 1)
+ break;
+ fallthrough;
+ case RINGBUF_TYPE_DATA:
+ ts += event->time_delta;
+ break;
+
+ default:
+ RB_WARN_ON(cpu_buffer, 1);
+ }
+ }
+ if ((full && ts > info->ts) ||
+ (!full && ts + info->delta != info->ts)) {
+ /* If another report is happening, ignore this one */
+ if (atomic_inc_return(&ts_dump) != 1) {
+ atomic_dec(&ts_dump);
+ goto out;
+ }
+ atomic_inc(&cpu_buffer->record_disabled);
+ /* There's some cases in boot up that this can happen */
+ WARN_ON_ONCE(system_state != SYSTEM_BOOTING);
+ pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n",
+ cpu_buffer->cpu,
+ ts + info->delta, info->ts, info->delta,
+ info->before, info->after,
+ full ? " (full)" : "");
+ dump_buffer_page(bpage, info, tail);
+ atomic_dec(&ts_dump);
+ /* Do not re-enable checking */
+ return;
+ }
+out:
+ atomic_dec(this_cpu_ptr(&checking));
+}
+#else
+static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info,
+ unsigned long tail)
{
- WARN_ONCE(info->delta > (1ULL << 59),
- KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
- (unsigned long long)info->delta,
- (unsigned long long)info->ts,
- (unsigned long long)cpu_buffer->write_stamp,
- sched_clock_stable() ? "" :
- "If you just came from a suspend/resume,\n"
- "please switch to the trace global clock:\n"
- " echo global > /sys/kernel/debug/tracing/trace_clock\n"
- "or add trace_clock=global to the kernel command line\n");
- info->add_timestamp = 1;
}
+#endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */
static struct ring_buffer_event *
__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2817,35 +3432,133 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
{
struct ring_buffer_event *event;
struct buffer_page *tail_page;
- unsigned long tail, write;
-
- /*
- * If the time delta since the last event is too big to
- * hold in the time field of the event, then we append a
- * TIME EXTEND event ahead of the data event.
- */
- if (unlikely(info->add_timestamp))
- info->length += RB_LEN_TIME_EXTEND;
+ unsigned long tail, write, w;
+ bool a_ok;
+ bool b_ok;
/* Don't let the compiler play games with cpu_buffer->tail_page */
tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
- write = local_add_return(info->length, &tail_page->write);
+
+ /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK;
+ barrier();
+ b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ barrier();
+ info->ts = rb_time_stamp(cpu_buffer->buffer);
+
+ if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) {
+ info->delta = info->ts;
+ } else {
+ /*
+ * If interrupting an event time update, we may need an
+ * absolute timestamp.
+ * Don't bother if this is the start of a new page (w == 0).
+ */
+ if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
+ info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
+ info->length += RB_LEN_TIME_EXTEND;
+ } else {
+ info->delta = info->ts - info->after;
+ if (unlikely(test_time_stamp(info->delta))) {
+ info->add_timestamp |= RB_ADD_STAMP_EXTEND;
+ info->length += RB_LEN_TIME_EXTEND;
+ }
+ }
+ }
+
+ /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts);
+
+ /*C*/ write = local_add_return(info->length, &tail_page->write);
/* set write to only the index of the write */
write &= RB_WRITE_MASK;
+
tail = write - info->length;
+ /* See if we shot pass the end of this buffer page */
+ if (unlikely(write > BUF_PAGE_SIZE)) {
+ /* before and after may now different, fix it up*/
+ b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ if (a_ok && b_ok && info->before != info->after)
+ (void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
+ info->before, info->after);
+ if (a_ok && b_ok)
+ check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
+ return rb_move_tail(cpu_buffer, tail, info);
+ }
+
+ if (likely(tail == w)) {
+ u64 save_before;
+ bool s_ok;
+
+ /* Nothing interrupted us between A and C */
+ /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts);
+ barrier();
+ /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
+ RB_WARN_ON(cpu_buffer, !s_ok);
+ if (likely(!(info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
+ /* This did not interrupt any time update */
+ info->delta = info->ts - info->after;
+ else
+ /* Just use full timestamp for interrupting event */
+ info->delta = info->ts;
+ barrier();
+ check_buffer(cpu_buffer, info, tail);
+ if (unlikely(info->ts != save_before)) {
+ /* SLOW PATH - Interrupted between C and E */
+
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ RB_WARN_ON(cpu_buffer, !a_ok);
+
+ /* Write stamp must only go forward */
+ if (save_before > info->after) {
+ /*
+ * We do not care about the result, only that
+ * it gets updated atomically.
+ */
+ (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
+ info->after, save_before);
+ }
+ }
+ } else {
+ u64 ts;
+ /* SLOW PATH - Interrupted between A and C */
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ /* Was interrupted before here, write_stamp must be valid */
+ RB_WARN_ON(cpu_buffer, !a_ok);
+ ts = rb_time_stamp(cpu_buffer->buffer);
+ barrier();
+ /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+ info->after < ts &&
+ rb_time_cmpxchg(&cpu_buffer->write_stamp,
+ info->after, ts)) {
+ /* Nothing came after this event between C and E */
+ info->delta = ts - info->after;
+ } else {
+ /*
+ * Interrupted between C and E:
+ * Lost the previous events time stamp. Just set the
+ * delta to zero, and this will be the same time as
+ * the event this event interrupted. And the events that
+ * came after this will still be correct (as they would
+ * have built their delta on the previous event.
+ */
+ info->delta = 0;
+ }
+ info->ts = ts;
+ info->add_timestamp &= ~RB_ADD_STAMP_FORCE;
+ }
+
/*
* If this is the first commit on the page, then it has the same
* timestamp as the page itself.
*/
- if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
+ if (unlikely(!tail && !(info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
info->delta = 0;
- /* See if we shot pass the end of this buffer page */
- if (unlikely(write > BUF_PAGE_SIZE))
- return rb_move_tail(cpu_buffer, tail, info);
-
/* We reserved something on the buffer */
event = __rb_page_index(tail_page, tail);
@@ -2857,7 +3570,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
* If this is the first commit on the page, then update
* its timestamp.
*/
- if (!tail)
+ if (unlikely(!tail))
tail_page->page->time_stamp = info->ts;
/* account for these added bytes */
@@ -2874,9 +3587,10 @@ rb_reserve_next_event(struct trace_buffer *buffer,
struct ring_buffer_event *event;
struct rb_event_info info;
int nr_loops = 0;
- u64 diff;
+ int add_ts_default;
rb_start_commit(cpu_buffer);
+ /* The commit page can not change after this */
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
/*
@@ -2894,8 +3608,16 @@ rb_reserve_next_event(struct trace_buffer *buffer,
#endif
info.length = rb_calculate_event_length(length);
+
+ if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
+ add_ts_default = RB_ADD_STAMP_ABSOLUTE;
+ info.length += RB_LEN_TIME_EXTEND;
+ } else {
+ add_ts_default = RB_ADD_STAMP_NONE;
+ }
+
again:
- info.add_timestamp = 0;
+ info.add_timestamp = add_ts_default;
info.delta = 0;
/*
@@ -2910,35 +3632,16 @@ rb_reserve_next_event(struct trace_buffer *buffer,
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
goto out_fail;
- info.ts = rb_time_stamp(cpu_buffer->buffer);
- diff = info.ts - cpu_buffer->write_stamp;
-
- /* make sure this diff is calculated here */
- barrier();
-
- if (ring_buffer_time_stamp_abs(buffer)) {
- info.delta = info.ts;
- rb_handle_timestamp(cpu_buffer, &info);
- } else /* Did the write stamp get updated already? */
- if (likely(info.ts >= cpu_buffer->write_stamp)) {
- info.delta = diff;
- if (unlikely(test_time_stamp(info.delta)))
- rb_handle_timestamp(cpu_buffer, &info);
- }
-
event = __rb_reserve_next(cpu_buffer, &info);
if (unlikely(PTR_ERR(event) == -EAGAIN)) {
- if (info.add_timestamp)
+ if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND))
info.length -= RB_LEN_TIME_EXTEND;
goto again;
}
- if (!event)
- goto out_fail;
-
- return event;
-
+ if (likely(event))
+ return event;
out_fail:
rb_end_commit(cpu_buffer);
return NULL;
@@ -3028,14 +3731,14 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
* Because the commit page may be on the reader page we
* start with the next page and check the end loop there.
*/
- rb_inc_page(cpu_buffer, &bpage);
+ rb_inc_page(&bpage);
start = bpage;
do {
if (bpage->page == (void *)addr) {
local_dec(&bpage->entries);
return;
}
- rb_inc_page(cpu_buffer, &bpage);
+ rb_inc_page(&bpage);
} while (bpage != start);
/* commit not part of this buffer?? */
@@ -3043,7 +3746,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
}
/**
- * ring_buffer_commit_discard - discard an event that has not been committed
+ * ring_buffer_discard_commit - discard an event that has not been committed
* @buffer: the ring buffer
* @event: non committed event to discard
*
@@ -3084,11 +3787,6 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
if (rb_try_to_discard(cpu_buffer, event))
goto out;
- /*
- * The commit is still visible by the reader, so we
- * must still update the timestamp.
- */
- rb_update_write_stamp(cpu_buffer, event);
out:
rb_end_commit(cpu_buffer);
@@ -3177,10 +3875,30 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
if (unlikely(!head))
return true;
- return reader->read == rb_page_commit(reader) &&
- (commit == reader ||
- (commit == head &&
- head->read == rb_page_commit(commit)));
+ /* Reader should exhaust content in reader page */
+ if (reader->read != rb_page_commit(reader))
+ return false;
+
+ /*
+ * If writers are committing on the reader page, knowing all
+ * committed content has been read, the ring buffer is empty.
+ */
+ if (commit == reader)
+ return true;
+
+ /*
+ * If writers are committing on a page other than reader page
+ * and head page, there should always be content to read.
+ */
+ if (commit != head)
+ return false;
+
+ /*
+ * Writers are committing on the head page, we just need
+ * to care about there're committed data, and the reader will
+ * swap reader page with head page when it is to read data.
+ */
+ return rb_page_commit(commit) == 0;
}
/**
@@ -3547,14 +4265,18 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
/* Iterator usage is expected to have record disabled */
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
+ iter->next_event = iter->head;
iter->cache_reader_page = iter->head_page;
iter->cache_read = cpu_buffer->read;
- if (iter->head)
+ if (iter->head) {
iter->read_stamp = cpu_buffer->read_stamp;
- else
+ iter->page_stamp = cpu_buffer->reader_page->page->time_stamp;
+ } else {
iter->read_stamp = iter->head_page->page->time_stamp;
+ iter->page_stamp = iter->read_stamp;
+ }
}
/**
@@ -3590,17 +4312,38 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
struct buffer_page *reader;
struct buffer_page *head_page;
struct buffer_page *commit_page;
+ struct buffer_page *curr_commit_page;
unsigned commit;
+ u64 curr_commit_ts;
+ u64 commit_ts;
cpu_buffer = iter->cpu_buffer;
-
- /* Remember, trace recording is off when iterator is in use */
reader = cpu_buffer->reader_page;
head_page = cpu_buffer->head_page;
commit_page = cpu_buffer->commit_page;
+ commit_ts = commit_page->page->time_stamp;
+
+ /*
+ * When the writer goes across pages, it issues a cmpxchg which
+ * is a mb(), which will synchronize with the rmb here.
+ * (see rb_tail_page_update())
+ */
+ smp_rmb();
commit = rb_page_commit(commit_page);
+ /* We want to make sure that the commit page doesn't change */
+ smp_rmb();
+
+ /* Make sure commit page didn't change */
+ curr_commit_page = READ_ONCE(cpu_buffer->commit_page);
+ curr_commit_ts = READ_ONCE(curr_commit_page->page->time_stamp);
+
+ /* If the commit page changed, then there's more data */
+ if (curr_commit_page != commit_page ||
+ curr_commit_ts != commit_ts)
+ return 0;
- return ((iter->head_page == commit_page && iter->head == commit) ||
+ /* Still racy, as it may return a false positive, but that's OK */
+ return ((iter->head_page == commit_page && iter->head >= commit) ||
(iter->head_page == reader && commit_page == head_page &&
head_page->read == commit &&
iter->head == rb_page_commit(cpu_buffer->reader_page)));
@@ -3618,12 +4361,12 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
return;
case RINGBUF_TYPE_TIME_EXTEND:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
cpu_buffer->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
cpu_buffer->read_stamp = delta;
return;
@@ -3632,7 +4375,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
return;
default:
- BUG();
+ RB_WARN_ON(cpu_buffer, 1);
}
return;
}
@@ -3648,12 +4391,12 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
return;
case RINGBUF_TYPE_TIME_EXTEND:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
iter->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
iter->read_stamp = delta;
return;
@@ -3662,7 +4405,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
return;
default:
- BUG();
+ RB_WARN_ON(iter->cpu_buffer, 1);
}
return;
}
@@ -3737,7 +4480,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->pages = reader->list.prev;
/* The reader page will be pointing to the new head */
- rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
+ rb_set_list_to_head(&cpu_buffer->reader_page->list);
/*
* We want to make sure we read the overruns after we set up our
@@ -3776,7 +4519,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
* Now make the new head point back to the reader page.
*/
rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
- rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_inc_page(&cpu_buffer->head_page);
local_inc(&cpu_buffer->pages_read);
@@ -3828,15 +4571,22 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
static void rb_advance_iter(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct ring_buffer_event *event;
- unsigned length;
cpu_buffer = iter->cpu_buffer;
+ /* If head == next_event then we need to jump to the next event */
+ if (iter->head == iter->next_event) {
+ /* If the event gets overwritten again, there's nothing to do */
+ if (rb_iter_head_event(iter) == NULL)
+ return;
+ }
+
+ iter->head = iter->next_event;
+
/*
* Check if we are at the end of the buffer.
*/
- if (iter->head >= rb_page_size(iter->head_page)) {
+ if (iter->next_event >= rb_page_size(iter->head_page)) {
/* discarded commits can make the page empty */
if (iter->head_page == cpu_buffer->commit_page)
return;
@@ -3844,27 +4594,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
return;
}
- event = rb_iter_head_event(iter);
-
- length = rb_event_length(event);
-
- /*
- * This should not be called to advance the header if we are
- * at the tail of the buffer.
- */
- if (RB_WARN_ON(cpu_buffer,
- (iter->head_page == cpu_buffer->commit_page) &&
- (iter->head + length > rb_commit_index(cpu_buffer))))
- return;
-
- rb_update_iter_read_stamp(iter, event);
-
- iter->head += length;
-
- /* check for end of page padding */
- if ((iter->head >= rb_page_size(iter->head_page)) &&
- (iter->head_page != cpu_buffer->commit_page))
- rb_inc_iter(iter);
+ rb_update_iter_read_stamp(iter, iter->event);
}
static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
@@ -3919,7 +4649,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
- *ts = ring_buffer_event_time_stamp(event);
+ *ts = rb_event_time_stamp(event);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
@@ -3938,7 +4668,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
return event;
default:
- BUG();
+ RB_WARN_ON(cpu_buffer, 1);
}
return NULL;
@@ -3973,14 +4703,13 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
return NULL;
/*
- * We repeat when a time extend is encountered or we hit
- * the end of the page. Since the time extend is always attached
- * to a data event, we should never loop more than three times.
- * Once for going to next page, once on time extend, and
- * finally once to get the event.
- * (We never hit the following condition more than thrice).
+ * As the writer can mess with what the iterator is trying
+ * to read, just give up if we fail to get an event after
+ * three tries. The iterator is not as reliable when reading
+ * the ring buffer with an active write as the consumer is.
+ * Do not warn if the three failures is reached.
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3))
+ if (++nr_loops > 3)
return NULL;
if (rb_per_cpu_empty(cpu_buffer))
@@ -3992,6 +4721,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
}
event = rb_iter_head_event(iter);
+ if (!event)
+ goto again;
switch (event->type_len) {
case RINGBUF_TYPE_PADDING:
@@ -4009,7 +4740,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
- *ts = ring_buffer_event_time_stamp(event);
+ *ts = rb_event_time_stamp(event);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
@@ -4026,7 +4757,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
return event;
default:
- BUG();
+ RB_WARN_ON(cpu_buffer, 1);
}
return NULL;
@@ -4102,6 +4833,20 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
return event;
}
+/** ring_buffer_iter_dropped - report if there are dropped events
+ * @iter: The ring buffer iterator
+ *
+ * Returns true if there was dropped events since the last peek.
+ */
+bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter)
+{
+ bool ret = iter->missed_events != 0;
+
+ iter->missed_events = 0;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped);
+
/**
* ring_buffer_iter_peek - peek at the next event to be read
* @iter: The ring buffer iterator
@@ -4208,16 +4953,21 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return NULL;
- iter = kmalloc(sizeof(*iter), flags);
+ iter = kzalloc(sizeof(*iter), flags);
if (!iter)
return NULL;
+ iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags);
+ if (!iter->event) {
+ kfree(iter);
+ return NULL;
+ }
+
cpu_buffer = buffer->buffers[cpu];
iter->cpu_buffer = cpu_buffer;
- atomic_inc(&buffer->resize_disabled);
- atomic_inc(&cpu_buffer->record_disabled);
+ atomic_inc(&cpu_buffer->resize_disabled);
return iter;
}
@@ -4290,42 +5040,31 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
rb_check_pages(cpu_buffer);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- atomic_dec(&cpu_buffer->record_disabled);
- atomic_dec(&cpu_buffer->buffer->resize_disabled);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ kfree(iter->event);
kfree(iter);
}
EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
/**
- * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * ring_buffer_iter_advance - advance the iterator to the next location
* @iter: The ring buffer iterator
- * @ts: The time stamp of the event read.
*
- * This reads the next event in the ring buffer and increments the iterator.
+ * Move the location of the iterator such that the next read will
+ * be the next location of the iterator.
*/
-struct ring_buffer_event *
-ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+void ring_buffer_iter_advance(struct ring_buffer_iter *iter)
{
- struct ring_buffer_event *event;
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
unsigned long flags;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- again:
- event = rb_iter_peek(iter, ts);
- if (!event)
- goto out;
-
- if (event->type_len == RINGBUF_TYPE_PADDING)
- goto again;
rb_advance_iter(iter);
- out:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- return event;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
-EXPORT_SYMBOL_GPL(ring_buffer_read);
+EXPORT_SYMBOL_GPL(ring_buffer_iter_advance);
/**
* ring_buffer_size - return the size of the ring buffer (in bytes)
@@ -4384,8 +5123,10 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->read = 0;
cpu_buffer->read_bytes = 0;
- cpu_buffer->write_stamp = 0;
- cpu_buffer->read_stamp = 0;
+ rb_time_set(&cpu_buffer->write_stamp, 0);
+ rb_time_set(&cpu_buffer->before_stamp, 0);
+
+ memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp));
cpu_buffer->lost_events = 0;
cpu_buffer->last_overrun = 0;
@@ -4393,6 +5134,26 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
rb_head_page_activate(cpu_buffer);
}
+/* Must have disabled the cpu buffer then done a synchronize_rcu */
+static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
+ goto out;
+
+ arch_spin_lock(&cpu_buffer->lock);
+
+ rb_reset_cpu(cpu_buffer);
+
+ arch_spin_unlock(&cpu_buffer->lock);
+
+ out:
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+}
+
/**
* ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
@@ -4401,35 +5162,62 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
- unsigned long flags;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
- atomic_inc(&buffer->resize_disabled);
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+
+ atomic_inc(&cpu_buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);
/* Make sure all commits have finished */
synchronize_rcu();
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ reset_disabled_cpu_buffer(cpu_buffer);
- if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
- goto out;
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->resize_disabled);
- arch_spin_lock(&cpu_buffer->lock);
+ mutex_unlock(&buffer->mutex);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
- rb_reset_cpu(cpu_buffer);
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
- arch_spin_unlock(&cpu_buffer->lock);
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
- out:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ for_each_online_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
- atomic_dec(&cpu_buffer->record_disabled);
- atomic_dec(&buffer->resize_disabled);
+ atomic_inc(&cpu_buffer->resize_disabled);
+ atomic_inc(&cpu_buffer->record_disabled);
+ }
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ for_each_online_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ reset_disabled_cpu_buffer(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ }
+
+ mutex_unlock(&buffer->mutex);
}
-EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
/**
* ring_buffer_reset - reset a ring buffer
@@ -4437,10 +5225,32 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
*/
void ring_buffer_reset(struct trace_buffer *buffer)
{
+ struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
- for_each_buffer_cpu(buffer, cpu)
- ring_buffer_reset_cpu(buffer, cpu);
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ atomic_inc(&cpu_buffer->resize_disabled);
+ atomic_inc(&cpu_buffer->record_disabled);
+ }
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ reset_disabled_cpu_buffer(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ }
+
+ mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 8df0aa810950..78e576575b79 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -45,8 +45,8 @@ MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
static int producer_nice = MAX_NICE;
static int consumer_nice = MAX_NICE;
-static int producer_fifo = -1;
-static int consumer_fifo = -1;
+static int producer_fifo;
+static int consumer_fifo;
module_param(producer_nice, int, 0644);
MODULE_PARM_DESC(producer_nice, "nice prio for producer");
@@ -55,10 +55,10 @@ module_param(consumer_nice, int, 0644);
MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
module_param(producer_fifo, int, 0644);
-MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
+MODULE_PARM_DESC(producer_fifo, "use fifo for producer: 0 - disabled, 1 - low prio, 2 - fifo");
module_param(consumer_fifo, int, 0644);
-MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
+MODULE_PARM_DESC(consumer_fifo, "use fifo for consumer: 0 - disabled, 1 - low prio, 2 - fifo");
static int read_events;
@@ -303,22 +303,22 @@ static void ring_buffer_producer(void)
trace_printk("ERROR!\n");
if (!disable_reader) {
- if (consumer_fifo < 0)
+ if (consumer_fifo)
+ trace_printk("Running Consumer at SCHED_FIFO %s\n",
+ consumer_fifo == 1 ? "low" : "high");
+ else
trace_printk("Running Consumer at nice: %d\n",
consumer_nice);
- else
- trace_printk("Running Consumer at SCHED_FIFO %d\n",
- consumer_fifo);
}
- if (producer_fifo < 0)
+ if (producer_fifo)
+ trace_printk("Running Producer at SCHED_FIFO %s\n",
+ producer_fifo == 1 ? "low" : "high");
+ else
trace_printk("Running Producer at nice: %d\n",
producer_nice);
- else
- trace_printk("Running Producer at SCHED_FIFO %d\n",
- producer_fifo);
/* Let the user know that the test is running at low priority */
- if (producer_fifo < 0 && consumer_fifo < 0 &&
+ if (!producer_fifo && !consumer_fifo &&
producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
trace_printk("WARNING!!! This test is running at lowest priority.\n");
@@ -455,21 +455,19 @@ static int __init ring_buffer_benchmark_init(void)
* Run them as low-prio background tasks by default:
*/
if (!disable_reader) {
- if (consumer_fifo >= 0) {
- struct sched_param param = {
- .sched_priority = consumer_fifo
- };
- sched_setscheduler(consumer, SCHED_FIFO, &param);
- } else
+ if (consumer_fifo >= 2)
+ sched_set_fifo(consumer);
+ else if (consumer_fifo == 1)
+ sched_set_fifo_low(consumer);
+ else
set_user_nice(consumer, consumer_nice);
}
- if (producer_fifo >= 0) {
- struct sched_param param = {
- .sched_priority = producer_fifo
- };
- sched_setscheduler(producer, SCHED_FIFO, &param);
- } else
+ if (producer_fifo >= 2)
+ sched_set_fifo(producer);
+ else if (producer_fifo == 1)
+ sched_set_fifo_low(producer);
+ else
set_user_nice(producer, producer_nice);
return 0;
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index 7d56d621ffea..0b15e975d2c2 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Test module for in-kernel sythetic event creation and generation.
+ * Test module for in-kernel synthetic event creation and generation.
*
* Copyright (C) 2019 Tom Zanussi <zanussi@kernel.org>
*/
@@ -242,9 +242,11 @@ static struct synth_field_desc create_synth_test_fields[] = {
{ .type = "pid_t", .name = "next_pid_field" },
{ .type = "char[16]", .name = "next_comm_field" },
{ .type = "u64", .name = "ts_ns" },
+ { .type = "char[]", .name = "dynstring_field_1" },
{ .type = "u64", .name = "ts_ms" },
{ .type = "unsigned int", .name = "cpu" },
{ .type = "char[64]", .name = "my_string_field" },
+ { .type = "char[]", .name = "dynstring_field_2" },
{ .type = "int", .name = "my_int_field" },
};
@@ -254,7 +256,7 @@ static struct synth_field_desc create_synth_test_fields[] = {
*/
static int __init test_create_synth_event(void)
{
- u64 vals[7];
+ u64 vals[9];
int ret;
/* Create the create_synth_test event with the fields above */
@@ -292,10 +294,12 @@ static int __init test_create_synth_event(void)
vals[0] = 777; /* next_pid_field */
vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */
vals[2] = 1000000; /* ts_ns */
- vals[3] = 1000; /* ts_ms */
- vals[4] = raw_smp_processor_id(); /* cpu */
- vals[5] = (u64)(long)"thneed"; /* my_string_field */
- vals[6] = 398; /* my_int_field */
+ vals[3] = (u64)(long)"xrayspecs"; /* dynstring_field_1 */
+ vals[4] = 1000; /* ts_ms */
+ vals[5] = raw_smp_processor_id(); /* cpu */
+ vals[6] = (u64)(long)"thneed"; /* my_string_field */
+ vals[7] = (u64)(long)"kerplunk"; /* dynstring_field_2 */
+ vals[8] = 398; /* my_int_field */
/* Now generate a create_synth_test event */
ret = synth_event_trace_array(create_synth_test, vals, ARRAY_SIZE(vals));
@@ -303,7 +307,7 @@ static int __init test_create_synth_event(void)
return ret;
delete:
/* We got an error after creating the event, delete it */
- ret = synth_event_delete("create_synth_test");
+ synth_event_delete("create_synth_test");
goto out;
}
@@ -422,13 +426,15 @@ static int __init test_trace_synth_event(void)
int ret;
/* Trace some bogus values just for testing */
- ret = synth_event_trace(create_synth_test, 7, /* number of values */
+ ret = synth_event_trace(create_synth_test, 9, /* number of values */
(u64)444, /* next_pid_field */
(u64)(long)"clackers", /* next_comm_field */
(u64)1000000, /* ts_ns */
+ (u64)(long)"viewmaster",/* dynstring_field_1 */
(u64)1000, /* ts_ms */
(u64)raw_smp_processor_id(), /* cpu */
(u64)(long)"Thneed", /* my_string_field */
+ (u64)(long)"yoyos", /* dynstring_field_2 */
(u64)999); /* my_int_field */
return ret;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6b11e4e2150c..78ea542ce3bc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -39,6 +39,7 @@
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/init.h>
+#include <linux/panic_notifier.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
@@ -68,13 +69,25 @@ bool ring_buffer_expanded;
static bool __read_mostly tracing_selftest_running;
/*
- * If a tracer is running, we do not want to run SELFTEST.
+ * If boot-time tracing including tracers/events via kernel cmdline
+ * is running, we do not want to run SELFTEST.
*/
bool __read_mostly tracing_selftest_disabled;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+void __init disable_tracing_selftest(const char *reason)
+{
+ if (!tracing_selftest_disabled) {
+ tracing_selftest_disabled = true;
+ pr_info("Ftrace startup test is disabled due to %s\n", reason);
+ }
+}
+#endif
+
/* Pipe tracepoints to printk */
struct trace_iterator *tracepoint_print_iter;
int tracepoint_printk;
+static bool tracepoint_printk_stop_on_boot __initdata;
static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
/* For tracers that don't implement custom flags */
@@ -163,8 +176,9 @@ static union trace_eval_map_item *trace_eval_maps;
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
int tracing_set_tracer(struct trace_array *tr, const char *buf);
-static void ftrace_trace_userstack(struct trace_buffer *buffer,
- unsigned long flags, int pc);
+static void ftrace_trace_userstack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx);
#define MAX_TRACER_SIZE 100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -184,12 +198,12 @@ __setup("ftrace=", set_cmdline_ftrace);
static int __init set_ftrace_dump_on_oops(char *str)
{
- if (*str++ != '=' || !*str) {
+ if (*str++ != '=' || !*str || !strcmp("1", str)) {
ftrace_dump_on_oops = DUMP_ALL;
return 1;
}
- if (!strcmp("orig_cpu", str)) {
+ if (!strcmp("orig_cpu", str) || !strcmp("2", str)) {
ftrace_dump_on_oops = DUMP_ORIG;
return 1;
}
@@ -244,6 +258,13 @@ static int __init set_tracepoint_printk(char *str)
}
__setup("tp_printk", set_tracepoint_printk);
+static int __init set_tracepoint_printk_stop(char *str)
+{
+ tracepoint_printk_stop_on_boot = true;
+ return 1;
+}
+__setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop);
+
unsigned long long ns2usecs(u64 nsec)
{
nsec += 500;
@@ -251,13 +272,153 @@ unsigned long long ns2usecs(u64 nsec)
return nsec;
}
+static void
+trace_process_export(struct trace_export *export,
+ struct ring_buffer_event *event, int flag)
+{
+ struct trace_entry *entry;
+ unsigned int size = 0;
+
+ if (export->flags & flag) {
+ entry = ring_buffer_event_data(event);
+ size = ring_buffer_event_length(event);
+ export->write(export, entry, size);
+ }
+}
+
+static DEFINE_MUTEX(ftrace_export_lock);
+
+static struct trace_export __rcu *ftrace_exports_list __read_mostly;
+
+static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled);
+static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled);
+static DEFINE_STATIC_KEY_FALSE(trace_marker_exports_enabled);
+
+static inline void ftrace_exports_enable(struct trace_export *export)
+{
+ if (export->flags & TRACE_EXPORT_FUNCTION)
+ static_branch_inc(&trace_function_exports_enabled);
+
+ if (export->flags & TRACE_EXPORT_EVENT)
+ static_branch_inc(&trace_event_exports_enabled);
+
+ if (export->flags & TRACE_EXPORT_MARKER)
+ static_branch_inc(&trace_marker_exports_enabled);
+}
+
+static inline void ftrace_exports_disable(struct trace_export *export)
+{
+ if (export->flags & TRACE_EXPORT_FUNCTION)
+ static_branch_dec(&trace_function_exports_enabled);
+
+ if (export->flags & TRACE_EXPORT_EVENT)
+ static_branch_dec(&trace_event_exports_enabled);
+
+ if (export->flags & TRACE_EXPORT_MARKER)
+ static_branch_dec(&trace_marker_exports_enabled);
+}
+
+static void ftrace_exports(struct ring_buffer_event *event, int flag)
+{
+ struct trace_export *export;
+
+ preempt_disable_notrace();
+
+ export = rcu_dereference_raw_check(ftrace_exports_list);
+ while (export) {
+ trace_process_export(export, event, flag);
+ export = rcu_dereference_raw_check(export->next);
+ }
+
+ preempt_enable_notrace();
+}
+
+static inline void
+add_trace_export(struct trace_export **list, struct trace_export *export)
+{
+ rcu_assign_pointer(export->next, *list);
+ /*
+ * We are entering export into the list but another
+ * CPU might be walking that list. We need to make sure
+ * the export->next pointer is valid before another CPU sees
+ * the export pointer included into the list.
+ */
+ rcu_assign_pointer(*list, export);
+}
+
+static inline int
+rm_trace_export(struct trace_export **list, struct trace_export *export)
+{
+ struct trace_export **p;
+
+ for (p = list; *p != NULL; p = &(*p)->next)
+ if (*p == export)
+ break;
+
+ if (*p != export)
+ return -1;
+
+ rcu_assign_pointer(*p, (*p)->next);
+
+ return 0;
+}
+
+static inline void
+add_ftrace_export(struct trace_export **list, struct trace_export *export)
+{
+ ftrace_exports_enable(export);
+
+ add_trace_export(list, export);
+}
+
+static inline int
+rm_ftrace_export(struct trace_export **list, struct trace_export *export)
+{
+ int ret;
+
+ ret = rm_trace_export(list, export);
+ ftrace_exports_disable(export);
+
+ return ret;
+}
+
+int register_ftrace_export(struct trace_export *export)
+{
+ if (WARN_ON_ONCE(!export->write))
+ return -1;
+
+ mutex_lock(&ftrace_export_lock);
+
+ add_ftrace_export(&ftrace_exports_list, export);
+
+ mutex_unlock(&ftrace_export_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_export);
+
+int unregister_ftrace_export(struct trace_export *export)
+{
+ int ret;
+
+ mutex_lock(&ftrace_export_lock);
+
+ ret = rm_ftrace_export(&ftrace_exports_list, export);
+
+ mutex_unlock(&ftrace_export_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_export);
+
/* trace_flags holds trace_options default values */
#define TRACE_DEFAULT_FLAGS \
(FUNCTION_DEFAULT_FLAGS | \
TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
- TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS)
+ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \
+ TRACE_ITER_HASH_PTR)
/* trace_options that are only supported by global_trace */
#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
@@ -303,6 +464,7 @@ static void __trace_array_put(struct trace_array *this_tr)
/**
* trace_array_put - Decrement the reference counter for this trace array.
+ * @this_tr : pointer to the trace array
*
* NOTE: Use this when we no longer need the trace array returned by
* trace_array_get_by_name(). This ensures the trace array can be later
@@ -350,35 +512,23 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
return 0;
}
-void trace_free_pid_list(struct trace_pid_list *pid_list)
-{
- vfree(pid_list->pids);
- kfree(pid_list);
-}
-
/**
* trace_find_filtered_pid - check if a pid exists in a filtered_pid list
* @filtered_pids: The list of pids to check
* @search_pid: The PID to find in @filtered_pids
*
- * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
+ * Returns true if @search_pid is found in @filtered_pids, and false otherwise.
*/
bool
trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
{
- /*
- * If pid_max changed after filtered_pids was created, we
- * by default ignore all pids greater than the previous pid_max.
- */
- if (search_pid >= filtered_pids->pid_max)
- return false;
-
- return test_bit(search_pid, filtered_pids->pids);
+ return trace_pid_list_is_set(filtered_pids, search_pid);
}
/**
* trace_ignore_this_task - should a task be ignored for tracing
* @filtered_pids: The list of pids to check
+ * @filtered_no_pids: The list of pids not to be traced
* @task: The task that should be ignored if not filtered
*
* Checks if @task should be traced or not from @filtered_pids.
@@ -386,16 +536,22 @@ trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
* Returns false if @task should be traced.
*/
bool
-trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+trace_ignore_this_task(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list *filtered_no_pids,
+ struct task_struct *task)
{
/*
- * Return false, because if filtered_pids does not exist,
- * all pids are good to trace.
+ * If filtered_no_pids is not empty, and the task's pid is listed
+ * in filtered_no_pids, then return true.
+ * Otherwise, if filtered_pids is empty, that means we can
+ * trace all tasks. If it has content, then only trace pids
+ * within filtered_pids.
*/
- if (!filtered_pids)
- return false;
- return !trace_find_filtered_pid(filtered_pids, task->pid);
+ return (filtered_pids &&
+ !trace_find_filtered_pid(filtered_pids, task->pid)) ||
+ (filtered_no_pids &&
+ trace_find_filtered_pid(filtered_no_pids, task->pid));
}
/**
@@ -423,15 +579,11 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
return;
}
- /* Sorry, but we don't support pid_max changing after setting */
- if (task->pid >= pid_list->pid_max)
- return;
-
/* "self" is set for forks, and NULL for exits */
if (self)
- set_bit(task->pid, pid_list->pids);
+ trace_pid_list_set(pid_list, task->pid);
else
- clear_bit(task->pid, pid_list->pids);
+ trace_pid_list_clear(pid_list, task->pid);
}
/**
@@ -448,18 +600,19 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
*/
void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
{
- unsigned long pid = (unsigned long)v;
+ long pid = (unsigned long)v;
+ unsigned int next;
(*pos)++;
- /* pid already is +1 of the actual prevous bit */
- pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+ /* pid already is +1 of the actual previous bit */
+ if (trace_pid_list_next(pid_list, pid, &next) < 0)
+ return NULL;
- /* Return pid + 1 to allow zero to be represented */
- if (pid < pid_list->pid_max)
- return (void *)(pid + 1);
+ pid = next;
- return NULL;
+ /* Return pid + 1 to allow zero to be represented */
+ return (void *)(pid + 1);
}
/**
@@ -476,12 +629,14 @@ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
{
unsigned long pid;
+ unsigned int first;
loff_t l = 0;
- pid = find_first_bit(pid_list->pids, pid_list->pid_max);
- if (pid >= pid_list->pid_max)
+ if (trace_pid_list_first(pid_list, &first) < 0)
return NULL;
+ pid = first;
+
/* Return pid + 1 so that zero can be the exit value */
for (pid++; pid && l < *pos;
pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
@@ -517,7 +672,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
unsigned long val;
int nr_pids = 0;
ssize_t read = 0;
- ssize_t ret = 0;
+ ssize_t ret;
loff_t pos;
pid_t pid;
@@ -530,34 +685,23 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
* the user. If the operation fails, then the current list is
* not modified.
*/
- pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+ pid_list = trace_pid_list_alloc();
if (!pid_list) {
trace_parser_put(&parser);
return -ENOMEM;
}
- pid_list->pid_max = READ_ONCE(pid_max);
-
- /* Only truncating will shrink pid_max */
- if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
- pid_list->pid_max = filtered_pids->pid_max;
-
- pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
- if (!pid_list->pids) {
- trace_parser_put(&parser);
- kfree(pid_list);
- return -ENOMEM;
- }
-
if (filtered_pids) {
/* copy the current bits to the new max */
- for_each_set_bit(pid, filtered_pids->pids,
- filtered_pids->pid_max) {
- set_bit(pid, pid_list->pids);
+ ret = trace_pid_list_first(filtered_pids, &pid);
+ while (!ret) {
+ trace_pid_list_set(pid_list, pid);
+ ret = trace_pid_list_next(filtered_pids, pid + 1, &pid);
nr_pids++;
}
}
+ ret = 0;
while (cnt > 0) {
pos = 0;
@@ -573,12 +717,13 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
ret = -EINVAL;
if (kstrtoul(parser.buffer, 0, &val))
break;
- if (val >= pid_list->pid_max)
- break;
pid = (pid_t)val;
- set_bit(pid, pid_list->pids);
+ if (trace_pid_list_set(pid_list, pid) < 0) {
+ ret = -1;
+ break;
+ }
nr_pids++;
trace_parser_clear(&parser);
@@ -587,13 +732,13 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
trace_parser_put(&parser);
if (ret < 0) {
- trace_free_pid_list(pid_list);
+ trace_pid_list_free(pid_list);
return ret;
}
if (!nr_pids) {
/* Cleared the list of pids */
- trace_free_pid_list(pid_list);
+ trace_pid_list_free(pid_list);
read = ret;
pid_list = NULL;
}
@@ -611,7 +756,7 @@ static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu)
if (!buf->buffer)
return trace_clock_local();
- ts = ring_buffer_time_stamp(buf->buffer, cpu);
+ ts = ring_buffer_time_stamp(buf->buffer);
ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);
return ts;
@@ -623,7 +768,7 @@ u64 ftrace_now(int cpu)
}
/**
- * tracing_is_enabled - Show if global_trace has been disabled
+ * tracing_is_enabled - Show if global_trace has been enabled
*
* Shows if the global trace has been enabled or not. It uses the
* mirror flag "buffer_disabled" to be used in fast paths such as for
@@ -674,7 +819,7 @@ DEFINE_MUTEX(trace_types_lock);
* The content of events may become garbage if we allow other process consumes
* these events concurrently:
* A) the page of the consumed events may become a normal page
- * (not reader page) in ring buffer, and this page will be rewrited
+ * (not reader page) in ring buffer, and this page will be rewritten
* by events producer.
* B) The page of the consumed events may become a page for splice_read,
* and this page will be returned to system.
@@ -748,23 +893,23 @@ static inline void trace_access_lock_init(void)
#ifdef CONFIG_STACKTRACE
static void __ftrace_trace_stack(struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs);
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs);
static inline void ftrace_trace_stack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs);
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs);
#else
static inline void __ftrace_trace_stack(struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
}
static inline void ftrace_trace_stack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned long trace_ctx,
+ int skip, struct pt_regs *regs)
{
}
@@ -772,24 +917,24 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
static __always_inline void
trace_event_setup(struct ring_buffer_event *event,
- int type, unsigned long flags, int pc)
+ int type, unsigned int trace_ctx)
{
struct trace_entry *ent = ring_buffer_event_data(event);
- tracing_generic_entry_update(ent, type, flags, pc);
+ tracing_generic_entry_update(ent, type, trace_ctx);
}
static __always_inline struct ring_buffer_event *
__trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct ring_buffer_event *event;
event = ring_buffer_lock_reserve(buffer, len);
if (event != NULL)
- trace_event_setup(event, type, flags, pc);
+ trace_event_setup(event, type, trace_ctx);
return event;
}
@@ -850,25 +995,22 @@ int __trace_puts(unsigned long ip, const char *str, int size)
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct print_entry *entry;
- unsigned long irq_flags;
+ unsigned int trace_ctx;
int alloc;
- int pc;
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
- pc = preempt_count();
-
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
alloc = sizeof(*entry) + size + 2; /* possible \n added */
- local_save_flags(irq_flags);
+ trace_ctx = tracing_gen_ctx();
buffer = global_trace.array_buffer.buffer;
ring_buffer_nest_start(buffer);
- event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
- irq_flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+ trace_ctx);
if (!event) {
size = 0;
goto out;
@@ -887,7 +1029,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
entry->buf[size] = '\0';
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
+ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
out:
ring_buffer_nest_end(buffer);
return size;
@@ -904,25 +1046,22 @@ int __trace_bputs(unsigned long ip, const char *str)
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct bputs_entry *entry;
- unsigned long irq_flags;
+ unsigned int trace_ctx;
int size = sizeof(struct bputs_entry);
int ret = 0;
- int pc;
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
- pc = preempt_count();
-
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
- local_save_flags(irq_flags);
+ trace_ctx = tracing_gen_ctx();
buffer = global_trace.array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
- irq_flags, pc);
+ trace_ctx);
if (!event)
goto out;
@@ -931,7 +1070,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
+ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
ret = 1;
out:
@@ -941,7 +1080,8 @@ int __trace_bputs(unsigned long ip, const char *str)
EXPORT_SYMBOL_GPL(__trace_bputs);
#ifdef CONFIG_TRACER_SNAPSHOT
-void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data)
+static void tracing_snapshot_instance_cond(struct trace_array *tr,
+ void *cond_data)
{
struct tracer *tracer = tr->current_trace;
unsigned long flags;
@@ -1292,8 +1432,11 @@ EXPORT_SYMBOL_GPL(tracing_off);
void disable_trace_on_warning(void)
{
- if (__disable_trace_on_warning)
+ if (__disable_trace_on_warning) {
+ trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_,
+ "Disabling tracing due to warning\n");
tracing_off();
+ }
}
/**
@@ -1362,7 +1505,7 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
#undef C
#define C(a, b) b
-/* These must match the bit postions in trace_iterator_flags */
+/* These must match the bit positions in trace_iterator_flags */
static const char *trace_options[] = {
TRACE_FLAGS
NULL
@@ -1524,8 +1667,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
unsigned long __read_mostly tracing_thresh;
static const struct file_operations tracing_max_lat_fops;
-#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
- defined(CONFIG_FSNOTIFY)
+#ifdef LATENCY_FS_NOTIFY
static struct workqueue_struct *fsnotify_wq;
@@ -1533,8 +1675,7 @@ static void latency_fsnotify_workfn(struct work_struct *work)
{
struct trace_array *tr = container_of(work, struct trace_array,
fsnotify_work);
- fsnotify(tr->d_max_latency->d_inode, FS_MODIFY,
- tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+ fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
}
static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
@@ -1549,7 +1690,8 @@ static void trace_create_maxlat_file(struct trace_array *tr,
{
INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
- tr->d_max_latency = trace_create_file("tracing_max_latency", 0644,
+ tr->d_max_latency = trace_create_file("tracing_max_latency",
+ TRACE_MODE_WRITE,
d_tracer, &tr->max_latency,
&tracing_max_lat_fops);
}
@@ -1579,16 +1721,15 @@ void latency_fsnotify(struct trace_array *tr)
irq_work_queue(&tr->fsnotify_irqwork);
}
-/*
- * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
- * defined(CONFIG_FSNOTIFY)
- */
-#else
+#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
+ || defined(CONFIG_OSNOISE_TRACER)
#define trace_create_maxlat_file(tr, d_tracer) \
- trace_create_file("tracing_max_latency", 0644, d_tracer, \
- &tr->max_latency, &tracing_max_lat_fops)
+ trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \
+ d_tracer, &tr->max_latency, &tracing_max_lat_fops)
+#else
+#define trace_create_maxlat_file(tr, d_tracer) do { } while (0)
#endif
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -1772,6 +1913,12 @@ static int run_tracer_selftest(struct tracer *type)
if (!selftests_can_run)
return save_selftest(type);
+ if (!tracing_is_on()) {
+ pr_warn("Selftest for tracer %s skipped due to tracing disabled\n",
+ type->name);
+ return 0;
+ }
+
/*
* Run a selftest on this tracer.
* Here we reset the trace buffer, and set the current
@@ -1964,11 +2111,7 @@ int __init register_tracer(struct tracer *type)
apply_trace_boot_options();
/* disable other selftests, since this will break it. */
- tracing_selftest_disabled = true;
-#ifdef CONFIG_FTRACE_STARTUP_TEST
- printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
- type->name);
-#endif
+ disable_tracing_selftest("running a tracer");
out_unlock:
return ret;
@@ -1993,7 +2136,6 @@ static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
void tracing_reset_online_cpus(struct array_buffer *buf)
{
struct trace_buffer *buffer = buf->buffer;
- int cpu;
if (!buffer)
return;
@@ -2005,8 +2147,7 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
buf->time_start = buffer_ftrace_now(buf, buf->cpu);
- for_each_online_cpu(cpu)
- ring_buffer_reset_cpu(buffer, cpu);
+ ring_buffer_reset_online_cpus(buffer);
ring_buffer_record_enable(buffer);
}
@@ -2027,8 +2168,15 @@ void tracing_reset_all_online_cpus(void)
}
}
+/*
+ * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
+ * is the tgid last observed corresponding to pid=i.
+ */
static int *tgid_map;
+/* The maximum valid index into tgid_map. */
+static size_t tgid_map_max;
+
#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -2041,9 +2189,6 @@ struct saved_cmdlines_buffer {
};
static struct saved_cmdlines_buffer *savedcmd;
-/* temporary disable recording */
-static atomic_t trace_record_taskinfo_disabled __read_mostly;
-
static inline char *get_saved_cmdlines(int idx)
{
return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
@@ -2233,14 +2378,13 @@ static void tracing_stop_tr(struct trace_array *tr)
static int trace_save_cmdline(struct task_struct *tsk)
{
- unsigned pid, idx;
+ unsigned tpid, idx;
/* treat recording of idle task as a success */
if (!tsk->pid)
return 1;
- if (unlikely(tsk->pid > PID_MAX_DEFAULT))
- return 0;
+ tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
/*
* It's not the end of the world if we don't get
@@ -2251,26 +2395,15 @@ static int trace_save_cmdline(struct task_struct *tsk)
if (!arch_spin_trylock(&trace_cmdline_lock))
return 0;
- idx = savedcmd->map_pid_to_cmdline[tsk->pid];
+ idx = savedcmd->map_pid_to_cmdline[tpid];
if (idx == NO_CMDLINE_MAP) {
idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
- /*
- * Check whether the cmdline buffer at idx has a pid
- * mapped. We are going to overwrite that entry so we
- * need to clear the map_pid_to_cmdline. Otherwise we
- * would read the new comm for the old pid.
- */
- pid = savedcmd->map_cmdline_to_pid[idx];
- if (pid != NO_CMDLINE_MAP)
- savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
-
- savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
- savedcmd->map_pid_to_cmdline[tsk->pid] = idx;
-
+ savedcmd->map_pid_to_cmdline[tpid] = idx;
savedcmd->cmdline_idx = idx;
}
+ savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
set_cmdline(idx, tsk->comm);
arch_spin_unlock(&trace_cmdline_lock);
@@ -2281,6 +2414,7 @@ static int trace_save_cmdline(struct task_struct *tsk)
static void __trace_find_cmdline(int pid, char comm[])
{
unsigned map;
+ int tpid;
if (!pid) {
strcpy(comm, "<idle>");
@@ -2292,16 +2426,16 @@ static void __trace_find_cmdline(int pid, char comm[])
return;
}
- if (pid > PID_MAX_DEFAULT) {
- strcpy(comm, "<...>");
- return;
+ tpid = pid & (PID_MAX_DEFAULT - 1);
+ map = savedcmd->map_pid_to_cmdline[tpid];
+ if (map != NO_CMDLINE_MAP) {
+ tpid = savedcmd->map_cmdline_to_pid[map];
+ if (tpid == pid) {
+ strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
+ return;
+ }
}
-
- map = savedcmd->map_pid_to_cmdline[pid];
- if (map != NO_CMDLINE_MAP)
- strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
- else
- strcpy(comm, "<...>");
+ strcpy(comm, "<...>");
}
void trace_find_cmdline(int pid, char comm[])
@@ -2315,24 +2449,41 @@ void trace_find_cmdline(int pid, char comm[])
preempt_enable();
}
+static int *trace_find_tgid_ptr(int pid)
+{
+ /*
+ * Pairs with the smp_store_release in set_tracer_flag() to ensure that
+ * if we observe a non-NULL tgid_map then we also observe the correct
+ * tgid_map_max.
+ */
+ int *map = smp_load_acquire(&tgid_map);
+
+ if (unlikely(!map || pid > tgid_map_max))
+ return NULL;
+
+ return &map[pid];
+}
+
int trace_find_tgid(int pid)
{
- if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT))
- return 0;
+ int *ptr = trace_find_tgid_ptr(pid);
- return tgid_map[pid];
+ return ptr ? *ptr : 0;
}
static int trace_save_tgid(struct task_struct *tsk)
{
+ int *ptr;
+
/* treat recording of idle task as a success */
if (!tsk->pid)
return 1;
- if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT))
+ ptr = trace_find_tgid_ptr(tsk->pid);
+ if (!ptr)
return 0;
- tgid_map[tsk->pid] = tsk->tgid;
+ *ptr = tsk->tgid;
return 1;
}
@@ -2340,8 +2491,6 @@ static bool tracing_record_taskinfo_skip(int flags)
{
if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
return true;
- if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on())
- return true;
if (!__this_cpu_read(trace_taskinfo_save))
return true;
return false;
@@ -2430,36 +2579,44 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
}
EXPORT_SYMBOL_GPL(trace_handle_return);
-void
-tracing_generic_entry_update(struct trace_entry *entry, unsigned short type,
- unsigned long flags, int pc)
+static unsigned short migration_disable_value(void)
{
- struct task_struct *tsk = current;
-
- entry->preempt_count = pc & 0xff;
- entry->pid = (tsk) ? tsk->pid : 0;
- entry->type = type;
- entry->flags =
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
- (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+#if defined(CONFIG_SMP)
+ return current->migration_disabled;
#else
- TRACE_FLAG_IRQS_NOSUPPORT |
+ return 0;
#endif
- ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
- ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
- ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
- (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
- (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
}
-EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
+
+unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
+{
+ unsigned int trace_flags = irqs_status;
+ unsigned int pc;
+
+ pc = preempt_count();
+
+ if (pc & NMI_MASK)
+ trace_flags |= TRACE_FLAG_NMI;
+ if (pc & HARDIRQ_MASK)
+ trace_flags |= TRACE_FLAG_HARDIRQ;
+ if (in_serving_softirq())
+ trace_flags |= TRACE_FLAG_SOFTIRQ;
+
+ if (tif_need_resched())
+ trace_flags |= TRACE_FLAG_NEED_RESCHED;
+ if (test_preempt_need_resched())
+ trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
+ return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
+ (min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
+}
struct ring_buffer_event *
trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
- return __trace_buffer_lock_reserve(buffer, type, len, flags, pc);
+ return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx);
}
DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -2504,7 +2661,7 @@ void trace_buffered_event_enable(void)
preempt_disable();
if (cpu == smp_processor_id() &&
- this_cpu_read(trace_buffered_event) !=
+ __this_cpu_read(trace_buffered_event) !=
per_cpu(trace_buffered_event, cpu))
WARN_ON_ONCE(1);
preempt_enable();
@@ -2579,38 +2736,75 @@ struct ring_buffer_event *
trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
struct trace_event_file *trace_file,
int type, unsigned long len,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
struct ring_buffer_event *entry;
+ struct trace_array *tr = trace_file->tr;
int val;
- *current_rb = trace_file->tr->array_buffer.buffer;
+ *current_rb = tr->array_buffer.buffer;
- if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
- (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
+ if (!tr->no_filter_buffering_ref &&
+ (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
(entry = this_cpu_read(trace_buffered_event))) {
- /* Try to use the per cpu buffer first */
+ /*
+ * Filtering is on, so try to use the per cpu buffer first.
+ * This buffer will simulate a ring_buffer_event,
+ * where the type_len is zero and the array[0] will
+ * hold the full length.
+ * (see include/linux/ring-buffer.h for details on
+ * how the ring_buffer_event is structured).
+ *
+ * Using a temp buffer during filtering and copying it
+ * on a matched filter is quicker than writing directly
+ * into the ring buffer and then discarding it when
+ * it doesn't match. That is because the discard
+ * requires several atomic operations to get right.
+ * Copying on match and doing nothing on a failed match
+ * is still quicker than no copy on match, but having
+ * to discard out of the ring buffer on a failed match.
+ */
+ int max_len = PAGE_SIZE - struct_size(entry, array, 1);
+
val = this_cpu_inc_return(trace_buffered_event_cnt);
- if (val == 1) {
- trace_event_setup(entry, type, flags, pc);
+
+ /*
+ * Preemption is disabled, but interrupts and NMIs
+ * can still come in now. If that happens after
+ * the above increment, then it will have to go
+ * back to the old method of allocating the event
+ * on the ring buffer, and if the filter fails, it
+ * will have to call ring_buffer_discard_commit()
+ * to remove it.
+ *
+ * Need to also check the unlikely case that the
+ * length is bigger than the temp buffer size.
+ * If that happens, then the reserve is pretty much
+ * guaranteed to fail, as the ring buffer currently
+ * only allows events less than a page. But that may
+ * change in the future, so let the ring buffer reserve
+ * handle the failure in that case.
+ */
+ if (val == 1 && likely(len <= max_len)) {
+ trace_event_setup(entry, type, trace_ctx);
entry->array[0] = len;
return entry;
}
this_cpu_dec(trace_buffered_event_cnt);
}
- entry = __trace_buffer_lock_reserve(*current_rb,
- type, len, flags, pc);
+ entry = __trace_buffer_lock_reserve(*current_rb, type, len,
+ trace_ctx);
/*
* If tracing is off, but we have triggers enabled
* we still need to look at the event data. Use the temp_buffer
- * to store the trace event for the tigger to use. It's recusive
+ * to store the trace event for the trigger to use. It's recursive
* safe and will not be recorded anywhere.
*/
if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
*current_rb = temp_buffer;
- entry = __trace_buffer_lock_reserve(*current_rb,
- type, len, flags, pc);
+ entry = __trace_buffer_lock_reserve(*current_rb, type, len,
+ trace_ctx);
}
return entry;
}
@@ -2655,7 +2849,7 @@ static void output_printk(struct trace_event_buffer *fbuffer)
}
int tracepoint_printk_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int save_tracepoint_printk;
@@ -2689,12 +2883,26 @@ int tracepoint_printk_sysctl(struct ctl_table *table, int write,
void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
{
+ enum event_trigger_type tt = ETT_NONE;
+ struct trace_event_file *file = fbuffer->trace_file;
+
+ if (__event_trigger_test_discard(file, fbuffer->buffer, fbuffer->event,
+ fbuffer->entry, &tt))
+ goto discard;
+
if (static_key_false(&tracepoint_printk_key.key))
output_printk(fbuffer);
- event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer,
- fbuffer->event, fbuffer->entry,
- fbuffer->flags, fbuffer->pc, fbuffer->regs);
+ if (static_branch_unlikely(&trace_event_exports_enabled))
+ ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT);
+
+ trace_buffer_unlock_commit_regs(file->tr, fbuffer->buffer,
+ fbuffer->event, fbuffer->trace_ctx, fbuffer->regs);
+
+discard:
+ if (tt)
+ event_triggers_post_call(file, tt);
+
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
@@ -2710,7 +2918,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- unsigned long flags, int pc,
+ unsigned int trace_ctx,
struct pt_regs *regs)
{
__buffer_unlock_commit(buffer, event);
@@ -2721,8 +2929,8 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
* and mmiotrace, but that's ok if they lose a function or
* two. They are not that meaningful.
*/
- ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
- ftrace_trace_userstack(buffer, flags, pc);
+ ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs);
+ ftrace_trace_userstack(tr, buffer, trace_ctx);
}
/*
@@ -2735,133 +2943,9 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
__buffer_unlock_commit(buffer, event);
}
-static void
-trace_process_export(struct trace_export *export,
- struct ring_buffer_event *event)
-{
- struct trace_entry *entry;
- unsigned int size = 0;
-
- entry = ring_buffer_event_data(event);
- size = ring_buffer_event_length(event);
- export->write(export, entry, size);
-}
-
-static DEFINE_MUTEX(ftrace_export_lock);
-
-static struct trace_export __rcu *ftrace_exports_list __read_mostly;
-
-static DEFINE_STATIC_KEY_FALSE(ftrace_exports_enabled);
-
-static inline void ftrace_exports_enable(void)
-{
- static_branch_enable(&ftrace_exports_enabled);
-}
-
-static inline void ftrace_exports_disable(void)
-{
- static_branch_disable(&ftrace_exports_enabled);
-}
-
-static void ftrace_exports(struct ring_buffer_event *event)
-{
- struct trace_export *export;
-
- preempt_disable_notrace();
-
- export = rcu_dereference_raw_check(ftrace_exports_list);
- while (export) {
- trace_process_export(export, event);
- export = rcu_dereference_raw_check(export->next);
- }
-
- preempt_enable_notrace();
-}
-
-static inline void
-add_trace_export(struct trace_export **list, struct trace_export *export)
-{
- rcu_assign_pointer(export->next, *list);
- /*
- * We are entering export into the list but another
- * CPU might be walking that list. We need to make sure
- * the export->next pointer is valid before another CPU sees
- * the export pointer included into the list.
- */
- rcu_assign_pointer(*list, export);
-}
-
-static inline int
-rm_trace_export(struct trace_export **list, struct trace_export *export)
-{
- struct trace_export **p;
-
- for (p = list; *p != NULL; p = &(*p)->next)
- if (*p == export)
- break;
-
- if (*p != export)
- return -1;
-
- rcu_assign_pointer(*p, (*p)->next);
-
- return 0;
-}
-
-static inline void
-add_ftrace_export(struct trace_export **list, struct trace_export *export)
-{
- if (*list == NULL)
- ftrace_exports_enable();
-
- add_trace_export(list, export);
-}
-
-static inline int
-rm_ftrace_export(struct trace_export **list, struct trace_export *export)
-{
- int ret;
-
- ret = rm_trace_export(list, export);
- if (*list == NULL)
- ftrace_exports_disable();
-
- return ret;
-}
-
-int register_ftrace_export(struct trace_export *export)
-{
- if (WARN_ON_ONCE(!export->write))
- return -1;
-
- mutex_lock(&ftrace_export_lock);
-
- add_ftrace_export(&ftrace_exports_list, export);
-
- mutex_unlock(&ftrace_export_lock);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(register_ftrace_export);
-
-int unregister_ftrace_export(struct trace_export *export)
-{
- int ret;
-
- mutex_lock(&ftrace_export_lock);
-
- ret = rm_ftrace_export(&ftrace_exports_list, export);
-
- mutex_unlock(&ftrace_export_lock);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(unregister_ftrace_export);
-
void
-trace_function(struct trace_array *tr,
- unsigned long ip, unsigned long parent_ip, unsigned long flags,
- int pc)
+trace_function(struct trace_array *tr, unsigned long ip, unsigned long
+ parent_ip, unsigned int trace_ctx)
{
struct trace_event_call *call = &event_function;
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -2869,7 +2953,7 @@ trace_function(struct trace_array *tr,
struct ftrace_entry *entry;
event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
- flags, pc);
+ trace_ctx);
if (!event)
return;
entry = ring_buffer_event_data(event);
@@ -2877,8 +2961,8 @@ trace_function(struct trace_array *tr,
entry->parent_ip = parent_ip;
if (!call_filter_check_discard(call, entry, buffer, event)) {
- if (static_branch_unlikely(&ftrace_exports_enabled))
- ftrace_exports(event);
+ if (static_branch_unlikely(&trace_function_exports_enabled))
+ ftrace_exports(event, TRACE_EXPORT_FUNCTION);
__buffer_unlock_commit(buffer, event);
}
}
@@ -2903,8 +2987,8 @@ static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
static void __ftrace_trace_stack(struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
struct trace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
@@ -2922,18 +3006,12 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
skip++;
#endif
- /*
- * Since events can happen in NMIs there's no safe way to
- * use the per cpu ftrace_stacks. We reserve it and if an interrupt
- * or NMI comes in, it will just have to use the default
- * FTRACE_STACK_SIZE.
- */
preempt_disable_notrace();
stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
/* This should never happen. If it does, yell once and skip */
- if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING))
+ if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING))
goto out;
/*
@@ -2957,7 +3035,8 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
size = nr_entries * sizeof(unsigned long);
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
- sizeof(*entry) + size, flags, pc);
+ (sizeof(*entry) - sizeof(entry->caller)) + size,
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -2978,22 +3057,22 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
static inline void ftrace_trace_stack(struct trace_array *tr,
struct trace_buffer *buffer,
- unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
return;
- __ftrace_trace_stack(buffer, flags, skip, pc, regs);
+ __ftrace_trace_stack(buffer, trace_ctx, skip, regs);
}
-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
- int pc)
+void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
+ int skip)
{
struct trace_buffer *buffer = tr->array_buffer.buffer;
if (rcu_is_watching()) {
- __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+ __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
return;
}
@@ -3007,7 +3086,7 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
return;
rcu_irq_enter_irqson();
- __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+ __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
rcu_irq_exit_irqson();
}
@@ -3017,19 +3096,15 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
*/
void trace_dump_stack(int skip)
{
- unsigned long flags;
-
if (tracing_disabled || tracing_selftest_running)
return;
- local_save_flags(flags);
-
#ifndef CONFIG_UNWINDER_ORC
/* Skip 1 to skip this function. */
skip++;
#endif
__ftrace_trace_stack(global_trace.array_buffer.buffer,
- flags, skip, preempt_count(), NULL);
+ tracing_gen_ctx(), skip, NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
@@ -3037,13 +3112,14 @@ EXPORT_SYMBOL_GPL(trace_dump_stack);
static DEFINE_PER_CPU(int, user_stack_count);
static void
-ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc)
+ftrace_trace_userstack(struct trace_array *tr,
+ struct trace_buffer *buffer, unsigned int trace_ctx)
{
struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
- if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))
+ if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE))
return;
/*
@@ -3064,7 +3140,7 @@ ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc)
__this_cpu_inc(user_stack_count);
event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
goto out_drop_count;
entry = ring_buffer_event_data(event);
@@ -3082,38 +3158,73 @@ ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc)
preempt_enable();
}
#else /* CONFIG_USER_STACKTRACE_SUPPORT */
-static void ftrace_trace_userstack(struct trace_buffer *buffer,
- unsigned long flags, int pc)
+static void ftrace_trace_userstack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx)
{
}
#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */
#endif /* CONFIG_STACKTRACE */
+static inline void
+func_repeats_set_delta_ts(struct func_repeats_entry *entry,
+ unsigned long long delta)
+{
+ entry->bottom_delta_ts = delta & U32_MAX;
+ entry->top_delta_ts = (delta >> 32);
+}
+
+void trace_last_func_repeats(struct trace_array *tr,
+ struct trace_func_repeats *last_info,
+ unsigned int trace_ctx)
+{
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
+ struct func_repeats_entry *entry;
+ struct ring_buffer_event *event;
+ u64 delta;
+
+ event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS,
+ sizeof(*entry), trace_ctx);
+ if (!event)
+ return;
+
+ delta = ring_buffer_event_time_stamp(buffer, event) -
+ last_info->ts_last_call;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = last_info->ip;
+ entry->parent_ip = last_info->parent_ip;
+ entry->count = last_info->count;
+ func_repeats_set_delta_ts(entry, delta);
+
+ __buffer_unlock_commit(buffer, event);
+}
+
/* created for use with alloc_percpu */
struct trace_buffer_struct {
int nesting;
char buffer[4][TRACE_BUF_SIZE];
};
-static struct trace_buffer_struct *trace_percpu_buffer;
+static struct trace_buffer_struct __percpu *trace_percpu_buffer;
/*
- * Thise allows for lockless recording. If we're nested too deeply, then
+ * This allows for lockless recording. If we're nested too deeply, then
* this returns NULL.
*/
static char *get_trace_buf(void)
{
struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
- if (!buffer || buffer->nesting >= 4)
+ if (!trace_percpu_buffer || buffer->nesting >= 4)
return NULL;
buffer->nesting++;
/* Interrupts must see nesting incremented before we use the buffer */
barrier();
- return &buffer->buffer[buffer->nesting][0];
+ return &buffer->buffer[buffer->nesting - 1][0];
}
static void put_trace_buf(void)
@@ -3125,7 +3236,10 @@ static void put_trace_buf(void)
static int alloc_percpu_trace_buffer(void)
{
- struct trace_buffer_struct *buffers;
+ struct trace_buffer_struct __percpu *buffers;
+
+ if (trace_percpu_buffer)
+ return 0;
buffers = alloc_percpu(struct trace_buffer_struct);
if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
@@ -3210,9 +3324,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
struct trace_buffer *buffer;
struct trace_array *tr = &global_trace;
struct bprint_entry *entry;
- unsigned long flags;
+ unsigned int trace_ctx;
char *tbuffer;
- int len = 0, size, pc;
+ int len = 0, size;
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -3220,7 +3334,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
/* Don't pollute graph traces with trace_vprintk internals */
pause_graph_tracing();
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
tbuffer = get_trace_buf();
@@ -3234,12 +3348,11 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
goto out_put;
- local_save_flags(flags);
size = sizeof(*entry) + sizeof(u32) * len;
buffer = tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
- flags, pc);
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3249,7 +3362,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
memcpy(entry->buf, tbuffer, sizeof(u32) * len);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
}
out:
@@ -3272,9 +3385,9 @@ __trace_array_vprintk(struct trace_buffer *buffer,
{
struct trace_event_call *call = &event_print;
struct ring_buffer_event *event;
- int len = 0, size, pc;
+ int len = 0, size;
struct print_entry *entry;
- unsigned long flags;
+ unsigned int trace_ctx;
char *tbuffer;
if (tracing_disabled || tracing_selftest_running)
@@ -3283,7 +3396,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
/* Don't pollute graph traces with trace_vprintk internals */
pause_graph_tracing();
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
@@ -3295,11 +3408,10 @@ __trace_array_vprintk(struct trace_buffer *buffer,
len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
- local_save_flags(flags);
size = sizeof(*entry) + len + 1;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- flags, pc);
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3308,7 +3420,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
+ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL);
}
out:
@@ -3329,6 +3441,26 @@ int trace_array_vprintk(struct trace_array *tr,
return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
}
+/**
+ * trace_array_printk - Print a message to a specific instance
+ * @tr: The instance trace_array descriptor
+ * @ip: The instruction pointer that this is called from.
+ * @fmt: The format to print (printf format)
+ *
+ * If a subsystem sets up its own instance, they have the right to
+ * printk strings into their tracing instance buffer using this
+ * function. Note, this function will not write into the top level
+ * buffer (use trace_printk() for that), as writing into the top level
+ * buffer should only have events that can be individually disabled.
+ * trace_printk() is only used for debugging a kernel, and should not
+ * be ever incorporated in normal use.
+ *
+ * trace_array_printk() can be used, as it will not add noise to the
+ * top level tracing buffer.
+ *
+ * Note, trace_array_init_printk() must be called on @tr before this
+ * can be used.
+ */
__printf(3, 0)
int trace_array_printk(struct trace_array *tr,
unsigned long ip, const char *fmt, ...)
@@ -3336,12 +3468,16 @@ int trace_array_printk(struct trace_array *tr,
int ret;
va_list ap;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
- return 0;
-
if (!tr)
return -ENOENT;
+ /* This is only allowed for created instances */
+ if (tr == &global_trace)
+ return 0;
+
+ if (!(tr->trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
va_start(ap, fmt);
ret = trace_array_vprintk(tr, ip, fmt, ap);
va_end(ap);
@@ -3349,6 +3485,27 @@ int trace_array_printk(struct trace_array *tr,
}
EXPORT_SYMBOL_GPL(trace_array_printk);
+/**
+ * trace_array_init_printk - Initialize buffers for trace_array_printk()
+ * @tr: The trace array to initialize the buffers for
+ *
+ * As trace_array_printk() only writes into instances, they are OK to
+ * have in the kernel (unlike trace_printk()). This needs to be called
+ * before trace_array_printk() can be used on a trace_array.
+ */
+int trace_array_init_printk(struct trace_array *tr)
+{
+ if (!tr)
+ return -ENOENT;
+
+ /* This is only allowed for created instances */
+ if (tr == &global_trace)
+ return -EINVAL;
+
+ return alloc_percpu_trace_buffer();
+}
+EXPORT_SYMBOL_GPL(trace_array_init_printk);
+
__printf(3, 4)
int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...)
@@ -3378,7 +3535,7 @@ static void trace_iterator_increment(struct trace_iterator *iter)
iter->idx++;
if (buf_iter)
- ring_buffer_read(buf_iter, NULL);
+ ring_buffer_iter_advance(buf_iter);
}
static struct trace_entry *
@@ -3388,11 +3545,15 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
struct ring_buffer_event *event;
struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);
- if (buf_iter)
+ if (buf_iter) {
event = ring_buffer_iter_peek(buf_iter, ts);
- else
+ if (lost_events)
+ *lost_events = ring_buffer_iter_dropped(buf_iter) ?
+ (unsigned long)-1 : 0;
+ } else {
event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts,
lost_events);
+ }
if (event) {
iter->ent_size = ring_buffer_event_length(event);
@@ -3462,11 +3623,349 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
return next;
}
+#define STATIC_FMT_BUF_SIZE 128
+static char static_fmt_buf[STATIC_FMT_BUF_SIZE];
+
+static char *trace_iter_expand_format(struct trace_iterator *iter)
+{
+ char *tmp;
+
+ /*
+ * iter->tr is NULL when used with tp_printk, which makes
+ * this get called where it is not safe to call krealloc().
+ */
+ if (!iter->tr || iter->fmt == static_fmt_buf)
+ return NULL;
+
+ tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE,
+ GFP_KERNEL);
+ if (tmp) {
+ iter->fmt_size += STATIC_FMT_BUF_SIZE;
+ iter->fmt = tmp;
+ }
+
+ return tmp;
+}
+
+/* Returns true if the string is safe to dereference from an event */
+static bool trace_safe_str(struct trace_iterator *iter, const char *str)
+{
+ unsigned long addr = (unsigned long)str;
+ struct trace_event *trace_event;
+ struct trace_event_call *event;
+
+ /* OK if part of the event data */
+ if ((addr >= (unsigned long)iter->ent) &&
+ (addr < (unsigned long)iter->ent + iter->ent_size))
+ return true;
+
+ /* OK if part of the temp seq buffer */
+ if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
+ (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE))
+ return true;
+
+ /* Core rodata can not be freed */
+ if (is_kernel_rodata(addr))
+ return true;
+
+ if (trace_is_tracepoint_string(str))
+ return true;
+
+ /*
+ * Now this could be a module event, referencing core module
+ * data, which is OK.
+ */
+ if (!iter->ent)
+ return false;
+
+ trace_event = ftrace_find_event(iter->ent->type);
+ if (!trace_event)
+ return false;
+
+ event = container_of(trace_event, struct trace_event_call, event);
+ if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module)
+ return false;
+
+ /* Would rather have rodata, but this will suffice */
+ if (within_module_core(addr, event->module))
+ return true;
+
+ return false;
+}
+
+static const char *show_buffer(struct trace_seq *s)
+{
+ struct seq_buf *seq = &s->seq;
+
+ seq_buf_terminate(seq);
+
+ return seq->buffer;
+}
+
+static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
+
+static int test_can_verify_check(const char *fmt, ...)
+{
+ char buf[16];
+ va_list ap;
+ int ret;
+
+ /*
+ * The verifier is dependent on vsnprintf() modifies the va_list
+ * passed to it, where it is sent as a reference. Some architectures
+ * (like x86_32) passes it by value, which means that vsnprintf()
+ * does not modify the va_list passed to it, and the verifier
+ * would then need to be able to understand all the values that
+ * vsnprintf can use. If it is passed by value, then the verifier
+ * is disabled.
+ */
+ va_start(ap, fmt);
+ vsnprintf(buf, 16, "%d", ap);
+ ret = va_arg(ap, int);
+ va_end(ap);
+
+ return ret;
+}
+
+static void test_can_verify(void)
+{
+ if (!test_can_verify_check("%d %d", 0, 1)) {
+ pr_info("trace event string verifier disabled\n");
+ static_branch_inc(&trace_no_verify);
+ }
+}
+
+/**
+ * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer
+ * @iter: The iterator that holds the seq buffer and the event being printed
+ * @fmt: The format used to print the event
+ * @ap: The va_list holding the data to print from @fmt.
+ *
+ * This writes the data into the @iter->seq buffer using the data from
+ * @fmt and @ap. If the format has a %s, then the source of the string
+ * is examined to make sure it is safe to print, otherwise it will
+ * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string
+ * pointer.
+ */
+void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+ va_list ap)
+{
+ const char *p = fmt;
+ const char *str;
+ int i, j;
+
+ if (WARN_ON_ONCE(!fmt))
+ return;
+
+ if (static_branch_unlikely(&trace_no_verify))
+ goto print;
+
+ /* Don't bother checking when doing a ftrace_dump() */
+ if (iter->fmt == static_fmt_buf)
+ goto print;
+
+ while (*p) {
+ bool star = false;
+ int len = 0;
+
+ j = 0;
+
+ /* We only care about %s and variants */
+ for (i = 0; p[i]; i++) {
+ if (i + 1 >= iter->fmt_size) {
+ /*
+ * If we can't expand the copy buffer,
+ * just print it.
+ */
+ if (!trace_iter_expand_format(iter))
+ goto print;
+ }
+
+ if (p[i] == '\\' && p[i+1]) {
+ i++;
+ continue;
+ }
+ if (p[i] == '%') {
+ /* Need to test cases like %08.*s */
+ for (j = 1; p[i+j]; j++) {
+ if (isdigit(p[i+j]) ||
+ p[i+j] == '.')
+ continue;
+ if (p[i+j] == '*') {
+ star = true;
+ continue;
+ }
+ break;
+ }
+ if (p[i+j] == 's')
+ break;
+ star = false;
+ }
+ j = 0;
+ }
+ /* If no %s found then just print normally */
+ if (!p[i])
+ break;
+
+ /* Copy up to the %s, and print that */
+ strncpy(iter->fmt, p, i);
+ iter->fmt[i] = '\0';
+ trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+
+ /*
+ * If iter->seq is full, the above call no longer guarantees
+ * that ap is in sync with fmt processing, and further calls
+ * to va_arg() can return wrong positional arguments.
+ *
+ * Ensure that ap is no longer used in this case.
+ */
+ if (iter->seq.full) {
+ p = "";
+ break;
+ }
+
+ if (star)
+ len = va_arg(ap, int);
+
+ /* The ap now points to the string data of the %s */
+ str = va_arg(ap, const char *);
+
+ /*
+ * If you hit this warning, it is likely that the
+ * trace event in question used %s on a string that
+ * was saved at the time of the event, but may not be
+ * around when the trace is read. Use __string(),
+ * __assign_str() and __get_str() helpers in the TRACE_EVENT()
+ * instead. See samples/trace_events/trace-events-sample.h
+ * for reference.
+ */
+ if (WARN_ONCE(!trace_safe_str(iter, str),
+ "fmt: '%s' current_buffer: '%s'",
+ fmt, show_buffer(&iter->seq))) {
+ int ret;
+
+ /* Try to safely read the string */
+ if (star) {
+ if (len + 1 > iter->fmt_size)
+ len = iter->fmt_size - 1;
+ if (len < 0)
+ len = 0;
+ ret = copy_from_kernel_nofault(iter->fmt, str, len);
+ iter->fmt[len] = 0;
+ star = false;
+ } else {
+ ret = strncpy_from_kernel_nofault(iter->fmt, str,
+ iter->fmt_size);
+ }
+ if (ret < 0)
+ trace_seq_printf(&iter->seq, "(0x%px)", str);
+ else
+ trace_seq_printf(&iter->seq, "(0x%px:%s)",
+ str, iter->fmt);
+ str = "[UNSAFE-MEMORY]";
+ strcpy(iter->fmt, "%s");
+ } else {
+ strncpy(iter->fmt, p + i, j + 1);
+ iter->fmt[j+1] = '\0';
+ }
+ if (star)
+ trace_seq_printf(&iter->seq, iter->fmt, len, str);
+ else
+ trace_seq_printf(&iter->seq, iter->fmt, str);
+
+ p += i + j + 1;
+ }
+ print:
+ if (*p)
+ trace_seq_vprintf(&iter->seq, p, ap);
+}
+
+const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
+{
+ const char *p, *new_fmt;
+ char *q;
+
+ if (WARN_ON_ONCE(!fmt))
+ return fmt;
+
+ if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR)
+ return fmt;
+
+ p = fmt;
+ new_fmt = q = iter->fmt;
+ while (*p) {
+ if (unlikely(q - new_fmt + 3 > iter->fmt_size)) {
+ if (!trace_iter_expand_format(iter))
+ return fmt;
+
+ q += iter->fmt - new_fmt;
+ new_fmt = iter->fmt;
+ }
+
+ *q++ = *p++;
+
+ /* Replace %p with %px */
+ if (p[-1] == '%') {
+ if (p[0] == '%') {
+ *q++ = *p++;
+ } else if (p[0] == 'p' && !isalnum(p[1])) {
+ *q++ = *p++;
+ *q++ = 'x';
+ }
+ }
+ }
+ *q = '\0';
+
+ return new_fmt;
+}
+
+#define STATIC_TEMP_BUF_SIZE 128
+static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4);
+
/* Find the next real entry, without updating the iterator itself */
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts)
{
- return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
+ /* __find_next_entry will reset ent_size */
+ int ent_size = iter->ent_size;
+ struct trace_entry *entry;
+
+ /*
+ * If called from ftrace_dump(), then the iter->temp buffer
+ * will be the static_temp_buf and not created from kmalloc.
+ * If the entry size is greater than the buffer, we can
+ * not save it. Just return NULL in that case. This is only
+ * used to add markers when two consecutive events' time
+ * stamps have a large delta. See trace_print_lat_context()
+ */
+ if (iter->temp == static_temp_buf &&
+ STATIC_TEMP_BUF_SIZE < ent_size)
+ return NULL;
+
+ /*
+ * The __find_next_entry() may call peek_next_entry(), which may
+ * call ring_buffer_peek() that may make the contents of iter->ent
+ * undefined. Need to copy iter->ent now.
+ */
+ if (iter->ent && iter->ent != iter->temp) {
+ if ((!iter->temp || iter->temp_size < iter->ent_size) &&
+ !WARN_ON_ONCE(iter->temp == static_temp_buf)) {
+ void *temp;
+ temp = kmalloc(iter->ent_size, GFP_KERNEL);
+ if (!temp)
+ return NULL;
+ kfree(iter->temp);
+ iter->temp = temp;
+ iter->temp_size = iter->ent_size;
+ }
+ memcpy(iter->temp, iter->ent, iter->ent_size);
+ iter->ent = iter->temp;
+ }
+ entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts);
+ /* Put back the original ent_size */
+ iter->ent_size = ent_size;
+
+ return entry;
}
/* Find the next real entry, and increment the iterator to the next entry */
@@ -3516,7 +4015,6 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
void tracing_iter_reset(struct trace_iterator *iter, int cpu)
{
- struct ring_buffer_event *event;
struct ring_buffer_iter *buf_iter;
unsigned long entries = 0;
u64 ts;
@@ -3534,11 +4032,11 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
* that a reset never took place on a cpu. This is evident
* by the timestamp being before the start of the buffer.
*/
- while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
+ while (ring_buffer_iter_peek(buf_iter, &ts)) {
if (ts >= iter->array_buffer->time_start)
break;
entries++;
- ring_buffer_read(buf_iter, NULL);
+ ring_buffer_iter_advance(buf_iter);
}
per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries;
@@ -3573,9 +4071,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
return ERR_PTR(-EBUSY);
#endif
- if (!iter->snapshot)
- atomic_inc(&trace_record_taskinfo_disabled);
-
if (*pos != iter->pos) {
iter->ent = NULL;
iter->cpu = 0;
@@ -3618,9 +4113,6 @@ static void s_stop(struct seq_file *m, void *p)
return;
#endif
- if (!iter->snapshot)
- atomic_dec(&trace_record_taskinfo_disabled);
-
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
}
@@ -3690,14 +4182,15 @@ unsigned long trace_total_entries(struct trace_array *tr)
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _------=> CPU# \n"
- "# / _-----=> irqs-off \n"
- "# | / _----=> need-resched \n"
- "# || / _---=> hardirq/softirq \n"
- "# ||| / _--=> preempt-depth \n"
- "# |||| / delay \n"
- "# cmd pid ||||| time | caller \n"
- "# \\ / ||||| \\ | / \n");
+ seq_puts(m, "# _------=> CPU# \n"
+ "# / _-----=> irqs-off \n"
+ "# | / _----=> need-resched \n"
+ "# || / _---=> hardirq/softirq \n"
+ "# ||| / _--=> preempt-depth \n"
+ "# |||| / _-=> migrate-disable \n"
+ "# ||||| / delay \n"
+ "# cmd pid |||||| time | caller \n"
+ "# \\ / |||||| \\ | / \n");
}
static void print_event_info(struct array_buffer *buf, struct seq_file *m)
@@ -3718,26 +4211,27 @@ static void print_func_help_header(struct array_buffer *buf, struct seq_file *m,
print_event_info(buf, m);
- seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : "");
- seq_printf(m, "# | | %s | | |\n", tgid ? " | " : "");
+ seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? " TGID " : "");
+ seq_printf(m, "# | | %s | | |\n", tgid ? " | " : "");
}
static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m,
unsigned int flags)
{
bool tgid = flags & TRACE_ITER_RECORD_TGID;
- const char *space = " ";
- int prec = tgid ? 10 : 2;
+ const char *space = " ";
+ int prec = tgid ? 12 : 2;
print_event_info(buf, m);
- seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space);
- seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
- seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
- seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
- seq_printf(m, "# %.*s||| / delay\n", prec, space);
- seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID ");
- seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | ");
+ seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space);
+ seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
+ seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
+ seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
+ seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space);
+ seq_printf(m, "# %.*s|||| / delay\n", prec, space);
+ seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID ");
+ seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | ");
}
void
@@ -3981,8 +4475,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
enum print_line_t ret;
if (iter->lost_events) {
- trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
- iter->cpu, iter->lost_events);
+ if (iter->lost_events == (unsigned long)-1)
+ trace_seq_printf(&iter->seq, "CPU:%d [LOST EVENTS]\n",
+ iter->cpu);
+ else
+ trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+ iter->cpu, iter->lost_events);
if (trace_seq_has_overflowed(&iter->seq))
return TRACE_TYPE_PARTIAL_LINE;
}
@@ -4198,6 +4696,28 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
goto release;
/*
+ * trace_find_next_entry() may need to save off iter->ent.
+ * It will place it into the iter->temp buffer. As most
+ * events are less than 128, allocate a buffer of that size.
+ * If one is greater, then trace_find_next_entry() will
+ * allocate a new buffer to adjust for the bigger iter->ent.
+ * It's not critical if it fails to get allocated here.
+ */
+ iter->temp = kmalloc(128, GFP_KERNEL);
+ if (iter->temp)
+ iter->temp_size = 128;
+
+ /*
+ * trace_event_printf() may need to modify given format
+ * string to replace %p with %px so that it shows real address
+ * instead of hash value. However, that is only for the event
+ * tracing, other tracer may not need. Defer the allocation
+ * until it is needed.
+ */
+ iter->fmt = NULL;
+ iter->fmt_size = 0;
+
+ /*
* We make a copy of the current tracer to avoid concurrent
* changes on it while we are reading.
*/
@@ -4237,8 +4757,11 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (trace_clocks[tr->clock_id].in_ns)
iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
- /* stop the trace while dumping if we are not opening "snapshot" */
- if (!iter->snapshot)
+ /*
+ * If pause-on-trace is enabled, then stop the trace while
+ * dumping, unless this is the "snapshot" file
+ */
+ if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE))
tracing_stop_tr(tr);
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
@@ -4269,6 +4792,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
fail:
mutex_unlock(&trace_types_lock);
kfree(iter->trace);
+ kfree(iter->temp);
kfree(iter->buffer_iter);
release:
seq_release_private(inode, file);
@@ -4334,7 +4858,7 @@ static int tracing_release(struct inode *inode, struct file *file)
if (iter->trace && iter->trace->close)
iter->trace->close(iter);
- if (!iter->snapshot)
+ if (!iter->snapshot && tr->stop_count)
/* reenable tracing if it was previously enabled */
tracing_start_tr(tr);
@@ -4344,6 +4868,8 @@ static int tracing_release(struct inode *inode, struct file *file)
mutex_destroy(&iter->mutex);
free_cpumask_var(iter->started);
+ kfree(iter->fmt);
+ kfree(iter->temp);
kfree(iter->trace);
kfree(iter->buffer_iter);
seq_release_private(inode, file);
@@ -4620,7 +5146,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
cpumask_var_t tracing_cpumask_new;
int err;
- if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
@@ -4725,6 +5251,8 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
+ int *map;
+
if ((mask == TRACE_ITER_RECORD_TGID) ||
(mask == TRACE_ITER_RECORD_CMD))
lockdep_assert_held(&event_mutex);
@@ -4747,10 +5275,19 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
trace_event_enable_cmd_record(enabled);
if (mask == TRACE_ITER_RECORD_TGID) {
- if (!tgid_map)
- tgid_map = kvcalloc(PID_MAX_DEFAULT + 1,
- sizeof(*tgid_map),
- GFP_KERNEL);
+ if (!tgid_map) {
+ tgid_map_max = pid_max;
+ map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
+ GFP_KERNEL);
+
+ /*
+ * Pairs with smp_load_acquire() in
+ * trace_find_tgid_ptr() to ensure that if it observes
+ * the tgid_map we just allocated then it also observes
+ * the corresponding tgid_map_max value.
+ */
+ smp_store_release(&tgid_map, map);
+ }
if (!tgid_map) {
tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
return -ENOMEM;
@@ -4964,6 +5501,8 @@ static const char readme_msg[] =
#ifdef CONFIG_FUNCTION_TRACER
" set_ftrace_pid\t- Write pid(s) to only function trace those pids\n"
"\t\t (function)\n"
+ " set_ftrace_notrace_pid\t- Write pid(s) to not function trace those pids\n"
+ "\t\t (function)\n"
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
" set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
@@ -5004,16 +5543,17 @@ static const char readme_msg[] =
#ifdef CONFIG_HIST_TRIGGERS
"\t s:[synthetic/]<event> <field> [<field>]\n"
#endif
+ "\t e[:[<group>/]<event>] <attached-group>.<attached-event> [<args>]\n"
"\t -:[<group>/]<event>\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
- "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n"
+ "place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n"
#endif
#ifdef CONFIG_UPROBE_EVENTS
- " place (uprobe): <path>:<offset>[(ref_ctr_offset)]\n"
+ " place (uprobe): <path>:<offset>[%return][(ref_ctr_offset)]\n"
#endif
"\t args: <name>=fetcharg[:type]\n"
- "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"
+ "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n"
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#else
@@ -5028,6 +5568,8 @@ static const char readme_msg[] =
"\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
"\t [unsigned] char/int/long\n"
#endif
+ "\t efield: For event probes ('e' types), the field is on of the fields\n"
+ "\t of the <attached-group>/<attached-event>.\n"
#endif
" events/\t\t- Directory containing all trace event subsystems:\n"
" enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
@@ -5075,6 +5617,7 @@ static const char readme_msg[] =
#ifdef CONFIG_HIST_TRIGGERS
" hist trigger\t- If set, event hits are aggregated into a hash table\n"
"\t Format: hist:keys=<field1[,field2,...]>\n"
+ "\t [:<var1>=<field|var_ref|numeric_literal>[,<var2>=...]]\n"
"\t [:values=<field1[,field2,...]>]\n"
"\t [:sort=<field1[,field2,...]>]\n"
"\t [:size=#entries]\n"
@@ -5082,6 +5625,20 @@ static const char readme_msg[] =
"\t [:name=histname1]\n"
"\t [:<handler>.<action>]\n"
"\t [if <filter>]\n\n"
+ "\t Note, special fields can be used as well:\n"
+ "\t common_timestamp - to record current timestamp\n"
+ "\t common_cpu - to record the CPU the event happened on\n"
+ "\n"
+ "\t A hist trigger variable can be:\n"
+ "\t - a reference to a field e.g. x=current_timestamp,\n"
+ "\t - a reference to another variable e.g. y=$x,\n"
+ "\t - a numeric literal: e.g. ms_per_sec=1000,\n"
+ "\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n"
+ "\n"
+ "\t hist trigger aritmethic expressions support addition(+), subtraction(-),\n"
+ "\t multiplication(*) and division(/) operators. An operand can be either a\n"
+ "\t variable reference, field or numeric literal.\n"
+ "\n"
"\t When a matching event is hit, an entry is added to a hash\n"
"\t table using the key(s) and value(s) named, and the value of a\n"
"\t sum called 'hitcount' is incremented. Keys and values\n"
@@ -5111,6 +5668,7 @@ static const char readme_msg[] =
"\t .execname display a common_pid as a program name\n"
"\t .syscall display a syscall id as a syscall name\n"
"\t .log2 display log2 value rather than raw number\n"
+ "\t .buckets=size display values in groups of size rather than raw number\n"
"\t .usecs display a common_timestamp in microseconds\n\n"
"\t The 'pause' parameter can be used to pause an existing hist\n"
"\t trigger or to start a hist trigger but not log any events\n"
@@ -5134,7 +5692,12 @@ static const char readme_msg[] =
"\t trace(<synthetic_event>,param list) - generate synthetic event\n"
"\t save(field,...) - save current event fields\n"
#ifdef CONFIG_TRACER_SNAPSHOT
- "\t snapshot() - snapshot the trace buffer\n"
+ "\t snapshot() - snapshot the trace buffer\n\n"
+#endif
+#ifdef CONFIG_SYNTH_EVENTS
+ " events/synthetic_events\t- Create/append/remove/show synthetic events\n"
+ "\t Write into this file to define/undefine new synthetic events.\n"
+ "\t example: echo 'myevent u64 lat; char name[]' >> synthetic_events\n"
#endif
#endif
;
@@ -5155,37 +5718,16 @@ static const struct file_operations tracing_readme_fops = {
static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
{
- int *ptr = v;
-
- if (*pos || m->count)
- ptr++;
-
- (*pos)++;
-
- for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) {
- if (trace_find_tgid(*ptr))
- return ptr;
- }
+ int pid = ++(*pos);
- return NULL;
+ return trace_find_tgid_ptr(pid);
}
static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
{
- void *v;
- loff_t l = 0;
+ int pid = *pos;
- if (!tgid_map)
- return NULL;
-
- v = &tgid_map[0];
- while (l <= *pos) {
- v = saved_tgids_next(m, v, &l);
- if (!v)
- return NULL;
- }
-
- return v;
+ return trace_find_tgid_ptr(pid);
}
static void saved_tgids_stop(struct seq_file *m, void *v)
@@ -5194,9 +5736,14 @@ static void saved_tgids_stop(struct seq_file *m, void *v)
static int saved_tgids_show(struct seq_file *m, void *v)
{
- int pid = (int *)v - tgid_map;
+ int *entry = (int *)v;
+ int pid = entry - tgid_map;
+ int tgid = *entry;
+
+ if (tgid == 0)
+ return SEQ_SKIP;
- seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid));
+ seq_printf(m, "%d %d\n", pid, tgid);
return 0;
}
@@ -5530,7 +6077,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
static void trace_create_eval_file(struct dentry *d_tracer)
{
- trace_create_file("eval_map", 0444, d_tracer,
+ trace_create_file("eval_map", TRACE_MODE_READ, d_tracer,
NULL, &tracing_eval_map_fops);
}
@@ -5681,7 +6228,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
unsigned long size, int cpu_id)
{
- int ret = size;
+ int ret;
mutex_lock(&trace_types_lock);
@@ -5811,7 +6358,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
}
/* If trace pipe files are being read, we can't change the tracer */
- if (tr->current_trace->ref) {
+ if (tr->trace_ref) {
ret = -EBUSY;
goto out;
}
@@ -6027,7 +6574,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);
- tr->current_trace->ref++;
+ tr->trace_ref++;
out:
mutex_unlock(&trace_types_lock);
return ret;
@@ -6046,7 +6593,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
- tr->current_trace->ref--;
+ tr->trace_ref--;
if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
@@ -6171,9 +6718,7 @@ waitagain:
cnt = PAGE_SIZE - 1;
/* reset all but tr, trace, and overruns */
- memset(&iter->seq, 0,
- sizeof(struct trace_iterator) -
- offsetof(struct trace_iterator, seq));
+ memset_startat(iter, 0, seq);
cpumask_clear(iter->started);
trace_seq_init(&iter->seq);
iter->pos = -1;
@@ -6231,13 +6776,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
__free_page(spd->pages[idx]);
}
-static const struct pipe_buf_operations tracing_pipe_buf_ops = {
- .confirm = generic_pipe_buf_confirm,
- .release = generic_pipe_buf_release,
- .steal = generic_pipe_buf_steal,
- .get = generic_pipe_buf_get,
-};
-
static size_t
tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
{
@@ -6299,7 +6837,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
.partial = partial_def,
.nr_pages = 0, /* This gets updated below. */
.nr_pages_max = PIPE_DEF_BUFFERS,
- .ops = &tracing_pipe_buf_ops,
+ .ops = &default_pipe_buf_ops,
.spd_release = tracing_spd_release_pipe,
};
ssize_t ret;
@@ -6510,7 +7048,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
enum event_trigger_type tt = ETT_NONE;
struct trace_buffer *buffer;
struct print_entry *entry;
- unsigned long irq_flags;
ssize_t written;
int size;
int len;
@@ -6530,7 +7067,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
- local_save_flags(irq_flags);
size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
/* If less than "<faulted>", then make sure we can still add that */
@@ -6539,7 +7075,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- irq_flags, preempt_count());
+ tracing_gen_ctx());
if (unlikely(!event))
/* Ring buffer disabled, return as if not open for write */
return -EBADF;
@@ -6554,12 +7090,11 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
written = -EFAULT;
} else
written = cnt;
- len = cnt;
if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) {
/* do not add \n before testing triggers, but add \0 */
entry->buf[cnt] = '\0';
- tt = event_triggers_call(tr->trace_marker_file, entry, event);
+ tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event);
}
if (entry->buf[cnt - 1] != '\n') {
@@ -6568,6 +7103,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
} else
entry->buf[cnt] = '\0';
+ if (static_branch_unlikely(&trace_marker_exports_enabled))
+ ftrace_exports(event, TRACE_EXPORT_MARKER);
__buffer_unlock_commit(buffer, event);
if (tt)
@@ -6590,7 +7127,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct raw_data_entry *entry;
- unsigned long irq_flags;
ssize_t written;
int size;
int len;
@@ -6612,14 +7148,13 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
- local_save_flags(irq_flags);
size = sizeof(*entry) + cnt;
if (cnt < FAULT_SIZE_ID)
size += FAULT_SIZE_ID - cnt;
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
- irq_flags, preempt_count());
+ tracing_gen_ctx());
if (!event)
/* Ring buffer disabled, return as if not open for write */
return -EBADF;
@@ -6767,31 +7302,34 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
return ret;
}
-int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
+u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe)
+{
+ if (rbe == this_cpu_read(trace_buffered_event))
+ return ring_buffer_time_stamp(buffer);
+
+ return ring_buffer_event_time_stamp(buffer, rbe);
+}
+
+/*
+ * Set or disable using the per CPU trace_buffer_event when possible.
+ */
+int tracing_set_filter_buffering(struct trace_array *tr, bool set)
{
int ret = 0;
mutex_lock(&trace_types_lock);
- if (abs && tr->time_stamp_abs_ref++)
+ if (set && tr->no_filter_buffering_ref++)
goto out;
- if (!abs) {
- if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
+ if (!set) {
+ if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) {
ret = -EINVAL;
goto out;
}
- if (--tr->time_stamp_abs_ref)
- goto out;
+ --tr->no_filter_buffering_ref;
}
-
- ring_buffer_set_time_stamp_abs(tr->array_buffer.buffer, abs);
-
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (tr->max_buffer.buffer)
- ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
-#endif
out:
mutex_unlock(&trace_types_lock);
@@ -7082,6 +7620,91 @@ static const struct file_operations snapshot_raw_fops = {
#endif /* CONFIG_TRACER_SNAPSHOT */
+/*
+ * trace_min_max_write - Write a u64 value to a trace_min_max_param struct
+ * @filp: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function implements the write interface for a struct trace_min_max_param.
+ * The filp->private_data must point to a trace_min_max_param structure that
+ * defines where to write the value, the min and the max acceptable values,
+ * and a lock to protect the write.
+ */
+static ssize_t
+trace_min_max_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_min_max_param *param = filp->private_data;
+ u64 val;
+ int err;
+
+ if (!param)
+ return -EFAULT;
+
+ err = kstrtoull_from_user(ubuf, cnt, 10, &val);
+ if (err)
+ return err;
+
+ if (param->lock)
+ mutex_lock(param->lock);
+
+ if (param->min && val < *param->min)
+ err = -EINVAL;
+
+ if (param->max && val > *param->max)
+ err = -EINVAL;
+
+ if (!err)
+ *param->val = val;
+
+ if (param->lock)
+ mutex_unlock(param->lock);
+
+ if (err)
+ return err;
+
+ return cnt;
+}
+
+/*
+ * trace_min_max_read - Read a u64 value from a trace_min_max_param struct
+ * @filp: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function implements the read interface for a struct trace_min_max_param.
+ * The filp->private_data must point to a trace_min_max_param struct with valid
+ * data.
+ */
+static ssize_t
+trace_min_max_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_min_max_param *param = filp->private_data;
+ char buf[U64_STR_SIZE];
+ int len;
+ u64 val;
+
+ if (!param)
+ return -EFAULT;
+
+ val = *param->val;
+
+ if (cnt > sizeof(buf))
+ cnt = sizeof(buf);
+
+ len = snprintf(buf, sizeof(buf), "%llu\n", val);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+}
+
+const struct file_operations trace_min_max_fops = {
+ .open = tracing_open_generic,
+ .read = trace_min_max_read,
+ .write = trace_min_max_write,
+};
+
#define TRACING_LOG_ERRS_MAX 8
#define TRACING_LOG_LOC_MAX 128
@@ -7127,11 +7750,11 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
* @cmd: The tracing command that caused the error
* @str: The string to position the caret at within @cmd
*
- * Finds the position of the first occurence of @str within @cmd. The
+ * Finds the position of the first occurrence of @str within @cmd. The
* return value can be passed to tracing_log_err() for caret placement
* within @cmd.
*
- * Returns the index within @cmd of the first occurence of @str or 0
+ * Returns the index within @cmd of the first occurrence of @str or 0
* if @str was not found.
*/
unsigned int err_pos(char *cmd, const char *str)
@@ -7337,7 +7960,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
if (ret)
return ret;
- info = kzalloc(sizeof(*info), GFP_KERNEL);
+ info = kvzalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
trace_array_put(tr);
return -ENOMEM;
@@ -7355,7 +7978,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
filp->private_data = info;
- tr->current_trace->ref++;
+ tr->trace_ref++;
mutex_unlock(&trace_types_lock);
@@ -7456,14 +8079,14 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
- iter->tr->current_trace->ref--;
+ iter->tr->trace_ref--;
__trace_array_put(iter->tr);
if (info->spare)
ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
- kfree(info);
+ kvfree(info);
mutex_unlock(&trace_types_lock);
@@ -7508,9 +8131,7 @@ static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
/* Pipe buffer operations for a buffer. */
static const struct pipe_buf_operations buffer_pipe_buf_ops = {
- .confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
- .steal = generic_pipe_buf_nosteal,
.get = buffer_pipe_buf_get,
};
@@ -7683,7 +8304,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
t, usec_rem);
- t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
+ t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer));
usec_rem = do_div(t, USEC_PER_SEC);
trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
} else {
@@ -7692,7 +8313,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
trace_seq_printf(s, "now ts: %llu\n",
- ring_buffer_time_stamp(trace_buf->buffer, cpu));
+ ring_buffer_time_stamp(trace_buf->buffer));
}
cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
@@ -7967,27 +8588,27 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
}
/* per cpu trace_pipe */
- trace_create_cpu_file("trace_pipe", 0444, d_cpu,
+ trace_create_cpu_file("trace_pipe", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_pipe_fops);
/* per cpu trace */
- trace_create_cpu_file("trace", 0644, d_cpu,
+ trace_create_cpu_file("trace", TRACE_MODE_WRITE, d_cpu,
tr, cpu, &tracing_fops);
- trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu,
+ trace_create_cpu_file("trace_pipe_raw", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_buffers_fops);
- trace_create_cpu_file("stats", 0444, d_cpu,
+ trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_stats_fops);
- trace_create_cpu_file("buffer_size_kb", 0444, d_cpu,
+ trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_entries_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_cpu_file("snapshot", 0644, d_cpu,
+ trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
tr, cpu, &snapshot_fops);
- trace_create_cpu_file("snapshot_raw", 0444, d_cpu,
+ trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
tr, cpu, &snapshot_raw_fops);
#endif
}
@@ -8193,8 +8814,8 @@ create_trace_option_file(struct trace_array *tr,
topt->opt = opt;
topt->tr = tr;
- topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
- &trace_options_fops);
+ topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE,
+ t_options, topt, &trace_options_fops);
}
@@ -8269,7 +8890,7 @@ create_trace_option_core_file(struct trace_array *tr,
if (!t_options)
return NULL;
- return trace_create_file(option, 0644, t_options,
+ return trace_create_file(option, TRACE_MODE_WRITE, t_options,
(void *)&tr->trace_flags_index[index],
&trace_options_core_fops);
}
@@ -8452,6 +9073,7 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
*/
allocate_snapshot = false;
#endif
+
return 0;
}
@@ -8529,6 +9151,26 @@ struct trace_array *trace_array_find_get(const char *instance)
return tr;
}
+static int trace_array_create_dir(struct trace_array *tr)
+{
+ int ret;
+
+ tr->dir = tracefs_create_dir(tr->name, trace_instance_dir);
+ if (!tr->dir)
+ return -EINVAL;
+
+ ret = event_trace_add_tracer(tr->dir, tr);
+ if (ret) {
+ tracefs_remove(tr->dir);
+ return ret;
+ }
+
+ init_tracer_tracefs(tr, tr->dir);
+ __update_tracer_options(tr);
+
+ return ret;
+}
+
static struct trace_array *trace_array_create(const char *name)
{
struct trace_array *tr;
@@ -8564,30 +9206,28 @@ static struct trace_array *trace_array_create(const char *name)
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
- tr->dir = tracefs_create_dir(name, trace_instance_dir);
- if (!tr->dir)
+ if (ftrace_allocate_ftrace_ops(tr) < 0)
goto out_free_tr;
- ret = event_trace_add_tracer(tr->dir, tr);
- if (ret) {
- tracefs_remove(tr->dir);
- goto out_free_tr;
- }
-
ftrace_init_trace_array(tr);
- init_tracer_tracefs(tr, tr->dir);
init_trace_flags_index(tr);
- __update_tracer_options(tr);
+
+ if (trace_instance_dir) {
+ ret = trace_array_create_dir(tr);
+ if (ret)
+ goto out_free_tr;
+ } else
+ __trace_early_add_events(tr);
list_add(&tr->list, &ftrace_trace_arrays);
tr->ref++;
-
return tr;
out_free_tr:
+ ftrace_free_ftrace_ops(tr);
free_trace_buffers(tr);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
@@ -8665,7 +9305,7 @@ static int __remove_instance(struct trace_array *tr)
int i;
/* Reference counter for a newly created trace array = 1. */
- if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref))
+ if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
return -EBUSY;
list_del(&tr->list);
@@ -8682,6 +9322,7 @@ static int __remove_instance(struct trace_array *tr)
ftrace_clear_pids(tr);
ftrace_destroy_function_files(tr);
tracefs_remove(tr->dir);
+ free_percpu(tr->last_func_repeats);
free_trace_buffers(tr);
for (i = 0; i < tr->nr_topts; i++) {
@@ -8692,7 +9333,6 @@ static int __remove_instance(struct trace_array *tr)
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
- tr = NULL;
return 0;
}
@@ -8746,11 +9386,27 @@ static int instance_rmdir(const char *name)
static __init void create_trace_instances(struct dentry *d_tracer)
{
+ struct trace_array *tr;
+
trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
instance_mkdir,
instance_rmdir);
if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n"))
return;
+
+ mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->name)
+ continue;
+ if (MEM_FAIL(trace_array_create_dir(tr) < 0,
+ "Failed to create instance directory\n"))
+ break;
+ }
+
+ mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
}
static void
@@ -8759,28 +9415,28 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
struct trace_event_file *file;
int cpu;
- trace_create_file("available_tracers", 0444, d_tracer,
+ trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
tr, &show_traces_fops);
- trace_create_file("current_tracer", 0644, d_tracer,
+ trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer,
tr, &set_tracer_fops);
- trace_create_file("tracing_cpumask", 0644, d_tracer,
+ trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_cpumask_fops);
- trace_create_file("trace_options", 0644, d_tracer,
+ trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_iter_fops);
- trace_create_file("trace", 0644, d_tracer,
+ trace_create_file("trace", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_fops);
- trace_create_file("trace_pipe", 0444, d_tracer,
+ trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer,
tr, &tracing_pipe_fops);
- trace_create_file("buffer_size_kb", 0644, d_tracer,
+ trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_entries_fops);
- trace_create_file("buffer_total_size_kb", 0444, d_tracer,
+ trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer,
tr, &tracing_total_entries_fops);
trace_create_file("free_buffer", 0200, d_tracer,
@@ -8791,42 +9447,40 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
file = __find_event_file(tr, "ftrace", "print");
if (file && file->dir)
- trace_create_file("trigger", 0644, file->dir, file,
- &event_trigger_fops);
+ trace_create_file("trigger", TRACE_MODE_WRITE, file->dir,
+ file, &event_trigger_fops);
tr->trace_marker_file = file;
trace_create_file("trace_marker_raw", 0220, d_tracer,
tr, &tracing_mark_raw_fops);
- trace_create_file("trace_clock", 0644, d_tracer, tr,
+ trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr,
&trace_clock_fops);
- trace_create_file("tracing_on", 0644, d_tracer,
+ trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
tr, &rb_simple_fops);
- trace_create_file("timestamp_mode", 0444, d_tracer, tr,
+ trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
&trace_time_stamp_mode_fops);
tr->buffer_percent = 50;
- trace_create_file("buffer_percent", 0444, d_tracer,
+ trace_create_file("buffer_percent", TRACE_MODE_READ, d_tracer,
tr, &buffer_percent_fops);
create_trace_options_dir(tr);
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
trace_create_maxlat_file(tr, d_tracer);
-#endif
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_file("snapshot", 0644, d_tracer,
+ trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
tr, &snapshot_fops);
#endif
- trace_create_file("error_log", 0644, d_tracer,
+ trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_err_log_fops);
for_each_tracing_cpu(cpu)
@@ -8864,40 +9518,41 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
* directory. It is called via fs_initcall() by any of the boot up code
* and expects to return the dentry of the top level tracing directory.
*/
-struct dentry *tracing_init_dentry(void)
+int tracing_init_dentry(void)
{
struct trace_array *tr = &global_trace;
if (security_locked_down(LOCKDOWN_TRACEFS)) {
pr_warn("Tracing disabled due to lockdown\n");
- return ERR_PTR(-EPERM);
+ return -EPERM;
}
/* The top level trace array uses NULL as parent */
if (tr->dir)
- return NULL;
+ return 0;
- if (WARN_ON(!tracefs_initialized()) ||
- (IS_ENABLED(CONFIG_DEBUG_FS) &&
- WARN_ON(!debugfs_initialized())))
- return ERR_PTR(-ENODEV);
+ if (WARN_ON(!tracefs_initialized()))
+ return -ENODEV;
/*
* As there may still be users that expect the tracing
* files to exist in debugfs/tracing, we must automount
* the tracefs file system there, so older tools still
- * work with the newer kerenl.
+ * work with the newer kernel.
*/
tr->dir = debugfs_create_automount("tracing", NULL,
trace_automount, NULL);
- return NULL;
+ return 0;
}
extern struct trace_eval_map *__start_ftrace_eval_maps[];
extern struct trace_eval_map *__stop_ftrace_eval_maps[];
-static void __init trace_eval_init(void)
+static struct workqueue_struct *eval_map_wq __initdata;
+static struct work_struct eval_map_work __initdata;
+
+static void __init eval_map_work_func(struct work_struct *work)
{
int len;
@@ -8905,6 +9560,33 @@ static void __init trace_eval_init(void)
trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
}
+static int __init trace_eval_init(void)
+{
+ INIT_WORK(&eval_map_work, eval_map_work_func);
+
+ eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0);
+ if (!eval_map_wq) {
+ pr_err("Unable to allocate eval_map_wq\n");
+ /* Do work here */
+ eval_map_work_func(&eval_map_work);
+ return -ENOMEM;
+ }
+
+ queue_work(eval_map_wq, &eval_map_work);
+ return 0;
+}
+
+static int __init trace_eval_sync(void)
+{
+ /* Make sure the eval map updates are finished */
+ if (eval_map_wq)
+ destroy_workqueue(eval_map_wq);
+ return 0;
+}
+
+late_initcall_sync(trace_eval_sync);
+
+
#ifdef CONFIG_MODULES
static void trace_module_add_evals(struct module *mod)
{
@@ -8967,7 +9649,7 @@ static int trace_module_notify(struct notifier_block *self,
break;
}
- return 0;
+ return NOTIFY_OK;
}
static struct notifier_block trace_module_nb = {
@@ -8978,54 +9660,56 @@ static struct notifier_block trace_module_nb = {
static __init int tracer_init_tracefs(void)
{
- struct dentry *d_tracer;
+ int ret;
trace_access_lock_init();
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
+ ret = tracing_init_dentry();
+ if (ret)
return 0;
event_trace_init();
- init_tracer_tracefs(&global_trace, d_tracer);
- ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
+ init_tracer_tracefs(&global_trace, NULL);
+ ftrace_init_tracefs_toplevel(&global_trace, NULL);
- trace_create_file("tracing_thresh", 0644, d_tracer,
+ trace_create_file("tracing_thresh", TRACE_MODE_WRITE, NULL,
&global_trace, &tracing_thresh_fops);
- trace_create_file("README", 0444, d_tracer,
+ trace_create_file("README", TRACE_MODE_READ, NULL,
NULL, &tracing_readme_fops);
- trace_create_file("saved_cmdlines", 0444, d_tracer,
+ trace_create_file("saved_cmdlines", TRACE_MODE_READ, NULL,
NULL, &tracing_saved_cmdlines_fops);
- trace_create_file("saved_cmdlines_size", 0644, d_tracer,
+ trace_create_file("saved_cmdlines_size", TRACE_MODE_WRITE, NULL,
NULL, &tracing_saved_cmdlines_size_fops);
- trace_create_file("saved_tgids", 0444, d_tracer,
+ trace_create_file("saved_tgids", TRACE_MODE_READ, NULL,
NULL, &tracing_saved_tgids_fops);
trace_eval_init();
- trace_create_eval_file(d_tracer);
+ trace_create_eval_file(NULL);
#ifdef CONFIG_MODULES
register_module_notifier(&trace_module_nb);
#endif
#ifdef CONFIG_DYNAMIC_FTRACE
- trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+ trace_create_file("dyn_ftrace_total_info", TRACE_MODE_READ, NULL,
NULL, &tracing_dyn_info_fops);
#endif
- create_trace_instances(d_tracer);
+ create_trace_instances(NULL);
update_tracer_options(&global_trace);
return 0;
}
+fs_initcall(tracer_init_tracefs);
+
static int trace_panic_handler(struct notifier_block *this,
unsigned long event, void *unused)
{
@@ -9142,10 +9826,14 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
tracing_off();
local_irq_save(flags);
- printk_nmi_direct_enter();
/* Simulate the iterator */
trace_init_global_iter(&iter);
+ /* Can not use kmalloc for iter.temp and iter.fmt */
+ iter.temp = static_temp_buf;
+ iter.temp_size = STATIC_TEMP_BUF_SIZE;
+ iter.fmt = static_fmt_buf;
+ iter.fmt_size = STATIC_FMT_BUF_SIZE;
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
@@ -9179,7 +9867,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
}
/*
- * We need to stop all tracing on all CPUS to read the
+ * We need to stop all tracing on all CPUS to read
* the next buffer. This is a bit expensive, but is
* not done often. We fill all what we can read,
* and then release the locks again.
@@ -9219,35 +9907,15 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
atomic_dec(&dump_running);
- printk_nmi_direct_exit();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ftrace_dump);
-int trace_run_command(const char *buf, int (*createfn)(int, char **))
-{
- char **argv;
- int argc, ret;
-
- argc = 0;
- ret = 0;
- argv = argv_split(GFP_KERNEL, buf, &argc);
- if (!argv)
- return -ENOMEM;
-
- if (argc)
- ret = createfn(argc, argv);
-
- argv_free(argv);
-
- return ret;
-}
-
#define WRITE_BUFSIZE 4096
ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos,
- int (*createfn)(int, char **))
+ int (*createfn)(const char *))
{
char *kbuf, *buf, *tmp;
int ret = 0;
@@ -9295,7 +9963,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
if (tmp)
*tmp = '\0';
- ret = trace_run_command(buf, createfn);
+ ret = createfn(buf);
if (ret)
goto out;
buf += size;
@@ -9322,7 +9990,7 @@ __init static int tracer_alloc_buffers(void)
}
/*
- * Make sure we don't accidently add more trace options
+ * Make sure we don't accidentally add more trace options
* than we have bits for.
*/
BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE);
@@ -9334,7 +10002,7 @@ __init static int tracer_alloc_buffers(void)
goto out_free_buffer_mask;
/* Only allocate trace_printk buffers if a trace_printk exists */
- if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
+ if (&__stop___trace_bprintk_fmt != &__start___trace_bprintk_fmt)
/* Must be called before global_trace.buffer is allocated */
trace_printk_init_buffers();
@@ -9351,7 +10019,7 @@ __init static int tracer_alloc_buffers(void)
/*
* The prepare callbacks allocates some memory for the ring buffer. We
- * don't free the buffer if the if the CPU goes down. If we were to free
+ * don't free the buffer if the CPU goes down. If we were to free
* the buffer, then the user would lose any trace that was in the
* buffer. The memory will be removed once the "instance" is removed.
*/
@@ -9423,6 +10091,8 @@ __init static int tracer_alloc_buffers(void)
register_snapshot_cmd();
+ test_can_verify();
+
return 0;
out_free_savedcmd:
@@ -9443,7 +10113,7 @@ void __init early_trace_init(void)
{
if (tracepoint_printk) {
tracepoint_print_iter =
- kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
+ kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
if (MEM_FAIL(!tracepoint_print_iter,
"Failed to allocate trace iterator\n"))
tracepoint_printk = 0;
@@ -9458,7 +10128,7 @@ void __init trace_init(void)
trace_event_init();
}
-__init static int clear_boot_tracer(void)
+__init static void clear_boot_tracer(void)
{
/*
* The default tracer at boot buffer is an init section.
@@ -9468,26 +10138,21 @@ __init static int clear_boot_tracer(void)
* about to be freed.
*/
if (!default_bootup_tracer)
- return 0;
+ return;
printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n",
default_bootup_tracer);
default_bootup_tracer = NULL;
-
- return 0;
}
-fs_initcall(tracer_init_tracefs);
-late_initcall_sync(clear_boot_tracer);
-
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-__init static int tracing_set_default_clock(void)
+__init static void tracing_set_default_clock(void)
{
/* sched_clock_stable() is determined in late_initcall */
if (!trace_boot_clock && !sched_clock_stable()) {
if (security_locked_down(LOCKDOWN_TRACEFS)) {
pr_warn("Can not set tracing clock due to lockdown\n");
- return -EPERM;
+ return;
}
printk(KERN_WARNING
@@ -9497,8 +10162,21 @@ __init static int tracing_set_default_clock(void)
"on the kernel command line\n");
tracing_set_clock(&global_trace, "global");
}
+}
+#else
+static inline void tracing_set_default_clock(void) { }
+#endif
+__init static int late_trace_init(void)
+{
+ if (tracepoint_printk && tracepoint_printk_stop_on_boot) {
+ static_key_disable(&tracepoint_printk_key.key);
+ tracepoint_printk = 0;
+ }
+
+ tracing_set_default_clock();
+ clear_boot_tracer();
return 0;
}
-late_initcall_sync(tracing_set_default_clock);
-#endif
+
+late_initcall_sync(late_trace_init);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 99372dd7d168..38715aa6cfdf 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -19,12 +19,19 @@
#include <linux/glob.h>
#include <linux/irq_work.h>
#include <linux/workqueue.h>
+#include <linux/ctype.h>
+#include <linux/once_lite.h>
+
+#include "pid_list.h"
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
#include <asm/syscall.h> /* some archs define it here */
#endif
+#define TRACE_MODE_WRITE 0640
+#define TRACE_MODE_READ 0440
+
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -43,7 +50,10 @@ enum trace_type {
TRACE_BLK,
TRACE_BPUTS,
TRACE_HWLAT,
+ TRACE_OSNOISE,
+ TRACE_TIMERLAT,
TRACE_RAW_DATA,
+ TRACE_FUNC_REPEATS,
__TRACE_LAST_TYPE,
};
@@ -61,6 +71,9 @@ enum trace_type {
#undef __field_desc
#define __field_desc(type, container, item)
+#undef __field_packed
+#define __field_packed(type, container, item)
+
#undef __array
#define __array(type, item, size) type item[size];
@@ -94,16 +107,8 @@ enum trace_type {
#include "trace_entries.h"
/* Use this for memory failure errors */
-#define MEM_FAIL(condition, fmt, ...) ({ \
- static bool __section(.data.once) __warned; \
- int __ret_warn_once = !!(condition); \
- \
- if (unlikely(__ret_warn_once && !__warned)) { \
- __warned = true; \
- pr_err("ERROR: " fmt, ##__VA_ARGS__); \
- } \
- unlikely(__ret_warn_once); \
-})
+#define MEM_FAIL(condition, fmt, ...) \
+ DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__)
/*
* syscalls are special, and need special handling, this is why
@@ -126,31 +131,17 @@ struct kprobe_trace_entry_head {
unsigned long ip;
};
+struct eprobe_trace_entry_head {
+ struct trace_entry ent;
+ unsigned int type;
+};
+
struct kretprobe_trace_entry_head {
struct trace_entry ent;
unsigned long func;
unsigned long ret_ip;
};
-/*
- * trace_flag_type is an enumeration that holds different
- * states when a trace occurs. These are:
- * IRQS_OFF - interrupts were disabled
- * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
- * NEED_RESCHED - reschedule is requested
- * HARDIRQ - inside an interrupt handler
- * SOFTIRQ - inside a softirq handler
- */
-enum trace_flag_type {
- TRACE_FLAG_IRQS_OFF = 0x01,
- TRACE_FLAG_IRQS_NOSUPPORT = 0x02,
- TRACE_FLAG_NEED_RESCHED = 0x04,
- TRACE_FLAG_HARDIRQ = 0x08,
- TRACE_FLAG_SOFTIRQ = 0x10,
- TRACE_FLAG_PREEMPT_RESCHED = 0x20,
- TRACE_FLAG_NMI = 0x40,
-};
-
#define TRACE_BUF_SIZE 1024
struct trace_array;
@@ -178,10 +169,10 @@ struct trace_array_cpu {
kuid_t uid;
char comm[TASK_COMM_LEN];
- bool ignore_pid;
#ifdef CONFIG_FUNCTION_TRACER
- bool ftrace_ignore_pid;
+ int ftrace_ignore_pid;
#endif
+ bool ignore_pid;
};
struct tracer;
@@ -202,11 +193,39 @@ struct trace_options {
struct trace_option_dentry *topts;
};
-struct trace_pid_list {
- int pid_max;
- unsigned long *pids;
+struct trace_pid_list *trace_pid_list_alloc(void);
+void trace_pid_list_free(struct trace_pid_list *pid_list);
+bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid);
+int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid);
+int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid);
+int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid);
+int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
+ unsigned int *next);
+
+enum {
+ TRACE_PIDS = BIT(0),
+ TRACE_NO_PIDS = BIT(1),
};
+static inline bool pid_type_enabled(int type, struct trace_pid_list *pid_list,
+ struct trace_pid_list *no_pid_list)
+{
+ /* Return true if the pid list in type has pids */
+ return ((type & TRACE_PIDS) && pid_list) ||
+ ((type & TRACE_NO_PIDS) && no_pid_list);
+}
+
+static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_list,
+ struct trace_pid_list *no_pid_list)
+{
+ /*
+ * Turning off what is in @type, return true if the "other"
+ * pid list, still has pids in it.
+ */
+ return (!(type & TRACE_PIDS) && pid_list) ||
+ (!(type & TRACE_NO_PIDS) && no_pid_list);
+}
+
typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data);
/**
@@ -219,7 +238,7 @@ typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data);
* tracing_snapshot_cond(tr, cond_data), the cond_data passed in is
* passed in turn to the cond_snapshot.update() function. That data
* can be compared by the update() implementation with the cond_data
- * contained wihin the struct cond_snapshot instance associated with
+ * contained within the struct cond_snapshot instance associated with
* the trace_array. Because the tr->max_lock is held throughout the
* update() call, the update() function can directly retrieve the
* cond_snapshot and cond_data associated with the per-instance
@@ -244,7 +263,7 @@ typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data);
* take the snapshot, by returning 'true' if so, 'false' if no
* snapshot should be taken. Because the max_lock is held for
* the duration of update(), the implementation is safe to
- * directly retrieven and save any implementation data it needs
+ * directly retrieved and save any implementation data it needs
* to in association with the snapshot.
*/
struct cond_snapshot {
@@ -253,6 +272,17 @@ struct cond_snapshot {
};
/*
+ * struct trace_func_repeats - used to keep track of the consecutive
+ * (on the same CPU) calls of a single function.
+ */
+struct trace_func_repeats {
+ unsigned long ip;
+ unsigned long parent_ip;
+ unsigned long count;
+ u64 ts_last_call;
+};
+
+/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
* They have on/off state as well:
@@ -276,7 +306,8 @@ struct trace_array {
struct array_buffer max_buffer;
bool allocated_snapshot;
#endif
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
+ || defined(CONFIG_OSNOISE_TRACER)
unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
@@ -285,6 +316,7 @@ struct trace_array {
#endif
#endif
struct trace_pid_list __rcu *filtered_pids;
+ struct trace_pid_list __rcu *filtered_no_pids;
/*
* max_lock is used to protect the swapping of buffers
* when taking a max snapshot. The buffers themselves are
@@ -328,9 +360,11 @@ struct trace_array {
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
int ref;
+ int trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
+ struct trace_pid_list __rcu *function_no_pids;
#ifdef CONFIG_DYNAMIC_FTRACE
/* All of these are protected by the ftrace_lock */
struct list_head func_probes;
@@ -340,11 +374,12 @@ struct trace_array {
/* function tracing enabled */
int function_enabled;
#endif
- int time_stamp_abs_ref;
+ int no_filter_buffering_ref;
struct list_head hist_vars;
#ifdef CONFIG_TRACER_SNAPSHOT
struct cond_snapshot *cond_snapshot;
#endif
+ struct trace_func_repeats __percpu *last_func_repeats;
};
enum {
@@ -360,7 +395,8 @@ extern int tracing_check_open_get_tr(struct trace_array *tr);
extern struct trace_array *trace_array_find(const char *instance);
extern struct trace_array *trace_array_find_get(const char *instance);
-extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
+extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe);
+extern int tracing_set_filter_buffering(struct trace_array *tr, bool set);
extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
extern bool trace_clock_in_ns(struct trace_array *tr);
@@ -419,6 +455,8 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \
+ IF_ASSIGN(var, ent, struct osnoise_entry, TRACE_OSNOISE);\
+ IF_ASSIGN(var, ent, struct timerlat_entry, TRACE_TIMERLAT);\
IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
@@ -429,6 +467,8 @@ extern void __ftrace_bad_type(void);
TRACE_GRAPH_ENT); \
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
+ IF_ASSIGN(var, ent, struct func_repeats_entry, \
+ TRACE_FUNC_REPEATS); \
__ftrace_bad_type(); \
} while (0)
@@ -518,7 +558,6 @@ struct tracer {
struct tracer *next;
struct tracer_flags *flags;
int enabled;
- int ref;
bool print_max;
bool allow_instances;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -528,168 +567,6 @@ struct tracer {
bool noboot;
};
-
-/* Only current can touch trace_recursion */
-
-/*
- * For function tracing recursion:
- * The order of these bits are important.
- *
- * When function tracing occurs, the following steps are made:
- * If arch does not support a ftrace feature:
- * call internal function (uses INTERNAL bits) which calls...
- * If callback is registered to the "global" list, the list
- * function is called and recursion checks the GLOBAL bits.
- * then this function calls...
- * The function callback, which can use the FTRACE bits to
- * check for recursion.
- *
- * Now if the arch does not suppport a feature, and it calls
- * the global list function which calls the ftrace callback
- * all three of these steps will do a recursion protection.
- * There's no reason to do one if the previous caller already
- * did. The recursion that we are protecting against will
- * go through the same steps again.
- *
- * To prevent the multiple recursion checks, if a recursion
- * bit is set that is higher than the MAX bit of the current
- * check, then we know that the check was made by the previous
- * caller, and we can skip the current check.
- */
-enum {
- TRACE_BUFFER_BIT,
- TRACE_BUFFER_NMI_BIT,
- TRACE_BUFFER_IRQ_BIT,
- TRACE_BUFFER_SIRQ_BIT,
-
- /* Start of function recursion bits */
- TRACE_FTRACE_BIT,
- TRACE_FTRACE_NMI_BIT,
- TRACE_FTRACE_IRQ_BIT,
- TRACE_FTRACE_SIRQ_BIT,
-
- /* INTERNAL_BITs must be greater than FTRACE_BITs */
- TRACE_INTERNAL_BIT,
- TRACE_INTERNAL_NMI_BIT,
- TRACE_INTERNAL_IRQ_BIT,
- TRACE_INTERNAL_SIRQ_BIT,
-
- TRACE_BRANCH_BIT,
-/*
- * Abuse of the trace_recursion.
- * As we need a way to maintain state if we are tracing the function
- * graph in irq because we want to trace a particular function that
- * was called in irq context but we have irq tracing off. Since this
- * can only be modified by current, we can reuse trace_recursion.
- */
- TRACE_IRQ_BIT,
-
- /* Set if the function is in the set_graph_function file */
- TRACE_GRAPH_BIT,
-
- /*
- * In the very unlikely case that an interrupt came in
- * at a start of graph tracing, and we want to trace
- * the function in that interrupt, the depth can be greater
- * than zero, because of the preempted start of a previous
- * trace. In an even more unlikely case, depth could be 2
- * if a softirq interrupted the start of graph tracing,
- * followed by an interrupt preempting a start of graph
- * tracing in the softirq, and depth can even be 3
- * if an NMI came in at the start of an interrupt function
- * that preempted a softirq start of a function that
- * preempted normal context!!!! Luckily, it can't be
- * greater than 3, so the next two bits are a mask
- * of what the depth is when we set TRACE_GRAPH_BIT
- */
-
- TRACE_GRAPH_DEPTH_START_BIT,
- TRACE_GRAPH_DEPTH_END_BIT,
-
- /*
- * To implement set_graph_notrace, if this bit is set, we ignore
- * function graph tracing of called functions, until the return
- * function is called to clear it.
- */
- TRACE_GRAPH_NOTRACE_BIT,
-};
-
-#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
-#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
-#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
-
-#define trace_recursion_depth() \
- (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3)
-#define trace_recursion_set_depth(depth) \
- do { \
- current->trace_recursion &= \
- ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \
- current->trace_recursion |= \
- ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \
- } while (0)
-
-#define TRACE_CONTEXT_BITS 4
-
-#define TRACE_FTRACE_START TRACE_FTRACE_BIT
-#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
-
-#define TRACE_LIST_START TRACE_INTERNAL_BIT
-#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
-
-#define TRACE_CONTEXT_MASK TRACE_LIST_MAX
-
-static __always_inline int trace_get_context_bit(void)
-{
- int bit;
-
- if (in_interrupt()) {
- if (in_nmi())
- bit = 0;
-
- else if (in_irq())
- bit = 1;
- else
- bit = 2;
- } else
- bit = 3;
-
- return bit;
-}
-
-static __always_inline int trace_test_and_set_recursion(int start, int max)
-{
- unsigned int val = current->trace_recursion;
- int bit;
-
- /* A previous recursion check was made */
- if ((val & TRACE_CONTEXT_MASK) > max)
- return 0;
-
- bit = trace_get_context_bit() + start;
- if (unlikely(val & (1 << bit)))
- return -1;
-
- val |= 1 << bit;
- current->trace_recursion = val;
- barrier();
-
- return bit;
-}
-
-static __always_inline void trace_clear_recursion(int bit)
-{
- unsigned int val = current->trace_recursion;
-
- if (!bit)
- return;
-
- bit = 1 << bit;
- val &= ~bit;
-
- barrier();
- current->trace_recursion = val;
-}
-
static inline struct ring_buffer_iter *
trace_buffer_iter(struct trace_iterator *iter, int cpu)
{
@@ -713,7 +590,7 @@ struct dentry *trace_create_file(const char *name,
void *data,
const struct file_operations *fops);
-struct dentry *tracing_init_dentry(void);
+int tracing_init_dentry(void);
struct ring_buffer_event;
@@ -721,8 +598,7 @@ struct ring_buffer_event *
trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
unsigned long len,
- unsigned long flags,
- int pc);
+ unsigned int trace_ctx);
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
@@ -733,6 +609,11 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
struct ring_buffer_event *event);
+bool trace_is_tracepoint_string(const char *str);
+const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
+void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+ va_list ap);
+
int trace_empty(struct trace_iterator *iter);
void *trace_find_next_entry_inc(struct trace_iterator *iter);
@@ -747,15 +628,14 @@ unsigned long trace_total_entries(struct trace_array *tr);
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
void trace_graph_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
void trace_latency_header(struct seq_file *m);
void trace_default_header(struct seq_file *m);
void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
-int trace_empty(struct trace_iterator *iter);
void trace_graph_return(struct ftrace_graph_ret *trace);
int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -787,6 +667,7 @@ extern int pid_max;
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list *filtered_no_pids,
struct task_struct *task);
void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
struct task_struct *self,
@@ -806,27 +687,30 @@ void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
#endif /* CONFIG_TRACER_MAX_TRACE */
-#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
- defined(CONFIG_FSNOTIFY)
+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
+ || defined(CONFIG_OSNOISE_TRACER)) && defined(CONFIG_FSNOTIFY)
+#define LATENCY_FS_NOTIFY
+#endif
+#ifdef LATENCY_FS_NOTIFY
void latency_fsnotify(struct trace_array *tr);
-
#else
-
static inline void latency_fsnotify(struct trace_array *tr) { }
-
#endif
#ifdef CONFIG_STACKTRACE
-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
- int pc);
+void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip);
#else
-static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
- int skip, int pc)
+static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
+ int skip)
{
}
#endif /* CONFIG_STACKTRACE */
+void trace_last_func_repeats(struct trace_array *tr,
+ struct trace_func_repeats *last_info,
+ unsigned int trace_ctx);
+
extern u64 ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
@@ -850,6 +734,8 @@ extern bool ring_buffer_expanded;
extern bool tracing_selftest_disabled;
#ifdef CONFIG_FTRACE_STARTUP_TEST
+extern void __init disable_tracing_selftest(const char *reason);
+
extern int trace_selftest_startup_function(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_function_graph(struct tracer *trace,
@@ -873,6 +759,9 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
*/
#define __tracer_data __refdata
#else
+static inline void __init disable_tracing_selftest(const char *reason)
+{
+}
/* Tracers are seldom changed. Optimize when selftests are disabled. */
#define __tracer_data __read_mostly
#endif /* CONFIG_FTRACE_STARTUP_TEST */
@@ -957,10 +846,10 @@ extern void graph_trace_open(struct trace_iterator *iter);
extern void graph_trace_close(struct trace_iterator *iter);
extern int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
- unsigned long flags, int pc);
+ unsigned int trace_ctx);
#ifdef CONFIG_DYNAMIC_FTRACE
extern struct ftrace_hash __rcu *ftrace_graph_hash;
@@ -1001,7 +890,7 @@ static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
* is set, and called by an interrupt handler, we still
* want to trace it.
*/
- if (in_irq())
+ if (in_hardirq())
trace_recursion_set(TRACE_IRQ_BIT);
else
trace_recursion_clear(TRACE_IRQ_BIT);
@@ -1078,6 +967,10 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
extern struct list_head ftrace_pids;
#ifdef CONFIG_FUNCTION_TRACER
+
+#define FTRACE_PID_IGNORE -1
+#define FTRACE_PID_TRACE -2
+
struct ftrace_func_command {
struct list_head list;
char *name;
@@ -1089,12 +982,15 @@ struct ftrace_func_command {
extern bool ftrace_filter_param __initdata;
static inline int ftrace_trace_task(struct trace_array *tr)
{
- return !this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid);
+ return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) !=
+ FTRACE_PID_IGNORE;
}
extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
struct dentry *parent);
void ftrace_destroy_function_files(struct trace_array *tr);
+int ftrace_allocate_ftrace_ops(struct trace_array *tr);
+void ftrace_free_ftrace_ops(struct trace_array *tr);
void ftrace_init_global_array_ops(struct trace_array *tr);
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
void ftrace_reset_array_ops(struct trace_array *tr);
@@ -1116,6 +1012,11 @@ ftrace_create_function_files(struct trace_array *tr,
{
return 0;
}
+static inline int ftrace_allocate_ftrace_ops(struct trace_array *tr)
+{
+ return 0;
+}
+static inline void ftrace_free_ftrace_ops(struct trace_array *tr) { }
static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
static inline __init void
ftrace_init_global_array_ops(struct trace_array *tr) { }
@@ -1307,6 +1208,8 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(IRQ_INFO, "irq-info"), \
C(MARKERS, "markers"), \
C(EVENT_FORK, "event-fork"), \
+ C(PAUSE_ON_TRACE, "pause-on-trace"), \
+ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \
FUNCTION_FLAGS \
FGRAPH_FLAGS \
STACK_FLAGS \
@@ -1410,15 +1313,15 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- unsigned long flags, int pc,
+ unsigned int trcace_ctx,
struct pt_regs *regs);
static inline void trace_buffer_unlock_commit(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- unsigned long flags, int pc)
+ unsigned int trace_ctx)
{
- trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL);
+ trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL);
}
DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -1441,7 +1344,7 @@ __trace_event_discard_commit(struct trace_buffer *buffer,
/*
* Helper function for event_trigger_unlock_commit{_regs}().
* If there are event triggers attached to this event that requires
- * filtering against its fields, then they wil be called as the
+ * filtering against its fields, then they will be called as the
* entry already holds the field information of the current event.
*
* It also checks if the event should be discarded or not.
@@ -1461,26 +1364,37 @@ __event_trigger_test_discard(struct trace_event_file *file,
unsigned long eflags = file->flags;
if (eflags & EVENT_FILE_FL_TRIGGER_COND)
- *tt = event_triggers_call(file, entry, event);
+ *tt = event_triggers_call(file, buffer, entry, event);
- if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
- (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
- !filter_match_preds(file->filter, entry))) {
- __trace_event_discard_commit(buffer, event);
- return true;
- }
+ if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED |
+ EVENT_FILE_FL_FILTERED |
+ EVENT_FILE_FL_PID_FILTER))))
+ return false;
+
+ if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
+ goto discard;
+
+ if (file->flags & EVENT_FILE_FL_FILTERED &&
+ !filter_match_preds(file->filter, entry))
+ goto discard;
+
+ if ((file->flags & EVENT_FILE_FL_PID_FILTER) &&
+ trace_event_ignore_this_pid(file))
+ goto discard;
return false;
+ discard:
+ __trace_event_discard_commit(buffer, event);
+ return true;
}
/**
* event_trigger_unlock_commit - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
+ * @file: The file pointer associated with the event
* @buffer: The ring buffer that the event is being written to
* @event: The event meta data in the ring buffer
* @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
+ * @trace_ctx: The tracing context flags.
*
* This is a helper function to handle triggers that require data
* from the event itself. It also tests the event against filters and
@@ -1490,45 +1404,12 @@ static inline void
event_trigger_unlock_commit(struct trace_event_file *file,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
- void *entry, unsigned long irq_flags, int pc)
-{
- enum event_trigger_type tt = ETT_NONE;
-
- if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
- trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
-
- if (tt)
- event_triggers_post_call(file, tt);
-}
-
-/**
- * event_trigger_unlock_commit_regs - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
- * @buffer: The ring buffer that the event is being written to
- * @event: The event meta data in the ring buffer
- * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
- *
- * This is a helper function to handle triggers that require data
- * from the event itself. It also tests the event against filters and
- * if the event is soft disabled and should be discarded.
- *
- * Same as event_trigger_unlock_commit() but calls
- * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
- */
-static inline void
-event_trigger_unlock_commit_regs(struct trace_event_file *file,
- struct trace_buffer *buffer,
- struct ring_buffer_event *event,
- void *entry, unsigned long irq_flags, int pc,
- struct pt_regs *regs)
+ void *entry, unsigned int trace_ctx)
{
enum event_trigger_type tt = ETT_NONE;
if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
- trace_buffer_unlock_commit_regs(file->tr, buffer, event,
- irq_flags, pc, regs);
+ trace_buffer_unlock_commit(file->tr, buffer, event, trace_ctx);
if (tt)
event_triggers_post_call(file, tt);
@@ -1620,6 +1501,7 @@ extern void trace_event_enable_tgid_record(bool enable);
extern int event_trace_init(void);
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
+extern void __trace_early_add_events(struct trace_array *tr);
extern struct trace_event_file *__find_event_file(struct trace_array *tr,
const char *system,
@@ -1638,6 +1520,7 @@ extern struct list_head ftrace_events;
extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
+extern const struct file_operations event_hist_debug_fops;
extern const struct file_operations event_inject_fops;
#ifdef CONFIG_HIST_TRIGGERS
@@ -1651,9 +1534,14 @@ static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; }
extern int register_trigger_cmds(void);
extern void clear_event_triggers(struct trace_array *tr);
+enum {
+ EVENT_TRIGGER_FL_PROBE = BIT(0),
+};
+
struct event_trigger_data {
unsigned long count;
int ref;
+ int flags;
struct event_trigger_ops *ops;
struct event_command *cmd_ops;
struct event_filter __rcu *filter;
@@ -1758,7 +1646,7 @@ extern int register_trigger_hist_enable_disable_cmds(void);
*/
struct event_trigger_ops {
void (*func)(struct event_trigger_data *data,
- void *rec,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe);
int (*init)(struct event_trigger_ops *ops,
struct event_trigger_data *data);
@@ -1941,10 +1829,9 @@ extern int tracing_set_cpumask(struct trace_array *tr,
#define MAX_EVENT_NAME_LEN 64
-extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
extern ssize_t trace_parse_run_command(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos,
- int (*createfn)(int, char**));
+ int (*createfn)(const char *));
extern unsigned int err_pos(char *cmd, const char *str);
extern void tracing_log_err(struct trace_array *tr,
@@ -2050,4 +1937,42 @@ static __always_inline void trace_iterator_reset(struct trace_iterator *iter)
iter->pos = -1;
}
+/* Check the name is good for event/group/fields */
+static inline bool is_good_name(const char *name)
+{
+ if (!isalpha(*name) && *name != '_')
+ return false;
+ while (*++name != '\0') {
+ if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+ return false;
+ }
+ return true;
+}
+
+/* Convert certain expected symbols into '_' when generating event names */
+static inline void sanitize_event_name(char *name)
+{
+ while (*name++ != '\0')
+ if (*name == ':' || *name == '.')
+ *name = '_';
+}
+
+/*
+ * This is a generic way to read and write a u64 value from a file in tracefs.
+ *
+ * The value is stored on the variable pointed by *val. The value needs
+ * to be at least *min and at most *max. The write is protected by an
+ * existing *lock.
+ */
+struct trace_min_max_param {
+ struct mutex *lock;
+ u64 *val;
+ u64 *min;
+ u64 *max;
+};
+
+#define U64_STR_SIZE 24 /* 20 digits max */
+
+extern const struct file_operations trace_min_max_fops;
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 2e9a4746ea85..801c2a7f7605 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -31,7 +31,7 @@ static bool ok_to_run;
* it simply writes "START". As the first write is cold cache and
* the rest is hot, we save off that time in bm_first and it is
* reported as "first", which is shown in the second write to the
- * tracepoint. The "first" field is writen within the statics from
+ * tracepoint. The "first" field is written within the statics from
* then on but never changes.
*/
static void trace_do_benchmark(void)
@@ -112,7 +112,7 @@ static void trace_do_benchmark(void)
int i = 0;
/*
* stddev is the square of standard deviation but
- * we want the actualy number. Use the average
+ * we want the actually number. Use the average
* as our seed to find the std.
*
* The next try is:
@@ -155,7 +155,7 @@ static int benchmark_event_kthread(void *arg)
/*
* We don't go to sleep, but let others run as well.
- * This is bascially a "yield()" to let any task that
+ * This is basically a "yield()" to let any task that
* wants to run, schedule in, but if the CPU is idle,
* we'll keep burning cycles.
*
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 06d7feb5255f..0580287d7a0d 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -40,6 +40,16 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node)
pr_err("Failed to set option: %s\n", buf);
}
+ p = xbc_node_find_value(node, "tracing_on", NULL);
+ if (p && *p != '\0') {
+ if (kstrtoul(p, 10, &v))
+ pr_err("Failed to set tracing on: %s\n", p);
+ if (v)
+ tracer_tracing_on(tr);
+ else
+ tracer_tracing_off(tr);
+ }
+
p = xbc_node_find_value(node, "trace_clock", NULL);
if (p && *p != '\0') {
if (tracing_set_clock(tr, p) < 0)
@@ -95,24 +105,24 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event)
struct xbc_node *anode;
char buf[MAX_BUF_LEN];
const char *val;
- int ret;
+ int ret = 0;
- kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN);
+ xbc_node_for_each_array_value(node, "probes", anode, val) {
+ kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN);
- ret = kprobe_event_gen_cmd_start(&cmd, event, NULL);
- if (ret)
- return ret;
+ ret = kprobe_event_gen_cmd_start(&cmd, event, val);
+ if (ret) {
+ pr_err("Failed to generate probe: %s\n", buf);
+ break;
+ }
- xbc_node_for_each_array_value(node, "probes", anode, val) {
- ret = kprobe_event_add_field(&cmd, val);
- if (ret)
- return ret;
+ ret = kprobe_event_gen_cmd_end(&cmd);
+ if (ret) {
+ pr_err("Failed to add probe: %s\n", buf);
+ break;
+ }
}
- ret = kprobe_event_gen_cmd_end(&cmd);
- if (ret)
- pr_err("Failed to add probe: %s\n", buf);
-
return ret;
}
#else
@@ -124,7 +134,7 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event)
}
#endif
-#ifdef CONFIG_HIST_TRIGGERS
+#ifdef CONFIG_SYNTH_EVENTS
static int __init
trace_boot_add_synth_event(struct xbc_node *node, const char *event)
{
@@ -161,6 +171,293 @@ trace_boot_add_synth_event(struct xbc_node *node, const char *event)
}
#endif
+#ifdef CONFIG_HIST_TRIGGERS
+static int __init __printf(3, 4)
+append_printf(char **bufp, char *end, const char *fmt, ...)
+{
+ va_list args;
+ int ret;
+
+ if (*bufp == end)
+ return -ENOSPC;
+
+ va_start(args, fmt);
+ ret = vsnprintf(*bufp, end - *bufp, fmt, args);
+ if (ret < end - *bufp) {
+ *bufp += ret;
+ } else {
+ *bufp = end;
+ ret = -ERANGE;
+ }
+ va_end(args);
+
+ return ret;
+}
+
+static int __init
+append_str_nospace(char **bufp, char *end, const char *str)
+{
+ char *p = *bufp;
+ int len;
+
+ while (p < end - 1 && *str != '\0') {
+ if (!isspace(*str))
+ *(p++) = *str;
+ str++;
+ }
+ *p = '\0';
+ if (p == end - 1) {
+ *bufp = end;
+ return -ENOSPC;
+ }
+ len = p - *bufp;
+ *bufp = p;
+ return (int)len;
+}
+
+static int __init
+trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp,
+ char *end, const char *key)
+{
+ struct xbc_node *anode;
+ const char *p;
+ char sep;
+
+ p = xbc_node_find_value(hnode, key, &anode);
+ if (p) {
+ if (!anode) {
+ pr_err("hist.%s requires value(s).\n", key);
+ return -EINVAL;
+ }
+
+ append_printf(bufp, end, ":%s", key);
+ sep = '=';
+ xbc_array_for_each_value(anode, p) {
+ append_printf(bufp, end, "%c%s", sep, p);
+ if (sep == '=')
+ sep = ',';
+ }
+ } else
+ return -ENOENT;
+
+ return 0;
+}
+
+static int __init
+trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp,
+ char *end, const char *handler,
+ const char *param)
+{
+ struct xbc_node *knode, *anode;
+ const char *p;
+ char sep;
+
+ /* Compose 'handler' parameter */
+ p = xbc_node_find_value(hnode, param, NULL);
+ if (!p) {
+ pr_err("hist.%s requires '%s' option.\n",
+ xbc_node_get_data(hnode), param);
+ return -EINVAL;
+ }
+ append_printf(bufp, end, ":%s(%s)", handler, p);
+
+ /* Compose 'action' parameter */
+ knode = xbc_node_find_subkey(hnode, "trace");
+ if (!knode)
+ knode = xbc_node_find_subkey(hnode, "save");
+
+ if (knode) {
+ anode = xbc_node_get_child(knode);
+ if (!anode || !xbc_node_is_value(anode)) {
+ pr_err("hist.%s.%s requires value(s).\n",
+ xbc_node_get_data(hnode),
+ xbc_node_get_data(knode));
+ return -EINVAL;
+ }
+
+ append_printf(bufp, end, ".%s", xbc_node_get_data(knode));
+ sep = '(';
+ xbc_array_for_each_value(anode, p) {
+ append_printf(bufp, end, "%c%s", sep, p);
+ if (sep == '(')
+ sep = ',';
+ }
+ append_printf(bufp, end, ")");
+ } else if (xbc_node_find_subkey(hnode, "snapshot")) {
+ append_printf(bufp, end, ".snapshot()");
+ } else {
+ pr_err("hist.%s requires an action.\n",
+ xbc_node_get_data(hnode));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __init
+trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp,
+ char *end, const char *param)
+{
+ struct xbc_node *node;
+ const char *p, *handler;
+ int ret;
+
+ handler = xbc_node_get_data(hnode);
+
+ xbc_node_for_each_subkey(hnode, node) {
+ p = xbc_node_get_data(node);
+ if (!isdigit(p[0]))
+ continue;
+ /* All digit started node should be instances. */
+ ret = trace_boot_hist_add_one_handler(node, bufp, end, handler, param);
+ if (ret < 0)
+ break;
+ }
+
+ if (xbc_node_find_subkey(hnode, param))
+ ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param);
+
+ return ret;
+}
+
+/*
+ * Histogram boottime tracing syntax.
+ *
+ * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist[.N] {
+ * keys = <KEY>[,...]
+ * values = <VAL>[,...]
+ * sort = <SORT-KEY>[,...]
+ * size = <ENTRIES>
+ * name = <HISTNAME>
+ * var { <VAR> = <EXPR> ... }
+ * pause|continue|clear
+ * onmax|onchange[.N] { var = <VAR>; <ACTION> [= <PARAM>] }
+ * onmatch[.N] { event = <EVENT>; <ACTION> [= <PARAM>] }
+ * filter = <FILTER>
+ * }
+ *
+ * Where <ACTION> are;
+ *
+ * trace = <EVENT>, <ARG1>[, ...]
+ * save = <ARG1>[, ...]
+ * snapshot
+ */
+static int __init
+trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size)
+{
+ struct xbc_node *node, *knode;
+ char *end = buf + size;
+ const char *p;
+ int ret = 0;
+
+ append_printf(&buf, end, "hist");
+
+ ret = trace_boot_hist_add_array(hnode, &buf, end, "keys");
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ pr_err("hist requires keys.\n");
+ return -EINVAL;
+ }
+
+ ret = trace_boot_hist_add_array(hnode, &buf, end, "values");
+ if (ret == -EINVAL)
+ return ret;
+ ret = trace_boot_hist_add_array(hnode, &buf, end, "sort");
+ if (ret == -EINVAL)
+ return ret;
+
+ p = xbc_node_find_value(hnode, "size", NULL);
+ if (p)
+ append_printf(&buf, end, ":size=%s", p);
+
+ p = xbc_node_find_value(hnode, "name", NULL);
+ if (p)
+ append_printf(&buf, end, ":name=%s", p);
+
+ node = xbc_node_find_subkey(hnode, "var");
+ if (node) {
+ xbc_node_for_each_key_value(node, knode, p) {
+ /* Expression must not include spaces. */
+ append_printf(&buf, end, ":%s=",
+ xbc_node_get_data(knode));
+ append_str_nospace(&buf, end, p);
+ }
+ }
+
+ /* Histogram control attributes (mutual exclusive) */
+ if (xbc_node_find_value(hnode, "pause", NULL))
+ append_printf(&buf, end, ":pause");
+ else if (xbc_node_find_value(hnode, "continue", NULL))
+ append_printf(&buf, end, ":continue");
+ else if (xbc_node_find_value(hnode, "clear", NULL))
+ append_printf(&buf, end, ":clear");
+
+ /* Histogram handler and actions */
+ node = xbc_node_find_subkey(hnode, "onmax");
+ if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0)
+ return -EINVAL;
+ node = xbc_node_find_subkey(hnode, "onchange");
+ if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0)
+ return -EINVAL;
+ node = xbc_node_find_subkey(hnode, "onmatch");
+ if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0)
+ return -EINVAL;
+
+ p = xbc_node_find_value(hnode, "filter", NULL);
+ if (p)
+ append_printf(&buf, end, " if %s", p);
+
+ if (buf == end) {
+ pr_err("hist exceeds the max command length.\n");
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
+static void __init
+trace_boot_init_histograms(struct trace_event_file *file,
+ struct xbc_node *hnode, char *buf, size_t size)
+{
+ struct xbc_node *node;
+ const char *p;
+ char *tmp;
+
+ xbc_node_for_each_subkey(hnode, node) {
+ p = xbc_node_get_data(node);
+ if (!isdigit(p[0]))
+ continue;
+ /* All digit started node should be instances. */
+ if (trace_boot_compose_hist_cmd(node, buf, size) == 0) {
+ tmp = kstrdup(buf, GFP_KERNEL);
+ if (!tmp)
+ return;
+ if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply hist trigger: %s\n", tmp);
+ kfree(tmp);
+ }
+ }
+
+ if (xbc_node_find_subkey(hnode, "keys")) {
+ if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) {
+ tmp = kstrdup(buf, GFP_KERNEL);
+ if (!tmp)
+ return;
+ if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply hist trigger: %s\n", tmp);
+ kfree(tmp);
+ }
+ }
+}
+#else
+static void __init
+trace_boot_init_histograms(struct trace_event_file *file,
+ struct xbc_node *hnode, char *buf, size_t size)
+{
+ /* do nothing */
+}
+#endif
+
static void __init
trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
struct xbc_node *enode)
@@ -195,12 +492,18 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
pr_err("Failed to apply filter: %s\n", buf);
}
- xbc_node_for_each_array_value(enode, "actions", anode, p) {
- if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
- pr_err("action string is too long: %s\n", p);
- else if (trigger_process_regex(file, buf) < 0)
- pr_err("Failed to apply an action: %s\n", buf);
- }
+ if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) {
+ xbc_node_for_each_array_value(enode, "actions", anode, p) {
+ if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
+ pr_err("action string is too long: %s\n", p);
+ else if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply an action: %s\n", p);
+ }
+ anode = xbc_node_find_subkey(enode, "hist");
+ if (anode)
+ trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf));
+ } else if (xbc_node_find_value(enode, "actions", NULL))
+ pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n");
if (xbc_node_find_value(enode, "enable", NULL)) {
if (trace_event_enable_disable(file, 1, 0) < 0)
@@ -215,14 +518,37 @@ static void __init
trace_boot_init_events(struct trace_array *tr, struct xbc_node *node)
{
struct xbc_node *gnode, *enode;
+ bool enable, enable_all = false;
+ const char *data;
- node = xbc_node_find_child(node, "event");
+ node = xbc_node_find_subkey(node, "event");
if (!node)
return;
/* per-event key starts with "event.GROUP.EVENT" */
- xbc_node_for_each_child(node, gnode)
- xbc_node_for_each_child(gnode, enode)
+ xbc_node_for_each_subkey(node, gnode) {
+ data = xbc_node_get_data(gnode);
+ if (!strcmp(data, "enable")) {
+ enable_all = true;
+ continue;
+ }
+ enable = false;
+ xbc_node_for_each_subkey(gnode, enode) {
+ data = xbc_node_get_data(enode);
+ if (!strcmp(data, "enable")) {
+ enable = true;
+ continue;
+ }
trace_boot_init_one_event(tr, gnode, enode);
+ }
+ /* Event enablement must be done after event settings */
+ if (enable) {
+ data = xbc_node_get_data(gnode);
+ trace_array_set_clr_event(tr, data, NULL, true);
+ }
+ }
+ /* Ditto */
+ if (enable_all)
+ trace_array_set_clr_event(tr, NULL, NULL, true);
}
#else
#define trace_boot_enable_events(tr, node) do {} while (0)
@@ -274,6 +600,12 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node)
if (tracing_set_tracer(tr, p) < 0)
pr_err("Failed to set given tracer: %s\n", p);
}
+
+ /* Since tracer can free snapshot buffer, allocate snapshot here.*/
+ if (xbc_node_find_value(node, "alloc_snapshot", NULL)) {
+ if (tracing_alloc_snapshot_instance(tr) < 0)
+ pr_err("Failed to allocate snapshot buffer\n");
+ }
}
static void __init
@@ -292,11 +624,11 @@ trace_boot_init_instances(struct xbc_node *node)
struct trace_array *tr;
const char *p;
- node = xbc_node_find_child(node, "instance");
+ node = xbc_node_find_subkey(node, "instance");
if (!node)
return;
- xbc_node_for_each_child(node, inode) {
+ xbc_node_for_each_subkey(node, inode) {
p = xbc_node_get_data(inode);
if (!p || *p == '\0')
continue;
@@ -328,7 +660,12 @@ static int __init trace_boot_init(void)
trace_boot_init_one_instance(tr, trace_node);
trace_boot_init_instances(trace_node);
+ disable_tracing_selftest("running boot-time tracing");
+
return 0;
}
-
-fs_initcall(trace_boot_init);
+/*
+ * Start tracing at the end of core-initcall, so that it starts tracing
+ * from the beginning of postcore_initcall.
+ */
+core_initcall_sync(trace_boot_init);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index eff099123aa2..e47fdb4c92fb 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -37,7 +37,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
struct ring_buffer_event *event;
struct trace_branch *entry;
unsigned long flags;
- int pc;
+ unsigned int trace_ctx;
const char *p;
if (current->trace_recursion & TRACE_BRANCH_BIT)
@@ -59,10 +59,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
if (atomic_read(&data->disabled))
goto out;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx_flags(flags);
buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
goto out;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index aaf6793ededa..4702efb00ff2 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -95,33 +95,49 @@ u64 notrace trace_clock_global(void)
{
unsigned long flags;
int this_cpu;
- u64 now;
+ u64 now, prev_time;
raw_local_irq_save(flags);
this_cpu = raw_smp_processor_id();
- now = sched_clock_cpu(this_cpu);
+
/*
- * If in an NMI context then dont risk lockups and return the
- * cpu_clock() time:
+ * The global clock "guarantees" that the events are ordered
+ * between CPUs. But if two events on two different CPUS call
+ * trace_clock_global at roughly the same time, it really does
+ * not matter which one gets the earlier time. Just make sure
+ * that the same CPU will always show a monotonic clock.
+ *
+ * Use a read memory barrier to get the latest written
+ * time that was recorded.
*/
- if (unlikely(in_nmi()))
- goto out;
+ smp_rmb();
+ prev_time = READ_ONCE(trace_clock_struct.prev_time);
+ now = sched_clock_cpu(this_cpu);
- arch_spin_lock(&trace_clock_struct.lock);
+ /* Make sure that now is always greater than or equal to prev_time */
+ if ((s64)(now - prev_time) < 0)
+ now = prev_time;
/*
- * TODO: if this happens often then maybe we should reset
- * my_scd->clock to prev_time+1, to make sure
- * we start ticking with the local clock from now on?
+ * If in an NMI context then dont risk lockups and simply return
+ * the current time.
*/
- if ((s64)(now - trace_clock_struct.prev_time) < 0)
- now = trace_clock_struct.prev_time + 1;
+ if (unlikely(in_nmi()))
+ goto out;
- trace_clock_struct.prev_time = now;
+ /* Tracing can cause strange recursion, always use a try lock */
+ if (arch_spin_trylock(&trace_clock_struct.lock)) {
+ /* Reread prev_time in case it was already updated */
+ prev_time = READ_ONCE(trace_clock_struct.prev_time);
+ if ((s64)(now - prev_time) < 0)
+ now = prev_time;
- arch_spin_unlock(&trace_clock_struct.lock);
+ trace_clock_struct.prev_time = now;
+ /* The unlock acts as the wmb for the above rmb */
+ arch_spin_unlock(&trace_clock_struct.lock);
+ }
out:
raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 9f2e8520b748..e34e8182ee4b 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -13,11 +13,49 @@
#include <linux/tracefs.h>
#include "trace.h"
+#include "trace_output.h" /* for trace_event_sem */
#include "trace_dynevent.h"
static DEFINE_MUTEX(dyn_event_ops_mutex);
static LIST_HEAD(dyn_event_ops_list);
+bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call)
+{
+ struct trace_event_call *call;
+ bool ret = false;
+
+ if (WARN_ON_ONCE(!(dyn_call->flags & TRACE_EVENT_FL_DYNAMIC)))
+ return false;
+
+ down_read(&trace_event_sem);
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (call == dyn_call) {
+ atomic_inc(&dyn_call->refcnt);
+ ret = true;
+ }
+ }
+ up_read(&trace_event_sem);
+ return ret;
+}
+
+void trace_event_dyn_put_ref(struct trace_event_call *call)
+{
+ if (WARN_ON_ONCE(!(call->flags & TRACE_EVENT_FL_DYNAMIC)))
+ return;
+
+ if (WARN_ON_ONCE(atomic_read(&call->refcnt) <= 0)) {
+ atomic_set(&call->refcnt, 0);
+ return;
+ }
+
+ atomic_dec(&call->refcnt);
+}
+
+bool trace_event_dyn_busy(struct trace_event_call *call)
+{
+ return atomic_read(&call->refcnt) != 0;
+}
+
int dyn_event_register(struct dyn_event_operations *ops)
{
if (!ops || !ops->create || !ops->show || !ops->is_busy ||
@@ -31,23 +69,31 @@ int dyn_event_register(struct dyn_event_operations *ops)
return 0;
}
-int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
+int dyn_event_release(const char *raw_command, struct dyn_event_operations *type)
{
struct dyn_event *pos, *n;
char *system = NULL, *event, *p;
- int ret = -ENOENT;
+ int argc, ret = -ENOENT;
+ char **argv;
+
+ argv = argv_split(GFP_KERNEL, raw_command, &argc);
+ if (!argv)
+ return -ENOMEM;
if (argv[0][0] == '-') {
- if (argv[0][1] != ':')
- return -EINVAL;
+ if (argv[0][1] != ':') {
+ ret = -EINVAL;
+ goto out;
+ }
event = &argv[0][2];
} else {
event = strchr(argv[0], ':');
- if (!event)
- return -EINVAL;
+ if (!event) {
+ ret = -EINVAL;
+ goto out;
+ }
event++;
}
- argc--; argv++;
p = strchr(event, '/');
if (p) {
@@ -55,15 +101,17 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
event = p + 1;
*p = '\0';
}
- if (event[0] == '\0')
- return -EINVAL;
+ if (event[0] == '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
mutex_lock(&event_mutex);
for_each_dyn_event_safe(pos, n) {
if (type && type != pos->ops)
continue;
if (!pos->ops->match(system, event,
- argc, (const char **)argv, pos))
+ argc - 1, (const char **)argv + 1, pos))
continue;
ret = pos->ops->free(pos);
@@ -71,21 +119,22 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
break;
}
mutex_unlock(&event_mutex);
-
+out:
+ argv_free(argv);
return ret;
}
-static int create_dyn_event(int argc, char **argv)
+static int create_dyn_event(const char *raw_command)
{
struct dyn_event_operations *ops;
int ret = -ENODEV;
- if (argv[0][0] == '-' || argv[0][0] == '!')
- return dyn_event_release(argc, argv, NULL);
+ if (raw_command[0] == '-' || raw_command[0] == '!')
+ return dyn_event_release(raw_command, NULL);
mutex_lock(&dyn_event_ops_mutex);
list_for_each_entry(ops, &dyn_event_ops_list, list) {
- ret = ops->create(argc, (const char **)argv);
+ ret = ops->create(raw_command);
if (!ret || ret != -ECANCELED)
break;
}
@@ -206,14 +255,14 @@ static const struct file_operations dynamic_events_ops = {
/* Make a tracefs interface for controlling dynamic events */
static __init int init_dynamic_event(void)
{
- struct dentry *d_tracer;
struct dentry *entry;
+ int ret;
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
+ ret = tracing_init_dentry();
+ if (ret)
return 0;
- entry = tracefs_create_file("dynamic_events", 0644, d_tracer,
+ entry = tracefs_create_file("dynamic_events", TRACE_MODE_WRITE, NULL,
NULL, &dynamic_events_ops);
/* Event list interface */
@@ -276,7 +325,7 @@ int dynevent_arg_add(struct dynevent_cmd *cmd,
* arguments of the form 'type variable_name;' or 'x+y'.
*
* The lhs argument string will be appended to the current cmd string,
- * followed by an operator, if applicable, followd by the rhs string,
+ * followed by an operator, if applicable, followed by the rhs string,
* followed finally by a separator, if applicable. Before the
* argument is added, the @check_arg function, if present, will be
* used to check the sanity of the current arg strings.
@@ -402,7 +451,7 @@ void dynevent_arg_init(struct dynevent_arg *arg,
* whitespace, all followed by a separator, if applicable. After the
* first arg string is successfully appended to the command string,
* the optional @operator is appended, followed by the second arg and
- * and optional @separator. If no separator was specified when
+ * optional @separator. If no separator was specified when
* initializing the arg, a space will be appended.
*/
void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair,
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index d6857a254ede..936477a111d3 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -29,17 +29,17 @@ struct dyn_event;
* @show: Showing method. This is invoked when user reads the event definitions
* via dynamic_events interface.
* @is_busy: Check whether given event is busy so that it can not be deleted.
- * Return true if it is busy, otherwides false.
- * @free: Delete the given event. Return 0 if success, otherwides error.
+ * Return true if it is busy, otherwise false.
+ * @free: Delete the given event. Return 0 if success, otherwise error.
* @match: Check whether given event and system name match this event. The argc
- * and argv is used for exact match. Return true if it matches, otherwides
+ * and argv is used for exact match. Return true if it matches, otherwise
* false.
*
* Except for @create, these methods are called under holding event_mutex.
*/
struct dyn_event_operations {
struct list_head list;
- int (*create)(int argc, const char *argv[]);
+ int (*create)(const char *raw_command);
int (*show)(struct seq_file *m, struct dyn_event *ev);
bool (*is_busy)(struct dyn_event *ev);
int (*free)(struct dyn_event *ev);
@@ -76,13 +76,15 @@ int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops)
return 0;
}
-static inline int dyn_event_add(struct dyn_event *ev)
+static inline int dyn_event_add(struct dyn_event *ev,
+ struct trace_event_call *call)
{
lockdep_assert_held(&event_mutex);
if (!ev || !ev->ops)
return -EINVAL;
+ call->flags |= TRACE_EVENT_FL_DYNAMIC;
list_add_tail(&ev->list, &dyn_event_list);
return 0;
}
@@ -97,7 +99,7 @@ void *dyn_event_seq_start(struct seq_file *m, loff_t *pos);
void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos);
void dyn_event_seq_stop(struct seq_file *m, void *v);
int dyn_events_release_all(struct dyn_event_operations *type);
-int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type);
+int dyn_event_release(const char *raw_command, struct dyn_event_operations *type);
/*
* for_each_dyn_event - iterate over the dyn_event list
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index f22746f3c132..cd41e863b51c 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -32,7 +32,7 @@
* to be deciphered for the format file. Although these macros
* may become out of sync with the internal structure, they
* will create a compile error if it happens. Since the
- * internel structures are just tracing helpers, this is not
+ * internal structures are just tracing helpers, this is not
* an issue.
*
* When an internal structure is used, it should use:
@@ -78,8 +78,8 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ent, graph_ent )
- __field_desc( unsigned long, graph_ent, func )
- __field_desc( int, graph_ent, depth )
+ __field_packed( unsigned long, graph_ent, func )
+ __field_packed( int, graph_ent, depth )
),
F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
@@ -92,11 +92,11 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
- __field_desc( unsigned long, ret, func )
- __field_desc( unsigned long, ret, overrun )
- __field_desc( unsigned long long, ret, calltime)
- __field_desc( unsigned long long, ret, rettime )
- __field_desc( int, ret, depth )
+ __field_packed( unsigned long, ret, func )
+ __field_packed( int, ret, depth )
+ __field_packed( unsigned int, ret, overrun )
+ __field_packed( unsigned long long, ret, calltime)
+ __field_packed( unsigned long long, ret, rettime )
),
F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
@@ -325,14 +325,79 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
__field_desc( long, timestamp, tv_nsec )
__field( unsigned int, nmi_count )
__field( unsigned int, seqnum )
+ __field( unsigned int, count )
),
- F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tnmi-ts:%llu\tnmi-count:%u\n",
+ F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tcount:%d\tnmi-ts:%llu\tnmi-count:%u\n",
__entry->seqnum,
__entry->tv_sec,
__entry->tv_nsec,
__entry->duration,
__entry->outer_duration,
+ __entry->count,
__entry->nmi_total_ts,
__entry->nmi_count)
);
+
+#define FUNC_REPEATS_GET_DELTA_TS(entry) \
+ (((u64)(entry)->top_delta_ts << 32) | (entry)->bottom_delta_ts) \
+
+FTRACE_ENTRY(func_repeats, func_repeats_entry,
+
+ TRACE_FUNC_REPEATS,
+
+ F_STRUCT(
+ __field( unsigned long, ip )
+ __field( unsigned long, parent_ip )
+ __field( u16 , count )
+ __field( u16 , top_delta_ts )
+ __field( u32 , bottom_delta_ts )
+ ),
+
+ F_printk(" %ps <-%ps\t(repeats:%u delta: -%llu)",
+ (void *)__entry->ip,
+ (void *)__entry->parent_ip,
+ __entry->count,
+ FUNC_REPEATS_GET_DELTA_TS(__entry))
+);
+
+FTRACE_ENTRY(osnoise, osnoise_entry,
+
+ TRACE_OSNOISE,
+
+ F_STRUCT(
+ __field( u64, noise )
+ __field( u64, runtime )
+ __field( u64, max_sample )
+ __field( unsigned int, hw_count )
+ __field( unsigned int, nmi_count )
+ __field( unsigned int, irq_count )
+ __field( unsigned int, softirq_count )
+ __field( unsigned int, thread_count )
+ ),
+
+ F_printk("noise:%llu\tmax_sample:%llu\thw:%u\tnmi:%u\tirq:%u\tsoftirq:%u\tthread:%u\n",
+ __entry->noise,
+ __entry->max_sample,
+ __entry->hw_count,
+ __entry->nmi_count,
+ __entry->irq_count,
+ __entry->softirq_count,
+ __entry->thread_count)
+);
+
+FTRACE_ENTRY(timerlat, timerlat_entry,
+
+ TRACE_TIMERLAT,
+
+ F_STRUCT(
+ __field( unsigned int, seqnum )
+ __field( int, context )
+ __field( u64, timer_latency )
+ ),
+
+ F_printk("seq:%u\tcontext:%d\ttimer_latency:%llu\n",
+ __entry->seqnum,
+ __entry->context,
+ __entry->timer_latency)
+);
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
new file mode 100644
index 000000000000..928867f527e7
--- /dev/null
+++ b/kernel/trace/trace_eprobe.c
@@ -0,0 +1,959 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * event probes
+ *
+ * Part of this code was copied from kernel/trace/trace_kprobe.c written by
+ * Masami Hiramatsu <mhiramat@kernel.org>
+ *
+ * Copyright (C) 2021, VMware Inc, Steven Rostedt <rostedt@goodmis.org>
+ * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/ftrace.h>
+
+#include "trace_dynevent.h"
+#include "trace_probe.h"
+#include "trace_probe_tmpl.h"
+
+#define EPROBE_EVENT_SYSTEM "eprobes"
+
+struct trace_eprobe {
+ /* tracepoint system */
+ const char *event_system;
+
+ /* tracepoint event */
+ const char *event_name;
+
+ struct trace_event_call *event;
+
+ struct dyn_event devent;
+ struct trace_probe tp;
+};
+
+struct eprobe_data {
+ struct trace_event_file *file;
+ struct trace_eprobe *ep;
+};
+
+static int __trace_eprobe_create(int argc, const char *argv[]);
+
+static void trace_event_probe_cleanup(struct trace_eprobe *ep)
+{
+ if (!ep)
+ return;
+ trace_probe_cleanup(&ep->tp);
+ kfree(ep->event_name);
+ kfree(ep->event_system);
+ if (ep->event)
+ trace_event_put_ref(ep->event);
+ kfree(ep);
+}
+
+static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev)
+{
+ return container_of(ev, struct trace_eprobe, devent);
+}
+
+static int eprobe_dyn_event_create(const char *raw_command)
+{
+ return trace_probe_create(raw_command, __trace_eprobe_create);
+}
+
+static int eprobe_dyn_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+ int i;
+
+ seq_printf(m, "e:%s/%s", trace_probe_group_name(&ep->tp),
+ trace_probe_name(&ep->tp));
+ seq_printf(m, " %s.%s", ep->event_system, ep->event_name);
+
+ for (i = 0; i < ep->tp.nr_args; i++)
+ seq_printf(m, " %s=%s", ep->tp.args[i].name, ep->tp.args[i].comm);
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static int unregister_trace_eprobe(struct trace_eprobe *ep)
+{
+ /* If other probes are on the event, just unregister eprobe */
+ if (trace_probe_has_sibling(&ep->tp))
+ goto unreg;
+
+ /* Enabled event can not be unregistered */
+ if (trace_probe_is_enabled(&ep->tp))
+ return -EBUSY;
+
+ /* Will fail if probe is being used by ftrace or perf */
+ if (trace_probe_unregister_event_call(&ep->tp))
+ return -EBUSY;
+
+unreg:
+ dyn_event_remove(&ep->devent);
+ trace_probe_unlink(&ep->tp);
+
+ return 0;
+}
+
+static int eprobe_dyn_event_release(struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+ int ret = unregister_trace_eprobe(ep);
+
+ if (!ret)
+ trace_event_probe_cleanup(ep);
+ return ret;
+}
+
+static bool eprobe_dyn_event_is_busy(struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+
+ return trace_probe_is_enabled(&ep->tp);
+}
+
+static bool eprobe_dyn_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+ const char *slash;
+
+ /*
+ * We match the following:
+ * event only - match all eprobes with event name
+ * system and event only - match all system/event probes
+ *
+ * The below has the above satisfied with more arguments:
+ *
+ * attached system/event - If the arg has the system and event
+ * the probe is attached to, match
+ * probes with the attachment.
+ *
+ * If any more args are given, then it requires a full match.
+ */
+
+ /*
+ * If system exists, but this probe is not part of that system
+ * do not match.
+ */
+ if (system && strcmp(trace_probe_group_name(&ep->tp), system) != 0)
+ return false;
+
+ /* Must match the event name */
+ if (strcmp(trace_probe_name(&ep->tp), event) != 0)
+ return false;
+
+ /* No arguments match all */
+ if (argc < 1)
+ return true;
+
+ /* First argument is the system/event the probe is attached to */
+
+ slash = strchr(argv[0], '/');
+ if (!slash)
+ slash = strchr(argv[0], '.');
+ if (!slash)
+ return false;
+
+ if (strncmp(ep->event_system, argv[0], slash - argv[0]))
+ return false;
+ if (strcmp(ep->event_name, slash + 1))
+ return false;
+
+ argc--;
+ argv++;
+
+ /* If there are no other args, then match */
+ if (argc < 1)
+ return true;
+
+ return trace_probe_match_command_args(&ep->tp, argc, argv);
+}
+
+static struct dyn_event_operations eprobe_dyn_event_ops = {
+ .create = eprobe_dyn_event_create,
+ .show = eprobe_dyn_event_show,
+ .is_busy = eprobe_dyn_event_is_busy,
+ .free = eprobe_dyn_event_release,
+ .match = eprobe_dyn_event_match,
+};
+
+static struct trace_eprobe *alloc_event_probe(const char *group,
+ const char *this_event,
+ struct trace_event_call *event,
+ int nargs)
+{
+ struct trace_eprobe *ep;
+ const char *event_name;
+ const char *sys_name;
+ int ret = -ENOMEM;
+
+ if (!event)
+ return ERR_PTR(-ENODEV);
+
+ sys_name = event->class->system;
+ event_name = trace_event_name(event);
+
+ ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL);
+ if (!ep) {
+ trace_event_put_ref(event);
+ goto error;
+ }
+ ep->event = event;
+ ep->event_name = kstrdup(event_name, GFP_KERNEL);
+ if (!ep->event_name)
+ goto error;
+ ep->event_system = kstrdup(sys_name, GFP_KERNEL);
+ if (!ep->event_system)
+ goto error;
+
+ ret = trace_probe_init(&ep->tp, this_event, group, false);
+ if (ret < 0)
+ goto error;
+
+ dyn_event_init(&ep->devent, &eprobe_dyn_event_ops);
+ return ep;
+error:
+ trace_event_probe_cleanup(ep);
+ return ERR_PTR(ret);
+}
+
+static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i)
+{
+ struct probe_arg *parg = &ep->tp.args[i];
+ struct ftrace_event_field *field;
+ struct list_head *head;
+
+ head = trace_get_fields(ep->event);
+ list_for_each_entry(field, head, link) {
+ if (!strcmp(parg->code->data, field->name)) {
+ kfree(parg->code->data);
+ parg->code->data = field;
+ return 0;
+ }
+ }
+ kfree(parg->code->data);
+ parg->code->data = NULL;
+ return -ENOENT;
+}
+
+static int eprobe_event_define_fields(struct trace_event_call *event_call)
+{
+ int ret;
+ struct eprobe_trace_entry_head field;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(event_call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENOENT;
+
+ DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0);
+
+ return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
+}
+
+static struct trace_event_fields eprobe_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = eprobe_event_define_fields },
+ {}
+};
+
+/* Event entry printers */
+static enum print_line_t
+print_eprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct eprobe_trace_entry_head *field;
+ struct trace_event_call *pevent;
+ struct trace_event *probed_event;
+ struct trace_seq *s = &iter->seq;
+ struct trace_probe *tp;
+
+ field = (struct eprobe_trace_entry_head *)iter->ent;
+ tp = trace_probe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (WARN_ON_ONCE(!tp))
+ goto out;
+
+ trace_seq_printf(s, "%s: (", trace_probe_name(tp));
+
+ probed_event = ftrace_find_event(field->type);
+ if (probed_event) {
+ pevent = container_of(probed_event, struct trace_event_call, event);
+ trace_seq_printf(s, "%s.%s", pevent->class->system,
+ trace_event_name(pevent));
+ } else {
+ trace_seq_printf(s, "%u", field->type);
+ }
+
+ trace_seq_putc(s, ')');
+
+ if (print_probe_args(s, tp->args, tp->nr_args,
+ (u8 *)&field[1], field) < 0)
+ goto out;
+
+ trace_seq_putc(s, '\n');
+ out:
+ return trace_handle_return(s);
+}
+
+static unsigned long get_event_field(struct fetch_insn *code, void *rec)
+{
+ struct ftrace_event_field *field = code->data;
+ unsigned long val;
+ void *addr;
+
+ addr = rec + field->offset;
+
+ switch (field->size) {
+ case 1:
+ if (field->is_signed)
+ val = *(char *)addr;
+ else
+ val = *(unsigned char *)addr;
+ break;
+ case 2:
+ if (field->is_signed)
+ val = *(short *)addr;
+ else
+ val = *(unsigned short *)addr;
+ break;
+ case 4:
+ if (field->is_signed)
+ val = *(int *)addr;
+ else
+ val = *(unsigned int *)addr;
+ break;
+ default:
+ if (field->is_signed)
+ val = *(long *)addr;
+ else
+ val = *(unsigned long *)addr;
+ break;
+ }
+ return val;
+}
+
+static int get_eprobe_size(struct trace_probe *tp, void *rec)
+{
+ struct probe_arg *arg;
+ int i, len, ret = 0;
+
+ for (i = 0; i < tp->nr_args; i++) {
+ arg = tp->args + i;
+ if (unlikely(arg->dynamic)) {
+ unsigned long val;
+
+ val = get_event_field(arg->code, rec);
+ len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL);
+ if (len > 0)
+ ret += len;
+ }
+ }
+
+ return ret;
+}
+
+/* Kprobe specific fetch functions */
+
+/* Note that we don't verify it, since the code does not come from user space */
+static int
+process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
+ void *base)
+{
+ unsigned long val;
+
+ val = get_event_field(code, rec);
+ return process_fetch_insn_bottom(code + 1, val, dest, base);
+}
+NOKPROBE_SYMBOL(process_fetch_insn)
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+fetch_store_strlen_user(unsigned long addr)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+
+ return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+}
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+fetch_store_strlen(unsigned long addr)
+{
+ int ret, len = 0;
+ u8 c;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if (addr < TASK_SIZE)
+ return fetch_store_strlen_user(addr);
+#endif
+
+ do {
+ ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
+ len++;
+ } while (c && ret == 0 && len < MAX_STRING_SIZE);
+
+ return (ret < 0) ? ret : len;
+}
+
+/*
+ * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
+ */
+static nokprobe_inline int
+fetch_store_string_user(unsigned long addr, void *dest, void *base)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
+ if (ret >= 0)
+ *(u32 *)dest = make_data_loc(ret, __dest - base);
+
+ return ret;
+}
+
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
+ * length and relative data location.
+ */
+static nokprobe_inline int
+fetch_store_string(unsigned long addr, void *dest, void *base)
+{
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)addr < TASK_SIZE)
+ return fetch_store_string_user(addr, dest, base);
+#endif
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
+ if (ret >= 0)
+ *(u32 *)dest = make_data_loc(ret, __dest - base);
+
+ return ret;
+}
+
+static nokprobe_inline int
+probe_mem_read_user(void *dest, void *src, size_t size)
+{
+ const void __user *uaddr = (__force const void __user *)src;
+
+ return copy_from_user_nofault(dest, uaddr, size);
+}
+
+static nokprobe_inline int
+probe_mem_read(void *dest, void *src, size_t size)
+{
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)src < TASK_SIZE)
+ return probe_mem_read_user(dest, src, size);
+#endif
+ return copy_from_kernel_nofault(dest, src, size);
+}
+
+/* eprobe handler */
+static inline void
+__eprobe_trace_func(struct eprobe_data *edata, void *rec)
+{
+ struct eprobe_trace_entry_head *entry;
+ struct trace_event_call *call = trace_probe_event_call(&edata->ep->tp);
+ struct trace_event_buffer fbuffer;
+ int dsize;
+
+ if (WARN_ON_ONCE(call != edata->file->event_call))
+ return;
+
+ if (trace_trigger_soft_disabled(edata->file))
+ return;
+
+ fbuffer.trace_ctx = tracing_gen_ctx();
+ fbuffer.trace_file = edata->file;
+
+ dsize = get_eprobe_size(&edata->ep->tp, rec);
+ fbuffer.regs = NULL;
+
+ fbuffer.event =
+ trace_event_buffer_lock_reserve(&fbuffer.buffer, edata->file,
+ call->event.type,
+ sizeof(*entry) + edata->ep->tp.size + dsize,
+ fbuffer.trace_ctx);
+ if (!fbuffer.event)
+ return;
+
+ entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
+ if (edata->ep->event)
+ entry->type = edata->ep->event->event.type;
+ else
+ entry->type = 0;
+ store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize);
+
+ trace_event_buffer_commit(&fbuffer);
+}
+
+/*
+ * The event probe implementation uses event triggers to get access to
+ * the event it is attached to, but is not an actual trigger. The below
+ * functions are just stubs to fulfill what is needed to use the trigger
+ * infrastructure.
+ */
+static int eprobe_trigger_init(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ return 0;
+}
+
+static void eprobe_trigger_free(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+
+}
+
+static int eprobe_trigger_print(struct seq_file *m,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ /* Do not print eprobe event triggers */
+ return 0;
+}
+
+static void eprobe_trigger_func(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *rbe)
+{
+ struct eprobe_data *edata = data->private_data;
+
+ __eprobe_trace_func(edata, rec);
+}
+
+static struct event_trigger_ops eprobe_trigger_ops = {
+ .func = eprobe_trigger_func,
+ .print = eprobe_trigger_print,
+ .init = eprobe_trigger_init,
+ .free = eprobe_trigger_free,
+};
+
+static int eprobe_trigger_cmd_func(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param)
+{
+ return -1;
+}
+
+static int eprobe_trigger_reg_func(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ return -1;
+}
+
+static void eprobe_trigger_unreg_func(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+
+}
+
+static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd,
+ char *param)
+{
+ return &eprobe_trigger_ops;
+}
+
+static struct event_command event_trigger_cmd = {
+ .name = "eprobe",
+ .trigger_type = ETT_EVENT_EPROBE,
+ .flags = EVENT_CMD_FL_NEEDS_REC,
+ .func = eprobe_trigger_cmd_func,
+ .reg = eprobe_trigger_reg_func,
+ .unreg = eprobe_trigger_unreg_func,
+ .unreg_all = NULL,
+ .get_trigger_ops = eprobe_trigger_get_ops,
+ .set_filter = NULL,
+};
+
+static struct event_trigger_data *
+new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
+{
+ struct event_trigger_data *trigger;
+ struct eprobe_data *edata;
+
+ edata = kzalloc(sizeof(*edata), GFP_KERNEL);
+ trigger = kzalloc(sizeof(*trigger), GFP_KERNEL);
+ if (!trigger || !edata) {
+ kfree(edata);
+ kfree(trigger);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ trigger->flags = EVENT_TRIGGER_FL_PROBE;
+ trigger->count = -1;
+ trigger->ops = &eprobe_trigger_ops;
+
+ /*
+ * EVENT PROBE triggers are not registered as commands with
+ * register_event_command(), as they are not controlled by the user
+ * from the trigger file
+ */
+ trigger->cmd_ops = &event_trigger_cmd;
+
+ INIT_LIST_HEAD(&trigger->list);
+ RCU_INIT_POINTER(trigger->filter, NULL);
+
+ edata->file = file;
+ edata->ep = ep;
+ trigger->private_data = edata;
+
+ return trigger;
+}
+
+static int enable_eprobe(struct trace_eprobe *ep,
+ struct trace_event_file *eprobe_file)
+{
+ struct event_trigger_data *trigger;
+ struct trace_event_file *file;
+ struct trace_array *tr = eprobe_file->tr;
+
+ file = find_event_file(tr, ep->event_system, ep->event_name);
+ if (!file)
+ return -ENOENT;
+ trigger = new_eprobe_trigger(ep, eprobe_file);
+ if (IS_ERR(trigger))
+ return PTR_ERR(trigger);
+
+ list_add_tail_rcu(&trigger->list, &file->triggers);
+
+ trace_event_trigger_enable_disable(file, 1);
+ update_cond_flag(file);
+
+ return 0;
+}
+
+static struct trace_event_functions eprobe_funcs = {
+ .trace = print_eprobe_event
+};
+
+static int disable_eprobe(struct trace_eprobe *ep,
+ struct trace_array *tr)
+{
+ struct event_trigger_data *trigger;
+ struct trace_event_file *file;
+ struct eprobe_data *edata;
+
+ file = find_event_file(tr, ep->event_system, ep->event_name);
+ if (!file)
+ return -ENOENT;
+
+ list_for_each_entry(trigger, &file->triggers, list) {
+ if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE))
+ continue;
+ edata = trigger->private_data;
+ if (edata->ep == ep)
+ break;
+ }
+ if (list_entry_is_head(trigger, &file->triggers, list))
+ return -ENODEV;
+
+ list_del_rcu(&trigger->list);
+
+ trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
+
+ /* Make sure nothing is using the edata or trigger */
+ tracepoint_synchronize_unregister();
+
+ kfree(edata);
+ kfree(trigger);
+
+ return 0;
+}
+
+static int enable_trace_eprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *pos, *tp;
+ struct trace_eprobe *ep;
+ bool enabled;
+ int ret = 0;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+ enabled = trace_probe_is_enabled(tp);
+
+ /* This also changes "enabled" state */
+ if (file) {
+ ret = trace_probe_add_file(tp, file);
+ if (ret)
+ return ret;
+ } else
+ trace_probe_set_flag(tp, TP_FLAG_PROFILE);
+
+ if (enabled)
+ return 0;
+
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ ep = container_of(pos, struct trace_eprobe, tp);
+ ret = enable_eprobe(ep, file);
+ if (ret)
+ break;
+ enabled = true;
+ }
+
+ if (ret) {
+ /* Failed to enable one of them. Roll back all */
+ if (enabled)
+ disable_eprobe(ep, file->tr);
+ if (file)
+ trace_probe_remove_file(tp, file);
+ else
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+ }
+
+ return ret;
+}
+
+static int disable_trace_eprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *pos, *tp;
+ struct trace_eprobe *ep;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ if (file) {
+ if (!trace_probe_get_file_link(tp, file))
+ return -ENOENT;
+ if (!trace_probe_has_single_file(tp))
+ goto out;
+ trace_probe_clear_flag(tp, TP_FLAG_TRACE);
+ } else
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+
+ if (!trace_probe_is_enabled(tp)) {
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ ep = container_of(pos, struct trace_eprobe, tp);
+ disable_eprobe(ep, file->tr);
+ }
+ }
+
+ out:
+ if (file)
+ /*
+ * Synchronization is done in below function. For perf event,
+ * file == NULL and perf_trace_event_unreg() calls
+ * tracepoint_synchronize_unregister() to ensure synchronize
+ * event. We don't need to care about it.
+ */
+ trace_probe_remove_file(tp, file);
+
+ return 0;
+}
+
+static int eprobe_register(struct trace_event_call *event,
+ enum trace_reg type, void *data)
+{
+ struct trace_event_file *file = data;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return enable_trace_eprobe(event, file);
+ case TRACE_REG_UNREGISTER:
+ return disable_trace_eprobe(event, file);
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ case TRACE_REG_PERF_UNREGISTER:
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ return 0;
+#endif
+ }
+ return 0;
+}
+
+static inline void init_trace_eprobe_call(struct trace_eprobe *ep)
+{
+ struct trace_event_call *call = trace_probe_event_call(&ep->tp);
+
+ call->flags = TRACE_EVENT_FL_EPROBE;
+ call->event.funcs = &eprobe_funcs;
+ call->class->fields_array = eprobe_fields_array;
+ call->class->reg = eprobe_register;
+}
+
+static struct trace_event_call *
+find_and_get_event(const char *system, const char *event_name)
+{
+ struct trace_event_call *tp_event;
+ const char *name;
+
+ list_for_each_entry(tp_event, &ftrace_events, list) {
+ /* Skip other probes and ftrace events */
+ if (tp_event->flags &
+ (TRACE_EVENT_FL_IGNORE_ENABLE |
+ TRACE_EVENT_FL_KPROBE |
+ TRACE_EVENT_FL_UPROBE |
+ TRACE_EVENT_FL_EPROBE))
+ continue;
+ if (!tp_event->class->system ||
+ strcmp(system, tp_event->class->system))
+ continue;
+ name = trace_event_name(tp_event);
+ if (!name || strcmp(event_name, name))
+ continue;
+ if (!trace_event_try_get_ref(tp_event)) {
+ return NULL;
+ break;
+ }
+ return tp_event;
+ break;
+ }
+ return NULL;
+}
+
+static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i)
+{
+ unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TPOINT;
+ int ret;
+
+ ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags);
+ if (ret)
+ return ret;
+
+ if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG)
+ ret = trace_eprobe_tp_arg_update(ep, i);
+
+ return ret;
+}
+
+static int __trace_eprobe_create(int argc, const char *argv[])
+{
+ /*
+ * Argument syntax:
+ * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS]
+ * Fetch args:
+ * <name>=$<field>[:TYPE]
+ */
+ const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
+ const char *sys_event = NULL, *sys_name = NULL;
+ struct trace_event_call *event_call;
+ struct trace_eprobe *ep = NULL;
+ char buf1[MAX_EVENT_NAME_LEN];
+ char buf2[MAX_EVENT_NAME_LEN];
+ int ret = 0;
+ int i;
+
+ if (argc < 2 || argv[0][0] != 'e')
+ return -ECANCELED;
+
+ trace_probe_log_init("event_probe", argc, argv);
+
+ event = strchr(&argv[0][1], ':');
+ if (event) {
+ event++;
+ ret = traceprobe_parse_event_name(&event, &group, buf1,
+ event - argv[0]);
+ if (ret)
+ goto parse_error;
+ } else {
+ strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN);
+ sanitize_event_name(buf1);
+ event = buf1;
+ }
+ if (!is_good_name(event) || !is_good_name(group))
+ goto parse_error;
+
+ sys_event = argv[1];
+ ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2,
+ sys_event - argv[1]);
+ if (ret || !sys_name)
+ goto parse_error;
+ if (!is_good_name(sys_event) || !is_good_name(sys_name))
+ goto parse_error;
+
+ mutex_lock(&event_mutex);
+ event_call = find_and_get_event(sys_name, sys_event);
+ ep = alloc_event_probe(group, event, event_call, argc - 2);
+ mutex_unlock(&event_mutex);
+
+ if (IS_ERR(ep)) {
+ ret = PTR_ERR(ep);
+ /* This must return -ENOMEM or missing event, else there is a bug */
+ WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV);
+ ep = NULL;
+ goto error;
+ }
+
+ argc -= 2; argv += 2;
+ /* parse arguments */
+ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ trace_probe_log_set_index(i + 2);
+ ret = trace_eprobe_tp_update_arg(ep, argv, i);
+ if (ret)
+ goto error;
+ }
+ ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT);
+ if (ret < 0)
+ goto error;
+ init_trace_eprobe_call(ep);
+ mutex_lock(&event_mutex);
+ ret = trace_probe_register_event_call(&ep->tp);
+ if (ret) {
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ }
+ mutex_unlock(&event_mutex);
+ goto error;
+ }
+ ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
+ mutex_unlock(&event_mutex);
+ return ret;
+parse_error:
+ ret = -EINVAL;
+error:
+ trace_event_probe_cleanup(ep);
+ return ret;
+}
+
+/*
+ * Register dynevent at core_initcall. This allows kernel to setup eprobe
+ * events in postcore_initcall without tracefs.
+ */
+static __init int trace_events_eprobe_init_early(void)
+{
+ int err = 0;
+
+ err = dyn_event_register(&eprobe_dyn_event_ops);
+ if (err)
+ pr_warn("Could not register eprobe_dyn_event_ops\n");
+
+ return err;
+}
+core_initcall(trace_events_eprobe_init_early);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 643e0b19920d..a114549720d6 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -16,7 +16,7 @@ static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
/*
* Force it to be aligned to unsigned long to avoid misaligned accesses
- * suprises
+ * surprises
*/
typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
perf_trace_t;
@@ -177,7 +177,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event)
}
}
out:
- module_put(tp_event->mod);
+ trace_event_put_ref(tp_event);
}
static int perf_trace_event_open(struct perf_event *p_event)
@@ -224,10 +224,10 @@ int perf_trace_init(struct perf_event *p_event)
list_for_each_entry(tp_event, &ftrace_events, list) {
if (tp_event->event.type == event_id &&
tp_event->class && tp_event->class->reg &&
- try_module_get(tp_event->mod)) {
+ trace_event_try_get_ref(tp_event)) {
ret = perf_trace_event_init(tp_event, p_event);
if (ret)
- module_put(tp_event->mod);
+ trace_event_put_ref(tp_event);
break;
}
}
@@ -400,7 +400,8 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
- "perf buffer not large enough"))
+ "perf buffer not large enough, wanted %d, have %d",
+ size, PERF_MAX_TRACE_SIZE))
return NULL;
*rctxp = rctx = perf_swevent_get_recursion_context();
@@ -421,28 +422,33 @@ NOKPROBE_SYMBOL(perf_trace_buf_alloc);
void perf_trace_buf_update(void *record, u16 type)
{
struct trace_entry *entry = record;
- int pc = preempt_count();
- unsigned long flags;
- local_save_flags(flags);
- tracing_generic_entry_update(entry, type, flags, pc);
+ tracing_generic_entry_update(entry, type, tracing_gen_ctx());
}
NOKPROBE_SYMBOL(perf_trace_buf_update);
#ifdef CONFIG_FUNCTION_TRACER
static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct pt_regs *pt_regs)
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
struct ftrace_entry *entry;
struct perf_event *event;
struct hlist_head head;
struct pt_regs regs;
int rctx;
+ int bit;
- if ((unsigned long)ops->private != smp_processor_id())
+ if (!rcu_is_watching())
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
return;
+ if ((unsigned long)ops->private != smp_processor_id())
+ goto out;
+
event = container_of(ops, struct perf_event, ftrace_ops);
/*
@@ -463,13 +469,15 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
if (!entry)
- return;
+ goto out;
entry->ip = ip;
entry->parent_ip = parent_ip;
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
1, &regs, &head, NULL);
+out:
+ ftrace_test_recursion_unlock(bit);
#undef ENTRY_SIZE
}
@@ -477,7 +485,6 @@ static int perf_ftrace_function_register(struct perf_event *event)
{
struct ftrace_ops *ops = &event->ftrace_ops;
- ops->flags = FTRACE_OPS_FL_RCU;
ops->func = perf_ftrace_function_call;
ops->private = (void *)(unsigned long)nr_cpu_ids;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f38234ecea18..92be9cb1d7d4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -38,6 +38,7 @@ DEFINE_MUTEX(event_mutex);
LIST_HEAD(ftrace_events);
static LIST_HEAD(ftrace_generic_fields);
static LIST_HEAD(ftrace_common_fields);
+static bool eventdir_initialized;
#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
@@ -180,6 +181,7 @@ static int trace_define_common_fields(void)
__common_field(unsigned short, type);
__common_field(unsigned char, flags);
+ /* Holds both preempt_count and migrate_disable */
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
@@ -216,6 +218,214 @@ int trace_event_get_offsets(struct trace_event_call *call)
return tail->offset + tail->size;
}
+/*
+ * Check if the referenced field is an array and return true,
+ * as arrays are OK to dereference.
+ */
+static bool test_field(const char *fmt, struct trace_event_call *call)
+{
+ struct trace_event_fields *field = call->class->fields_array;
+ const char *array_descriptor;
+ const char *p = fmt;
+ int len;
+
+ if (!(len = str_has_prefix(fmt, "REC->")))
+ return false;
+ fmt += len;
+ for (p = fmt; *p; p++) {
+ if (!isalnum(*p) && *p != '_')
+ break;
+ }
+ len = p - fmt;
+
+ for (; field->type; field++) {
+ if (strncmp(field->name, fmt, len) ||
+ field->name[len])
+ continue;
+ array_descriptor = strchr(field->type, '[');
+ /* This is an array and is OK to dereference. */
+ return array_descriptor != NULL;
+ }
+ return false;
+}
+
+/*
+ * Examine the print fmt of the event looking for unsafe dereference
+ * pointers using %p* that could be recorded in the trace event and
+ * much later referenced after the pointer was freed. Dereferencing
+ * pointers are OK, if it is dereferenced into the event itself.
+ */
+static void test_event_printk(struct trace_event_call *call)
+{
+ u64 dereference_flags = 0;
+ bool first = true;
+ const char *fmt, *c, *r, *a;
+ int parens = 0;
+ char in_quote = 0;
+ int start_arg = 0;
+ int arg = 0;
+ int i;
+
+ fmt = call->print_fmt;
+
+ if (!fmt)
+ return;
+
+ for (i = 0; fmt[i]; i++) {
+ switch (fmt[i]) {
+ case '\\':
+ i++;
+ if (!fmt[i])
+ return;
+ continue;
+ case '"':
+ case '\'':
+ /*
+ * The print fmt starts with a string that
+ * is processed first to find %p* usage,
+ * then after the first string, the print fmt
+ * contains arguments that are used to check
+ * if the dereferenced %p* usage is safe.
+ */
+ if (first) {
+ if (fmt[i] == '\'')
+ continue;
+ if (in_quote) {
+ arg = 0;
+ first = false;
+ /*
+ * If there was no %p* uses
+ * the fmt is OK.
+ */
+ if (!dereference_flags)
+ return;
+ }
+ }
+ if (in_quote) {
+ if (in_quote == fmt[i])
+ in_quote = 0;
+ } else {
+ in_quote = fmt[i];
+ }
+ continue;
+ case '%':
+ if (!first || !in_quote)
+ continue;
+ i++;
+ if (!fmt[i])
+ return;
+ switch (fmt[i]) {
+ case '%':
+ continue;
+ case 'p':
+ /* Find dereferencing fields */
+ switch (fmt[i + 1]) {
+ case 'B': case 'R': case 'r':
+ case 'b': case 'M': case 'm':
+ case 'I': case 'i': case 'E':
+ case 'U': case 'V': case 'N':
+ case 'a': case 'd': case 'D':
+ case 'g': case 't': case 'C':
+ case 'O': case 'f':
+ if (WARN_ONCE(arg == 63,
+ "Too many args for event: %s",
+ trace_event_name(call)))
+ return;
+ dereference_flags |= 1ULL << arg;
+ }
+ break;
+ default:
+ {
+ bool star = false;
+ int j;
+
+ /* Increment arg if %*s exists. */
+ for (j = 0; fmt[i + j]; j++) {
+ if (isdigit(fmt[i + j]) ||
+ fmt[i + j] == '.')
+ continue;
+ if (fmt[i + j] == '*') {
+ star = true;
+ continue;
+ }
+ if ((fmt[i + j] == 's') && star)
+ arg++;
+ break;
+ }
+ break;
+ } /* default */
+
+ } /* switch */
+ arg++;
+ continue;
+ case '(':
+ if (in_quote)
+ continue;
+ parens++;
+ continue;
+ case ')':
+ if (in_quote)
+ continue;
+ parens--;
+ if (WARN_ONCE(parens < 0,
+ "Paren mismatch for event: %s\narg='%s'\n%*s",
+ trace_event_name(call),
+ fmt + start_arg,
+ (i - start_arg) + 5, "^"))
+ return;
+ continue;
+ case ',':
+ if (in_quote || parens)
+ continue;
+ i++;
+ while (isspace(fmt[i]))
+ i++;
+ start_arg = i;
+ if (!(dereference_flags & (1ULL << arg)))
+ goto next_arg;
+
+ /* Find the REC-> in the argument */
+ c = strchr(fmt + i, ',');
+ r = strstr(fmt + i, "REC->");
+ if (r && (!c || r < c)) {
+ /*
+ * Addresses of events on the buffer,
+ * or an array on the buffer is
+ * OK to dereference.
+ * There's ways to fool this, but
+ * this is to catch common mistakes,
+ * not malicious code.
+ */
+ a = strchr(fmt + i, '&');
+ if ((a && (a < r)) || test_field(r, call))
+ dereference_flags &= ~(1ULL << arg);
+ }
+ next_arg:
+ i--;
+ arg++;
+ }
+ }
+
+ /*
+ * If you triggered the below warning, the trace event reported
+ * uses an unsafe dereference pointer %p*. As the data stored
+ * at the trace event time may no longer exist when the trace
+ * event is printed, dereferencing to the original source is
+ * unsafe. The source of the dereference must be copied into the
+ * event itself, and the dereference must access the copy instead.
+ */
+ if (WARN_ON_ONCE(dereference_flags)) {
+ arg = 1;
+ while (!(dereference_flags & 1)) {
+ dereference_flags >>= 1;
+ arg++;
+ }
+ pr_warn("event %s has unsafe dereference of argument %d\n",
+ trace_event_name(call), arg);
+ pr_warn("print_fmt: %s\n", fmt);
+ }
+}
+
int trace_event_raw_init(struct trace_event_call *call)
{
int id;
@@ -224,6 +434,8 @@ int trace_event_raw_init(struct trace_event_call *call)
if (!id)
return -ENODEV;
+ test_event_printk(call);
+
return 0;
}
EXPORT_SYMBOL_GPL(trace_event_raw_init);
@@ -232,10 +444,13 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
{
struct trace_array *tr = trace_file->tr;
struct trace_array_cpu *data;
+ struct trace_pid_list *no_pid_list;
struct trace_pid_list *pid_list;
pid_list = rcu_dereference_raw(tr->filtered_pids);
- if (!pid_list)
+ no_pid_list = rcu_dereference_raw(tr->filtered_no_pids);
+
+ if (!pid_list && !no_pid_list)
return false;
data = this_cpu_ptr(tr->array_buffer.data);
@@ -254,22 +469,19 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
trace_event_ignore_this_pid(trace_file))
return NULL;
- local_save_flags(fbuffer->flags);
- fbuffer->pc = preempt_count();
/*
* If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
* preemption (adding one to the preempt_count). Since we are
* interested in the preempt_count at the time the tracepoint was
* hit, we need to subtract one to offset the increment.
*/
- if (IS_ENABLED(CONFIG_PREEMPTION))
- fbuffer->pc--;
+ fbuffer->trace_ctx = tracing_gen_ctx_dec();
fbuffer->trace_file = trace_file;
fbuffer->event =
trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file,
event_call->event.type, len,
- fbuffer->flags, fbuffer->pc);
+ fbuffer->trace_ctx);
if (!fbuffer->event)
return NULL;
@@ -510,6 +722,9 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
pid_list = rcu_dereference_raw(tr->filtered_pids);
trace_filter_add_remove_task(pid_list, NULL, task);
+
+ pid_list = rcu_dereference_raw(tr->filtered_no_pids);
+ trace_filter_add_remove_task(pid_list, NULL, task);
}
static void
@@ -522,6 +737,9 @@ event_filter_pid_sched_process_fork(void *data,
pid_list = rcu_dereference_sched(tr->filtered_pids);
trace_filter_add_remove_task(pid_list, self, task);
+
+ pid_list = rcu_dereference_sched(tr->filtered_no_pids);
+ trace_filter_add_remove_task(pid_list, self, task);
}
void trace_event_follow_fork(struct trace_array *tr, bool enable)
@@ -529,12 +747,12 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
if (enable) {
register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork,
tr, INT_MIN);
- register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit,
+ register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit,
tr, INT_MAX);
} else {
unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork,
tr);
- unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit,
+ unregister_trace_sched_process_free(event_filter_pid_sched_process_exit,
tr);
}
}
@@ -544,13 +762,23 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
struct task_struct *prev, struct task_struct *next)
{
struct trace_array *tr = data;
+ struct trace_pid_list *no_pid_list;
struct trace_pid_list *pid_list;
+ bool ret;
pid_list = rcu_dereference_sched(tr->filtered_pids);
+ no_pid_list = rcu_dereference_sched(tr->filtered_no_pids);
- this_cpu_write(tr->array_buffer.data->ignore_pid,
- trace_ignore_this_task(pid_list, prev) &&
- trace_ignore_this_task(pid_list, next));
+ /*
+ * Sched switch is funny, as we only want to ignore it
+ * in the notrace case if both prev and next should be ignored.
+ */
+ ret = trace_ignore_this_task(NULL, no_pid_list, prev) &&
+ trace_ignore_this_task(NULL, no_pid_list, next);
+
+ this_cpu_write(tr->array_buffer.data->ignore_pid, ret ||
+ (trace_ignore_this_task(pid_list, NULL, prev) &&
+ trace_ignore_this_task(pid_list, NULL, next)));
}
static void
@@ -558,18 +786,21 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
struct task_struct *prev, struct task_struct *next)
{
struct trace_array *tr = data;
+ struct trace_pid_list *no_pid_list;
struct trace_pid_list *pid_list;
pid_list = rcu_dereference_sched(tr->filtered_pids);
+ no_pid_list = rcu_dereference_sched(tr->filtered_no_pids);
this_cpu_write(tr->array_buffer.data->ignore_pid,
- trace_ignore_this_task(pid_list, next));
+ trace_ignore_this_task(pid_list, no_pid_list, next));
}
static void
event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
{
struct trace_array *tr = data;
+ struct trace_pid_list *no_pid_list;
struct trace_pid_list *pid_list;
/* Nothing to do if we are already tracing */
@@ -577,15 +808,17 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
return;
pid_list = rcu_dereference_sched(tr->filtered_pids);
+ no_pid_list = rcu_dereference_sched(tr->filtered_no_pids);
this_cpu_write(tr->array_buffer.data->ignore_pid,
- trace_ignore_this_task(pid_list, task));
+ trace_ignore_this_task(pid_list, no_pid_list, task));
}
static void
event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
{
struct trace_array *tr = data;
+ struct trace_pid_list *no_pid_list;
struct trace_pid_list *pid_list;
/* Nothing to do if we are not tracing */
@@ -593,23 +826,15 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
return;
pid_list = rcu_dereference_sched(tr->filtered_pids);
+ no_pid_list = rcu_dereference_sched(tr->filtered_no_pids);
/* Set tracing if current is enabled */
this_cpu_write(tr->array_buffer.data->ignore_pid,
- trace_ignore_this_task(pid_list, current));
+ trace_ignore_this_task(pid_list, no_pid_list, current));
}
-static void __ftrace_clear_event_pids(struct trace_array *tr)
+static void unregister_pid_events(struct trace_array *tr)
{
- struct trace_pid_list *pid_list;
- struct trace_event_file *file;
- int cpu;
-
- pid_list = rcu_dereference_protected(tr->filtered_pids,
- lockdep_is_held(&event_mutex));
- if (!pid_list)
- return;
-
unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr);
unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr);
@@ -621,26 +846,55 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr);
unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr);
+}
- list_for_each_entry(file, &tr->events, list) {
- clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
+static void __ftrace_clear_event_pids(struct trace_array *tr, int type)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_pid_list *no_pid_list;
+ struct trace_event_file *file;
+ int cpu;
+
+ pid_list = rcu_dereference_protected(tr->filtered_pids,
+ lockdep_is_held(&event_mutex));
+ no_pid_list = rcu_dereference_protected(tr->filtered_no_pids,
+ lockdep_is_held(&event_mutex));
+
+ /* Make sure there's something to do */
+ if (!pid_type_enabled(type, pid_list, no_pid_list))
+ return;
+
+ if (!still_need_pid_events(type, pid_list, no_pid_list)) {
+ unregister_pid_events(tr);
+
+ list_for_each_entry(file, &tr->events, list) {
+ clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
+ }
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false;
}
- for_each_possible_cpu(cpu)
- per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false;
+ if (type & TRACE_PIDS)
+ rcu_assign_pointer(tr->filtered_pids, NULL);
- rcu_assign_pointer(tr->filtered_pids, NULL);
+ if (type & TRACE_NO_PIDS)
+ rcu_assign_pointer(tr->filtered_no_pids, NULL);
/* Wait till all users are no longer using pid filtering */
tracepoint_synchronize_unregister();
- trace_free_pid_list(pid_list);
+ if ((type & TRACE_PIDS) && pid_list)
+ trace_pid_list_free(pid_list);
+
+ if ((type & TRACE_NO_PIDS) && no_pid_list)
+ trace_pid_list_free(no_pid_list);
}
-static void ftrace_clear_event_pids(struct trace_array *tr)
+static void ftrace_clear_event_pids(struct trace_array *tr, int type)
{
mutex_lock(&event_mutex);
- __ftrace_clear_event_pids(tr);
+ __ftrace_clear_event_pids(tr, type);
mutex_unlock(&event_mutex);
}
@@ -1013,15 +1267,32 @@ static void t_stop(struct seq_file *m, void *p)
}
static void *
-p_next(struct seq_file *m, void *v, loff_t *pos)
+__next(struct seq_file *m, void *v, loff_t *pos, int type)
{
struct trace_array *tr = m->private;
- struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
+ struct trace_pid_list *pid_list;
+
+ if (type == TRACE_PIDS)
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+ else
+ pid_list = rcu_dereference_sched(tr->filtered_no_pids);
return trace_pid_next(pid_list, v, pos);
}
-static void *p_start(struct seq_file *m, loff_t *pos)
+static void *
+p_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return __next(m, v, pos, TRACE_PIDS);
+}
+
+static void *
+np_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return __next(m, v, pos, TRACE_NO_PIDS);
+}
+
+static void *__start(struct seq_file *m, loff_t *pos, int type)
__acquires(RCU)
{
struct trace_pid_list *pid_list;
@@ -1036,7 +1307,10 @@ static void *p_start(struct seq_file *m, loff_t *pos)
mutex_lock(&event_mutex);
rcu_read_lock_sched();
- pid_list = rcu_dereference_sched(tr->filtered_pids);
+ if (type == TRACE_PIDS)
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+ else
+ pid_list = rcu_dereference_sched(tr->filtered_no_pids);
if (!pid_list)
return NULL;
@@ -1044,6 +1318,18 @@ static void *p_start(struct seq_file *m, loff_t *pos)
return trace_pid_start(pid_list, pos);
}
+static void *p_start(struct seq_file *m, loff_t *pos)
+ __acquires(RCU)
+{
+ return __start(m, pos, TRACE_PIDS);
+}
+
+static void *np_start(struct seq_file *m, loff_t *pos)
+ __acquires(RCU)
+{
+ return __start(m, pos, TRACE_NO_PIDS);
+}
+
static void p_stop(struct seq_file *m, void *p)
__releases(RCU)
{
@@ -1134,7 +1420,8 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
- if (!trace_event_name(call) || !call->class || !call->class->reg)
+ if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) ||
+ !trace_event_name(call) || !call->class || !call->class->reg)
continue;
if (system && strcmp(call->class->system, system->name) != 0)
@@ -1588,6 +1875,7 @@ static void ignore_task_cpu(void *data)
{
struct trace_array *tr = data;
struct trace_pid_list *pid_list;
+ struct trace_pid_list *no_pid_list;
/*
* This function is called by on_each_cpu() while the
@@ -1595,18 +1883,50 @@ static void ignore_task_cpu(void *data)
*/
pid_list = rcu_dereference_protected(tr->filtered_pids,
mutex_is_locked(&event_mutex));
+ no_pid_list = rcu_dereference_protected(tr->filtered_no_pids,
+ mutex_is_locked(&event_mutex));
this_cpu_write(tr->array_buffer.data->ignore_pid,
- trace_ignore_this_task(pid_list, current));
+ trace_ignore_this_task(pid_list, no_pid_list, current));
+}
+
+static void register_pid_events(struct trace_array *tr)
+{
+ /*
+ * Register a probe that is called before all other probes
+ * to set ignore_pid if next or prev do not match.
+ * Register a probe this is called after all other probes
+ * to only keep ignore_pid set if next pid matches.
+ */
+ register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre,
+ tr, INT_MAX);
+ register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post,
+ tr, 0);
+
+ register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre,
+ tr, INT_MAX);
+ register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post,
+ tr, 0);
+
+ register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre,
+ tr, INT_MAX);
+ register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post,
+ tr, 0);
+
+ register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre,
+ tr, INT_MAX);
+ register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post,
+ tr, 0);
}
static ssize_t
-ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+event_pid_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos, int type)
{
struct seq_file *m = filp->private_data;
struct trace_array *tr = m->private;
struct trace_pid_list *filtered_pids = NULL;
+ struct trace_pid_list *other_pids = NULL;
struct trace_pid_list *pid_list;
struct trace_event_file *file;
ssize_t ret;
@@ -1620,14 +1940,26 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
mutex_lock(&event_mutex);
- filtered_pids = rcu_dereference_protected(tr->filtered_pids,
- lockdep_is_held(&event_mutex));
+ if (type == TRACE_PIDS) {
+ filtered_pids = rcu_dereference_protected(tr->filtered_pids,
+ lockdep_is_held(&event_mutex));
+ other_pids = rcu_dereference_protected(tr->filtered_no_pids,
+ lockdep_is_held(&event_mutex));
+ } else {
+ filtered_pids = rcu_dereference_protected(tr->filtered_no_pids,
+ lockdep_is_held(&event_mutex));
+ other_pids = rcu_dereference_protected(tr->filtered_pids,
+ lockdep_is_held(&event_mutex));
+ }
ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
if (ret < 0)
goto out;
- rcu_assign_pointer(tr->filtered_pids, pid_list);
+ if (type == TRACE_PIDS)
+ rcu_assign_pointer(tr->filtered_pids, pid_list);
+ else
+ rcu_assign_pointer(tr->filtered_no_pids, pid_list);
list_for_each_entry(file, &tr->events, list) {
set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
@@ -1635,33 +1967,9 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
if (filtered_pids) {
tracepoint_synchronize_unregister();
- trace_free_pid_list(filtered_pids);
- } else if (pid_list) {
- /*
- * Register a probe that is called before all other probes
- * to set ignore_pid if next or prev do not match.
- * Register a probe this is called after all other probes
- * to only keep ignore_pid set if next pid matches.
- */
- register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre,
- tr, INT_MAX);
- register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post,
- tr, 0);
-
- register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre,
- tr, INT_MAX);
- register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post,
- tr, 0);
-
- register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre,
- tr, INT_MAX);
- register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post,
- tr, 0);
-
- register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre,
- tr, INT_MAX);
- register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post,
- tr, 0);
+ trace_pid_list_free(filtered_pids);
+ } else if (pid_list && !other_pids) {
+ register_pid_events(tr);
}
/*
@@ -1680,9 +1988,24 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
return ret;
}
+static ssize_t
+ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return event_pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS);
+}
+
+static ssize_t
+ftrace_event_npid_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return event_pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS);
+}
+
static int ftrace_event_avail_open(struct inode *inode, struct file *file);
static int ftrace_event_set_open(struct inode *inode, struct file *file);
static int ftrace_event_set_pid_open(struct inode *inode, struct file *file);
+static int ftrace_event_set_npid_open(struct inode *inode, struct file *file);
static int ftrace_event_release(struct inode *inode, struct file *file);
static const struct seq_operations show_event_seq_ops = {
@@ -1706,6 +2029,13 @@ static const struct seq_operations show_set_pid_seq_ops = {
.stop = p_stop,
};
+static const struct seq_operations show_set_no_pid_seq_ops = {
+ .start = np_start,
+ .next = np_next,
+ .show = trace_pid_show,
+ .stop = p_stop,
+};
+
static const struct file_operations ftrace_avail_fops = {
.open = ftrace_event_avail_open,
.read = seq_read,
@@ -1729,6 +2059,14 @@ static const struct file_operations ftrace_set_event_pid_fops = {
.release = ftrace_event_release,
};
+static const struct file_operations ftrace_set_event_notrace_pid_fops = {
+ .open = ftrace_event_set_npid_open,
+ .read = seq_read,
+ .write = ftrace_event_npid_write,
+ .llseek = seq_lseek,
+ .release = ftrace_event_release,
+};
+
static const struct file_operations ftrace_enable_fops = {
.open = tracing_open_generic,
.read = event_enable_read,
@@ -1858,7 +2196,28 @@ ftrace_event_set_pid_open(struct inode *inode, struct file *file)
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
- ftrace_clear_event_pids(tr);
+ ftrace_clear_event_pids(tr, TRACE_PIDS);
+
+ ret = ftrace_event_open(inode, file, seq_ops);
+ if (ret < 0)
+ trace_array_put(tr);
+ return ret;
+}
+
+static int
+ftrace_event_set_npid_open(struct inode *inode, struct file *file)
+{
+ const struct seq_operations *seq_ops = &show_set_no_pid_seq_ops;
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
+
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC))
+ ftrace_clear_event_pids(tr, TRACE_NO_PIDS);
ret = ftrace_event_open(inode, file, seq_ops);
if (ret < 0)
@@ -1950,16 +2309,21 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
dir->subsystem = system;
file->system = dir;
- entry = tracefs_create_file("filter", 0644, dir->entry, dir,
- &ftrace_subsystem_filter_fops);
- if (!entry) {
- kfree(system->filter);
- system->filter = NULL;
- pr_warn("Could not create tracefs '%s/filter' entry\n", name);
- }
+ /* the ftrace system is special, do not create enable or filter files */
+ if (strcmp(name, "ftrace") != 0) {
- trace_create_file("enable", 0644, dir->entry, dir,
- &ftrace_system_enable_fops);
+ entry = tracefs_create_file("filter", TRACE_MODE_WRITE,
+ dir->entry, dir,
+ &ftrace_subsystem_filter_fops);
+ if (!entry) {
+ kfree(system->filter);
+ system->filter = NULL;
+ pr_warn("Could not create tracefs '%s/filter' entry\n", name);
+ }
+
+ trace_create_file("enable", TRACE_MODE_WRITE, dir->entry, dir,
+ &ftrace_system_enable_fops);
+ }
list_add(&dir->list, &tr->systems);
@@ -1975,11 +2339,47 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
}
static int
+event_define_fields(struct trace_event_call *call)
+{
+ struct list_head *head;
+ int ret = 0;
+
+ /*
+ * Other events may have the same class. Only update
+ * the fields if they are not already defined.
+ */
+ head = trace_get_fields(call);
+ if (list_empty(head)) {
+ struct trace_event_fields *field = call->class->fields_array;
+ unsigned int offset = sizeof(struct trace_entry);
+
+ for (; field->type; field++) {
+ if (field->type == TRACE_FUNCTION_TYPE) {
+ field->define_fields(call);
+ break;
+ }
+
+ offset = ALIGN(offset, field->align);
+ ret = trace_define_field(call, field->type, field->name,
+ offset, field->size,
+ field->is_signed, field->filter_type);
+ if (WARN_ON_ONCE(ret)) {
+ pr_err("error code is %d\n", ret);
+ break;
+ }
+
+ offset += field->size;
+ }
+ }
+
+ return ret;
+}
+
+static int
event_create_dir(struct dentry *parent, struct trace_event_file *file)
{
struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
- struct list_head *head;
struct dentry *d_events;
const char *name;
int ret;
@@ -2003,45 +2403,20 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
}
if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
- trace_create_file("enable", 0644, file->dir, file,
+ trace_create_file("enable", TRACE_MODE_WRITE, file->dir, file,
&ftrace_enable_fops);
#ifdef CONFIG_PERF_EVENTS
if (call->event.type && call->class->reg)
- trace_create_file("id", 0444, file->dir,
+ trace_create_file("id", TRACE_MODE_READ, file->dir,
(void *)(long)call->event.type,
&ftrace_event_id_fops);
#endif
- /*
- * Other events may have the same class. Only update
- * the fields if they are not already defined.
- */
- head = trace_get_fields(call);
- if (list_empty(head)) {
- struct trace_event_fields *field = call->class->fields_array;
- unsigned int offset = sizeof(struct trace_entry);
-
- for (; field->type; field++) {
- if (field->type == TRACE_FUNCTION_TYPE) {
- ret = field->define_fields(call);
- break;
- }
-
- offset = ALIGN(offset, field->align);
- ret = trace_define_field(call, field->type, field->name,
- offset, field->size,
- field->is_signed, field->filter_type);
- if (ret)
- break;
-
- offset += field->size;
- }
- if (ret < 0) {
- pr_warn("Could not initialize trace point events/%s\n",
- name);
- return -1;
- }
+ ret = event_define_fields(call);
+ if (ret < 0) {
+ pr_warn("Could not initialize trace point events/%s\n", name);
+ return ret;
}
/*
@@ -2049,18 +2424,22 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
* triggers or filters.
*/
if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
- trace_create_file("filter", 0644, file->dir, file,
- &ftrace_event_filter_fops);
+ trace_create_file("filter", TRACE_MODE_WRITE, file->dir,
+ file, &ftrace_event_filter_fops);
- trace_create_file("trigger", 0644, file->dir, file,
- &event_trigger_fops);
+ trace_create_file("trigger", TRACE_MODE_WRITE, file->dir,
+ file, &event_trigger_fops);
}
#ifdef CONFIG_HIST_TRIGGERS
- trace_create_file("hist", 0444, file->dir, file,
+ trace_create_file("hist", TRACE_MODE_READ, file->dir, file,
&event_hist_fops);
#endif
- trace_create_file("format", 0444, file->dir, call,
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+ trace_create_file("hist_debug", TRACE_MODE_READ, file->dir, file,
+ &event_hist_debug_fops);
+#endif
+ trace_create_file("format", TRACE_MODE_READ, file->dir, call,
&ftrace_event_format_fops);
#ifdef CONFIG_TRACE_EVENT_INJECT
@@ -2148,7 +2527,10 @@ __register_event(struct trace_event_call *call, struct module *mod)
return ret;
list_add(&call->list, &ftrace_events);
- call->mod = mod;
+ if (call->flags & TRACE_EVENT_FL_DYNAMIC)
+ atomic_set(&call->refcnt, 0);
+ else
+ call->module = mod;
return 0;
}
@@ -2269,9 +2651,9 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
}
/*
- * Since calls are grouped by systems, the likelyhood that the
+ * Since calls are grouped by systems, the likelihood that the
* next call in the iteration belongs to the same system as the
- * previous call is high. As an optimization, we skip seaching
+ * previous call is high. As an optimization, we skip searching
* for a map[] that matches the call's system if the last call
* was from the same system. That's what last_i is for. If the
* call has the same system as the previous call, then last_i
@@ -2296,12 +2678,24 @@ static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
+ struct trace_pid_list *no_pid_list;
+ struct trace_pid_list *pid_list;
struct trace_event_file *file;
+ unsigned int first;
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
if (!file)
return NULL;
+ pid_list = rcu_dereference_protected(tr->filtered_pids,
+ lockdep_is_held(&event_mutex));
+ no_pid_list = rcu_dereference_protected(tr->filtered_no_pids,
+ lockdep_is_held(&event_mutex));
+
+ if (!trace_pid_list_first(pid_list, &first) ||
+ !trace_pid_list_first(no_pid_list, &first))
+ file->flags |= EVENT_FILE_FL_PID_FILTER;
+
file->event_call = call;
file->tr = tr;
atomic_set(&file->sm_ref, 0);
@@ -2322,15 +2716,18 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
if (!file)
return -ENOMEM;
- return event_create_dir(tr->event_dir, file);
+ if (eventdir_initialized)
+ return event_create_dir(tr->event_dir, file);
+ else
+ return event_define_fields(call);
}
/*
- * Just create a decriptor for early init. A descriptor is required
+ * Just create a descriptor for early init. A descriptor is required
* for enabling events at boot. We want to enable events before
* the filesystem is initialized.
*/
-static __init int
+static int
__trace_early_add_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
@@ -2340,7 +2737,7 @@ __trace_early_add_new_event(struct trace_event_call *call,
if (!file)
return -ENOMEM;
- return 0;
+ return event_define_fields(call);
}
struct ftrace_module_file_ops;
@@ -2459,7 +2856,9 @@ static void trace_module_remove_events(struct module *mod)
down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
- if (call->mod == mod)
+ if ((call->flags & TRACE_EVENT_FL_DYNAMIC) || !call->module)
+ continue;
+ if (call->module == mod)
__trace_remove_event_call(call);
}
up_write(&trace_event_sem);
@@ -2493,7 +2892,7 @@ static int trace_module_notify(struct notifier_block *self,
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
- return 0;
+ return NOTIFY_OK;
}
static struct notifier_block trace_module_nb = {
@@ -2602,7 +3001,7 @@ struct trace_event_file *trace_get_event_file(const char *instance,
}
/* Don't let event modules unload while in use */
- ret = try_module_get(file->event_call->mod);
+ ret = trace_event_try_get_ref(file->event_call);
if (!ret) {
trace_array_put(tr);
ret = -EBUSY;
@@ -2632,7 +3031,7 @@ EXPORT_SYMBOL_GPL(trace_get_event_file);
void trace_put_event_file(struct trace_event_file *file)
{
mutex_lock(&event_mutex);
- module_put(file->event_call->mod);
+ trace_event_put_ref(file->event_call);
mutex_unlock(&event_mutex);
trace_array_put(file->tr);
@@ -2767,7 +3166,7 @@ static int free_probe_data(void *data)
if (!edata->ref) {
/* Remove the SOFT_MODE flag */
__ftrace_event_enable_disable(edata->file, 0, 1);
- module_put(edata->file->event_call->mod);
+ trace_event_put_ref(edata->file->event_call);
kfree(edata);
}
return 0;
@@ -2900,7 +3299,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
out_reg:
/* Don't let event modules unload while probe registered */
- ret = try_module_get(file->event_call->mod);
+ ret = trace_event_try_get_ref(file->event_call);
if (!ret) {
ret = -EBUSY;
goto out_free;
@@ -2930,7 +3329,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
out_disable:
__ftrace_event_enable_disable(file, 0, 1);
out_put:
- module_put(file->event_call->mod);
+ trace_event_put_ref(file->event_call);
out_free:
kfree(data);
goto out;
@@ -2963,14 +3362,13 @@ static inline int register_event_cmds(void) { return 0; }
#endif /* CONFIG_DYNAMIC_FTRACE */
/*
- * The top level array has already had its trace_event_file
- * descriptors created in order to allow for early events to
- * be recorded. This function is called after the tracefs has been
- * initialized, and we now have to create the files associated
- * to the events.
+ * The top level array and trace arrays created by boot-time tracing
+ * have already had its trace_event_file descriptors created in order
+ * to allow for early events to be recorded.
+ * This function is called after the tracefs has been initialized,
+ * and we now have to create the files associated to the events.
*/
-static __init void
-__trace_early_add_event_dirs(struct trace_array *tr)
+static void __trace_early_add_event_dirs(struct trace_array *tr)
{
struct trace_event_file *file;
int ret;
@@ -2985,20 +3383,20 @@ __trace_early_add_event_dirs(struct trace_array *tr)
}
/*
- * For early boot up, the top trace array requires to have
- * a list of events that can be enabled. This must be done before
- * the filesystem is set up in order to allow events to be traced
- * early.
+ * For early boot up, the top trace array and the trace arrays created
+ * by boot-time tracing require to have a list of events that can be
+ * enabled. This must be done before the filesystem is set up in order
+ * to allow events to be traced early.
*/
-static __init void
-__trace_early_add_events(struct trace_array *tr)
+void __trace_early_add_events(struct trace_array *tr)
{
struct trace_event_call *call;
int ret;
list_for_each_entry(call, &ftrace_events, list) {
/* Early boot up should not have any modules loaded */
- if (WARN_ON_ONCE(call->mod))
+ if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) &&
+ WARN_ON_ONCE(call->module))
continue;
ret = __trace_early_add_new_event(call, tr);
@@ -3035,7 +3433,7 @@ static __init int setup_trace_event(char *str)
{
strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
ring_buffer_expanded = true;
- tracing_selftest_disabled = true;
+ disable_tracing_selftest("running event tracing");
return 1;
}
@@ -3048,7 +3446,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
struct dentry *d_events;
struct dentry *entry;
- entry = tracefs_create_file("set_event", 0644, parent,
+ entry = tracefs_create_file("set_event", TRACE_MODE_WRITE, parent,
tr, &ftrace_set_event_fops);
if (!entry) {
pr_warn("Could not create tracefs 'set_event' entry\n");
@@ -3061,7 +3459,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
return -ENOMEM;
}
- entry = trace_create_file("enable", 0644, d_events,
+ entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events,
tr, &ftrace_tr_enable_fops);
if (!entry) {
pr_warn("Could not create tracefs 'enable' entry\n");
@@ -3070,19 +3468,25 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
/* There are not as crucial, just warn if they are not created */
- entry = tracefs_create_file("set_event_pid", 0644, parent,
+ entry = tracefs_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
tr, &ftrace_set_event_pid_fops);
if (!entry)
pr_warn("Could not create tracefs 'set_event_pid' entry\n");
+ entry = tracefs_create_file("set_event_notrace_pid",
+ TRACE_MODE_WRITE, parent, tr,
+ &ftrace_set_event_notrace_pid_fops);
+ if (!entry)
+ pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n");
+
/* ring buffer internal formats */
- entry = trace_create_file("header_page", 0444, d_events,
+ entry = trace_create_file("header_page", TRACE_MODE_READ, d_events,
ring_buffer_print_page_header,
&ftrace_show_header_fops);
if (!entry)
pr_warn("Could not create tracefs 'header_page' entry\n");
- entry = trace_create_file("header_event", 0444, d_events,
+ entry = trace_create_file("header_event", TRACE_MODE_READ, d_events,
ring_buffer_print_entry_header,
&ftrace_show_header_fops);
if (!entry)
@@ -3100,7 +3504,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
*
* When a new instance is created, it needs to set up its events
* directory, as well as other files associated with events. It also
- * creates the event hierachry in the @parent/events directory.
+ * creates the event hierarchy in the @parent/events directory.
*
* Returns 0 on success.
*
@@ -3117,7 +3521,11 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
goto out;
down_write(&trace_event_sem);
- __trace_add_event_dirs(tr);
+ /* If tr already has the event list, it is initialized in early boot. */
+ if (unlikely(!list_empty(&tr->events)))
+ __trace_early_add_event_dirs(tr);
+ else
+ __trace_add_event_dirs(tr);
up_write(&trace_event_sem);
out:
@@ -3158,7 +3566,7 @@ int event_trace_del_tracer(struct trace_array *tr)
clear_event_triggers(tr);
/* Clear the pid list */
- __ftrace_clear_event_pids(tr);
+ __ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS);
/* Disable any running events */
__ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
@@ -3253,10 +3661,10 @@ static __init int event_trace_enable(void)
* initialize events and perhaps start any events that are on the
* command line. Unfortunately, there are some events that will not
* start this early, like the system call tracepoints that need
- * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable()
- * is called before pid 1 starts, and this flag is never set, making
- * the syscall tracepoint never get reached, but the event is enabled
- * regardless (and not doing anything).
+ * to set the %SYSCALL_WORK_SYSCALL_TRACEPOINT flag of pid 1. But
+ * event_trace_enable() is called before pid 1 starts, and this flag
+ * is never set, making the syscall tracepoint never get reached, but
+ * the event is enabled regardless (and not doing anything).
*/
static __init int event_trace_enable_again(void)
{
@@ -3273,10 +3681,21 @@ static __init int event_trace_enable_again(void)
early_initcall(event_trace_enable_again);
+/* Init fields which doesn't related to the tracefs */
+static __init int event_trace_init_fields(void)
+{
+ if (trace_define_generic_fields())
+ pr_warn("tracing: Failed to allocated generic fields");
+
+ if (trace_define_common_fields())
+ pr_warn("tracing: Failed to allocate common fields");
+
+ return 0;
+}
+
__init int event_trace_init(void)
{
struct trace_array *tr;
- struct dentry *d_tracer;
struct dentry *entry;
int ret;
@@ -3284,22 +3703,12 @@ __init int event_trace_init(void)
if (!tr)
return -ENODEV;
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
- return 0;
-
- entry = tracefs_create_file("available_events", 0444, d_tracer,
- tr, &ftrace_avail_fops);
+ entry = tracefs_create_file("available_events", TRACE_MODE_READ,
+ NULL, tr, &ftrace_avail_fops);
if (!entry)
pr_warn("Could not create tracefs 'available_events' entry\n");
- if (trace_define_generic_fields())
- pr_warn("tracing: Failed to allocated generic fields");
-
- if (trace_define_common_fields())
- pr_warn("tracing: Failed to allocate common fields");
-
- ret = early_event_add_tracer(d_tracer, tr);
+ ret = early_event_add_tracer(NULL, tr);
if (ret)
return ret;
@@ -3308,6 +3717,9 @@ __init int event_trace_init(void)
if (ret)
pr_warn("Failed to register trace events module notifier\n");
#endif
+
+ eventdir_initialized = true;
+
return 0;
}
@@ -3316,6 +3728,7 @@ void __init trace_event_init(void)
event_trace_memsetup();
init_ftrace_syscalls();
event_trace_enable();
+ event_trace_init_fields();
}
#ifdef CONFIG_EVENT_TRACE_STARTUP_TEST
@@ -3493,17 +3906,16 @@ static struct trace_event_file event_trace_file __initdata;
static void __init
function_test_events_call(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
+ struct ftrace_ops *op, struct ftrace_regs *regs)
{
struct trace_buffer *buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
- unsigned long flags;
+ unsigned int trace_ctx;
long disabled;
int cpu;
- int pc;
- pc = preempt_count();
+ trace_ctx = tracing_gen_ctx();
preempt_disable_notrace();
cpu = raw_smp_processor_id();
disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
@@ -3511,11 +3923,9 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
if (disabled != 1)
goto out;
- local_save_flags(flags);
-
event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file,
TRACE_FN, sizeof(*entry),
- flags, pc);
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3523,7 +3933,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
entry->parent_ip = parent_ip;
event_trigger_unlock_commit(&event_trace_file, buffer, event,
- entry, flags, pc);
+ entry, trace_ctx);
out:
atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
preempt_enable_notrace();
@@ -3532,7 +3942,6 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
static struct ftrace_ops trace_ops __initdata =
{
.func = function_test_events_call,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static __init void event_trace_self_test_with_function(void)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index bf44f6bbd0c3..c9124038b140 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -256,7 +256,7 @@ enum {
* is "&&" we don't call update_preds(). Instead continue to "c". As the
* next token after "c" is not "&&" but the end of input, we first process the
* "&&" by calling update_preds() for the "&&" then we process the "||" by
- * callin updates_preds() with the values for processing "||".
+ * calling updates_preds() with the values for processing "||".
*
* What does that mean? What update_preds() does is to first save the "target"
* of the program entry indexed by the current program entry's "target"
@@ -296,7 +296,7 @@ enum {
* and "FALSE" the program entry after that, we are now done with the first
* pass.
*
- * Making the above "a || b && c" have a progam of:
+ * Making the above "a || b && c" have a program of:
* prog[0] = { "a", 1, 2 }
* prog[1] = { "b", 0, 2 }
* prog[2] = { "c", 0, 3 }
@@ -390,7 +390,7 @@ enum {
* F: return FALSE
*
* As "r = a; if (!r) goto n5;" is obviously the same as
- * "if (!a) goto n5;" without doing anything we can interperate the
+ * "if (!a) goto n5;" without doing anything we can interpret the
* program as:
* n1: if (!a) goto n5;
* n2: if (!b) goto n5;
@@ -499,7 +499,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
ptr++;
break;
}
- /* fall through */
+ fallthrough;
default:
parse_error(pe, FILT_ERR_TOO_MANY_PREDS,
next - str);
@@ -1273,7 +1273,7 @@ static int parse_pred(const char *str, void *data,
switch (op) {
case OP_NE:
pred->not = 1;
- /* Fall through */
+ fallthrough;
case OP_GLOB:
case OP_EQ:
break;
@@ -1561,27 +1561,6 @@ static inline void event_clear_filter(struct trace_event_file *file)
RCU_INIT_POINTER(file->filter, NULL);
}
-static inline void
-event_set_no_set_filter_flag(struct trace_event_file *file)
-{
- file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
-}
-
-static inline void
-event_clear_no_set_filter_flag(struct trace_event_file *file)
-{
- file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
-}
-
-static inline bool
-event_no_set_filter_flag(struct trace_event_file *file)
-{
- if (file->flags & EVENT_FILE_FL_NO_SET_FILTER)
- return true;
-
- return false;
-}
-
struct filter_list {
struct list_head list;
struct event_filter *filter;
@@ -1714,6 +1693,7 @@ static void create_filter_finish(struct filter_parse_error *pe)
/**
* create_filter - create a filter for a trace_event_call
+ * @tr: the trace array associated with these events
* @call: trace_event_call to create a filter for
* @filter_str: filter string
* @set_str: remember @filter_str and enable detailed error in filter
@@ -1762,8 +1742,8 @@ int create_event_filter(struct trace_array *tr,
}
/**
- * create_system_filter - create a filter for an event_subsystem
- * @system: event_subsystem to create a filter for
+ * create_system_filter - create a filter for an event subsystem
+ * @dir: the descriptor for the subsystem directory
* @filter_str: filter string
* @filterp: out param for created filter (always updated on return)
*
@@ -1771,7 +1751,6 @@ int create_event_filter(struct trace_array *tr,
* and always remembers @filter_str.
*/
static int create_system_filter(struct trace_subsystem_dir *dir,
- struct trace_array *tr,
char *filter_str, struct event_filter **filterp)
{
struct filter_parse_error *pe = NULL;
@@ -1779,13 +1758,13 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
err = create_filter_start(filter_str, true, &pe, filterp);
if (!err) {
- err = process_system_preds(dir, tr, pe, filter_str);
+ err = process_system_preds(dir, dir->tr, pe, filter_str);
if (!err) {
/* System filters just show a default message */
kfree((*filterp)->filter_string);
(*filterp)->filter_string = NULL;
} else {
- append_filter_err(tr, pe, *filterp);
+ append_filter_err(dir->tr, pe, *filterp);
}
}
create_filter_finish(pe);
@@ -1873,7 +1852,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
goto out_unlock;
}
- err = create_system_filter(dir, tr, filter_string, &filter);
+ err = create_system_filter(dir, filter_string, &filter);
if (filter) {
/*
* No event actually uses the system filter
@@ -1950,7 +1929,7 @@ static int __ftrace_function_set_filter(int filter, char *buf, int len,
/*
* The 'ip' field could have multiple filters set, separated
* either by space or comma. We first cut the filter and apply
- * all pieces separatelly.
+ * all pieces separately.
*/
re = ftrace_function_filter_re(buf, len, &re_cnt);
if (!re)
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5f6834a2bf41..319f9c8ca7e7 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -19,13 +19,7 @@
#include <trace/events/mmflags.h>
#include "tracing_map.h"
-#include "trace.h"
-#include "trace_dynevent.h"
-
-#define SYNTH_SYSTEM "synthetic"
-#define SYNTH_FIELDS_MAX 32
-
-#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
+#include "trace_synth.h"
#define ERRORS \
C(NONE, "No error"), \
@@ -71,7 +65,11 @@
C(INVALID_SORT_MODIFIER,"Invalid sort modifier"), \
C(EMPTY_SORT_FIELD, "Empty sort field"), \
C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \
- C(INVALID_SORT_FIELD, "Sort field must be a key or a val"),
+ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \
+ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \
+ C(EXPECT_NUMBER, "Expecting numeric literal"), \
+ C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \
+ C(DIVISION_BY_ZERO, "Division by zero"),
#undef C
#define C(a, b) HIST_ERR_##a
@@ -87,18 +85,23 @@ struct hist_field;
typedef u64 (*hist_field_fn_t) (struct hist_field *field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event);
#define HIST_FIELD_OPERANDS_MAX 2
#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
#define HIST_ACTIONS_MAX 8
+#define HIST_CONST_DIGITS_MAX 21
+#define HIST_DIV_SHIFT 20 /* For optimizing division by constants */
enum field_op_id {
FIELD_OP_NONE,
FIELD_OP_PLUS,
FIELD_OP_MINUS,
FIELD_OP_UNARY_MINUS,
+ FIELD_OP_DIV,
+ FIELD_OP_MULT,
};
/*
@@ -125,6 +128,7 @@ struct hist_field {
unsigned int size;
unsigned int offset;
unsigned int is_signed;
+ unsigned long buckets;
const char *type;
struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
struct hist_trigger_data *hist_data;
@@ -153,18 +157,36 @@ struct hist_field {
*/
unsigned int var_ref_idx;
bool read_once;
+
+ unsigned int var_str_idx;
+
+ /* Numeric literals are represented as u64 */
+ u64 constant;
+ /* Used to optimize division by constants */
+ u64 div_multiplier;
};
static u64 hist_field_none(struct hist_field *field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
return 0;
}
+static u64 hist_field_const(struct hist_field *field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ return field->constant;
+}
+
static u64 hist_field_counter(struct hist_field *field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -173,6 +195,7 @@ static u64 hist_field_counter(struct hist_field *field,
static u64 hist_field_string(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -183,6 +206,7 @@ static u64 hist_field_string(struct hist_field *hist_field,
static u64 hist_field_dynstring(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -195,6 +219,7 @@ static u64 hist_field_dynstring(struct hist_field *hist_field,
static u64 hist_field_pstring(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -205,52 +230,177 @@ static u64 hist_field_pstring(struct hist_field *hist_field,
static u64 hist_field_log2(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_field *operand = hist_field->operands[0];
- u64 val = operand->fn(operand, elt, rbe, event);
+ u64 val = operand->fn(operand, elt, buffer, rbe, event);
return (u64) ilog2(roundup_pow_of_two(val));
}
+static u64 hist_field_bucket(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand = hist_field->operands[0];
+ unsigned long buckets = hist_field->buckets;
+
+ u64 val = operand->fn(operand, elt, buffer, rbe, event);
+
+ if (WARN_ON_ONCE(!buckets))
+ return val;
+
+ if (val >= LONG_MAX)
+ val = div64_ul(val, buckets);
+ else
+ val = (u64)((unsigned long)val / buckets);
+ return val * buckets;
+}
+
static u64 hist_field_plus(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, rbe, event);
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
return val1 + val2;
}
static u64 hist_field_minus(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, rbe, event);
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
return val1 - val2;
}
+static u64 hist_field_div(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+
+ /* Return -1 for the undefined case */
+ if (!val2)
+ return -1;
+
+ /* Use shift if the divisor is a power of 2 */
+ if (!(val2 & (val2 - 1)))
+ return val1 >> __ffs64(val2);
+
+ return div64_u64(val1, val2);
+}
+
+static u64 div_by_power_of_two(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+
+ return val1 >> __ffs64(operand2->constant);
+}
+
+static u64 div_by_not_power_of_two(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+
+ return div64_u64(val1, operand2->constant);
+}
+
+static u64 div_by_mult_and_shift(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+
+ /*
+ * If the divisor is a constant, do a multiplication and shift instead.
+ *
+ * Choose Z = some power of 2. If Y <= Z, then:
+ * X / Y = (X * (Z / Y)) / Z
+ *
+ * (Z / Y) is a constant (mult) which is calculated at parse time, so:
+ * X / Y = (X * mult) / Z
+ *
+ * The division by Z can be replaced by a shift since Z is a power of 2:
+ * X / Y = (X * mult) >> HIST_DIV_SHIFT
+ *
+ * As long, as X < Z the results will not be off by more than 1.
+ */
+ if (val1 < (1 << HIST_DIV_SHIFT)) {
+ u64 mult = operand2->div_multiplier;
+
+ return (val1 * mult + ((1 << HIST_DIV_SHIFT) - 1)) >> HIST_DIV_SHIFT;
+ }
+
+ return div64_u64(val1, operand2->constant);
+}
+
+static u64 hist_field_mult(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+
+ return val1 * val2;
+}
+
static u64 hist_field_unary_minus(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_field *operand = hist_field->operands[0];
- s64 sval = (s64)operand->fn(operand, elt, rbe, event);
+ s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event);
u64 val = (u64)-sval;
return val;
@@ -259,6 +409,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field,
#define DEFINE_HIST_FIELD_FN(type) \
static u64 hist_field_##type(struct hist_field *hist_field, \
struct tracing_map_elt *elt, \
+ struct trace_buffer *buffer, \
struct ring_buffer_event *rbe, \
void *event) \
{ \
@@ -310,6 +461,8 @@ enum hist_field_flags {
HIST_FIELD_FL_VAR_REF = 1 << 14,
HIST_FIELD_FL_CPU = 1 << 15,
HIST_FIELD_FL_ALIAS = 1 << 16,
+ HIST_FIELD_FL_BUCKET = 1 << 17,
+ HIST_FIELD_FL_CONST = 1 << 18,
};
struct var_defs {
@@ -355,6 +508,7 @@ struct hist_trigger_data {
unsigned int n_keys;
unsigned int n_fields;
unsigned int n_vars;
+ unsigned int n_var_str;
unsigned int key_size;
struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX];
unsigned int n_sort_keys;
@@ -380,73 +534,11 @@ struct hist_trigger_data {
unsigned int n_save_var_str;
};
-static int create_synth_event(int argc, const char **argv);
-static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
-static int synth_event_release(struct dyn_event *ev);
-static bool synth_event_is_busy(struct dyn_event *ev);
-static bool synth_event_match(const char *system, const char *event,
- int argc, const char **argv, struct dyn_event *ev);
-
-static struct dyn_event_operations synth_event_ops = {
- .create = create_synth_event,
- .show = synth_event_show,
- .is_busy = synth_event_is_busy,
- .free = synth_event_release,
- .match = synth_event_match,
-};
-
-struct synth_field {
- char *type;
- char *name;
- size_t size;
- unsigned int offset;
- bool is_signed;
- bool is_string;
-};
-
-struct synth_event {
- struct dyn_event devent;
- int ref;
- char *name;
- struct synth_field **fields;
- unsigned int n_fields;
- unsigned int n_u64;
- struct trace_event_class class;
- struct trace_event_call call;
- struct tracepoint *tp;
- struct module *mod;
-};
-
-static bool is_synth_event(struct dyn_event *ev)
-{
- return ev->ops == &synth_event_ops;
-}
-
-static struct synth_event *to_synth_event(struct dyn_event *ev)
-{
- return container_of(ev, struct synth_event, devent);
-}
-
-static bool synth_event_is_busy(struct dyn_event *ev)
-{
- struct synth_event *event = to_synth_event(ev);
-
- return event->ref != 0;
-}
-
-static bool synth_event_match(const char *system, const char *event,
- int argc, const char **argv, struct dyn_event *ev)
-{
- struct synth_event *sev = to_synth_event(ev);
-
- return strcmp(sev->name, event) == 0 &&
- (!system || strcmp(system, SYNTH_SYSTEM) == 0);
-}
-
struct action_data;
typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals);
@@ -538,7 +630,8 @@ struct track_data {
struct hist_elt_data {
char *comm;
u64 *var_ref_vals;
- char *field_var_str[SYNTH_FIELDS_MAX];
+ char **field_var_str;
+ int n_field_var_str;
};
struct snapshot_context {
@@ -546,6 +639,25 @@ struct snapshot_context {
void *key;
};
+/*
+ * Returns the specific division function to use if the divisor
+ * is constant. This avoids extra branches when the trigger is hit.
+ */
+static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor)
+{
+ u64 div = divisor->constant;
+
+ if (!(div & (div - 1)))
+ return div_by_power_of_two;
+
+ /* If the divisor is too large, do a regular division */
+ if (div > (1 << HIST_DIV_SHIFT))
+ return div_by_not_power_of_two;
+
+ divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div);
+ return div_by_mult_and_shift;
+}
+
static void track_data_free(struct track_data *track_data)
{
struct hist_elt_data *elt_data;
@@ -589,6 +701,7 @@ static struct track_data *track_data_alloc(unsigned int key_len,
track_data_free(data);
return ERR_PTR(-ENOMEM);
}
+
data->elt.private_data = elt_data;
elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL);
@@ -621,7 +734,6 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
if (file) {
call = file->event_call;
-
system = call->class->system;
if (system) {
name = trace_event_name(call);
@@ -646,510 +758,6 @@ static void hist_err_clear(void)
last_cmd_loc[0] = '\0';
}
-struct synth_trace_event {
- struct trace_entry ent;
- u64 fields[];
-};
-
-static int synth_event_define_fields(struct trace_event_call *call)
-{
- struct synth_trace_event trace;
- int offset = offsetof(typeof(trace), fields);
- struct synth_event *event = call->data;
- unsigned int i, size, n_u64;
- char *name, *type;
- bool is_signed;
- int ret = 0;
-
- for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
- size = event->fields[i]->size;
- is_signed = event->fields[i]->is_signed;
- type = event->fields[i]->type;
- name = event->fields[i]->name;
- ret = trace_define_field(call, type, name, offset, size,
- is_signed, FILTER_OTHER);
- if (ret)
- break;
-
- event->fields[i]->offset = n_u64;
-
- if (event->fields[i]->is_string) {
- offset += STR_VAR_LEN_MAX;
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- offset += sizeof(u64);
- n_u64++;
- }
- }
-
- event->n_u64 = n_u64;
-
- return ret;
-}
-
-static bool synth_field_signed(char *type)
-{
- if (str_has_prefix(type, "u"))
- return false;
- if (strcmp(type, "gfp_t") == 0)
- return false;
-
- return true;
-}
-
-static int synth_field_is_string(char *type)
-{
- if (strstr(type, "char[") != NULL)
- return true;
-
- return false;
-}
-
-static int synth_field_string_size(char *type)
-{
- char buf[4], *end, *start;
- unsigned int len;
- int size, err;
-
- start = strstr(type, "char[");
- if (start == NULL)
- return -EINVAL;
- start += sizeof("char[") - 1;
-
- end = strchr(type, ']');
- if (!end || end < start)
- return -EINVAL;
-
- len = end - start;
- if (len > 3)
- return -EINVAL;
-
- strncpy(buf, start, len);
- buf[len] = '\0';
-
- err = kstrtouint(buf, 0, &size);
- if (err)
- return err;
-
- if (size > STR_VAR_LEN_MAX)
- return -EINVAL;
-
- return size;
-}
-
-static int synth_field_size(char *type)
-{
- int size = 0;
-
- if (strcmp(type, "s64") == 0)
- size = sizeof(s64);
- else if (strcmp(type, "u64") == 0)
- size = sizeof(u64);
- else if (strcmp(type, "s32") == 0)
- size = sizeof(s32);
- else if (strcmp(type, "u32") == 0)
- size = sizeof(u32);
- else if (strcmp(type, "s16") == 0)
- size = sizeof(s16);
- else if (strcmp(type, "u16") == 0)
- size = sizeof(u16);
- else if (strcmp(type, "s8") == 0)
- size = sizeof(s8);
- else if (strcmp(type, "u8") == 0)
- size = sizeof(u8);
- else if (strcmp(type, "char") == 0)
- size = sizeof(char);
- else if (strcmp(type, "unsigned char") == 0)
- size = sizeof(unsigned char);
- else if (strcmp(type, "int") == 0)
- size = sizeof(int);
- else if (strcmp(type, "unsigned int") == 0)
- size = sizeof(unsigned int);
- else if (strcmp(type, "long") == 0)
- size = sizeof(long);
- else if (strcmp(type, "unsigned long") == 0)
- size = sizeof(unsigned long);
- else if (strcmp(type, "pid_t") == 0)
- size = sizeof(pid_t);
- else if (strcmp(type, "gfp_t") == 0)
- size = sizeof(gfp_t);
- else if (synth_field_is_string(type))
- size = synth_field_string_size(type);
-
- return size;
-}
-
-static const char *synth_field_fmt(char *type)
-{
- const char *fmt = "%llu";
-
- if (strcmp(type, "s64") == 0)
- fmt = "%lld";
- else if (strcmp(type, "u64") == 0)
- fmt = "%llu";
- else if (strcmp(type, "s32") == 0)
- fmt = "%d";
- else if (strcmp(type, "u32") == 0)
- fmt = "%u";
- else if (strcmp(type, "s16") == 0)
- fmt = "%d";
- else if (strcmp(type, "u16") == 0)
- fmt = "%u";
- else if (strcmp(type, "s8") == 0)
- fmt = "%d";
- else if (strcmp(type, "u8") == 0)
- fmt = "%u";
- else if (strcmp(type, "char") == 0)
- fmt = "%d";
- else if (strcmp(type, "unsigned char") == 0)
- fmt = "%u";
- else if (strcmp(type, "int") == 0)
- fmt = "%d";
- else if (strcmp(type, "unsigned int") == 0)
- fmt = "%u";
- else if (strcmp(type, "long") == 0)
- fmt = "%ld";
- else if (strcmp(type, "unsigned long") == 0)
- fmt = "%lu";
- else if (strcmp(type, "pid_t") == 0)
- fmt = "%d";
- else if (strcmp(type, "gfp_t") == 0)
- fmt = "%x";
- else if (synth_field_is_string(type))
- fmt = "%s";
-
- return fmt;
-}
-
-static void print_synth_event_num_val(struct trace_seq *s,
- char *print_fmt, char *name,
- int size, u64 val, char *space)
-{
- switch (size) {
- case 1:
- trace_seq_printf(s, print_fmt, name, (u8)val, space);
- break;
-
- case 2:
- trace_seq_printf(s, print_fmt, name, (u16)val, space);
- break;
-
- case 4:
- trace_seq_printf(s, print_fmt, name, (u32)val, space);
- break;
-
- default:
- trace_seq_printf(s, print_fmt, name, val, space);
- break;
- }
-}
-
-static enum print_line_t print_synth_event(struct trace_iterator *iter,
- int flags,
- struct trace_event *event)
-{
- struct trace_array *tr = iter->tr;
- struct trace_seq *s = &iter->seq;
- struct synth_trace_event *entry;
- struct synth_event *se;
- unsigned int i, n_u64;
- char print_fmt[32];
- const char *fmt;
-
- entry = (struct synth_trace_event *)iter->ent;
- se = container_of(event, struct synth_event, call.event);
-
- trace_seq_printf(s, "%s: ", se->name);
-
- for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
- if (trace_seq_has_overflowed(s))
- goto end;
-
- fmt = synth_field_fmt(se->fields[i]->type);
-
- /* parameter types */
- if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
- trace_seq_printf(s, "%s ", fmt);
-
- snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
-
- /* parameter values */
- if (se->fields[i]->is_string) {
- trace_seq_printf(s, print_fmt, se->fields[i]->name,
- (char *)&entry->fields[n_u64],
- i == se->n_fields - 1 ? "" : " ");
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- struct trace_print_flags __flags[] = {
- __def_gfpflag_names, {-1, NULL} };
- char *space = (i == se->n_fields - 1 ? "" : " ");
-
- print_synth_event_num_val(s, print_fmt,
- se->fields[i]->name,
- se->fields[i]->size,
- entry->fields[n_u64],
- space);
-
- if (strcmp(se->fields[i]->type, "gfp_t") == 0) {
- trace_seq_puts(s, " (");
- trace_print_flags_seq(s, "|",
- entry->fields[n_u64],
- __flags);
- trace_seq_putc(s, ')');
- }
- n_u64++;
- }
- }
-end:
- trace_seq_putc(s, '\n');
-
- return trace_handle_return(s);
-}
-
-static struct trace_event_functions synth_event_funcs = {
- .trace = print_synth_event
-};
-
-static notrace void trace_event_raw_event_synth(void *__data,
- u64 *var_ref_vals,
- unsigned int *var_ref_idx)
-{
- struct trace_event_file *trace_file = __data;
- struct synth_trace_event *entry;
- struct trace_event_buffer fbuffer;
- struct trace_buffer *buffer;
- struct synth_event *event;
- unsigned int i, n_u64, val_idx;
- int fields_size = 0;
-
- event = trace_file->event_call->data;
-
- if (trace_trigger_soft_disabled(trace_file))
- return;
-
- fields_size = event->n_u64 * sizeof(u64);
-
- /*
- * Avoid ring buffer recursion detection, as this event
- * is being performed within another event.
- */
- buffer = trace_file->tr->array_buffer.buffer;
- ring_buffer_nest_start(buffer);
-
- entry = trace_event_buffer_reserve(&fbuffer, trace_file,
- sizeof(*entry) + fields_size);
- if (!entry)
- goto out;
-
- for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
- val_idx = var_ref_idx[i];
- if (event->fields[i]->is_string) {
- char *str_val = (char *)(long)var_ref_vals[val_idx];
- char *str_field = (char *)&entry->fields[n_u64];
-
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- struct synth_field *field = event->fields[i];
- u64 val = var_ref_vals[val_idx];
-
- switch (field->size) {
- case 1:
- *(u8 *)&entry->fields[n_u64] = (u8)val;
- break;
-
- case 2:
- *(u16 *)&entry->fields[n_u64] = (u16)val;
- break;
-
- case 4:
- *(u32 *)&entry->fields[n_u64] = (u32)val;
- break;
-
- default:
- entry->fields[n_u64] = val;
- break;
- }
- n_u64++;
- }
- }
-
- trace_event_buffer_commit(&fbuffer);
-out:
- ring_buffer_nest_end(buffer);
-}
-
-static void free_synth_event_print_fmt(struct trace_event_call *call)
-{
- if (call) {
- kfree(call->print_fmt);
- call->print_fmt = NULL;
- }
-}
-
-static int __set_synth_event_print_fmt(struct synth_event *event,
- char *buf, int len)
-{
- const char *fmt;
- int pos = 0;
- int i;
-
- /* When len=0, we just calculate the needed length */
-#define LEN_OR_ZERO (len ? len - pos : 0)
-
- pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
- for (i = 0; i < event->n_fields; i++) {
- fmt = synth_field_fmt(event->fields[i]->type);
- pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
- event->fields[i]->name, fmt,
- i == event->n_fields - 1 ? "" : ", ");
- }
- pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
-
- for (i = 0; i < event->n_fields; i++) {
- pos += snprintf(buf + pos, LEN_OR_ZERO,
- ", REC->%s", event->fields[i]->name);
- }
-
-#undef LEN_OR_ZERO
-
- /* return the length of print_fmt */
- return pos;
-}
-
-static int set_synth_event_print_fmt(struct trace_event_call *call)
-{
- struct synth_event *event = call->data;
- char *print_fmt;
- int len;
-
- /* First: called with 0 length to calculate the needed length */
- len = __set_synth_event_print_fmt(event, NULL, 0);
-
- print_fmt = kmalloc(len + 1, GFP_KERNEL);
- if (!print_fmt)
- return -ENOMEM;
-
- /* Second: actually write the @print_fmt */
- __set_synth_event_print_fmt(event, print_fmt, len + 1);
- call->print_fmt = print_fmt;
-
- return 0;
-}
-
-static void free_synth_field(struct synth_field *field)
-{
- kfree(field->type);
- kfree(field->name);
- kfree(field);
-}
-
-static struct synth_field *parse_synth_field(int argc, const char **argv,
- int *consumed)
-{
- struct synth_field *field;
- const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
- int len, ret = 0;
-
- if (field_type[0] == ';')
- field_type++;
-
- if (!strcmp(field_type, "unsigned")) {
- if (argc < 3)
- return ERR_PTR(-EINVAL);
- prefix = "unsigned ";
- field_type = argv[1];
- field_name = argv[2];
- *consumed = 3;
- } else {
- field_name = argv[1];
- *consumed = 2;
- }
-
- field = kzalloc(sizeof(*field), GFP_KERNEL);
- if (!field)
- return ERR_PTR(-ENOMEM);
-
- len = strlen(field_name);
- array = strchr(field_name, '[');
- if (array)
- len -= strlen(array);
- else if (field_name[len - 1] == ';')
- len--;
-
- field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
- if (!field->name) {
- ret = -ENOMEM;
- goto free;
- }
-
- if (field_type[0] == ';')
- field_type++;
- len = strlen(field_type) + 1;
- if (array)
- len += strlen(array);
- if (prefix)
- len += strlen(prefix);
-
- field->type = kzalloc(len, GFP_KERNEL);
- if (!field->type) {
- ret = -ENOMEM;
- goto free;
- }
- if (prefix)
- strcat(field->type, prefix);
- strcat(field->type, field_type);
- if (array) {
- strcat(field->type, array);
- if (field->type[len - 1] == ';')
- field->type[len - 1] = '\0';
- }
-
- field->size = synth_field_size(field->type);
- if (!field->size) {
- ret = -EINVAL;
- goto free;
- }
-
- if (synth_field_is_string(field->type))
- field->is_string = true;
-
- field->is_signed = synth_field_signed(field->type);
-
- out:
- return field;
- free:
- free_synth_field(field);
- field = ERR_PTR(ret);
- goto out;
-}
-
-static void free_synth_tracepoint(struct tracepoint *tp)
-{
- if (!tp)
- return;
-
- kfree(tp->name);
- kfree(tp);
-}
-
-static struct tracepoint *alloc_synth_tracepoint(char *name)
-{
- struct tracepoint *tp;
-
- tp = kzalloc(sizeof(*tp), GFP_KERNEL);
- if (!tp)
- return ERR_PTR(-ENOMEM);
-
- tp->name = kstrdup(name, GFP_KERNEL);
- if (!tp->name) {
- kfree(tp);
- return ERR_PTR(-ENOMEM);
- }
-
- return tp;
-}
-
typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
unsigned int *var_ref_idx);
@@ -1177,147 +785,9 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
}
}
-static struct synth_event *find_synth_event(const char *name)
-{
- struct dyn_event *pos;
- struct synth_event *event;
-
- for_each_dyn_event(pos) {
- if (!is_synth_event(pos))
- continue;
- event = to_synth_event(pos);
- if (strcmp(event->name, name) == 0)
- return event;
- }
-
- return NULL;
-}
-
-static struct trace_event_fields synth_event_fields_array[] = {
- { .type = TRACE_FUNCTION_TYPE,
- .define_fields = synth_event_define_fields },
- {}
-};
-
-static int register_synth_event(struct synth_event *event)
-{
- struct trace_event_call *call = &event->call;
- int ret = 0;
-
- event->call.class = &event->class;
- event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
- if (!event->class.system) {
- ret = -ENOMEM;
- goto out;
- }
-
- event->tp = alloc_synth_tracepoint(event->name);
- if (IS_ERR(event->tp)) {
- ret = PTR_ERR(event->tp);
- event->tp = NULL;
- goto out;
- }
-
- INIT_LIST_HEAD(&call->class->fields);
- call->event.funcs = &synth_event_funcs;
- call->class->fields_array = synth_event_fields_array;
-
- ret = register_trace_event(&call->event);
- if (!ret) {
- ret = -ENODEV;
- goto out;
- }
- call->flags = TRACE_EVENT_FL_TRACEPOINT;
- call->class->reg = trace_event_reg;
- call->class->probe = trace_event_raw_event_synth;
- call->data = event;
- call->tp = event->tp;
-
- ret = trace_add_event_call(call);
- if (ret) {
- pr_warn("Failed to register synthetic event: %s\n",
- trace_event_name(call));
- goto err;
- }
-
- ret = set_synth_event_print_fmt(call);
- if (ret < 0) {
- trace_remove_event_call(call);
- goto err;
- }
- out:
- return ret;
- err:
- unregister_trace_event(&call->event);
- goto out;
-}
-
-static int unregister_synth_event(struct synth_event *event)
-{
- struct trace_event_call *call = &event->call;
- int ret;
-
- ret = trace_remove_event_call(call);
-
- return ret;
-}
-
-static void free_synth_event(struct synth_event *event)
-{
- unsigned int i;
-
- if (!event)
- return;
-
- for (i = 0; i < event->n_fields; i++)
- free_synth_field(event->fields[i]);
-
- kfree(event->fields);
- kfree(event->name);
- kfree(event->class.system);
- free_synth_tracepoint(event->tp);
- free_synth_event_print_fmt(&event->call);
- kfree(event);
-}
-
-static struct synth_event *alloc_synth_event(const char *name, int n_fields,
- struct synth_field **fields)
-{
- struct synth_event *event;
- unsigned int i;
-
- event = kzalloc(sizeof(*event), GFP_KERNEL);
- if (!event) {
- event = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- event->name = kstrdup(name, GFP_KERNEL);
- if (!event->name) {
- kfree(event);
- event = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
- if (!event->fields) {
- free_synth_event(event);
- event = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- dyn_event_init(&event->devent, &synth_event_ops);
-
- for (i = 0; i < n_fields; i++)
- event->fields[i] = fields[i];
-
- event->n_fields = n_fields;
- out:
- return event;
-}
-
static void action_trace(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
@@ -1331,1065 +801,16 @@ struct hist_var_data {
struct hist_trigger_data *hist_data;
};
-static int synth_event_check_arg_fn(void *data)
-{
- struct dynevent_arg_pair *arg_pair = data;
- int size;
-
- size = synth_field_size((char *)arg_pair->lhs);
-
- return size ? 0 : -EINVAL;
-}
-
-/**
- * synth_event_add_field - Add a new field to a synthetic event cmd
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @type: The type of the new field to add
- * @name: The name of the new field to add
- *
- * Add a new field to a synthetic event cmd object. Field ordering is in
- * the same order the fields are added.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_add_field(struct dynevent_cmd *cmd, const char *type,
- const char *name)
-{
- struct dynevent_arg_pair arg_pair;
- int ret;
-
- if (cmd->type != DYNEVENT_TYPE_SYNTH)
- return -EINVAL;
-
- if (!type || !name)
- return -EINVAL;
-
- dynevent_arg_pair_init(&arg_pair, 0, ';');
-
- arg_pair.lhs = type;
- arg_pair.rhs = name;
-
- ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn);
- if (ret)
- return ret;
-
- if (++cmd->n_fields > SYNTH_FIELDS_MAX)
- ret = -EINVAL;
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_add_field);
-
-/**
- * synth_event_add_field_str - Add a new field to a synthetic event cmd
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @type_name: The type and name of the new field to add, as a single string
- *
- * Add a new field to a synthetic event cmd object, as a single
- * string. The @type_name string is expected to be of the form 'type
- * name', which will be appended by ';'. No sanity checking is done -
- * what's passed in is assumed to already be well-formed. Field
- * ordering is in the same order the fields are added.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name)
-{
- struct dynevent_arg arg;
- int ret;
-
- if (cmd->type != DYNEVENT_TYPE_SYNTH)
- return -EINVAL;
-
- if (!type_name)
- return -EINVAL;
-
- dynevent_arg_init(&arg, ';');
-
- arg.str = type_name;
-
- ret = dynevent_arg_add(cmd, &arg, NULL);
- if (ret)
- return ret;
-
- if (++cmd->n_fields > SYNTH_FIELDS_MAX)
- ret = -EINVAL;
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_add_field_str);
-
-/**
- * synth_event_add_fields - Add multiple fields to a synthetic event cmd
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @fields: An array of type/name field descriptions
- * @n_fields: The number of field descriptions contained in the fields array
- *
- * Add a new set of fields to a synthetic event cmd object. The event
- * fields that will be defined for the event should be passed in as an
- * array of struct synth_field_desc, and the number of elements in the
- * array passed in as n_fields. Field ordering will retain the
- * ordering given in the fields array.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_add_fields(struct dynevent_cmd *cmd,
- struct synth_field_desc *fields,
- unsigned int n_fields)
-{
- unsigned int i;
- int ret = 0;
-
- for (i = 0; i < n_fields; i++) {
- if (fields[i].type == NULL || fields[i].name == NULL) {
- ret = -EINVAL;
- break;
- }
-
- ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
- if (ret)
- break;
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_add_fields);
-
-/**
- * __synth_event_gen_cmd_start - Start a synthetic event command from arg list
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @name: The name of the synthetic event
- * @mod: The module creating the event, NULL if not created from a module
- * @args: Variable number of arg (pairs), one pair for each field
- *
- * NOTE: Users normally won't want to call this function directly, but
- * rather use the synth_event_gen_cmd_start() wrapper, which
- * automatically adds a NULL to the end of the arg list. If this
- * function is used directly, make sure the last arg in the variable
- * arg list is NULL.
- *
- * Generate a synthetic event command to be executed by
- * synth_event_gen_cmd_end(). This function can be used to generate
- * the complete command or only the first part of it; in the latter
- * case, synth_event_add_field(), synth_event_add_field_str(), or
- * synth_event_add_fields() can be used to add more fields following
- * this.
- *
- * There should be an even number variable args, each pair consisting
- * of a type followed by a field name.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name,
- struct module *mod, ...)
-{
- struct dynevent_arg arg;
- va_list args;
- int ret;
-
- cmd->event_name = name;
- cmd->private_data = mod;
-
- if (cmd->type != DYNEVENT_TYPE_SYNTH)
- return -EINVAL;
-
- dynevent_arg_init(&arg, 0);
- arg.str = name;
- ret = dynevent_arg_add(cmd, &arg, NULL);
- if (ret)
- return ret;
-
- va_start(args, mod);
- for (;;) {
- const char *type, *name;
-
- type = va_arg(args, const char *);
- if (!type)
- break;
- name = va_arg(args, const char *);
- if (!name)
- break;
-
- if (++cmd->n_fields > SYNTH_FIELDS_MAX) {
- ret = -EINVAL;
- break;
- }
-
- ret = synth_event_add_field(cmd, type, name);
- if (ret)
- break;
- }
- va_end(args);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start);
-
-/**
- * synth_event_gen_cmd_array_start - Start synthetic event command from an array
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @name: The name of the synthetic event
- * @fields: An array of type/name field descriptions
- * @n_fields: The number of field descriptions contained in the fields array
- *
- * Generate a synthetic event command to be executed by
- * synth_event_gen_cmd_end(). This function can be used to generate
- * the complete command or only the first part of it; in the latter
- * case, synth_event_add_field(), synth_event_add_field_str(), or
- * synth_event_add_fields() can be used to add more fields following
- * this.
- *
- * The event fields that will be defined for the event should be
- * passed in as an array of struct synth_field_desc, and the number of
- * elements in the array passed in as n_fields. Field ordering will
- * retain the ordering given in the fields array.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name,
- struct module *mod,
- struct synth_field_desc *fields,
- unsigned int n_fields)
-{
- struct dynevent_arg arg;
- unsigned int i;
- int ret = 0;
-
- cmd->event_name = name;
- cmd->private_data = mod;
-
- if (cmd->type != DYNEVENT_TYPE_SYNTH)
- return -EINVAL;
-
- if (n_fields > SYNTH_FIELDS_MAX)
- return -EINVAL;
-
- dynevent_arg_init(&arg, 0);
- arg.str = name;
- ret = dynevent_arg_add(cmd, &arg, NULL);
- if (ret)
- return ret;
-
- for (i = 0; i < n_fields; i++) {
- if (fields[i].type == NULL || fields[i].name == NULL)
- return -EINVAL;
-
- ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
- if (ret)
- break;
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start);
-
-static int __create_synth_event(int argc, const char *name, const char **argv)
-{
- struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
- struct synth_event *event = NULL;
- int i, consumed = 0, n_fields = 0, ret = 0;
-
- /*
- * Argument syntax:
- * - Add synthetic event: <event_name> field[;field] ...
- * - Remove synthetic event: !<event_name> field[;field] ...
- * where 'field' = type field_name
- */
-
- if (name[0] == '\0' || argc < 1)
- return -EINVAL;
-
- mutex_lock(&event_mutex);
-
- event = find_synth_event(name);
- if (event) {
- ret = -EEXIST;
- goto out;
- }
-
- for (i = 0; i < argc - 1; i++) {
- if (strcmp(argv[i], ";") == 0)
- continue;
- if (n_fields == SYNTH_FIELDS_MAX) {
- ret = -EINVAL;
- goto err;
- }
-
- field = parse_synth_field(argc - i, &argv[i], &consumed);
- if (IS_ERR(field)) {
- ret = PTR_ERR(field);
- goto err;
- }
- fields[n_fields++] = field;
- i += consumed - 1;
- }
-
- if (i < argc && strcmp(argv[i], ";") != 0) {
- ret = -EINVAL;
- goto err;
- }
-
- event = alloc_synth_event(name, n_fields, fields);
- if (IS_ERR(event)) {
- ret = PTR_ERR(event);
- event = NULL;
- goto err;
- }
- ret = register_synth_event(event);
- if (!ret)
- dyn_event_add(&event->devent);
- else
- free_synth_event(event);
- out:
- mutex_unlock(&event_mutex);
-
- return ret;
- err:
- for (i = 0; i < n_fields; i++)
- free_synth_field(fields[i]);
-
- goto out;
-}
-
-/**
- * synth_event_create - Create a new synthetic event
- * @name: The name of the new sythetic event
- * @fields: An array of type/name field descriptions
- * @n_fields: The number of field descriptions contained in the fields array
- * @mod: The module creating the event, NULL if not created from a module
- *
- * Create a new synthetic event with the given name under the
- * trace/events/synthetic/ directory. The event fields that will be
- * defined for the event should be passed in as an array of struct
- * synth_field_desc, and the number elements in the array passed in as
- * n_fields. Field ordering will retain the ordering given in the
- * fields array.
- *
- * If the new synthetic event is being created from a module, the mod
- * param must be non-NULL. This will ensure that the trace buffer
- * won't contain unreadable events.
- *
- * The new synth event should be deleted using synth_event_delete()
- * function. The new synthetic event can be generated from modules or
- * other kernel code using trace_synth_event() and related functions.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_create(const char *name, struct synth_field_desc *fields,
- unsigned int n_fields, struct module *mod)
-{
- struct dynevent_cmd cmd;
- char *buf;
- int ret;
-
- buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
-
- ret = synth_event_gen_cmd_array_start(&cmd, name, mod,
- fields, n_fields);
- if (ret)
- goto out;
-
- ret = synth_event_gen_cmd_end(&cmd);
- out:
- kfree(buf);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_create);
-
-static int destroy_synth_event(struct synth_event *se)
-{
- int ret;
-
- if (se->ref)
- ret = -EBUSY;
- else {
- ret = unregister_synth_event(se);
- if (!ret) {
- dyn_event_remove(&se->devent);
- free_synth_event(se);
- }
- }
-
- return ret;
-}
-
-/**
- * synth_event_delete - Delete a synthetic event
- * @event_name: The name of the new sythetic event
- *
- * Delete a synthetic event that was created with synth_event_create().
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_delete(const char *event_name)
-{
- struct synth_event *se = NULL;
- struct module *mod = NULL;
- int ret = -ENOENT;
-
- mutex_lock(&event_mutex);
- se = find_synth_event(event_name);
- if (se) {
- mod = se->mod;
- ret = destroy_synth_event(se);
- }
- mutex_unlock(&event_mutex);
-
- if (mod) {
- mutex_lock(&trace_types_lock);
- /*
- * It is safest to reset the ring buffer if the module
- * being unloaded registered any events that were
- * used. The only worry is if a new module gets
- * loaded, and takes on the same id as the events of
- * this module. When printing out the buffer, traced
- * events left over from this module may be passed to
- * the new module events and unexpected results may
- * occur.
- */
- tracing_reset_all_online_cpus();
- mutex_unlock(&trace_types_lock);
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_delete);
-
-static int create_or_delete_synth_event(int argc, char **argv)
-{
- const char *name = argv[0];
- int ret;
-
- /* trace_run_command() ensures argc != 0 */
- if (name[0] == '!') {
- ret = synth_event_delete(name + 1);
- return ret;
- }
-
- ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
- return ret == -ECANCELED ? -EINVAL : ret;
-}
-
-static int synth_event_run_command(struct dynevent_cmd *cmd)
-{
- struct synth_event *se;
- int ret;
-
- ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event);
- if (ret)
- return ret;
-
- se = find_synth_event(cmd->event_name);
- if (WARN_ON(!se))
- return -ENOENT;
-
- se->mod = cmd->private_data;
-
- return ret;
-}
-
-/**
- * synth_event_cmd_init - Initialize a synthetic event command object
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @buf: A pointer to the buffer used to build the command
- * @maxlen: The length of the buffer passed in @buf
- *
- * Initialize a synthetic event command object. Use this before
- * calling any of the other dyenvent_cmd functions.
- */
-void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen)
-{
- dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH,
- synth_event_run_command);
-}
-EXPORT_SYMBOL_GPL(synth_event_cmd_init);
-
-static inline int
-__synth_event_trace_start(struct trace_event_file *file,
- struct synth_event_trace_state *trace_state)
-{
- int entry_size, fields_size = 0;
- int ret = 0;
-
- memset(trace_state, '\0', sizeof(*trace_state));
-
- /*
- * Normal event tracing doesn't get called at all unless the
- * ENABLED bit is set (which attaches the probe thus allowing
- * this code to be called, etc). Because this is called
- * directly by the user, we don't have that but we still need
- * to honor not logging when disabled. For the the iterated
- * trace case, we save the enabed state upon start and just
- * ignore the following data calls.
- */
- if (!(file->flags & EVENT_FILE_FL_ENABLED) ||
- trace_trigger_soft_disabled(file)) {
- trace_state->disabled = true;
- ret = -ENOENT;
- goto out;
- }
-
- trace_state->event = file->event_call->data;
-
- fields_size = trace_state->event->n_u64 * sizeof(u64);
-
- /*
- * Avoid ring buffer recursion detection, as this event
- * is being performed within another event.
- */
- trace_state->buffer = file->tr->array_buffer.buffer;
- ring_buffer_nest_start(trace_state->buffer);
-
- entry_size = sizeof(*trace_state->entry) + fields_size;
- trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer,
- file,
- entry_size);
- if (!trace_state->entry) {
- ring_buffer_nest_end(trace_state->buffer);
- ret = -EINVAL;
- }
-out:
- return ret;
-}
-
-static inline void
-__synth_event_trace_end(struct synth_event_trace_state *trace_state)
-{
- trace_event_buffer_commit(&trace_state->fbuffer);
-
- ring_buffer_nest_end(trace_state->buffer);
-}
-
-/**
- * synth_event_trace - Trace a synthetic event
- * @file: The trace_event_file representing the synthetic event
- * @n_vals: The number of values in vals
- * @args: Variable number of args containing the event values
- *
- * Trace a synthetic event using the values passed in the variable
- * argument list.
- *
- * The argument list should be a list 'n_vals' u64 values. The number
- * of vals must match the number of field in the synthetic event, and
- * must be in the same order as the synthetic event fields.
- *
- * All vals should be cast to u64, and string vals are just pointers
- * to strings, cast to u64. Strings will be copied into space
- * reserved in the event for the string, using these pointers.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
-{
- struct synth_event_trace_state state;
- unsigned int i, n_u64;
- va_list args;
- int ret;
-
- ret = __synth_event_trace_start(file, &state);
- if (ret) {
- if (ret == -ENOENT)
- ret = 0; /* just disabled, not really an error */
- return ret;
- }
-
- if (n_vals != state.event->n_fields) {
- ret = -EINVAL;
- goto out;
- }
-
- va_start(args, n_vals);
- for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
- u64 val;
-
- val = va_arg(args, u64);
-
- if (state.event->fields[i]->is_string) {
- char *str_val = (char *)(long)val;
- char *str_field = (char *)&state.entry->fields[n_u64];
-
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- struct synth_field *field = state.event->fields[i];
-
- switch (field->size) {
- case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
- break;
-
- case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
- break;
-
- case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
- break;
-
- default:
- state.entry->fields[n_u64] = val;
- break;
- }
- n_u64++;
- }
- }
- va_end(args);
-out:
- __synth_event_trace_end(&state);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_trace);
-
-/**
- * synth_event_trace_array - Trace a synthetic event from an array
- * @file: The trace_event_file representing the synthetic event
- * @vals: Array of values
- * @n_vals: The number of values in vals
- *
- * Trace a synthetic event using the values passed in as 'vals'.
- *
- * The 'vals' array is just an array of 'n_vals' u64. The number of
- * vals must match the number of field in the synthetic event, and
- * must be in the same order as the synthetic event fields.
- *
- * All vals should be cast to u64, and string vals are just pointers
- * to strings, cast to u64. Strings will be copied into space
- * reserved in the event for the string, using these pointers.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
- unsigned int n_vals)
-{
- struct synth_event_trace_state state;
- unsigned int i, n_u64;
- int ret;
-
- ret = __synth_event_trace_start(file, &state);
- if (ret) {
- if (ret == -ENOENT)
- ret = 0; /* just disabled, not really an error */
- return ret;
- }
-
- if (n_vals != state.event->n_fields) {
- ret = -EINVAL;
- goto out;
- }
-
- for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
- if (state.event->fields[i]->is_string) {
- char *str_val = (char *)(long)vals[i];
- char *str_field = (char *)&state.entry->fields[n_u64];
-
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- struct synth_field *field = state.event->fields[i];
- u64 val = vals[i];
-
- switch (field->size) {
- case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
- break;
-
- case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
- break;
-
- case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
- break;
-
- default:
- state.entry->fields[n_u64] = val;
- break;
- }
- n_u64++;
- }
- }
-out:
- __synth_event_trace_end(&state);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_trace_array);
-
-/**
- * synth_event_trace_start - Start piecewise synthetic event trace
- * @file: The trace_event_file representing the synthetic event
- * @trace_state: A pointer to object tracking the piecewise trace state
- *
- * Start the trace of a synthetic event field-by-field rather than all
- * at once.
- *
- * This function 'opens' an event trace, which means space is reserved
- * for the event in the trace buffer, after which the event's
- * individual field values can be set through either
- * synth_event_add_next_val() or synth_event_add_val().
- *
- * A pointer to a trace_state object is passed in, which will keep
- * track of the current event trace state until the event trace is
- * closed (and the event finally traced) using
- * synth_event_trace_end().
- *
- * Note that synth_event_trace_end() must be called after all values
- * have been added for each event trace, regardless of whether adding
- * all field values succeeded or not.
- *
- * Note also that for a given event trace, all fields must be added
- * using either synth_event_add_next_val() or synth_event_add_val()
- * but not both together or interleaved.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_trace_start(struct trace_event_file *file,
- struct synth_event_trace_state *trace_state)
-{
- int ret;
-
- if (!trace_state)
- return -EINVAL;
-
- ret = __synth_event_trace_start(file, trace_state);
- if (ret == -ENOENT)
- ret = 0; /* just disabled, not really an error */
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_trace_start);
-
-static int __synth_event_add_val(const char *field_name, u64 val,
- struct synth_event_trace_state *trace_state)
-{
- struct synth_field *field = NULL;
- struct synth_trace_event *entry;
- struct synth_event *event;
- int i, ret = 0;
-
- if (!trace_state) {
- ret = -EINVAL;
- goto out;
- }
-
- /* can't mix add_next_synth_val() with add_synth_val() */
- if (field_name) {
- if (trace_state->add_next) {
- ret = -EINVAL;
- goto out;
- }
- trace_state->add_name = true;
- } else {
- if (trace_state->add_name) {
- ret = -EINVAL;
- goto out;
- }
- trace_state->add_next = true;
- }
-
- if (trace_state->disabled)
- goto out;
-
- event = trace_state->event;
- if (trace_state->add_name) {
- for (i = 0; i < event->n_fields; i++) {
- field = event->fields[i];
- if (strcmp(field->name, field_name) == 0)
- break;
- }
- if (!field) {
- ret = -EINVAL;
- goto out;
- }
- } else {
- if (trace_state->cur_field >= event->n_fields) {
- ret = -EINVAL;
- goto out;
- }
- field = event->fields[trace_state->cur_field++];
- }
-
- entry = trace_state->entry;
- if (field->is_string) {
- char *str_val = (char *)(long)val;
- char *str_field;
-
- if (!str_val) {
- ret = -EINVAL;
- goto out;
- }
-
- str_field = (char *)&entry->fields[field->offset];
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- } else {
- switch (field->size) {
- case 1:
- *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val;
- break;
-
- case 2:
- *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val;
- break;
-
- case 4:
- *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val;
- break;
-
- default:
- trace_state->entry->fields[field->offset] = val;
- break;
- }
- }
- out:
- return ret;
-}
-
-/**
- * synth_event_add_next_val - Add the next field's value to an open synth trace
- * @val: The value to set the next field to
- * @trace_state: A pointer to object tracking the piecewise trace state
- *
- * Set the value of the next field in an event that's been opened by
- * synth_event_trace_start().
- *
- * The val param should be the value cast to u64. If the value points
- * to a string, the val param should be a char * cast to u64.
- *
- * This function assumes all the fields in an event are to be set one
- * after another - successive calls to this function are made, one for
- * each field, in the order of the fields in the event, until all
- * fields have been set. If you'd rather set each field individually
- * without regard to ordering, synth_event_add_val() can be used
- * instead.
- *
- * Note however that synth_event_add_next_val() and
- * synth_event_add_val() can't be intermixed for a given event trace -
- * one or the other but not both can be used at the same time.
- *
- * Note also that synth_event_trace_end() must be called after all
- * values have been added for each event trace, regardless of whether
- * adding all field values succeeded or not.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_add_next_val(u64 val,
- struct synth_event_trace_state *trace_state)
-{
- return __synth_event_add_val(NULL, val, trace_state);
-}
-EXPORT_SYMBOL_GPL(synth_event_add_next_val);
-
-/**
- * synth_event_add_val - Add a named field's value to an open synth trace
- * @field_name: The name of the synthetic event field value to set
- * @val: The value to set the next field to
- * @trace_state: A pointer to object tracking the piecewise trace state
- *
- * Set the value of the named field in an event that's been opened by
- * synth_event_trace_start().
- *
- * The val param should be the value cast to u64. If the value points
- * to a string, the val param should be a char * cast to u64.
- *
- * This function looks up the field name, and if found, sets the field
- * to the specified value. This lookup makes this function more
- * expensive than synth_event_add_next_val(), so use that or the
- * none-piecewise synth_event_trace() instead if efficiency is more
- * important.
- *
- * Note however that synth_event_add_next_val() and
- * synth_event_add_val() can't be intermixed for a given event trace -
- * one or the other but not both can be used at the same time.
- *
- * Note also that synth_event_trace_end() must be called after all
- * values have been added for each event trace, regardless of whether
- * adding all field values succeeded or not.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_add_val(const char *field_name, u64 val,
- struct synth_event_trace_state *trace_state)
-{
- return __synth_event_add_val(field_name, val, trace_state);
-}
-EXPORT_SYMBOL_GPL(synth_event_add_val);
-
-/**
- * synth_event_trace_end - End piecewise synthetic event trace
- * @trace_state: A pointer to object tracking the piecewise trace state
- *
- * End the trace of a synthetic event opened by
- * synth_event_trace__start().
- *
- * This function 'closes' an event trace, which basically means that
- * it commits the reserved event and cleans up other loose ends.
- *
- * A pointer to a trace_state object is passed in, which will keep
- * track of the current event trace state opened with
- * synth_event_trace_start().
- *
- * Note that this function must be called after all values have been
- * added for each event trace, regardless of whether adding all field
- * values succeeded or not.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_trace_end(struct synth_event_trace_state *trace_state)
-{
- if (!trace_state)
- return -EINVAL;
-
- __synth_event_trace_end(trace_state);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(synth_event_trace_end);
-
-static int create_synth_event(int argc, const char **argv)
-{
- const char *name = argv[0];
- int len;
-
- if (name[0] != 's' || name[1] != ':')
- return -ECANCELED;
- name += 2;
-
- /* This interface accepts group name prefix */
- if (strchr(name, '/')) {
- len = str_has_prefix(name, SYNTH_SYSTEM "/");
- if (len == 0)
- return -EINVAL;
- name += len;
- }
- return __create_synth_event(argc - 1, name, argv + 1);
-}
-
-static int synth_event_release(struct dyn_event *ev)
-{
- struct synth_event *event = to_synth_event(ev);
- int ret;
-
- if (event->ref)
- return -EBUSY;
-
- ret = unregister_synth_event(event);
- if (ret)
- return ret;
-
- dyn_event_remove(ev);
- free_synth_event(event);
- return 0;
-}
-
-static int __synth_event_show(struct seq_file *m, struct synth_event *event)
-{
- struct synth_field *field;
- unsigned int i;
-
- seq_printf(m, "%s\t", event->name);
-
- for (i = 0; i < event->n_fields; i++) {
- field = event->fields[i];
-
- /* parameter values */
- seq_printf(m, "%s %s%s", field->type, field->name,
- i == event->n_fields - 1 ? "" : "; ");
- }
-
- seq_putc(m, '\n');
-
- return 0;
-}
-
-static int synth_event_show(struct seq_file *m, struct dyn_event *ev)
-{
- struct synth_event *event = to_synth_event(ev);
-
- seq_printf(m, "s:%s/", event->class.system);
-
- return __synth_event_show(m, event);
-}
-
-static int synth_events_seq_show(struct seq_file *m, void *v)
-{
- struct dyn_event *ev = v;
-
- if (!is_synth_event(ev))
- return 0;
-
- return __synth_event_show(m, to_synth_event(ev));
-}
-
-static const struct seq_operations synth_events_seq_op = {
- .start = dyn_event_seq_start,
- .next = dyn_event_seq_next,
- .stop = dyn_event_seq_stop,
- .show = synth_events_seq_show,
-};
-
-static int synth_events_open(struct inode *inode, struct file *file)
-{
- int ret;
-
- ret = security_locked_down(LOCKDOWN_TRACEFS);
- if (ret)
- return ret;
-
- if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
- ret = dyn_events_release_all(&synth_event_ops);
- if (ret < 0)
- return ret;
- }
-
- return seq_open(file, &synth_events_seq_op);
-}
-
-static ssize_t synth_events_write(struct file *file,
- const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- return trace_parse_run_command(file, buffer, count, ppos,
- create_or_delete_synth_event);
-}
-
-static const struct file_operations synth_events_fops = {
- .open = synth_events_open,
- .write = synth_events_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static u64 hist_field_timestamp(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_trigger_data *hist_data = hist_field->hist_data;
struct trace_array *tr = hist_data->event_file->tr;
- u64 ts = ring_buffer_event_time_stamp(rbe);
+ u64 ts = ring_buffer_event_time_stamp(buffer, rbe);
if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
ts = ns2usecs(ts);
@@ -2399,6 +820,7 @@ static u64 hist_field_timestamp(struct hist_field *hist_field,
static u64 hist_field_cpu(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -2779,6 +1201,7 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
static u64 hist_field_var_ref(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -2851,10 +1274,11 @@ static const char *hist_field_name(struct hist_field *field,
if (field->field)
field_name = field->field->name;
else if (field->flags & HIST_FIELD_FL_LOG2 ||
- field->flags & HIST_FIELD_FL_ALIAS)
+ field->flags & HIST_FIELD_FL_ALIAS ||
+ field->flags & HIST_FIELD_FL_BUCKET)
field_name = hist_field_name(field->operands[0], ++level);
else if (field->flags & HIST_FIELD_FL_CPU)
- field_name = "cpu";
+ field_name = "common_cpu";
else if (field->flags & HIST_FIELD_FL_EXPR ||
field->flags & HIST_FIELD_FL_VAR_REF) {
if (field->system) {
@@ -3119,9 +1543,11 @@ static void hist_elt_data_free(struct hist_elt_data *elt_data)
{
unsigned int i;
- for (i = 0; i < SYNTH_FIELDS_MAX; i++)
+ for (i = 0; i < elt_data->n_field_var_str; i++)
kfree(elt_data->field_var_str[i]);
+ kfree(elt_data->field_var_str);
+
kfree(elt_data->comm);
kfree(elt_data);
}
@@ -3138,17 +1564,17 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
struct hist_trigger_data *hist_data = elt->map->private_data;
unsigned int size = TASK_COMM_LEN;
struct hist_elt_data *elt_data;
- struct hist_field *key_field;
+ struct hist_field *hist_field;
unsigned int i, n_str;
elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
if (!elt_data)
return -ENOMEM;
- for_each_hist_key_field(i, hist_data) {
- key_field = hist_data->fields[i];
+ for_each_hist_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
- if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
+ if (hist_field->flags & HIST_FIELD_FL_EXECNAME) {
elt_data->comm = kzalloc(size, GFP_KERNEL);
if (!elt_data->comm) {
kfree(elt_data);
@@ -3158,10 +1584,24 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
}
}
- n_str = hist_data->n_field_var_str + hist_data->n_save_var_str;
+ n_str = hist_data->n_field_var_str + hist_data->n_save_var_str +
+ hist_data->n_var_str;
+ if (n_str > SYNTH_FIELDS_MAX) {
+ hist_elt_data_free(elt_data);
+ return -EINVAL;
+ }
+
+ BUILD_BUG_ON(STR_VAR_LEN_MAX & (sizeof(u64) - 1));
size = STR_VAR_LEN_MAX;
+ elt_data->field_var_str = kcalloc(n_str, sizeof(char *), GFP_KERNEL);
+ if (!elt_data->field_var_str) {
+ hist_elt_data_free(elt_data);
+ return -EINVAL;
+ }
+ elt_data->n_field_var_str = n_str;
+
for (i = 0; i < n_str; i++) {
elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
if (!elt_data->field_var_str[i]) {
@@ -3205,6 +1645,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
flags_str = "syscall";
else if (hist_field->flags & HIST_FIELD_FL_LOG2)
flags_str = "log2";
+ else if (hist_field->flags & HIST_FIELD_FL_BUCKET)
+ flags_str = "buckets";
else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
flags_str = "usecs";
@@ -3215,6 +1657,12 @@ static void expr_field_str(struct hist_field *field, char *expr)
{
if (field->flags & HIST_FIELD_FL_VAR_REF)
strcat(expr, "$");
+ else if (field->flags & HIST_FIELD_FL_CONST) {
+ char str[HIST_CONST_DIGITS_MAX];
+
+ snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant);
+ strcat(expr, str);
+ }
strcat(expr, hist_field_name(field, 0));
@@ -3270,6 +1718,12 @@ static char *expr_str(struct hist_field *field, unsigned int level)
case FIELD_OP_PLUS:
strcat(expr, "+");
break;
+ case FIELD_OP_DIV:
+ strcat(expr, "/");
+ break;
+ case FIELD_OP_MULT:
+ strcat(expr, "*");
+ break;
default:
kfree(expr);
return NULL;
@@ -3280,27 +1734,92 @@ static char *expr_str(struct hist_field *field, unsigned int level)
return expr;
}
-static int contains_operator(char *str)
+/*
+ * If field_op != FIELD_OP_NONE, *sep points to the root operator
+ * of the expression tree to be evaluated.
+ */
+static int contains_operator(char *str, char **sep)
{
enum field_op_id field_op = FIELD_OP_NONE;
- char *op;
+ char *minus_op, *plus_op, *div_op, *mult_op;
- op = strpbrk(str, "+-");
- if (!op)
- return FIELD_OP_NONE;
- switch (*op) {
- case '-':
- if (*str == '-')
+ /*
+ * Report the last occurrence of the operators first, so that the
+ * expression is evaluated left to right. This is important since
+ * subtraction and division are not associative.
+ *
+ * e.g
+ * 64/8/4/2 is 1, i.e 64/8/4/2 = ((64/8)/4)/2
+ * 14-7-5-2 is 0, i.e 14-7-5-2 = ((14-7)-5)-2
+ */
+
+ /*
+ * First, find lower precedence addition and subtraction
+ * since the expression will be evaluated recursively.
+ */
+ minus_op = strrchr(str, '-');
+ if (minus_op) {
+ /*
+ * Unary minus is not supported in sub-expressions. If
+ * present, it is always the next root operator.
+ */
+ if (minus_op == str) {
field_op = FIELD_OP_UNARY_MINUS;
- else
- field_op = FIELD_OP_MINUS;
- break;
- case '+':
- field_op = FIELD_OP_PLUS;
- break;
- default:
- break;
+ goto out;
+ }
+
+ field_op = FIELD_OP_MINUS;
+ }
+
+ plus_op = strrchr(str, '+');
+ if (plus_op || minus_op) {
+ /*
+ * For operators of the same precedence use to rightmost as the
+ * root, so that the expression is evaluated left to right.
+ */
+ if (plus_op > minus_op)
+ field_op = FIELD_OP_PLUS;
+ goto out;
+ }
+
+ /*
+ * Multiplication and division have higher precedence than addition and
+ * subtraction.
+ */
+ div_op = strrchr(str, '/');
+ if (div_op)
+ field_op = FIELD_OP_DIV;
+
+ mult_op = strrchr(str, '*');
+ /*
+ * For operators of the same precedence use to rightmost as the
+ * root, so that the expression is evaluated left to right.
+ */
+ if (mult_op > div_op)
+ field_op = FIELD_OP_MULT;
+
+out:
+ if (sep) {
+ switch (field_op) {
+ case FIELD_OP_UNARY_MINUS:
+ case FIELD_OP_MINUS:
+ *sep = minus_op;
+ break;
+ case FIELD_OP_PLUS:
+ *sep = plus_op;
+ break;
+ case FIELD_OP_DIV:
+ *sep = div_op;
+ break;
+ case FIELD_OP_MULT:
+ *sep = mult_op;
+ break;
+ case FIELD_OP_NONE:
+ default:
+ *sep = NULL;
+ break;
+ }
}
return field_op;
@@ -3318,7 +1837,12 @@ static void __destroy_hist_field(struct hist_field *hist_field)
kfree(hist_field->var.name);
kfree(hist_field->name);
- kfree(hist_field->type);
+
+ /* Can likely be a const */
+ kfree_const(hist_field->type);
+
+ kfree(hist_field->system);
+ kfree(hist_field->event_name);
kfree(hist_field);
}
@@ -3372,6 +1896,13 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (flags & HIST_FIELD_FL_HITCOUNT) {
hist_field->fn = hist_field_counter;
hist_field->size = sizeof(u64);
+ hist_field->type = "u64";
+ goto out;
+ }
+
+ if (flags & HIST_FIELD_FL_CONST) {
+ hist_field->fn = hist_field_const;
+ hist_field->size = sizeof(u64);
hist_field->type = kstrdup("u64", GFP_KERNEL);
if (!hist_field->type)
goto free;
@@ -3383,12 +1914,13 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
goto out;
}
- if (flags & HIST_FIELD_FL_LOG2) {
- unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
- hist_field->fn = hist_field_log2;
+ if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) {
+ unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET);
+ hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 :
+ hist_field_bucket;
hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
hist_field->size = hist_field->operands[0]->size;
- hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
+ hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);
if (!hist_field->type)
goto free;
goto out;
@@ -3397,42 +1929,41 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (flags & HIST_FIELD_FL_TIMESTAMP) {
hist_field->fn = hist_field_timestamp;
hist_field->size = sizeof(u64);
- hist_field->type = kstrdup("u64", GFP_KERNEL);
- if (!hist_field->type)
- goto free;
+ hist_field->type = "u64";
goto out;
}
if (flags & HIST_FIELD_FL_CPU) {
hist_field->fn = hist_field_cpu;
hist_field->size = sizeof(int);
- hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
- if (!hist_field->type)
- goto free;
+ hist_field->type = "unsigned int";
goto out;
}
if (WARN_ON_ONCE(!field))
goto out;
- if (is_string_field(field)) {
+ /* Pointers to strings are just pointers and dangerous to dereference */
+ if (is_string_field(field) &&
+ (field->filter_type != FILTER_PTR_STRING)) {
flags |= HIST_FIELD_FL_STRING;
hist_field->size = MAX_FILTER_STR_VAL;
- hist_field->type = kstrdup(field->type, GFP_KERNEL);
+ hist_field->type = kstrdup_const(field->type, GFP_KERNEL);
if (!hist_field->type)
goto free;
- if (field->filter_type == FILTER_STATIC_STRING)
+ if (field->filter_type == FILTER_STATIC_STRING) {
hist_field->fn = hist_field_string;
- else if (field->filter_type == FILTER_DYN_STRING)
+ hist_field->size = field->size;
+ } else if (field->filter_type == FILTER_DYN_STRING)
hist_field->fn = hist_field_dynstring;
else
hist_field->fn = hist_field_pstring;
} else {
hist_field->size = field->size;
hist_field->is_signed = field->is_signed;
- hist_field->type = kstrdup(field->type, GFP_KERNEL);
+ hist_field->type = kstrdup_const(field->type, GFP_KERNEL);
if (!hist_field->type)
goto free;
@@ -3518,7 +2049,7 @@ static int init_var_ref(struct hist_field *ref_field,
}
}
- ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
+ ref_field->type = kstrdup_const(var_field->type, GFP_KERNEL);
if (!ref_field->type) {
err = -ENOMEM;
goto free;
@@ -3615,7 +2146,7 @@ static char *field_name_from_var(struct hist_trigger_data *hist_data,
if (strcmp(var_name, name) == 0) {
field = hist_data->attrs->var_defs.expr[i];
- if (contains_operator(field) || is_var_ref(field))
+ if (contains_operator(field, NULL) || is_var_ref(field))
continue;
return field;
}
@@ -3676,7 +2207,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
static struct ftrace_event_field *
parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
- char *field_str, unsigned long *flags)
+ char *field_str, unsigned long *flags, unsigned long *buckets)
{
struct ftrace_event_field *field = NULL;
char *field_name, *modifier, *str;
@@ -3692,7 +2223,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
*flags |= HIST_FIELD_FL_HEX;
else if (strcmp(modifier, "sym") == 0)
*flags |= HIST_FIELD_FL_SYM;
- else if (strcmp(modifier, "sym-offset") == 0)
+ /*
+ * 'sym-offset' occurrences in the trigger string are modified
+ * to 'symXoffset' to simplify arithmetic expression parsing.
+ */
+ else if (strcmp(modifier, "symXoffset") == 0)
*flags |= HIST_FIELD_FL_SYM_OFFSET;
else if ((strcmp(modifier, "execname") == 0) &&
(strcmp(field_name, "common_pid") == 0))
@@ -3703,7 +2238,22 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
*flags |= HIST_FIELD_FL_LOG2;
else if (strcmp(modifier, "usecs") == 0)
*flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
- else {
+ else if (strncmp(modifier, "bucket", 6) == 0) {
+ int ret;
+
+ modifier += 6;
+
+ if (*modifier == 's')
+ modifier++;
+ if (*modifier != '=')
+ goto error;
+ modifier++;
+ ret = kstrtoul(modifier, 0, buckets);
+ if (ret || !(*buckets))
+ goto error;
+ *flags |= HIST_FIELD_FL_BUCKET;
+ } else {
+ error:
hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier));
field = ERR_PTR(-EINVAL);
goto out;
@@ -3715,14 +2265,24 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
hist_data->enable_timestamps = true;
if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
hist_data->attrs->ts_in_usecs = true;
- } else if (strcmp(field_name, "cpu") == 0)
+ } else if (strcmp(field_name, "common_cpu") == 0)
*flags |= HIST_FIELD_FL_CPU;
else {
field = trace_find_event_field(file->event_call, field_name);
if (!field || !field->size) {
- hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));
- field = ERR_PTR(-EINVAL);
- goto out;
+ /*
+ * For backward compatibility, if field_name
+ * was "cpu", then we treat this the same as
+ * common_cpu.
+ */
+ if (strcmp(field_name, "cpu") == 0) {
+ *flags |= HIST_FIELD_FL_CPU;
+ } else {
+ hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
+ errpos(field_name));
+ field = ERR_PTR(-EINVAL);
+ goto out;
+ }
}
}
out:
@@ -3755,6 +2315,29 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
return alias;
}
+static struct hist_field *parse_const(struct hist_trigger_data *hist_data,
+ char *str, char *var_name,
+ unsigned long *flags)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_field *field = NULL;
+ u64 constant;
+
+ if (kstrtoull(str, 0, &constant)) {
+ hist_err(tr, HIST_ERR_EXPECT_NUMBER, errpos(str));
+ return NULL;
+ }
+
+ *flags |= HIST_FIELD_FL_CONST;
+ field = create_hist_field(hist_data, NULL, *flags, var_name);
+ if (!field)
+ return NULL;
+
+ field->constant = constant;
+
+ return field;
+}
+
static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
struct trace_event_file *file, char *str,
unsigned long *flags, char *var_name)
@@ -3762,8 +2345,18 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
struct ftrace_event_field *field = NULL;
struct hist_field *hist_field = NULL;
+ unsigned long buckets = 0;
int ret = 0;
+ if (isdigit(str[0])) {
+ hist_field = parse_const(hist_data, str, var_name, flags);
+ if (!hist_field) {
+ ret = -EINVAL;
+ goto out;
+ }
+ return hist_field;
+ }
+
s = strchr(str, '.');
if (s) {
s = strchr(++s, '.');
@@ -3799,7 +2392,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
} else
str = s;
- field = parse_field(hist_data, file, str, flags);
+ field = parse_field(hist_data, file, str, flags, &buckets);
if (IS_ERR(field)) {
ret = PTR_ERR(field);
goto out;
@@ -3810,6 +2403,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
ret = -ENOMEM;
goto out;
}
+ hist_field->buckets = buckets;
return hist_field;
out:
@@ -3819,21 +2413,24 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
struct trace_event_file *file,
char *str, unsigned long flags,
- char *var_name, unsigned int level);
+ char *var_name, unsigned int *n_subexprs);
static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
struct trace_event_file *file,
char *str, unsigned long flags,
- char *var_name, unsigned int level)
+ char *var_name, unsigned int *n_subexprs)
{
struct hist_field *operand1, *expr = NULL;
unsigned long operand_flags;
int ret = 0;
char *s;
+ /* Unary minus operator, increment n_subexprs */
+ ++*n_subexprs;
+
/* we support only -(xxx) i.e. explicit parens required */
- if (level > 3) {
+ if (*n_subexprs > 3) {
hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
ret = -EINVAL;
goto free;
@@ -3850,8 +2447,16 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
}
s = strrchr(str, ')');
- if (s)
+ if (s) {
+ /* unary minus not supported in sub-expressions */
+ if (*(s+1) != '\0') {
+ hist_err(file->tr, HIST_ERR_UNARY_MINUS_SUBEXPR,
+ errpos(str));
+ ret = -EINVAL;
+ goto free;
+ }
*s = '\0';
+ }
else {
ret = -EINVAL; /* no closing ')' */
goto free;
@@ -3865,11 +2470,18 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
}
operand_flags = 0;
- operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
+ operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs);
if (IS_ERR(operand1)) {
ret = PTR_ERR(operand1);
goto free;
}
+ if (operand1->flags & HIST_FIELD_FL_STRING) {
+ /* String type can not be the operand of unary operator. */
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str));
+ destroy_hist_field(operand1, 0);
+ ret = -EINVAL;
+ goto free;
+ }
expr->flags |= operand1->flags &
(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
@@ -3877,7 +2489,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
expr->operands[0] = operand1;
expr->operator = FIELD_OP_UNARY_MINUS;
expr->name = expr_str(expr, 0);
- expr->type = kstrdup(operand1->type, GFP_KERNEL);
+ expr->type = kstrdup_const(operand1->type, GFP_KERNEL);
if (!expr->type) {
ret = -ENOMEM;
goto free;
@@ -3889,9 +2501,15 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
return ERR_PTR(ret);
}
+/*
+ * If the operands are var refs, return pointers the
+ * variable(s) referenced in var1 and var2, else NULL.
+ */
static int check_expr_operands(struct trace_array *tr,
struct hist_field *operand1,
- struct hist_field *operand2)
+ struct hist_field *operand2,
+ struct hist_field **var1,
+ struct hist_field **var2)
{
unsigned long operand1_flags = operand1->flags;
unsigned long operand2_flags = operand2->flags;
@@ -3904,6 +2522,7 @@ static int check_expr_operands(struct trace_array *tr,
if (!var)
return -EINVAL;
operand1_flags = var->flags;
+ *var1 = var;
}
if ((operand2_flags & HIST_FIELD_FL_VAR_REF) ||
@@ -3914,6 +2533,7 @@ static int check_expr_operands(struct trace_array *tr,
if (!var)
return -EINVAL;
operand2_flags = var->flags;
+ *var2 = var;
}
if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
@@ -3928,64 +2548,102 @@ static int check_expr_operands(struct trace_array *tr,
static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
struct trace_event_file *file,
char *str, unsigned long flags,
- char *var_name, unsigned int level)
+ char *var_name, unsigned int *n_subexprs)
{
struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL;
- unsigned long operand_flags;
+ struct hist_field *var1 = NULL, *var2 = NULL;
+ unsigned long operand_flags, operand2_flags;
int field_op, ret = -EINVAL;
char *sep, *operand1_str;
+ hist_field_fn_t op_fn;
+ bool combine_consts;
- if (level > 3) {
+ if (*n_subexprs > 3) {
hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
return ERR_PTR(-EINVAL);
}
- field_op = contains_operator(str);
+ field_op = contains_operator(str, &sep);
if (field_op == FIELD_OP_NONE)
return parse_atom(hist_data, file, str, &flags, var_name);
if (field_op == FIELD_OP_UNARY_MINUS)
- return parse_unary(hist_data, file, str, flags, var_name, ++level);
+ return parse_unary(hist_data, file, str, flags, var_name, n_subexprs);
- switch (field_op) {
- case FIELD_OP_MINUS:
- sep = "-";
- break;
- case FIELD_OP_PLUS:
- sep = "+";
- break;
- default:
- goto free;
- }
+ /* Binary operator found, increment n_subexprs */
+ ++*n_subexprs;
- operand1_str = strsep(&str, sep);
- if (!operand1_str || !str)
- goto free;
+ /* Split the expression string at the root operator */
+ if (!sep)
+ return ERR_PTR(-EINVAL);
+
+ *sep = '\0';
+ operand1_str = str;
+ str = sep+1;
+
+ /* Binary operator requires both operands */
+ if (*operand1_str == '\0' || *str == '\0')
+ return ERR_PTR(-EINVAL);
operand_flags = 0;
- operand1 = parse_atom(hist_data, file, operand1_str,
- &operand_flags, NULL);
- if (IS_ERR(operand1)) {
- ret = PTR_ERR(operand1);
- operand1 = NULL;
- goto free;
+
+ /* LHS of string is an expression e.g. a+b in a+b+c */
+ operand1 = parse_expr(hist_data, file, operand1_str, operand_flags, NULL, n_subexprs);
+ if (IS_ERR(operand1))
+ return ERR_CAST(operand1);
+
+ if (operand1->flags & HIST_FIELD_FL_STRING) {
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(operand1_str));
+ ret = -EINVAL;
+ goto free_op1;
}
- /* rest of string could be another expression e.g. b+c in a+b+c */
+ /* RHS of string is another expression e.g. c in a+b+c */
operand_flags = 0;
- operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
+ operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs);
if (IS_ERR(operand2)) {
ret = PTR_ERR(operand2);
- operand2 = NULL;
- goto free;
+ goto free_op1;
+ }
+ if (operand2->flags & HIST_FIELD_FL_STRING) {
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str));
+ ret = -EINVAL;
+ goto free_operands;
+ }
+
+ switch (field_op) {
+ case FIELD_OP_MINUS:
+ op_fn = hist_field_minus;
+ break;
+ case FIELD_OP_PLUS:
+ op_fn = hist_field_plus;
+ break;
+ case FIELD_OP_DIV:
+ op_fn = hist_field_div;
+ break;
+ case FIELD_OP_MULT:
+ op_fn = hist_field_mult;
+ break;
+ default:
+ ret = -EINVAL;
+ goto free_operands;
}
- ret = check_expr_operands(file->tr, operand1, operand2);
+ ret = check_expr_operands(file->tr, operand1, operand2, &var1, &var2);
if (ret)
- goto free;
+ goto free_operands;
- flags |= HIST_FIELD_FL_EXPR;
+ operand_flags = var1 ? var1->flags : operand1->flags;
+ operand2_flags = var2 ? var2->flags : operand2->flags;
+
+ /*
+ * If both operands are constant, the expression can be
+ * collapsed to a single constant.
+ */
+ combine_consts = operand_flags & operand2_flags & HIST_FIELD_FL_CONST;
+
+ flags |= combine_consts ? HIST_FIELD_FL_CONST : HIST_FIELD_FL_EXPR;
flags |= operand1->flags &
(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
@@ -3993,40 +2651,79 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
expr = create_hist_field(hist_data, NULL, flags, var_name);
if (!expr) {
ret = -ENOMEM;
- goto free;
+ goto free_operands;
}
operand1->read_once = true;
operand2->read_once = true;
+ /* The operands are now owned and free'd by 'expr' */
expr->operands[0] = operand1;
expr->operands[1] = operand2;
- expr->operator = field_op;
- expr->name = expr_str(expr, 0);
- expr->type = kstrdup(operand1->type, GFP_KERNEL);
- if (!expr->type) {
- ret = -ENOMEM;
- goto free;
+
+ if (field_op == FIELD_OP_DIV &&
+ operand2_flags & HIST_FIELD_FL_CONST) {
+ u64 divisor = var2 ? var2->constant : operand2->constant;
+
+ if (!divisor) {
+ hist_err(file->tr, HIST_ERR_DIVISION_BY_ZERO, errpos(str));
+ ret = -EDOM;
+ goto free_expr;
+ }
+
+ /*
+ * Copy the divisor here so we don't have to look it up
+ * later if this is a var ref
+ */
+ operand2->constant = divisor;
+ op_fn = hist_field_get_div_fn(operand2);
}
- switch (field_op) {
- case FIELD_OP_MINUS:
- expr->fn = hist_field_minus;
- break;
- case FIELD_OP_PLUS:
- expr->fn = hist_field_plus;
- break;
- default:
- ret = -EINVAL;
- goto free;
+ if (combine_consts) {
+ if (var1)
+ expr->operands[0] = var1;
+ if (var2)
+ expr->operands[1] = var2;
+
+ expr->constant = op_fn(expr, NULL, NULL, NULL, NULL);
+
+ expr->operands[0] = NULL;
+ expr->operands[1] = NULL;
+
+ /*
+ * var refs won't be destroyed immediately
+ * See: destroy_hist_field()
+ */
+ destroy_hist_field(operand2, 0);
+ destroy_hist_field(operand1, 0);
+
+ expr->name = expr_str(expr, 0);
+ } else {
+ expr->fn = op_fn;
+
+ /* The operand sizes should be the same, so just pick one */
+ expr->size = operand1->size;
+
+ expr->operator = field_op;
+ expr->type = kstrdup_const(operand1->type, GFP_KERNEL);
+ if (!expr->type) {
+ ret = -ENOMEM;
+ goto free_expr;
+ }
+
+ expr->name = expr_str(expr, 0);
}
return expr;
- free:
- destroy_hist_field(operand1, 0);
+
+free_operands:
destroy_hist_field(operand2, 0);
- destroy_hist_field(expr, 0);
+free_op1:
+ destroy_hist_field(operand1, 0);
+ return ERR_PTR(ret);
+free_expr:
+ destroy_hist_field(expr, 0);
return ERR_PTR(ret);
}
@@ -4148,7 +2845,7 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
* events. However, for convenience, users are allowed to directly
* specify an event field in an action, which will be automatically
* converted into a variable on their behalf.
-
+ *
* If a user specifies a field on an event that isn't the event the
* histogram currently being defined (the target event histogram), the
* only way that can be accomplished is if a new hist trigger is
@@ -4167,12 +2864,12 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
char *subsys_name, char *event_name, char *field_name)
{
struct trace_array *tr = target_hist_data->event_file->tr;
- struct hist_field *event_var = ERR_PTR(-EINVAL);
struct hist_trigger_data *hist_data;
unsigned int i, n, first = true;
struct field_var_hist *var_hist;
struct trace_event_file *file;
struct hist_field *key_field;
+ struct hist_field *event_var;
char *saved_filter;
char *cmd;
int ret;
@@ -4310,6 +3007,7 @@ find_target_event_var(struct hist_trigger_data *hist_data,
}
static inline void __update_field_vars(struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *rec,
struct field_var **field_vars,
@@ -4325,14 +3023,16 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
struct hist_field *var = field_var->var;
struct hist_field *val = field_var->val;
- var_val = val->fn(val, elt, rbe, rec);
+ var_val = val->fn(val, elt, buffer, rbe, rec);
var_idx = var->var.idx;
if (val->flags & HIST_FIELD_FL_STRING) {
char *str = elt_data->field_var_str[j++];
char *val_str = (char *)(uintptr_t)var_val;
+ unsigned int size;
- strscpy(str, val_str, STR_VAR_LEN_MAX);
+ size = min(val->size, STR_VAR_LEN_MAX);
+ strscpy(str, val_str, size);
var_val = (u64)(uintptr_t)str;
}
tracing_map_set_var(elt, var_idx, var_val);
@@ -4341,19 +3041,21 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
static void update_field_vars(struct hist_trigger_data *hist_data,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *rec)
{
- __update_field_vars(elt, rbe, rec, hist_data->field_vars,
+ __update_field_vars(elt, buffer, rbe, rec, hist_data->field_vars,
hist_data->n_field_vars, 0);
}
static void save_track_data_vars(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
- __update_field_vars(elt, rbe, rec, hist_data->save_vars,
+ __update_field_vars(elt, buffer, rbe, rec, hist_data->save_vars,
hist_data->n_save_vars, hist_data->n_field_var_str);
}
@@ -4382,15 +3084,16 @@ static struct hist_field *create_var(struct hist_trigger_data *hist_data,
goto out;
}
+ var->ref = 1;
var->flags = HIST_FIELD_FL_VAR;
var->var.idx = idx;
var->var.hist_data = var->hist_data = hist_data;
var->size = size;
var->var.name = kstrdup(name, GFP_KERNEL);
- var->type = kstrdup(type, GFP_KERNEL);
+ var->type = kstrdup_const(type, GFP_KERNEL);
if (!var->var.name || !var->type) {
+ kfree_const(var->type);
kfree(var->var.name);
- kfree(var->type);
kfree(var);
var = ERR_PTR(-ENOMEM);
}
@@ -4528,12 +3231,14 @@ static void save_track_val(struct hist_trigger_data *hist_data,
}
static void save_track_data(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
if (data->track_data.save_data)
- data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+ data->track_data.save_data(hist_data, elt, buffer, rec, rbe,
+ key, data, var_ref_vals);
}
static bool check_track_val(struct tracing_map_elt *elt,
@@ -4584,7 +3289,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
}
static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data,
u64 *var_ref_vals)
@@ -4653,7 +3359,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
return false;
}
static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data,
u64 *var_ref_vals) {}
@@ -4695,7 +3402,8 @@ static void track_data_print(struct seq_file *m,
}
static void ontrack_action(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
@@ -4703,7 +3411,8 @@ static void ontrack_action(struct hist_trigger_data *hist_data,
if (check_track_val(elt, data, var_val)) {
save_track_val(hist_data, elt, data, var_val);
- save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+ save_track_data(hist_data, elt, buffer, rec, rbe,
+ key, data, var_ref_vals);
}
}
@@ -5011,6 +3720,9 @@ static void destroy_field_vars(struct hist_trigger_data *hist_data)
for (i = 0; i < hist_data->n_field_vars; i++)
destroy_field_var(hist_data->field_vars[i]);
+
+ for (i = 0; i < hist_data->n_save_vars; i++)
+ destroy_field_var(hist_data->save_vars[i]);
}
static void save_field_var(struct hist_trigger_data *hist_data,
@@ -5034,9 +3746,18 @@ static int check_synth_field(struct synth_event *event,
field = event->fields[field_pos];
+ /*
+ * A dynamic string synth field can accept static or
+ * dynamic. A static string synth field can only accept a
+ * same-sized static string, which is checked for later.
+ */
+ if (strstr(hist_field->type, "char[") && field->is_string
+ && field->is_dynamic)
+ return 0;
+
if (strcmp(field->type, hist_field->type) != 0) {
if (field->size != hist_field->size ||
- field->is_signed != hist_field->is_signed)
+ (!field->is_string && field->is_signed != hist_field->is_signed))
return -EINVAL;
}
@@ -5091,7 +3812,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data,
} else {
field_var = NULL;
/*
- * If no explicit system.event is specfied, default to
+ * If no explicit system.event is specified, default to
* looking for fields on the onmatch(system.event.xxx)
* event.
*/
@@ -5100,6 +3821,8 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data,
event = data->match_data.event;
}
+ if (!event)
+ goto free;
/*
* At this point, we're looking at a field on another
* event. Because we can't modify a hist trigger on
@@ -5369,9 +4092,9 @@ static int __create_val_field(struct hist_trigger_data *hist_data,
unsigned long flags)
{
struct hist_field *hist_field;
- int ret = 0;
+ int ret = 0, n_subexprs = 0;
- hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0);
+ hist_field = parse_expr(hist_data, file, field_str, flags, var_name, &n_subexprs);
if (IS_ERR(hist_field)) {
ret = PTR_ERR(hist_field);
goto out;
@@ -5399,6 +4122,41 @@ static int create_val_field(struct hist_trigger_data *hist_data,
return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
}
+static const char *no_comm = "(no comm)";
+
+static u64 hist_field_execname(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_elt_data *elt_data;
+
+ if (WARN_ON_ONCE(!elt))
+ return (u64)(unsigned long)no_comm;
+
+ elt_data = elt->private_data;
+
+ if (WARN_ON_ONCE(!elt_data->comm))
+ return (u64)(unsigned long)no_comm;
+
+ return (u64)(unsigned long)(elt_data->comm);
+}
+
+/* Convert a var that points to common_pid.execname to a string */
+static void update_var_execname(struct hist_field *hist_field)
+{
+ hist_field->flags = HIST_FIELD_FL_STRING | HIST_FIELD_FL_VAR |
+ HIST_FIELD_FL_EXECNAME;
+ hist_field->size = MAX_FILTER_STR_VAL;
+ hist_field->is_signed = 0;
+
+ kfree_const(hist_field->type);
+ hist_field->type = "char[]";
+
+ hist_field->fn = hist_field_execname;
+}
+
static int create_var_field(struct hist_trigger_data *hist_data,
unsigned int val_idx,
struct trace_event_file *file,
@@ -5406,6 +4164,7 @@ static int create_var_field(struct hist_trigger_data *hist_data,
{
struct trace_array *tr = hist_data->event_file->tr;
unsigned long flags = 0;
+ int ret;
if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
return -EINVAL;
@@ -5420,7 +4179,15 @@ static int create_var_field(struct hist_trigger_data *hist_data,
if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX))
return -EINVAL;
- return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
+ ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
+
+ if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME)
+ update_var_execname(hist_data->fields[val_idx]);
+
+ if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING)
+ hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++;
+
+ return ret;
}
static int create_val_fields(struct hist_trigger_data *hist_data,
@@ -5468,7 +4235,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
struct hist_field *hist_field = NULL;
unsigned long flags = 0;
unsigned int key_size;
- int ret = 0;
+ int ret = 0, n_subexprs = 0;
if (WARN_ON(key_idx >= HIST_FIELDS_MAX))
return -EINVAL;
@@ -5481,7 +4248,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
hist_field = create_hist_field(hist_data, NULL, flags, NULL);
} else {
hist_field = parse_expr(hist_data, file, field_str, flags,
- NULL, 0);
+ NULL, &n_subexprs);
if (IS_ERR(hist_field)) {
ret = PTR_ERR(hist_field);
goto out;
@@ -5620,7 +4387,6 @@ static int parse_var_defs(struct hist_trigger_data *hist_data)
s = kstrdup(field_str, GFP_KERNEL);
if (!s) {
- kfree(hist_data->attrs->var_defs.name[n_vars]);
ret = -ENOMEM;
goto free;
}
@@ -6131,7 +4897,8 @@ create_hist_data(unsigned int map_bits,
}
static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe,
u64 *var_ref_vals)
{
@@ -6145,9 +4912,28 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
for_each_hist_val_field(i, hist_data) {
hist_field = hist_data->fields[i];
- hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+ hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
if (hist_field->flags & HIST_FIELD_FL_VAR) {
var_idx = hist_field->var.idx;
+
+ if (hist_field->flags & HIST_FIELD_FL_STRING) {
+ unsigned int str_start, var_str_idx, idx;
+ char *str, *val_str;
+ unsigned int size;
+
+ str_start = hist_data->n_field_var_str +
+ hist_data->n_save_var_str;
+ var_str_idx = hist_field->var_str_idx;
+ idx = str_start + var_str_idx;
+
+ str = elt_data->field_var_str[idx];
+ val_str = (char *)(uintptr_t)hist_val;
+
+ size = min(hist_field->size, STR_VAR_LEN_MAX);
+ strscpy(str, val_str, size);
+
+ hist_val = (u64)(uintptr_t)str;
+ }
tracing_map_set_var(elt, var_idx, hist_val);
continue;
}
@@ -6157,13 +4943,13 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
for_each_hist_key_field(i, hist_data) {
hist_field = hist_data->fields[i];
if (hist_field->flags & HIST_FIELD_FL_VAR) {
- hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+ hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
var_idx = hist_field->var.idx;
tracing_map_set_var(elt, var_idx, hist_val);
}
}
- update_field_vars(hist_data, elt, rbe, rec);
+ update_field_vars(hist_data, elt, buffer, rbe, rec);
}
static inline void add_to_key(char *compound_key, void *key,
@@ -6177,8 +4963,6 @@ static inline void add_to_key(char *compound_key, void *key,
field = key_field->field;
if (field->filter_type == FILTER_DYN_STRING)
size = *(u32 *)(rec + field->offset) >> 16;
- else if (field->filter_type == FILTER_PTR_STRING)
- size = strlen(key);
else if (field->filter_type == FILTER_STATIC_STRING)
size = field->size;
@@ -6193,7 +4977,8 @@ static inline void add_to_key(char *compound_key, void *key,
static void
hist_trigger_actions(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
u64 *var_ref_vals)
{
@@ -6202,11 +4987,12 @@ hist_trigger_actions(struct hist_trigger_data *hist_data,
for (i = 0; i < hist_data->n_actions; i++) {
data = hist_data->actions[i];
- data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+ data->fn(hist_data, elt, buffer, rec, rbe, key, data, var_ref_vals);
}
}
-static void event_hist_trigger(struct event_trigger_data *data, void *rec,
+static void event_hist_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe)
{
struct hist_trigger_data *hist_data = data->private_data;
@@ -6231,7 +5017,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
HIST_STACKTRACE_SKIP);
key = entries;
} else {
- field_contents = key_field->fn(key_field, elt, rbe, rec);
+ field_contents = key_field->fn(key_field, elt, buffer, rbe, rec);
if (key_field->flags & HIST_FIELD_FL_STRING) {
key = (void *)(unsigned long)field_contents;
use_compound_key = true;
@@ -6254,17 +5040,16 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
if (!elt)
return;
- hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
+ hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals);
if (resolve_var_refs(hist_data, key, var_ref_vals, true))
- hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals);
+ hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals);
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
unsigned long *stacktrace_entries,
unsigned int max_entries)
{
- char str[KSYM_SYMBOL_LEN];
unsigned int spaces = 8;
unsigned int i;
@@ -6273,8 +5058,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
return;
seq_printf(m, "%*c", 1 + spaces, ' ');
- sprint_symbol(str, stacktrace_entries[i]);
- seq_printf(m, "%s\n", str);
+ seq_printf(m, "%pS\n", (void*)stacktrace_entries[i]);
}
}
@@ -6284,7 +5068,6 @@ static void hist_trigger_print_key(struct seq_file *m,
struct tracing_map_elt *elt)
{
struct hist_field *key_field;
- char str[KSYM_SYMBOL_LEN];
bool multiline = false;
const char *field_name;
unsigned int i;
@@ -6305,14 +5088,12 @@ static void hist_trigger_pri